Merge commit 'v3.14' into next

author: James Morris <james.l.morris@oracle.com> 2014-04-13 21:23:14 -0400
committer: James Morris <james.l.morris@oracle.com> 2014-04-13 21:23:14 -0400
commit: ecd740c6f2f092b90b95fa35f757973589eaaca2 (patch)
tree: ce02b1e18c4fc5729699251460cd8be7604d8401 /fs
parent: f64410ec665479d7b4b77b7519e814253ed0f686 (diff)
parent: 455c6fdbd219161bd09b1165f11699d6d73de11c (diff)
530 files changed, 18623 insertions, 15505 deletions
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index 7af425f53bee..8482f2d11606 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -156,7 +156,7 @@ int v9fs_acl_chmod(struct inode *inode, struct p9_fid *fid)
                return -EOPNOTSUPP;
        acl = v9fs_get_cached_acl(inode, ACL_TYPE_ACCESS);
        if (acl) {
-                retval = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
+                retval = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
                if (retval)
                        return retval;
                set_cached_acl(inode, ACL_TYPE_ACCESS, acl);
@@ -200,7 +200,7 @@ int v9fs_acl_mode(struct inode *dir, umode_t *modep,
        if (acl) {
                if (S_ISDIR(mode))
                        *dpacl = posix_acl_dup(acl);
-                retval = posix_acl_create(&acl, GFP_NOFS, &mode);
+                retval = __posix_acl_create(&acl, GFP_NOFS, &mode);
                if (retval < 0)
                        return retval;
                if (retval > 0)
diff --git a/fs/9p/cache.c b/fs/9p/cache.c
index 2b7a032c37bc..a69260f27555 100644
--- a/fs/9p/cache.c
+++ b/fs/9p/cache.c
@@ -239,13 +239,12 @@ void v9fs_cache_inode_flush_cookie(struct inode *inode)
 void v9fs_cache_inode_set_cookie(struct inode *inode, struct file *filp)
 {
        struct v9fs_inode *v9inode = V9FS_I(inode);
-        struct p9_fid *fid;
        if (!v9inode->fscache)
                return;
        spin_lock(&v9inode->fscache_lock);
-        fid = filp->private_data;
        if ((filp->f_flags & O_ACCMODE) != O_RDONLY)
                v9fs_cache_inode_flush_cookie(inode);
        else
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 08f2e1e9a7e6..14da82564f4e 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -56,7 +56,7 @@ enum {
        /* Options that take no arguments */
        Opt_nodevmap,
        /* Cache options */
-        Opt_cache_loose, Opt_fscache,
+        Opt_cache_loose, Opt_fscache, Opt_mmap,
        /* Access options */
        Opt_access, Opt_posixacl,
        /* Error token */
@@ -74,6 +74,7 @@ static const match_table_t tokens = {
        {Opt_cache, "cache=%s"},
        {Opt_cache_loose, "loose"},
        {Opt_fscache, "fscache"},
+        {Opt_mmap, "mmap"},
        {Opt_cachetag, "cachetag=%s"},
        {Opt_access, "access=%s"},
        {Opt_posixacl, "posixacl"},
@@ -91,6 +92,9 @@ static int get_cache_mode(char *s)
        } else if (!strcmp(s, "fscache")) {
                version = CACHE_FSCACHE;
                p9_debug(P9_DEBUG_9P, "Cache mode: fscache\n");
+        } else if (!strcmp(s, "mmap")) {
+                version = CACHE_MMAP;
+                p9_debug(P9_DEBUG_9P, "Cache mode: mmap\n");
        } else if (!strcmp(s, "none")) {
                version = CACHE_NONE;
                p9_debug(P9_DEBUG_9P, "Cache mode: none\n");
@@ -220,6 +224,9 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
                case Opt_fscache:
                        v9ses->cache = CACHE_FSCACHE;
                        break;
+                case Opt_mmap:
+                        v9ses->cache = CACHE_MMAP;
+                        break;
                case Opt_cachetag:
 #ifdef CONFIG_9P_FSCACHE
                        v9ses->cachetag = match_strdup(&args[0]);
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index a8e127c89627..099c7712631c 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -64,6 +64,7 @@ enum p9_session_flags {
 enum p9_cache_modes {
        CACHE_NONE,
+        CACHE_MMAP,
        CACHE_LOOSE,
        CACHE_FSCACHE,
 };
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index dc95a252523d..b83ebfbf3fdc 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -50,6 +50,8 @@ extern const struct dentry_operations v9fs_dentry_operations;
 extern const struct dentry_operations v9fs_cached_dentry_operations;
 extern const struct file_operations v9fs_cached_file_operations;
 extern const struct file_operations v9fs_cached_file_operations_dotl;
+extern const struct file_operations v9fs_mmap_file_operations;
+extern const struct file_operations v9fs_mmap_file_operations_dotl;
 extern struct kmem_cache *v9fs_inode_cache;
 struct inode *v9fs_alloc_inode(struct super_block *sb);
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 9ff073f4090a..c71e88602ff4 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -202,6 +202,8 @@ static int v9fs_vfs_writepage(struct page *page, struct writeback_control *wbc)
 {
        int retval;
+        p9_debug(P9_DEBUG_VFS, "page %p\n", page);
        retval = v9fs_vfs_writepage_locked(page);
        if (retval < 0) {
                if (retval == -EAGAIN) {
@@ -282,6 +284,9 @@ static int v9fs_write_begin(struct file *filp, struct address_space *mapping,
        pgoff_t index = pos >> PAGE_CACHE_SHIFT;
        struct inode *inode = mapping->host;
+        p9_debug(P9_DEBUG_VFS, "filp %p, mapping %p\n", filp, mapping);
        v9inode = V9FS_I(inode);
 start:
        page = grab_cache_page_write_begin(mapping, index, flags);
@@ -312,6 +317,8 @@ static int v9fs_write_end(struct file *filp, struct address_space *mapping,
        loff_t last_pos = pos + copied;
        struct inode *inode = page->mapping->host;
+        p9_debug(P9_DEBUG_VFS, "filp %p, mapping %p\n", filp, mapping);
        if (unlikely(copied < len)) {
                /*
                 * zero out the rest of the area
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index a0df3e73c2b1..a16b0ff497ca 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -45,6 +45,7 @@
 #include "cache.h"
 static const struct vm_operations_struct v9fs_file_vm_ops;
+static const struct vm_operations_struct v9fs_mmap_file_vm_ops;
 /**
 * v9fs_file_open - open a file (or directory)
@@ -87,7 +88,8 @@ int v9fs_file_open(struct inode *inode, struct file *file)
        file->private_data = fid;
        mutex_lock(&v9inode->v_mutex);
-        if (v9ses->cache && !v9inode->writeback_fid &&
+        if ((v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) &&
+            !v9inode->writeback_fid &&
            ((file->f_flags & O_ACCMODE) != O_RDONLY)) {
                /*
                 * clone a fid and add it to writeback_fid
@@ -105,7 +107,7 @@ int v9fs_file_open(struct inode *inode, struct file *file)
                v9inode->writeback_fid = (void *) fid;
        }
        mutex_unlock(&v9inode->v_mutex);
-        if (v9ses->cache)
+        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
                v9fs_cache_inode_set_cookie(inode, file);
        return 0;
 out_error:
@@ -461,14 +463,12 @@ v9fs_file_write_internal(struct inode *inode, struct p9_fid *fid,
        int n;
        loff_t i_size;
        size_t total = 0;
-        struct p9_client *clnt;
        loff_t origin = *offset;
        unsigned long pg_start, pg_end;
        p9_debug(P9_DEBUG_VFS, "data %p count %d offset %x\n",
                 data, (int)count, (int)*offset);
-        clnt = fid->clnt;
        do {
                n = p9_client_write(fid, NULL, data+total, origin+total, count);
                if (n <= 0)
@@ -581,11 +581,12 @@ int v9fs_file_fsync_dotl(struct file *filp, loff_t start, loff_t end,
 }
 static int
-v9fs_file_mmap(struct file *file, struct vm_area_struct *vma)
+v9fs_file_mmap(struct file *filp, struct vm_area_struct *vma)
 {
        int retval;
-        retval = generic_file_mmap(file, vma);
+        retval = generic_file_mmap(filp, vma);
        if (!retval)
                vma->vm_ops = &v9fs_file_vm_ops;
@@ -593,6 +594,43 @@ v9fs_file_mmap(struct file *file, struct vm_area_struct *vma)
 }
 static int
+v9fs_mmap_file_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+        int retval;
+        struct inode *inode;
+        struct v9fs_inode *v9inode;
+        struct p9_fid *fid;
+        inode = file_inode(filp);
+        v9inode = V9FS_I(inode);
+        mutex_lock(&v9inode->v_mutex);
+        if (!v9inode->writeback_fid &&
+            (vma->vm_flags & VM_WRITE)) {
+                /*
+                 * clone a fid and add it to writeback_fid
+                 * we do it during mmap instead of
+                 * page dirty time via write_begin/page_mkwrite
+                 * because we want write after unlink usecase
+                 * to work.
+                 */
+                fid = v9fs_writeback_fid(filp->f_path.dentry);
+                if (IS_ERR(fid)) {
+                        retval = PTR_ERR(fid);
+                        mutex_unlock(&v9inode->v_mutex);
+                        return retval;
+                }
+                v9inode->writeback_fid = (void *) fid;
+        }
+        mutex_unlock(&v9inode->v_mutex);
+        retval = generic_file_mmap(filp, vma);
+        if (!retval)
+                vma->vm_ops = &v9fs_mmap_file_vm_ops;
+        return retval;
+}
+static int
 v9fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
        struct v9fs_inode *v9inode;
@@ -660,6 +698,22 @@ v9fs_cached_file_read(struct file *filp, char __user *data, size_t count,
        return do_sync_read(filp, data, count, offset);
 }
+/**
+ * v9fs_mmap_file_read - read from a file
+ * @filp: file pointer to read
+ * @udata: user data buffer to read data into
+ * @count: size of buffer
+ * @offset: offset at which to read data
+ *
+ */
+static ssize_t
+v9fs_mmap_file_read(struct file *filp, char __user *data, size_t count,
+                      loff_t *offset)
+{
+        /* TODO: Check if there are dirty pages */
+        return v9fs_file_read(filp, data, count, offset);
+}
 static ssize_t
 v9fs_direct_write(struct file *filp, const char __user * data,
                  size_t count, loff_t *offsetp)
@@ -730,12 +784,65 @@ v9fs_cached_file_write(struct file *filp, const char __user * data,
        return do_sync_write(filp, data, count, offset);
 }
+/**
+ * v9fs_mmap_file_write - write to a file
+ * @filp: file pointer to write
+ * @data: data buffer to write data from
+ * @count: size of buffer
+ * @offset: offset at which to write data
+ *
+ */
+static ssize_t
+v9fs_mmap_file_write(struct file *filp, const char __user *data,
+                       size_t count, loff_t *offset)
+{
+        /*
+         * TODO: invalidate mmaps on filp's inode between
+         * offset and offset+count
+         */
+        return v9fs_file_write(filp, data, count, offset);
+}
+static void v9fs_mmap_vm_close(struct vm_area_struct *vma)
+{
+        struct inode *inode;
+        struct writeback_control wbc = {
+                .nr_to_write = LONG_MAX,
+                .sync_mode = WB_SYNC_ALL,
+                .range_start = vma->vm_pgoff * PAGE_SIZE,
+                 /* absolute end, byte at end included */
+                .range_end = vma->vm_pgoff * PAGE_SIZE +
+                        (vma->vm_end - vma->vm_start - 1),
+        };
+        p9_debug(P9_DEBUG_VFS, "9p VMA close, %p, flushing", vma);
+        inode = file_inode(vma->vm_file);
+        if (!mapping_cap_writeback_dirty(inode->i_mapping))
+                wbc.nr_to_write = 0;
+        might_sleep();
+        sync_inode(inode, &wbc);
+}
 static const struct vm_operations_struct v9fs_file_vm_ops = {
        .fault = filemap_fault,
        .page_mkwrite = v9fs_vm_page_mkwrite,
        .remap_pages = generic_file_remap_pages,
 };
+static const struct vm_operations_struct v9fs_mmap_file_vm_ops = {
+        .close = v9fs_mmap_vm_close,
+        .fault = filemap_fault,
+        .page_mkwrite = v9fs_vm_page_mkwrite,
+        .remap_pages = generic_file_remap_pages,
+};
 const struct file_operations v9fs_cached_file_operations = {
        .llseek = generic_file_llseek,
@@ -786,3 +893,26 @@ const struct file_operations v9fs_file_operations_dotl = {
        .mmap = generic_file_readonly_mmap,
        .fsync = v9fs_file_fsync_dotl,
 };
+const struct file_operations v9fs_mmap_file_operations = {
+        .llseek = generic_file_llseek,
+        .read = v9fs_mmap_file_read,
+        .write = v9fs_mmap_file_write,
+        .open = v9fs_file_open,
+        .release = v9fs_dir_release,
+        .lock = v9fs_file_lock,
+        .mmap = v9fs_mmap_file_mmap,
+        .fsync = v9fs_file_fsync,
+};
+const struct file_operations v9fs_mmap_file_operations_dotl = {
+        .llseek = generic_file_llseek,
+        .read = v9fs_mmap_file_read,
+        .write = v9fs_mmap_file_write,
+        .open = v9fs_file_open,
+        .release = v9fs_dir_release,
+        .lock = v9fs_file_lock_dotl,
+        .flock = v9fs_file_flock_dotl,
+        .mmap = v9fs_mmap_file_mmap,
+        .fsync = v9fs_file_fsync_dotl,
+};
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 4e65aa903345..bb7991c7e5c7 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -299,15 +299,22 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses,
        case S_IFREG:
                if (v9fs_proto_dotl(v9ses)) {
                        inode->i_op = &v9fs_file_inode_operations_dotl;
-                        if (v9ses->cache)
+                        if (v9ses->cache == CACHE_LOOSE ||
+                            v9ses->cache == CACHE_FSCACHE)
                                inode->i_fop =
                                        &v9fs_cached_file_operations_dotl;
+                        else if (v9ses->cache == CACHE_MMAP)
+                                inode->i_fop = &v9fs_mmap_file_operations_dotl;
                        else
                                inode->i_fop = &v9fs_file_operations_dotl;
                } else {
                        inode->i_op = &v9fs_file_inode_operations;
-                        if (v9ses->cache)
+                        if (v9ses->cache == CACHE_LOOSE ||
-                                inode->i_fop = &v9fs_cached_file_operations;
+                            v9ses->cache == CACHE_FSCACHE)
+                                inode->i_fop =
+                                        &v9fs_cached_file_operations;
+                        else if (v9ses->cache == CACHE_MMAP)
+                                inode->i_fop = &v9fs_mmap_file_operations;
                        else
                                inode->i_fop = &v9fs_file_operations;
                }
@@ -779,7 +786,6 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
                                      unsigned int flags)
 {
        struct dentry *res;
-        struct super_block *sb;
        struct v9fs_session_info *v9ses;
        struct p9_fid *dfid, *fid;
        struct inode *inode;
@@ -791,7 +797,6 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
        if (dentry->d_name.len > NAME_MAX)
                return ERR_PTR(-ENAMETOOLONG);
-        sb = dir->i_sb;
        v9ses = v9fs_inode2v9ses(dir);
        /* We can walk d_parent because we hold the dir->i_mutex */
        dfid = v9fs_fid_lookup(dentry->d_parent);
@@ -812,7 +817,7 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
         * unlink. For cached mode create calls request for new
         * inode. But with cache disabled, lookup should do this.
         */
-        if (v9ses->cache)
+        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
                inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
        else
                inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
@@ -863,7 +868,7 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry,
                return finish_no_open(file, res);
        err = 0;
-        fid = NULL;
        v9ses = v9fs_inode2v9ses(dir);
        perm = unixmode2p9mode(v9ses, mode);
        fid = v9fs_create(v9ses, dir, dentry, NULL, perm,
@@ -878,7 +883,8 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry,
        v9fs_invalidate_inode_attr(dir);
        v9inode = V9FS_I(dentry->d_inode);
        mutex_lock(&v9inode->v_mutex);
-        if (v9ses->cache && !v9inode->writeback_fid &&
+        if ((v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) &&
+            !v9inode->writeback_fid &&
            ((flags & O_ACCMODE) != O_RDONLY)) {
                /*
                 * clone a fid and add it to writeback_fid
@@ -901,7 +907,7 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry,
                goto error;
        file->private_data = fid;
-        if (v9ses->cache)
+        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
                v9fs_cache_inode_set_cookie(dentry->d_inode, file);
        *opened |= FILE_CREATED;
@@ -1479,7 +1485,7 @@ int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode)
         */
        i_size = inode->i_size;
        v9fs_stat2inode(st, inode, inode->i_sb);
-        if (v9ses->cache)
+        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
                inode->i_size = i_size;
        spin_unlock(&inode->i_lock);
 out:
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 4c10edec26a0..59dc8e87647f 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -330,7 +330,8 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
        v9inode = V9FS_I(inode);
        mutex_lock(&v9inode->v_mutex);
-        if (v9ses->cache && !v9inode->writeback_fid &&
+        if ((v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) &&
+            !v9inode->writeback_fid &&
            ((flags & O_ACCMODE) != O_RDONLY)) {
                /*
                 * clone a fid and add it to writeback_fid
@@ -353,7 +354,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
        if (err)
                goto err_clunk_old_fid;
        file->private_data = ofid;
-        if (v9ses->cache)
+        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
                v9fs_cache_inode_set_cookie(inode, file);
        *opened |= FILE_CREATED;
 out:
@@ -473,13 +474,11 @@ static int
 v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry,
                 struct kstat *stat)
 {
-        int err;
        struct v9fs_session_info *v9ses;
        struct p9_fid *fid;
        struct p9_stat_dotl *st;
        p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry);
-        err = -EPERM;
        v9ses = v9fs_dentry2v9ses(dentry);
        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
                generic_fillattr(dentry->d_inode, stat);
@@ -556,7 +555,6 @@ static int v9fs_mapped_iattr_valid(int iattr_valid)
 int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
 {
        int retval;
-        struct v9fs_session_info *v9ses;
        struct p9_fid *fid;
        struct p9_iattr_dotl p9attr;
        struct inode *inode = dentry->d_inode;
@@ -577,8 +575,6 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
        p9attr.mtime_sec = iattr->ia_mtime.tv_sec;
        p9attr.mtime_nsec = iattr->ia_mtime.tv_nsec;
-        retval = -EPERM;
-        v9ses = v9fs_dentry2v9ses(dentry);
        fid = v9fs_fid_lookup(dentry);
        if (IS_ERR(fid))
                return PTR_ERR(fid);
@@ -715,7 +711,7 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
        }
        v9fs_invalidate_inode_attr(dir);
-        if (v9ses->cache) {
+        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
                /* Now walk from the parent so we can get an unopened fid. */
                fid = p9_client_walk(dfid, 1, &name, 1);
                if (IS_ERR(fid)) {
@@ -768,7 +764,6 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
                struct dentry *dentry)
 {
        int err;
-        char *name;
        struct dentry *dir_dentry;
        struct p9_fid *dfid, *oldfid;
        struct v9fs_session_info *v9ses;
@@ -786,8 +781,6 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
        if (IS_ERR(oldfid))
                return PTR_ERR(oldfid);
-        name = (char *) dentry->d_name.name;
        err = p9_client_link(dfid, oldfid, (char *)dentry->d_name.name);
        if (err < 0) {
@@ -973,7 +966,7 @@ int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode)
         */
        i_size = inode->i_size;
        v9fs_stat2inode_dotl(st, inode);
-        if (v9ses->cache)
+        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
                inode->i_size = i_size;
        spin_unlock(&inode->i_lock);
 out:
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 2756dcd5de6e..0afd0382822b 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -144,7 +144,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
        }
        v9fs_fill_super(sb, v9ses, flags, data);
-        if (v9ses->cache)
+        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
                sb->s_d_op = &v9fs_cached_dentry_operations;
        else
                sb->s_d_op = &v9fs_dentry_operations;
@@ -282,7 +282,7 @@ static int v9fs_drop_inode(struct inode *inode)
 {
        struct v9fs_session_info *v9ses;
        v9ses = v9fs_inode2v9ses(inode);
-        if (v9ses->cache)
+        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
                return generic_drop_inode(inode);
        /*
         * in case of non cached mode always drop the
@@ -325,10 +325,12 @@ static int v9fs_write_inode_dotl(struct inode *inode,
         * send an fsync request to server irrespective of
         * wbc->sync_mode.
         */
-        p9_debug(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode);
        v9inode = V9FS_I(inode);
+        p9_debug(P9_DEBUG_VFS, "%s: inode %p, writeback_fid %p\n",
+                 __func__, inode, v9inode->writeback_fid);
        if (!v9inode->writeback_fid)
                return 0;
        ret = p9_client_fsync(v9inode->writeback_fid, 0);
        if (ret < 0) {
                __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
index 3c28cdfb8c47..04133a1fd9cb 100644
--- a/fs/9p/xattr.c
+++ b/fs/9p/xattr.c
@@ -138,8 +138,7 @@ int v9fs_fid_xattr_set(struct p9_fid *fid, const char *name,
        if (retval < 0) {
                p9_debug(P9_DEBUG_VFS, "p9_client_xattrcreate failed %d\n",
                         retval);
-                p9_client_clunk(fid);
+                goto err;
-                return retval;
        }
        msize = fid->clnt->msize;
        while (value_len) {
@@ -152,12 +151,15 @@ int v9fs_fid_xattr_set(struct p9_fid *fid, const char *name,
                if (write_count < 0) {
                        /* error in xattr write */
                        retval = write_count;
-                        break;
+                        goto err;
                }
                offset += write_count;
                value_len -= write_count;
        }
-        return p9_client_clunk(fid);
+        retval = offset;
+err:
+        p9_client_clunk(fid);
+        return retval;
 }
 ssize_t v9fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
diff --git a/fs/Kconfig b/fs/Kconfig
index c229f828eb01..7385e54be4b9 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -68,10 +68,6 @@ source "fs/quota/Kconfig"
 source "fs/autofs4/Kconfig"
 source "fs/fuse/Kconfig"
-config GENERIC_ACL
-        bool
-        select FS_POSIX_ACL
 menu "Caches"
 source "fs/fscache/Kconfig"
@@ -119,7 +115,7 @@ config TMPFS_POSIX_ACL
        bool "Tmpfs POSIX Access Control Lists"
        depends on TMPFS
        select TMPFS_XATTR
-        select GENERIC_ACL
+        select FS_POSIX_ACL
        help
          POSIX Access Control Lists (ACLs) support additional access rights
          for users and groups beyond the standard owner/group/world scheme,
diff --git a/fs/Makefile b/fs/Makefile
index 4fe6df3ec28f..47ac07bb4acc 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -42,9 +42,8 @@ obj-$(CONFIG_BINFMT_SOM)	+= binfmt_som.o
 obj-$(CONFIG_BINFMT_FLAT)       += binfmt_flat.o
 obj-$(CONFIG_FS_MBCACHE)        += mbcache.o
-obj-$(CONFIG_FS_POSIX_ACL)      += posix_acl.o xattr_acl.o
+obj-$(CONFIG_FS_POSIX_ACL)      += posix_acl.o
 obj-$(CONFIG_NFS_COMMON)        += nfs_common/
-obj-$(CONFIG_GENERIC_ACL)       += generic_acl.o
 obj-$(CONFIG_COREDUMP)          += coredump.o
 obj-$(CONFIG_SYSCTL)            += drop_caches.o
@@ -53,7 +52,7 @@ obj-$(CONFIG_FHANDLE)		+= fhandle.o
 obj-y                           += quota/
 obj-$(CONFIG_PROC_FS)           += proc/
-obj-$(CONFIG_SYSFS)             += sysfs/
+obj-$(CONFIG_SYSFS)             += sysfs/ kernfs/
 obj-$(CONFIG_CONFIGFS_FS)       += configfs/
 obj-y                           += devpts/
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 45161a832bbc..d098731b82ff 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -49,11 +49,6 @@ affs_put_super(struct super_block *sb)
        pr_debug("AFFS: put_super()\n");
        cancel_delayed_work_sync(&sbi->sb_work);
-        kfree(sbi->s_prefix);
-        affs_free_bitmap(sb);
-        affs_brelse(sbi->s_root_bh);
-        kfree(sbi);
-        sb->s_fs_info = NULL;
 }
 static int
@@ -316,7 +311,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
        unsigned long            mount_flags;
        int                      tmp_flags;     /* fix remount prototype... */
        u8                       sig[4];
-        int                      ret = -EINVAL;
+        int                      ret;
        save_mount_options(sb, data);
@@ -412,17 +407,19 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
        if (!silent)
                printk(KERN_ERR "AFFS: No valid root block on device %s\n",
                        sb->s_id);
-        goto out_error;
+        return -EINVAL;
        /* N.B. after this point bh must be released */
 got_root:
+        /* Keep super block in cache */
+        sbi->s_root_bh = root_bh;
        root_block = sbi->s_root_block;
        /* Find out which kind of FS we have */
        boot_bh = sb_bread(sb, 0);
        if (!boot_bh) {
                printk(KERN_ERR "AFFS: Cannot read boot block\n");
-                goto out_error;
+                return -EINVAL;
        }
        memcpy(sig, boot_bh->b_data, 4);
        brelse(boot_bh);
@@ -471,7 +468,7 @@ got_root:
                default:
                        printk(KERN_ERR "AFFS: Unknown filesystem on device %s: %08X\n",
                                sb->s_id, chksum);
-                        goto out_error;
+                        return -EINVAL;
        }
        if (mount_flags & SF_VERBOSE) {
@@ -488,22 +485,17 @@ got_root:
        if (sbi->s_flags & SF_OFS)
                sbi->s_data_blksize -= 24;
-        /* Keep super block in cache */
-        sbi->s_root_bh = root_bh;
-        /* N.B. after this point s_root_bh must be released */
        tmp_flags = sb->s_flags;
-        if (affs_init_bitmap(sb, &tmp_flags))
+        ret = affs_init_bitmap(sb, &tmp_flags);
-                goto out_error;
+        if (ret)
+                return ret;
        sb->s_flags = tmp_flags;
        /* set up enough so that it can read an inode */
        root_inode = affs_iget(sb, root_block);
-        if (IS_ERR(root_inode)) {
+        if (IS_ERR(root_inode))
-                ret = PTR_ERR(root_inode);
+                return PTR_ERR(root_inode);
-                goto out_error;
-        }
        if (AFFS_SB(sb)->s_flags & SF_INTL)
                sb->s_d_op = &affs_intl_dentry_operations;
@@ -513,22 +505,11 @@ got_root:
        sb->s_root = d_make_root(root_inode);
        if (!sb->s_root) {
                printk(KERN_ERR "AFFS: Get root inode failed\n");
-                goto out_error;
+                return -ENOMEM;
        }
        pr_debug("AFFS: s_flags=%lX\n",sb->s_flags);
        return 0;
-        /*
-         * Begin the cascaded cleanup ...
-         */
-out_error:
-        kfree(sbi->s_bitmap);
-        affs_brelse(root_bh);
-        kfree(sbi->s_prefix);
-        kfree(sbi);
-        sb->s_fs_info = NULL;
-        return ret;
 }
 static int
@@ -615,11 +596,23 @@ static struct dentry *affs_mount(struct file_system_type *fs_type,
        return mount_bdev(fs_type, flags, dev_name, data, affs_fill_super);
 }
+static void affs_kill_sb(struct super_block *sb)
+{
+        struct affs_sb_info *sbi = AFFS_SB(sb);
+        kill_block_super(sb);
+        if (sbi) {
+                affs_free_bitmap(sb);
+                affs_brelse(sbi->s_root_bh);
+                kfree(sbi->s_prefix);
+                kfree(sbi);
+        }
+}
 static struct file_system_type affs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "affs",
        .mount          = affs_mount,
-        .kill_sb        = kill_block_super,
+        .kill_sb        = affs_kill_sb,
        .fs_flags       = FS_REQUIRES_DEV,
 };
 MODULE_ALIAS_FS("affs");
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index a306bb6d88d9..6621f8008122 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -195,7 +195,6 @@ struct afs_cell {
        struct list_head        link;           /* main cell list link */
        struct key              *anonymous_key; /* anonymous user key for this cell */
        struct list_head        proc_link;      /* /proc cell list link */
-        struct proc_dir_entry   *proc_dir;      /* /proc dir for this cell */
 #ifdef CONFIG_AFS_FSCACHE
        struct fscache_cookie   *cache;         /* caching cookie */
 #endif
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index 526e4bbbde59..24a905b076fd 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -41,11 +41,8 @@ static const struct file_operations afs_proc_cells_fops = {
        .write          = afs_proc_cells_write,
        .llseek         = seq_lseek,
        .release        = seq_release,
-        .owner          = THIS_MODULE,
 };
-static int afs_proc_rootcell_open(struct inode *inode, struct file *file);
-static int afs_proc_rootcell_release(struct inode *inode, struct file *file);
 static ssize_t afs_proc_rootcell_read(struct file *file, char __user *buf,
                                      size_t size, loff_t *_pos);
 static ssize_t afs_proc_rootcell_write(struct file *file,
@@ -53,17 +50,12 @@ static ssize_t afs_proc_rootcell_write(struct file *file,
                                       size_t size, loff_t *_pos);
 static const struct file_operations afs_proc_rootcell_fops = {
-        .open           = afs_proc_rootcell_open,
        .read           = afs_proc_rootcell_read,
        .write          = afs_proc_rootcell_write,
        .llseek         = no_llseek,
-        .release        = afs_proc_rootcell_release,
-        .owner          = THIS_MODULE,
 };
 static int afs_proc_cell_volumes_open(struct inode *inode, struct file *file);
-static int afs_proc_cell_volumes_release(struct inode *inode,
-                                         struct file *file);
 static void *afs_proc_cell_volumes_start(struct seq_file *p, loff_t *pos);
 static void *afs_proc_cell_volumes_next(struct seq_file *p, void *v,
                                        loff_t *pos);
@@ -81,14 +73,11 @@ static const struct file_operations afs_proc_cell_volumes_fops = {
        .open           = afs_proc_cell_volumes_open,
        .read           = seq_read,
        .llseek         = seq_lseek,
-        .release        = afs_proc_cell_volumes_release,
+        .release        = seq_release,
-        .owner          = THIS_MODULE,
 };
 static int afs_proc_cell_vlservers_open(struct inode *inode,
                                        struct file *file);
-static int afs_proc_cell_vlservers_release(struct inode *inode,
-                                           struct file *file);
 static void *afs_proc_cell_vlservers_start(struct seq_file *p, loff_t *pos);
 static void *afs_proc_cell_vlservers_next(struct seq_file *p, void *v,
                                          loff_t *pos);
@@ -106,13 +95,10 @@ static const struct file_operations afs_proc_cell_vlservers_fops = {
        .open           = afs_proc_cell_vlservers_open,
        .read           = seq_read,
        .llseek         = seq_lseek,
-        .release        = afs_proc_cell_vlservers_release,
+        .release        = seq_release,
-        .owner          = THIS_MODULE,
 };
 static int afs_proc_cell_servers_open(struct inode *inode, struct file *file);
-static int afs_proc_cell_servers_release(struct inode *inode,
-                                         struct file *file);
 static void *afs_proc_cell_servers_start(struct seq_file *p, loff_t *pos);
 static void *afs_proc_cell_servers_next(struct seq_file *p, void *v,
                                        loff_t *pos);
@@ -130,8 +116,7 @@ static const struct file_operations afs_proc_cell_servers_fops = {
        .open           = afs_proc_cell_servers_open,
        .read           = seq_read,
        .llseek         = seq_lseek,
-        .release        = afs_proc_cell_servers_release,
+        .release        = seq_release,
-        .owner          = THIS_MODULE,
 };
 /*
@@ -139,29 +124,21 @@ static const struct file_operations afs_proc_cell_servers_fops = {
 */
 int afs_proc_init(void)
 {
-        struct proc_dir_entry *p;
        _enter("");
        proc_afs = proc_mkdir("fs/afs", NULL);
        if (!proc_afs)
                goto error_dir;
-        p = proc_create("cells", 0, proc_afs, &afs_proc_cells_fops);
+        if (!proc_create("cells", 0644, proc_afs, &afs_proc_cells_fops) ||
-        if (!p)
+            !proc_create("rootcell", 0644, proc_afs, &afs_proc_rootcell_fops))
-                goto error_cells;
+                goto error_tree;
-        p = proc_create("rootcell", 0, proc_afs, &afs_proc_rootcell_fops);
-        if (!p)
-                goto error_rootcell;
        _leave(" = 0");
        return 0;
-error_rootcell:
+error_tree:
-        remove_proc_entry("cells", proc_afs);
+        remove_proc_subtree("fs/afs", NULL);
-error_cells:
-        remove_proc_entry("fs/afs", NULL);
 error_dir:
        _leave(" = -ENOMEM");
        return -ENOMEM;
@@ -172,9 +149,7 @@ error_dir:
 */
 void afs_proc_cleanup(void)
 {
-        remove_proc_entry("rootcell", proc_afs);
+        remove_proc_subtree("fs/afs", NULL);
-        remove_proc_entry("cells", proc_afs);
-        remove_proc_entry("fs/afs", NULL);
 }
 /*
@@ -319,19 +294,6 @@ inval:
        goto done;
 }
-/*
- * Stubs for /proc/fs/afs/rootcell
- */
-static int afs_proc_rootcell_open(struct inode *inode, struct file *file)
-{
-        return 0;
-}
-static int afs_proc_rootcell_release(struct inode *inode, struct file *file)
-{
-        return 0;
-}
 static ssize_t afs_proc_rootcell_read(struct file *file, char __user *buf,
                                      size_t size, loff_t *_pos)
 {
@@ -387,38 +349,27 @@ nomem:
 */
 int afs_proc_cell_setup(struct afs_cell *cell)
 {
-        struct proc_dir_entry *p;
+        struct proc_dir_entry *dir;
        _enter("%p{%s}", cell, cell->name);
-        cell->proc_dir = proc_mkdir(cell->name, proc_afs);
+        dir = proc_mkdir(cell->name, proc_afs);
-        if (!cell->proc_dir)
+        if (!dir)
                goto error_dir;
-        p = proc_create_data("servers", 0, cell->proc_dir,
+        if (!proc_create_data("servers", 0, dir,
-                             &afs_proc_cell_servers_fops, cell);
+                             &afs_proc_cell_servers_fops, cell) ||
-        if (!p)
+            !proc_create_data("vlservers", 0, dir,
-                goto error_servers;
+                             &afs_proc_cell_vlservers_fops, cell) ||
+            !proc_create_data("volumes", 0, dir,
-        p = proc_create_data("vlservers", 0, cell->proc_dir,
+                             &afs_proc_cell_volumes_fops, cell))
-                             &afs_proc_cell_vlservers_fops, cell);
+                goto error_tree;
-        if (!p)
-                goto error_vlservers;
-        p = proc_create_data("volumes", 0, cell->proc_dir,
-                             &afs_proc_cell_volumes_fops, cell);
-        if (!p)
-                goto error_volumes;
        _leave(" = 0");
        return 0;
-error_volumes:
+error_tree:
-        remove_proc_entry("vlservers", cell->proc_dir);
+        remove_proc_subtree(cell->name, proc_afs);
-error_vlservers:
-        remove_proc_entry("servers", cell->proc_dir);
-error_servers:
-        remove_proc_entry(cell->name, proc_afs);
 error_dir:
        _leave(" = -ENOMEM");
        return -ENOMEM;
@@ -431,10 +382,7 @@ void afs_proc_cell_remove(struct afs_cell *cell)
 {
        _enter("");
-        remove_proc_entry("volumes", cell->proc_dir);
+        remove_proc_subtree(cell->name, proc_afs);
-        remove_proc_entry("vlservers", cell->proc_dir);
-        remove_proc_entry("servers", cell->proc_dir);
-        remove_proc_entry(cell->name, proc_afs);
        _leave("");
 }
@@ -463,14 +411,6 @@ static int afs_proc_cell_volumes_open(struct inode *inode, struct file *file)
 }
 /*
- * close the file and release the ref to the cell
- */
-static int afs_proc_cell_volumes_release(struct inode *inode, struct file *file)
-{
-        return seq_release(inode, file);
-}
-/*
 * set up the iterator to start reading from the cells list and return the
 * first item
 */
@@ -569,15 +509,6 @@ static int afs_proc_cell_vlservers_open(struct inode *inode, struct file *file)
 }
 /*
- * close the file and release the ref to the cell
- */
-static int afs_proc_cell_vlservers_release(struct inode *inode,
-                                           struct file *file)
-{
-        return seq_release(inode, file);
-}
-/*
 * set up the iterator to start reading from the cells list and return the
 * first item
 */
@@ -673,15 +604,6 @@ static int afs_proc_cell_servers_open(struct inode *inode, struct file *file)
 }
 /*
- * close the file and release the ref to the cell
- */
-static int afs_proc_cell_servers_release(struct inode *inode,
-                                         struct file *file)
-{
-        return seq_release(inode, file);
-}
-/*
 * set up the iterator to start reading from the cells list and return the
 * first item
 */
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 24084732b1d0..80ef38c73e5a 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -41,19 +41,8 @@ static const struct dentry_operations anon_inodefs_dentry_operations = {
 static struct dentry *anon_inodefs_mount(struct file_system_type *fs_type,
                                int flags, const char *dev_name, void *data)
 {
-        struct dentry *root;
+        return mount_pseudo(fs_type, "anon_inode:", NULL,
-        root = mount_pseudo(fs_type, "anon_inode:", NULL,
                        &anon_inodefs_dentry_operations, ANON_INODE_FS_MAGIC);
-        if (!IS_ERR(root)) {
-                struct super_block *s = root->d_sb;
-                anon_inode_inode = alloc_anon_inode(s);
-                if (IS_ERR(anon_inode_inode)) {
-                        dput(root);
-                        deactivate_locked_super(s);
-                        root = ERR_CAST(anon_inode_inode);
-                }
-        }
-        return root;
 }
 static struct file_system_type anon_inode_fs_type = {
@@ -175,22 +164,15 @@ EXPORT_SYMBOL_GPL(anon_inode_getfd);
 static int __init anon_inode_init(void)
 {
-        int error;
-        error = register_filesystem(&anon_inode_fs_type);
-        if (error)
-                goto err_exit;
        anon_inode_mnt = kern_mount(&anon_inode_fs_type);
-        if (IS_ERR(anon_inode_mnt)) {
+        if (IS_ERR(anon_inode_mnt))
-                error = PTR_ERR(anon_inode_mnt);
+                panic("anon_inode_init() kernel mount failed (%ld)\n", PTR_ERR(anon_inode_mnt));
-                goto err_unregister_filesystem;
-        }
-        return 0;
-err_unregister_filesystem:
+        anon_inode_inode = alloc_anon_inode(anon_inode_mnt->mnt_sb);
-        unregister_filesystem(&anon_inode_fs_type);
+        if (IS_ERR(anon_inode_inode))
-err_exit:
+                panic("anon_inode_init() inode allocation failed (%ld)\n", PTR_ERR(anon_inode_inode));
-        panic(KERN_ERR "anon_inode_init() failed (%d)\n", error);
+        return 0;
 }
 fs_initcall(anon_inode_init);
diff --git a/fs/attr.c b/fs/attr.c
index 267968d94673..5d4e59d56e85 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -202,11 +202,6 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de
                        return -EPERM;
        }
-        if ((ia_valid & ATTR_SIZE) && IS_I_VERSION(inode)) {
-                if (attr->ia_size != inode->i_size)
-                        inode_inc_iversion(inode);
-        }
        if ((ia_valid & ATTR_MODE)) {
                umode_t amode = attr->ia_mode;
                /* Flag setting protected by i_mutex */
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 4218e26df916..acf32054edd8 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -104,7 +104,7 @@ struct autofs_sb_info {
        u32 magic;
        int pipefd;
        struct file *pipe;
-        pid_t oz_pgrp;
+        struct pid *oz_pgrp;
        int catatonic;
        int version;
        int sub_version;
@@ -140,7 +140,7 @@ static inline struct autofs_info *autofs4_dentry_ino(struct dentry *dentry)
   filesystem without "magic".) */
 static inline int autofs4_oz_mode(struct autofs_sb_info *sbi) {
-        return sbi->catatonic || task_pgrp_nr(current) == sbi->oz_pgrp;
+        return sbi->catatonic || task_pgrp(current) == sbi->oz_pgrp;
 }
 /* Does a dentry have some pending activity? */
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 1818ce7f5a06..3182c0e68b42 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -346,6 +346,7 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp,
 {
        int pipefd;
        int err = 0;
+        struct pid *new_pid = NULL;
        if (param->setpipefd.pipefd == -1)
                return -EINVAL;
@@ -357,7 +358,17 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp,
                mutex_unlock(&sbi->wq_mutex);
                return -EBUSY;
        } else {
-                struct file *pipe = fget(pipefd);
+                struct file *pipe;
+                new_pid = get_task_pid(current, PIDTYPE_PGID);
+                if (ns_of_pid(new_pid) != ns_of_pid(sbi->oz_pgrp)) {
+                        AUTOFS_WARN("Not allowed to change PID namespace");
+                        err = -EINVAL;
+                        goto out;
+                }
+                pipe = fget(pipefd);
                if (!pipe) {
                        err = -EBADF;
                        goto out;
@@ -367,12 +378,13 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp,
                        fput(pipe);
                        goto out;
                }
-                sbi->oz_pgrp = task_pgrp_nr(current);
+                swap(sbi->oz_pgrp, new_pid);
                sbi->pipefd = pipefd;
                sbi->pipe = pipe;
                sbi->catatonic = 0;
        }
 out:
+        put_pid(new_pid);
        mutex_unlock(&sbi->wq_mutex);
        return err;
 }
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 3d9d3f5d5dda..394e90b02c5e 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -402,6 +402,20 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
                        goto next;
                }
+                if (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)) {
+                        DPRINTK("checking symlink %p %.*s",
+                                dentry, (int)dentry->d_name.len, dentry->d_name.name);
+                        /*
+                         * A symlink can't be "busy" in the usual sense so
+                         * just check last used for expire timeout.
+                         */
+                        if (autofs4_can_expire(dentry, timeout, do_now)) {
+                                expired = dentry;
+                                goto found;
+                        }
+                        goto next;
+                }
                if (simple_empty(dentry))
                        goto next;
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 3b9cc9b973c2..d7bd395ab586 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -56,8 +56,11 @@ void autofs4_kill_sb(struct super_block *sb)
         * just call kill_anon_super when we are called from
         * deactivate_super.
         */
-        if (sbi) /* Free wait queues, close pipe */
+        if (sbi) {
+                /* Free wait queues, close pipe */
                autofs4_catatonic_mode(sbi);
+                put_pid(sbi->oz_pgrp);
+        }
        DPRINTK("shutting down");
        kill_litter_super(sb);
@@ -80,7 +83,7 @@ static int autofs4_show_options(struct seq_file *m, struct dentry *root)
        if (!gid_eq(root_inode->i_gid, GLOBAL_ROOT_GID))
                seq_printf(m, ",gid=%u",
                        from_kgid_munged(&init_user_ns, root_inode->i_gid));
-        seq_printf(m, ",pgrp=%d", sbi->oz_pgrp);
+        seq_printf(m, ",pgrp=%d", pid_vnr(sbi->oz_pgrp));
        seq_printf(m, ",timeout=%lu", sbi->exp_timeout/HZ);
        seq_printf(m, ",minproto=%d", sbi->min_proto);
        seq_printf(m, ",maxproto=%d", sbi->max_proto);
@@ -124,7 +127,8 @@ static const match_table_t tokens = {
 };
 static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid,
-                pid_t *pgrp, unsigned int *type, int *minproto, int *maxproto)
+                         int *pgrp, bool *pgrp_set, unsigned int *type,
+                         int *minproto, int *maxproto)
 {
        char *p;
        substring_t args[MAX_OPT_ARGS];
@@ -132,7 +136,6 @@ static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid,
        *uid = current_uid();
        *gid = current_gid();
-        *pgrp = task_pgrp_nr(current);
        *minproto = AUTOFS_MIN_PROTO_VERSION;
        *maxproto = AUTOFS_MAX_PROTO_VERSION;
@@ -171,6 +174,7 @@ static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid,
                        if (match_int(args, &option))
                                return 1;
                        *pgrp = option;
+                        *pgrp_set = true;
                        break;
                case Opt_minproto:
                        if (match_int(args, &option))
@@ -206,10 +210,13 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
        int pipefd;
        struct autofs_sb_info *sbi;
        struct autofs_info *ino;
+        int pgrp;
+        bool pgrp_set = false;
+        int ret = -EINVAL;
        sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
        if (!sbi)
-                goto fail_unlock;
+                return -ENOMEM;
        DPRINTK("starting up, sbi = %p",sbi);
        s->s_fs_info = sbi;
@@ -218,7 +225,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
        sbi->pipe = NULL;
        sbi->catatonic = 1;
        sbi->exp_timeout = 0;
-        sbi->oz_pgrp = task_pgrp_nr(current);
+        sbi->oz_pgrp = NULL;
        sbi->sb = s;
        sbi->version = 0;
        sbi->sub_version = 0;
@@ -243,8 +250,10 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
         * Get the root inode and dentry, but defer checking for errors.
         */
        ino = autofs4_new_ino(sbi);
-        if (!ino)
+        if (!ino) {
+                ret = -ENOMEM;
                goto fail_free;
+        }
        root_inode = autofs4_get_inode(s, S_IFDIR | 0755);
        root = d_make_root(root_inode);
        if (!root)
@@ -255,12 +264,23 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
        /* Can this call block? */
        if (parse_options(data, &pipefd, &root_inode->i_uid, &root_inode->i_gid,
-                                &sbi->oz_pgrp, &sbi->type, &sbi->min_proto,
+                          &pgrp, &pgrp_set, &sbi->type, &sbi->min_proto,
-                                &sbi->max_proto)) {
+                          &sbi->max_proto)) {
                printk("autofs: called with bogus options\n");
                goto fail_dput;
        }
+        if (pgrp_set) {
+                sbi->oz_pgrp = find_get_pid(pgrp);
+                if (!sbi->oz_pgrp) {
+                        pr_warn("autofs: could not find process group %d\n",
+                                pgrp);
+                        goto fail_dput;
+                }
+        } else {
+                sbi->oz_pgrp = get_task_pid(current, PIDTYPE_PGID);
+        }
        if (autofs_type_trigger(sbi->type))
                __managed_dentry_set_managed(root);
@@ -284,14 +304,15 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
                sbi->version = sbi->max_proto;
        sbi->sub_version = AUTOFS_PROTO_SUBVERSION;
-        DPRINTK("pipe fd = %d, pgrp = %u", pipefd, sbi->oz_pgrp);
+        DPRINTK("pipe fd = %d, pgrp = %u", pipefd, pid_nr(sbi->oz_pgrp));
        pipe = fget(pipefd);
-        
        if (!pipe) {
                printk("autofs: could not open pipe file descriptor\n");
                goto fail_dput;
        }
-        if (autofs_prepare_pipe(pipe) < 0)
+        ret = autofs_prepare_pipe(pipe);
+        if (ret < 0)
                goto fail_fput;
        sbi->pipe = pipe;
        sbi->pipefd = pipefd;
@@ -316,10 +337,10 @@ fail_dput:
 fail_ino:
        kfree(ino);
 fail_free:
+        put_pid(sbi->oz_pgrp);
        kfree(sbi);
        s->s_fs_info = NULL;
-fail_unlock:
+        return ret;
-        return -EINVAL;
 }
 struct inode *autofs4_get_inode(struct super_block *sb, umode_t mode)
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 92ef341ba0cf..2caf36ac3e93 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -558,7 +558,7 @@ static int autofs4_dir_symlink(struct inode *dir,
        dget(dentry);
        atomic_inc(&ino->count);
        p_ino = autofs4_dentry_ino(dentry->d_parent);
-        if (p_ino && dentry->d_parent != dentry)
+        if (p_ino && !IS_ROOT(dentry))
                atomic_inc(&p_ino->count);
        dir->i_mtime = CURRENT_TIME;
@@ -593,7 +593,7 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
        if (atomic_dec_and_test(&ino->count)) {
                p_ino = autofs4_dentry_ino(dentry->d_parent);
-                if (p_ino && dentry->d_parent != dentry)
+                if (p_ino && !IS_ROOT(dentry))
                        atomic_dec(&p_ino->count);
        }
        dput(ino->dentry);
@@ -732,7 +732,7 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, umode_t m
        dget(dentry);
        atomic_inc(&ino->count);
        p_ino = autofs4_dentry_ino(dentry->d_parent);
-        if (p_ino && dentry->d_parent != dentry)
+        if (p_ino && !IS_ROOT(dentry))
                atomic_inc(&p_ino->count);
        inc_nlink(dir);
        dir->i_mtime = CURRENT_TIME;
diff --git a/fs/autofs4/symlink.c b/fs/autofs4/symlink.c
index f27c094a1919..1e8ea192be2b 100644
--- a/fs/autofs4/symlink.c
+++ b/fs/autofs4/symlink.c
@@ -14,6 +14,10 @@
 static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
+        struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+        struct autofs_info *ino = autofs4_dentry_ino(dentry);
+        if (ino && !autofs4_oz_mode(sbi))
+                ino->last_used = jiffies;
        nd_set_link(nd, dentry->d_inode->i_private);
        return NULL;
 }
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 689e40d983ad..116fd38ee472 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -347,11 +347,23 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
        struct qstr qstr;
        char *name;
        int status, ret, type;
+        pid_t pid;
+        pid_t tgid;
        /* In catatonic mode, we don't wait for nobody */
        if (sbi->catatonic)
                return -ENOENT;
+        /*
+         * Try translating pids to the namespace of the daemon.
+         *
+         * Zero means failure: we are in an unrelated pid namespace.
+         */
+        pid = task_pid_nr_ns(current, ns_of_pid(sbi->oz_pgrp));
+        tgid = task_tgid_nr_ns(current, ns_of_pid(sbi->oz_pgrp));
+        if (pid == 0 || tgid == 0)
+                return -ENOENT;
        if (!dentry->d_inode) {
                /*
                 * A wait for a negative dentry is invalid for certain
@@ -417,8 +429,8 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
                wq->ino = autofs4_get_ino(sbi);
                wq->uid = current_uid();
                wq->gid = current_gid();
-                wq->pid = current->pid;
+                wq->pid = pid;
-                wq->tgid = current->tgid;
+                wq->tgid = tgid;
                wq->status = -EINTR; /* Status return if interrupted */
                wq->wait_ctr = 2;
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index daa15d6ba450..845d2d690ce2 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -324,8 +324,8 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
        befs_debug(sb, "---> befs_read_inode() " "inode = %lu", ino);
        inode = iget_locked(sb, ino);
-        if (IS_ERR(inode))
+        if (!inode)
-                return inode;
+                return ERR_PTR(-ENOMEM);
        if (!(inode->i_state & I_NEW))
                return inode;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 571a42326908..67be2951b98a 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -543,9 +543,6 @@ out:
 * libraries.  There is no binary dependent code anywhere else.
 */
-#define INTERPRETER_NONE 0
-#define INTERPRETER_ELF 2
 #ifndef STACK_RND_MASK
 #define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12))     /* 8MB of VA */
 #endif
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index fc60b31453ee..4f70f383132c 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -114,6 +114,14 @@ void bio_integrity_free(struct bio *bio)
 }
 EXPORT_SYMBOL(bio_integrity_free);
+static inline unsigned int bip_integrity_vecs(struct bio_integrity_payload *bip)
+{
+        if (bip->bip_slab == BIO_POOL_NONE)
+                return BIP_INLINE_VECS;
+        return bvec_nr_vecs(bip->bip_slab);
+}
 /**
 * bio_integrity_add_page - Attach integrity metadata
 * @bio:        bio to update
@@ -129,13 +137,12 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
        struct bio_integrity_payload *bip = bio->bi_integrity;
        struct bio_vec *iv;
-        if (bip->bip_vcnt >= bvec_nr_vecs(bip->bip_slab)) {
+        if (bip->bip_vcnt >= bip_integrity_vecs(bip)) {
                printk(KERN_ERR "%s: bip_vec full\n", __func__);
                return 0;
        }
-        iv = bip_vec_idx(bip, bip->bip_vcnt);
+        iv = bip->bip_vec + bip->bip_vcnt;
-        BUG_ON(iv == NULL);
        iv->bv_page = page;
        iv->bv_len = len;
@@ -203,6 +210,12 @@ static inline unsigned int bio_integrity_hw_sectors(struct blk_integrity *bi,
        return sectors;
 }
+static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi,
+                                               unsigned int sectors)
+{
+        return bio_integrity_hw_sectors(bi, sectors) * bi->tuple_size;
+}
 /**
 * bio_integrity_tag_size - Retrieve integrity tag space
 * @bio:        bio to inspect
@@ -215,13 +228,14 @@ unsigned int bio_integrity_tag_size(struct bio *bio)
 {
        struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
-        BUG_ON(bio->bi_size == 0);
+        BUG_ON(bio->bi_iter.bi_size == 0);
-        return bi->tag_size * (bio->bi_size / bi->sector_size);
+        return bi->tag_size * (bio->bi_iter.bi_size / bi->sector_size);
 }
 EXPORT_SYMBOL(bio_integrity_tag_size);
-int bio_integrity_tag(struct bio *bio, void *tag_buf, unsigned int len, int set)
+static int bio_integrity_tag(struct bio *bio, void *tag_buf, unsigned int len,
+                             int set)
 {
        struct bio_integrity_payload *bip = bio->bi_integrity;
        struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
@@ -235,9 +249,9 @@ int bio_integrity_tag(struct bio *bio, void *tag_buf, unsigned int len, int set)
        nr_sectors = bio_integrity_hw_sectors(bi,
                                        DIV_ROUND_UP(len, bi->tag_size));
-        if (nr_sectors * bi->tuple_size > bip->bip_size) {
+        if (nr_sectors * bi->tuple_size > bip->bip_iter.bi_size) {
-                printk(KERN_ERR "%s: tag too big for bio: %u > %u\n",
+                printk(KERN_ERR "%s: tag too big for bio: %u > %u\n", __func__,
-                       __func__, nr_sectors * bi->tuple_size, bip->bip_size);
+                       nr_sectors * bi->tuple_size, bip->bip_iter.bi_size);
                return -1;
        }
@@ -299,29 +313,30 @@ static void bio_integrity_generate(struct bio *bio)
 {
        struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
        struct blk_integrity_exchg bix;
-        struct bio_vec *bv;
+        struct bio_vec bv;
-        sector_t sector = bio->bi_sector;
+        struct bvec_iter iter;
-        unsigned int i, sectors, total;
+        sector_t sector = bio->bi_iter.bi_sector;
+        unsigned int sectors, total;
        void *prot_buf = bio->bi_integrity->bip_buf;
        total = 0;
        bix.disk_name = bio->bi_bdev->bd_disk->disk_name;
        bix.sector_size = bi->sector_size;
-        bio_for_each_segment(bv, bio, i) {
+        bio_for_each_segment(bv, bio, iter) {
-                void *kaddr = kmap_atomic(bv->bv_page);
+                void *kaddr = kmap_atomic(bv.bv_page);
-                bix.data_buf = kaddr + bv->bv_offset;
+                bix.data_buf = kaddr + bv.bv_offset;
-                bix.data_size = bv->bv_len;
+                bix.data_size = bv.bv_len;
                bix.prot_buf = prot_buf;
                bix.sector = sector;
                bi->generate_fn(&bix);
-                sectors = bv->bv_len / bi->sector_size;
+                sectors = bv.bv_len / bi->sector_size;
                sector += sectors;
                prot_buf += sectors * bi->tuple_size;
                total += sectors * bi->tuple_size;
-                BUG_ON(total > bio->bi_integrity->bip_size);
+                BUG_ON(total > bio->bi_integrity->bip_iter.bi_size);
                kunmap_atomic(kaddr);
        }
@@ -386,8 +401,8 @@ int bio_integrity_prep(struct bio *bio)
        bip->bip_owns_buf = 1;
        bip->bip_buf = buf;
-        bip->bip_size = len;
+        bip->bip_iter.bi_size = len;
-        bip->bip_sector = bio->bi_sector;
+        bip->bip_iter.bi_sector = bio->bi_iter.bi_sector;
        /* Map it */
        offset = offset_in_page(buf);
@@ -442,16 +457,17 @@ static int bio_integrity_verify(struct bio *bio)
        struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
        struct blk_integrity_exchg bix;
        struct bio_vec *bv;
-        sector_t sector = bio->bi_integrity->bip_sector;
+        sector_t sector = bio->bi_integrity->bip_iter.bi_sector;
-        unsigned int i, sectors, total, ret;
+        unsigned int sectors, ret = 0;
        void *prot_buf = bio->bi_integrity->bip_buf;
+        int i;
-        ret = total = 0;
        bix.disk_name = bio->bi_bdev->bd_disk->disk_name;
        bix.sector_size = bi->sector_size;
-        bio_for_each_segment(bv, bio, i) {
+        bio_for_each_segment_all(bv, bio, i) {
                void *kaddr = kmap_atomic(bv->bv_page);
                bix.data_buf = kaddr + bv->bv_offset;
                bix.data_size = bv->bv_len;
                bix.prot_buf = prot_buf;
@@ -467,8 +483,6 @@ static int bio_integrity_verify(struct bio *bio)
                sectors = bv->bv_len / bi->sector_size;
                sector += sectors;
                prot_buf += sectors * bi->tuple_size;
-                total += sectors * bi->tuple_size;
-                BUG_ON(total > bio->bi_integrity->bip_size);
                kunmap_atomic(kaddr);
        }
@@ -495,7 +509,7 @@ static void bio_integrity_verify_fn(struct work_struct *work)
        /* Restore original bio completion handler */
        bio->bi_end_io = bip->bip_end_io;
-        bio_endio(bio, error);
+        bio_endio_nodec(bio, error);
 }
 /**
@@ -533,56 +547,6 @@ void bio_integrity_endio(struct bio *bio, int error)
 EXPORT_SYMBOL(bio_integrity_endio);
 /**
- * bio_integrity_mark_head - Advance bip_vec skip bytes
- * @bip:        Integrity vector to advance
- * @skip:       Number of bytes to advance it
- */
-void bio_integrity_mark_head(struct bio_integrity_payload *bip,
-                             unsigned int skip)
-{
-        struct bio_vec *iv;
-        unsigned int i;
-        bip_for_each_vec(iv, bip, i) {
-                if (skip == 0) {
-                        bip->bip_idx = i;
-                        return;
-                } else if (skip >= iv->bv_len) {
-                        skip -= iv->bv_len;
-                } else { /* skip < iv->bv_len) */
-                        iv->bv_offset += skip;
-                        iv->bv_len -= skip;
-                        bip->bip_idx = i;
-                        return;
-                }
-        }
-}
-/**
- * bio_integrity_mark_tail - Truncate bip_vec to be len bytes long
- * @bip:        Integrity vector to truncate
- * @len:        New length of integrity vector
- */
-void bio_integrity_mark_tail(struct bio_integrity_payload *bip,
-                             unsigned int len)
-{
-        struct bio_vec *iv;
-        unsigned int i;
-        bip_for_each_vec(iv, bip, i) {
-                if (len == 0) {
-                        bip->bip_vcnt = i;
-                        return;
-                } else if (len >= iv->bv_len) {
-                        len -= iv->bv_len;
-                } else { /* len < iv->bv_len) */
-                        iv->bv_len = len;
-                        len = 0;
-                }
-        }
-}
-/**
 * bio_integrity_advance - Advance integrity vector
 * @bio:        bio whose integrity vector to update
 * @bytes_done: number of data bytes that have been completed
@@ -595,13 +559,9 @@ void bio_integrity_advance(struct bio *bio, unsigned int bytes_done)
 {
        struct bio_integrity_payload *bip = bio->bi_integrity;
        struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
-        unsigned int nr_sectors;
+        unsigned bytes = bio_integrity_bytes(bi, bytes_done >> 9);
-        BUG_ON(bip == NULL);
-        BUG_ON(bi == NULL);
-        nr_sectors = bio_integrity_hw_sectors(bi, bytes_done >> 9);
+        bvec_iter_advance(bip->bip_vec, &bip->bip_iter, bytes);
-        bio_integrity_mark_head(bip, nr_sectors * bi->tuple_size);
 }
 EXPORT_SYMBOL(bio_integrity_advance);
@@ -621,64 +581,13 @@ void bio_integrity_trim(struct bio *bio, unsigned int offset,
 {
        struct bio_integrity_payload *bip = bio->bi_integrity;
        struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
-        unsigned int nr_sectors;
-        BUG_ON(bip == NULL);
-        BUG_ON(bi == NULL);
-        BUG_ON(!bio_flagged(bio, BIO_CLONED));
-        nr_sectors = bio_integrity_hw_sectors(bi, sectors);
+        bio_integrity_advance(bio, offset << 9);
-        bip->bip_sector = bip->bip_sector + offset;
+        bip->bip_iter.bi_size = bio_integrity_bytes(bi, sectors);
-        bio_integrity_mark_head(bip, offset * bi->tuple_size);
-        bio_integrity_mark_tail(bip, sectors * bi->tuple_size);
 }
 EXPORT_SYMBOL(bio_integrity_trim);
 /**
- * bio_integrity_split - Split integrity metadata
- * @bio:        Protected bio
- * @bp:         Resulting bio_pair
- * @sectors:    Offset
- *
- * Description: Splits an integrity page into a bio_pair.
- */
-void bio_integrity_split(struct bio *bio, struct bio_pair *bp, int sectors)
-{
-        struct blk_integrity *bi;
-        struct bio_integrity_payload *bip = bio->bi_integrity;
-        unsigned int nr_sectors;
-        if (bio_integrity(bio) == 0)
-                return;
-        bi = bdev_get_integrity(bio->bi_bdev);
-        BUG_ON(bi == NULL);
-        BUG_ON(bip->bip_vcnt != 1);
-        nr_sectors = bio_integrity_hw_sectors(bi, sectors);
-        bp->bio1.bi_integrity = &bp->bip1;
-        bp->bio2.bi_integrity = &bp->bip2;
-        bp->iv1 = bip->bip_vec[bip->bip_idx];
-        bp->iv2 = bip->bip_vec[bip->bip_idx];
-        bp->bip1.bip_vec = &bp->iv1;
-        bp->bip2.bip_vec = &bp->iv2;
-        bp->iv1.bv_len = sectors * bi->tuple_size;
-        bp->iv2.bv_offset += sectors * bi->tuple_size;
-        bp->iv2.bv_len -= sectors * bi->tuple_size;
-        bp->bip1.bip_sector = bio->bi_integrity->bip_sector;
-        bp->bip2.bip_sector = bio->bi_integrity->bip_sector + nr_sectors;
-        bp->bip1.bip_vcnt = bp->bip2.bip_vcnt = 1;
-        bp->bip1.bip_idx = bp->bip2.bip_idx = 0;
-}
-EXPORT_SYMBOL(bio_integrity_split);
-/**
 * bio_integrity_clone - Callback for cloning bios with integrity metadata
 * @bio:        New bio
 * @bio_src:    Original bio
@@ -702,9 +611,8 @@ int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
        memcpy(bip->bip_vec, bip_src->bip_vec,
               bip_src->bip_vcnt * sizeof(struct bio_vec));
-        bip->bip_sector = bip_src->bip_sector;
        bip->bip_vcnt = bip_src->bip_vcnt;
-        bip->bip_idx = bip_src->bip_idx;
+        bip->bip_iter = bip_src->bip_iter;
        return 0;
 }
diff --git a/fs/bio.c b/fs/bio.c
index 33d79a4eb92d..8754e7b6eb49 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -38,8 +38,6 @@
 */
 #define BIO_INLINE_VECS         4
-static mempool_t *bio_split_pool __read_mostly;
 /*
 * if you change this list, also change bvec_alloc or things will
 * break badly! cannot be bigger than what you can fit into an
@@ -273,6 +271,7 @@ void bio_init(struct bio *bio)
 {
        memset(bio, 0, sizeof(*bio));
        bio->bi_flags = 1 << BIO_UPTODATE;
+        atomic_set(&bio->bi_remaining, 1);
        atomic_set(&bio->bi_cnt, 1);
 }
 EXPORT_SYMBOL(bio_init);
@@ -295,9 +294,35 @@ void bio_reset(struct bio *bio)
        memset(bio, 0, BIO_RESET_BYTES);
        bio->bi_flags = flags|(1 << BIO_UPTODATE);
+        atomic_set(&bio->bi_remaining, 1);
 }
 EXPORT_SYMBOL(bio_reset);
+static void bio_chain_endio(struct bio *bio, int error)
+{
+        bio_endio(bio->bi_private, error);
+        bio_put(bio);
+}
+/**
+ * bio_chain - chain bio completions
+ *
+ * The caller won't have a bi_end_io called when @bio completes - instead,
+ * @parent's bi_end_io won't be called until both @parent and @bio have
+ * completed; the chained bio will also be freed when it completes.
+ *
+ * The caller must not set bi_private or bi_end_io in @bio.
+ */
+void bio_chain(struct bio *bio, struct bio *parent)
+{
+        BUG_ON(bio->bi_private || bio->bi_end_io);
+        bio->bi_private = parent;
+        bio->bi_end_io  = bio_chain_endio;
+        atomic_inc(&parent->bi_remaining);
+}
+EXPORT_SYMBOL(bio_chain);
 static void bio_alloc_rescue(struct work_struct *work)
 {
        struct bio_set *bs = container_of(work, struct bio_set, rescue_work);
@@ -473,13 +498,13 @@ EXPORT_SYMBOL(bio_alloc_bioset);
 void zero_fill_bio(struct bio *bio)
 {
        unsigned long flags;
-        struct bio_vec *bv;
+        struct bio_vec bv;
-        int i;
+        struct bvec_iter iter;
-        bio_for_each_segment(bv, bio, i) {
+        bio_for_each_segment(bv, bio, iter) {
-                char *data = bvec_kmap_irq(bv, &flags);
+                char *data = bvec_kmap_irq(&bv, &flags);
-                memset(data, 0, bv->bv_len);
+                memset(data, 0, bv.bv_len);
-                flush_dcache_page(bv->bv_page);
+                flush_dcache_page(bv.bv_page);
                bvec_kunmap_irq(data, &flags);
        }
 }
@@ -515,51 +540,49 @@ inline int bio_phys_segments(struct request_queue *q, struct bio *bio)
 EXPORT_SYMBOL(bio_phys_segments);
 /**
- *      __bio_clone     -       clone a bio
+ *      __bio_clone_fast - clone a bio that shares the original bio's biovec
 *      @bio: destination bio
 *      @bio_src: bio to clone
 *
 *      Clone a &bio. Caller will own the returned bio, but not
 *      the actual data it points to. Reference count of returned
 *      bio will be one.
+ *
+ *      Caller must ensure that @bio_src is not freed before @bio.
 */
-void __bio_clone(struct bio *bio, struct bio *bio_src)
+void __bio_clone_fast(struct bio *bio, struct bio *bio_src)
 {
-        memcpy(bio->bi_io_vec, bio_src->bi_io_vec,
+        BUG_ON(bio->bi_pool && BIO_POOL_IDX(bio) != BIO_POOL_NONE);
-                bio_src->bi_max_vecs * sizeof(struct bio_vec));
        /*
         * most users will be overriding ->bi_bdev with a new target,
         * so we don't set nor calculate new physical/hw segment counts here
         */
-        bio->bi_sector = bio_src->bi_sector;
        bio->bi_bdev = bio_src->bi_bdev;
        bio->bi_flags |= 1 << BIO_CLONED;
        bio->bi_rw = bio_src->bi_rw;
-        bio->bi_vcnt = bio_src->bi_vcnt;
+        bio->bi_iter = bio_src->bi_iter;
-        bio->bi_size = bio_src->bi_size;
+        bio->bi_io_vec = bio_src->bi_io_vec;
-        bio->bi_idx = bio_src->bi_idx;
 }
-EXPORT_SYMBOL(__bio_clone);
+EXPORT_SYMBOL(__bio_clone_fast);
 /**
- *      bio_clone_bioset -      clone a bio
+ *      bio_clone_fast - clone a bio that shares the original bio's biovec
 *      @bio: bio to clone
 *      @gfp_mask: allocation priority
 *      @bs: bio_set to allocate from
 *
- *      Like __bio_clone, only also allocates the returned bio
+ *      Like __bio_clone_fast, only also allocates the returned bio
 */
-struct bio *bio_clone_bioset(struct bio *bio, gfp_t gfp_mask,
+struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs)
-                             struct bio_set *bs)
 {
        struct bio *b;
-        b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, bs);
+        b = bio_alloc_bioset(gfp_mask, 0, bs);
        if (!b)
                return NULL;
-        __bio_clone(b, bio);
+        __bio_clone_fast(b, bio);
        if (bio_integrity(bio)) {
                int ret;
@@ -574,6 +597,79 @@ struct bio *bio_clone_bioset(struct bio *bio, gfp_t gfp_mask,
        return b;
 }
+EXPORT_SYMBOL(bio_clone_fast);
+/**
+ *      bio_clone_bioset - clone a bio
+ *      @bio_src: bio to clone
+ *      @gfp_mask: allocation priority
+ *      @bs: bio_set to allocate from
+ *
+ *      Clone bio. Caller will own the returned bio, but not the actual data it
+ *      points to. Reference count of returned bio will be one.
+ */
+struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
+                             struct bio_set *bs)
+{
+        struct bvec_iter iter;
+        struct bio_vec bv;
+        struct bio *bio;
+        /*
+         * Pre immutable biovecs, __bio_clone() used to just do a memcpy from
+         * bio_src->bi_io_vec to bio->bi_io_vec.
+         *
+         * We can't do that anymore, because:
+         *
+         *  - The point of cloning the biovec is to produce a bio with a biovec
+         *    the caller can modify: bi_idx and bi_bvec_done should be 0.
+         *
+         *  - The original bio could've had more than BIO_MAX_PAGES biovecs; if
+         *    we tried to clone the whole thing bio_alloc_bioset() would fail.
+         *    But the clone should succeed as long as the number of biovecs we
+         *    actually need to allocate is fewer than BIO_MAX_PAGES.
+         *
+         *  - Lastly, bi_vcnt should not be looked at or relied upon by code
+         *    that does not own the bio - reason being drivers don't use it for
+         *    iterating over the biovec anymore, so expecting it to be kept up
+         *    to date (i.e. for clones that share the parent biovec) is just
+         *    asking for trouble and would force extra work on
+         *    __bio_clone_fast() anyways.
+         */
+        bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs);
+        if (!bio)
+                return NULL;
+        bio->bi_bdev            = bio_src->bi_bdev;
+        bio->bi_rw              = bio_src->bi_rw;
+        bio->bi_iter.bi_sector  = bio_src->bi_iter.bi_sector;
+        bio->bi_iter.bi_size    = bio_src->bi_iter.bi_size;
+        if (bio->bi_rw & REQ_DISCARD)
+                goto integrity_clone;
+        if (bio->bi_rw & REQ_WRITE_SAME) {
+                bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0];
+                goto integrity_clone;
+        }
+        bio_for_each_segment(bv, bio_src, iter)
+                bio->bi_io_vec[bio->bi_vcnt++] = bv;
+integrity_clone:
+        if (bio_integrity(bio_src)) {
+                int ret;
+                ret = bio_integrity_clone(bio, bio_src, gfp_mask);
+                if (ret < 0) {
+                        bio_put(bio);
+                        return NULL;
+                }
+        }
+        return bio;
+}
 EXPORT_SYMBOL(bio_clone_bioset);
 /**
@@ -612,7 +708,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
        if (unlikely(bio_flagged(bio, BIO_CLONED)))
                return 0;
-        if (((bio->bi_size + len) >> 9) > max_sectors)
+        if (((bio->bi_iter.bi_size + len) >> 9) > max_sectors)
                return 0;
        /*
@@ -635,8 +731,9 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
                                           simulate merging updated prev_bvec
                                           as new bvec. */
                                        .bi_bdev = bio->bi_bdev,
-                                        .bi_sector = bio->bi_sector,
+                                        .bi_sector = bio->bi_iter.bi_sector,
-                                        .bi_size = bio->bi_size - prev_bv_len,
+                                        .bi_size = bio->bi_iter.bi_size -
+                                                prev_bv_len,
                                        .bi_rw = bio->bi_rw,
                                };
@@ -684,8 +781,8 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
        if (q->merge_bvec_fn) {
                struct bvec_merge_data bvm = {
                        .bi_bdev = bio->bi_bdev,
-                        .bi_sector = bio->bi_sector,
+                        .bi_sector = bio->bi_iter.bi_sector,
-                        .bi_size = bio->bi_size,
+                        .bi_size = bio->bi_iter.bi_size,
                        .bi_rw = bio->bi_rw,
                };
@@ -708,7 +805,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
        bio->bi_vcnt++;
        bio->bi_phys_segments++;
 done:
-        bio->bi_size += len;
+        bio->bi_iter.bi_size += len;
        return len;
 }
@@ -807,28 +904,7 @@ void bio_advance(struct bio *bio, unsigned bytes)
        if (bio_integrity(bio))
                bio_integrity_advance(bio, bytes);
-        bio->bi_sector += bytes >> 9;
+        bio_advance_iter(bio, &bio->bi_iter, bytes);
-        bio->bi_size -= bytes;
-        if (bio->bi_rw & BIO_NO_ADVANCE_ITER_MASK)
-                return;
-        while (bytes) {
-                if (unlikely(bio->bi_idx >= bio->bi_vcnt)) {
-                        WARN_ONCE(1, "bio idx %d >= vcnt %d\n",
-                                  bio->bi_idx, bio->bi_vcnt);
-                        break;
-                }
-                if (bytes >= bio_iovec(bio)->bv_len) {
-                        bytes -= bio_iovec(bio)->bv_len;
-                        bio->bi_idx++;
-                } else {
-                        bio_iovec(bio)->bv_len -= bytes;
-                        bio_iovec(bio)->bv_offset += bytes;
-                        bytes = 0;
-                }
-        }
 }
 EXPORT_SYMBOL(bio_advance);
@@ -874,117 +950,80 @@ EXPORT_SYMBOL(bio_alloc_pages);
 */
 void bio_copy_data(struct bio *dst, struct bio *src)
 {
-        struct bio_vec *src_bv, *dst_bv;
+        struct bvec_iter src_iter, dst_iter;
-        unsigned src_offset, dst_offset, bytes;
+        struct bio_vec src_bv, dst_bv;
        void *src_p, *dst_p;
+        unsigned bytes;
-        src_bv = bio_iovec(src);
+        src_iter = src->bi_iter;
-        dst_bv = bio_iovec(dst);
+        dst_iter = dst->bi_iter;
-        src_offset = src_bv->bv_offset;
-        dst_offset = dst_bv->bv_offset;
        while (1) {
-                if (src_offset == src_bv->bv_offset + src_bv->bv_len) {
+                if (!src_iter.bi_size) {
-                        src_bv++;
+                        src = src->bi_next;
-                        if (src_bv == bio_iovec_idx(src, src->bi_vcnt)) {
+                        if (!src)
-                                src = src->bi_next;
+                                break;
-                                if (!src)
-                                        break;
-                                src_bv = bio_iovec(src);
-                        }
-                        src_offset = src_bv->bv_offset;
+                        src_iter = src->bi_iter;
                }
-                if (dst_offset == dst_bv->bv_offset + dst_bv->bv_len) {
+                if (!dst_iter.bi_size) {
-                        dst_bv++;
+                        dst = dst->bi_next;
-                        if (dst_bv == bio_iovec_idx(dst, dst->bi_vcnt)) {
+                        if (!dst)
-                                dst = dst->bi_next;
+                                break;
-                                if (!dst)
-                                        break;
-                                dst_bv = bio_iovec(dst);
-                        }
-                        dst_offset = dst_bv->bv_offset;
+                        dst_iter = dst->bi_iter;
                }
-                bytes = min(dst_bv->bv_offset + dst_bv->bv_len - dst_offset,
+                src_bv = bio_iter_iovec(src, src_iter);
-                            src_bv->bv_offset + src_bv->bv_len - src_offset);
+                dst_bv = bio_iter_iovec(dst, dst_iter);
-                src_p = kmap_atomic(src_bv->bv_page);
+                bytes = min(src_bv.bv_len, dst_bv.bv_len);
-                dst_p = kmap_atomic(dst_bv->bv_page);
-                memcpy(dst_p + dst_offset,
+                src_p = kmap_atomic(src_bv.bv_page);
-                       src_p + src_offset,
+                dst_p = kmap_atomic(dst_bv.bv_page);
+                memcpy(dst_p + dst_bv.bv_offset,
+                       src_p + src_bv.bv_offset,
                       bytes);
                kunmap_atomic(dst_p);
                kunmap_atomic(src_p);
-                src_offset += bytes;
+                bio_advance_iter(src, &src_iter, bytes);
-                dst_offset += bytes;
+                bio_advance_iter(dst, &dst_iter, bytes);
        }
 }
 EXPORT_SYMBOL(bio_copy_data);
 struct bio_map_data {
-        struct bio_vec *iovecs;
-        struct sg_iovec *sgvecs;
        int nr_sgvecs;
        int is_our_pages;
+        struct sg_iovec sgvecs[];
 };
 static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio,
                             struct sg_iovec *iov, int iov_count,
                             int is_our_pages)
 {
-        memcpy(bmd->iovecs, bio->bi_io_vec, sizeof(struct bio_vec) * bio->bi_vcnt);
        memcpy(bmd->sgvecs, iov, sizeof(struct sg_iovec) * iov_count);
        bmd->nr_sgvecs = iov_count;
        bmd->is_our_pages = is_our_pages;
        bio->bi_private = bmd;
 }
-static void bio_free_map_data(struct bio_map_data *bmd)
-{
-        kfree(bmd->iovecs);
-        kfree(bmd->sgvecs);
-        kfree(bmd);
-}
 static struct bio_map_data *bio_alloc_map_data(int nr_segs,
                                               unsigned int iov_count,
                                               gfp_t gfp_mask)
 {
-        struct bio_map_data *bmd;
        if (iov_count > UIO_MAXIOV)
                return NULL;
-        bmd = kmalloc(sizeof(*bmd), gfp_mask);
+        return kmalloc(sizeof(struct bio_map_data) +
-        if (!bmd)
+                       sizeof(struct sg_iovec) * iov_count, gfp_mask);
-                return NULL;
-        bmd->iovecs = kmalloc(sizeof(struct bio_vec) * nr_segs, gfp_mask);
-        if (!bmd->iovecs) {
-                kfree(bmd);
-                return NULL;
-        }
-        bmd->sgvecs = kmalloc(sizeof(struct sg_iovec) * iov_count, gfp_mask);
-        if (bmd->sgvecs)
-                return bmd;
-        kfree(bmd->iovecs);
-        kfree(bmd);
-        return NULL;
 }
-static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs,
+static int __bio_copy_iov(struct bio *bio, struct sg_iovec *iov, int iov_count,
-                          struct sg_iovec *iov, int iov_count,
                          int to_user, int from_user, int do_free_page)
 {
        int ret = 0, i;
@@ -994,7 +1033,7 @@ static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs,
        bio_for_each_segment_all(bvec, bio, i) {
                char *bv_addr = page_address(bvec->bv_page);
-                unsigned int bv_len = iovecs[i].bv_len;
+                unsigned int bv_len = bvec->bv_len;
                while (bv_len && iov_idx < iov_count) {
                        unsigned int bytes;
@@ -1054,14 +1093,14 @@ int bio_uncopy_user(struct bio *bio)
                 * don't copy into a random user address space, just free.
                 */
                if (current->mm)
-                        ret = __bio_copy_iov(bio, bmd->iovecs, bmd->sgvecs,
+                        ret = __bio_copy_iov(bio, bmd->sgvecs, bmd->nr_sgvecs,
-                                             bmd->nr_sgvecs, bio_data_dir(bio) == READ,
+                                             bio_data_dir(bio) == READ,
                                             0, bmd->is_our_pages);
                else if (bmd->is_our_pages)
                        bio_for_each_segment_all(bvec, bio, i)
                                __free_page(bvec->bv_page);
        }
-        bio_free_map_data(bmd);
+        kfree(bmd);
        bio_put(bio);
        return ret;
 }
@@ -1175,7 +1214,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
         */
        if ((!write_to_vm && (!map_data || !map_data->null_mapped)) ||
            (map_data && map_data->from_user)) {
-                ret = __bio_copy_iov(bio, bio->bi_io_vec, iov, iov_count, 0, 1, 0);
+                ret = __bio_copy_iov(bio, iov, iov_count, 0, 1, 0);
                if (ret)
                        goto cleanup;
        }
@@ -1189,7 +1228,7 @@ cleanup:
        bio_put(bio);
 out_bmd:
-        bio_free_map_data(bmd);
+        kfree(bmd);
        return ERR_PTR(ret);
 }
@@ -1485,7 +1524,7 @@ struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len,
        if (IS_ERR(bio))
                return bio;
-        if (bio->bi_size == len)
+        if (bio->bi_iter.bi_size == len)
                return bio;
        /*
@@ -1506,16 +1545,15 @@ static void bio_copy_kern_endio(struct bio *bio, int err)
        bio_for_each_segment_all(bvec, bio, i) {
                char *addr = page_address(bvec->bv_page);
-                int len = bmd->iovecs[i].bv_len;
                if (read)
-                        memcpy(p, addr, len);
+                        memcpy(p, addr, bvec->bv_len);
                __free_page(bvec->bv_page);
-                p += len;
+                p += bvec->bv_len;
        }
-        bio_free_map_data(bmd);
+        kfree(bmd);
        bio_put(bio);
 }
@@ -1686,11 +1724,11 @@ void bio_check_pages_dirty(struct bio *bio)
 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
 void bio_flush_dcache_pages(struct bio *bi)
 {
-        int i;
+        struct bio_vec bvec;
-        struct bio_vec *bvec;
+        struct bvec_iter iter;
-        bio_for_each_segment(bvec, bi, i)
+        bio_for_each_segment(bvec, bi, iter)
-                flush_dcache_page(bvec->bv_page);
+                flush_dcache_page(bvec.bv_page);
 }
 EXPORT_SYMBOL(bio_flush_dcache_pages);
 #endif
@@ -1711,96 +1749,86 @@ EXPORT_SYMBOL(bio_flush_dcache_pages);
 **/
 void bio_endio(struct bio *bio, int error)
 {
-        if (error)
+        while (bio) {
-                clear_bit(BIO_UPTODATE, &bio->bi_flags);
+                BUG_ON(atomic_read(&bio->bi_remaining) <= 0);
-        else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
-                error = -EIO;
-        if (bio->bi_end_io)
+                if (error)
-                bio->bi_end_io(bio, error);
+                        clear_bit(BIO_UPTODATE, &bio->bi_flags);
-}
+                else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
-EXPORT_SYMBOL(bio_endio);
+                        error = -EIO;
-void bio_pair_release(struct bio_pair *bp)
+                if (!atomic_dec_and_test(&bio->bi_remaining))
-{
+                        return;
-        if (atomic_dec_and_test(&bp->cnt)) {
-                struct bio *master = bp->bio1.bi_private;
-                bio_endio(master, bp->error);
+                /*
-                mempool_free(bp, bp->bio2.bi_private);
+                 * Need to have a real endio function for chained bios,
+                 * otherwise various corner cases will break (like stacking
+                 * block devices that save/restore bi_end_io) - however, we want
+                 * to avoid unbounded recursion and blowing the stack. Tail call
+                 * optimization would handle this, but compiling with frame
+                 * pointers also disables gcc's sibling call optimization.
+                 */
+                if (bio->bi_end_io == bio_chain_endio) {
+                        struct bio *parent = bio->bi_private;
+                        bio_put(bio);
+                        bio = parent;
+                } else {
+                        if (bio->bi_end_io)
+                                bio->bi_end_io(bio, error);
+                        bio = NULL;
+                }
        }
 }
-EXPORT_SYMBOL(bio_pair_release);
+EXPORT_SYMBOL(bio_endio);
-static void bio_pair_end_1(struct bio *bi, int err)
-{
-        struct bio_pair *bp = container_of(bi, struct bio_pair, bio1);
-        if (err)
-                bp->error = err;
-        bio_pair_release(bp);
-}
-static void bio_pair_end_2(struct bio *bi, int err)
+/**
+ * bio_endio_nodec - end I/O on a bio, without decrementing bi_remaining
+ * @bio:        bio
+ * @error:      error, if any
+ *
+ * For code that has saved and restored bi_end_io; thing hard before using this
+ * function, probably you should've cloned the entire bio.
+ **/
+void bio_endio_nodec(struct bio *bio, int error)
 {
-        struct bio_pair *bp = container_of(bi, struct bio_pair, bio2);
+        atomic_inc(&bio->bi_remaining);
+        bio_endio(bio, error);
-        if (err)
-                bp->error = err;
-        bio_pair_release(bp);
 }
+EXPORT_SYMBOL(bio_endio_nodec);
-/*
+/**
- * split a bio - only worry about a bio with a single page in its iovec
+ * bio_split - split a bio
+ * @bio:        bio to split
+ * @sectors:    number of sectors to split from the front of @bio
+ * @gfp:        gfp mask
+ * @bs:         bio set to allocate from
+ *
+ * Allocates and returns a new bio which represents @sectors from the start of
+ * @bio, and updates @bio to represent the remaining sectors.
+ *
+ * The newly allocated bio will point to @bio's bi_io_vec; it is the caller's
+ * responsibility to ensure that @bio is not freed before the split.
 */
-struct bio_pair *bio_split(struct bio *bi, int first_sectors)
+struct bio *bio_split(struct bio *bio, int sectors,
+                      gfp_t gfp, struct bio_set *bs)
 {
-        struct bio_pair *bp = mempool_alloc(bio_split_pool, GFP_NOIO);
+        struct bio *split = NULL;
-        if (!bp)
-                return bp;
-        trace_block_split(bdev_get_queue(bi->bi_bdev), bi,
-                                bi->bi_sector + first_sectors);
-        BUG_ON(bio_segments(bi) > 1);
-        atomic_set(&bp->cnt, 3);
-        bp->error = 0;
-        bp->bio1 = *bi;
-        bp->bio2 = *bi;
-        bp->bio2.bi_sector += first_sectors;
-        bp->bio2.bi_size -= first_sectors << 9;
-        bp->bio1.bi_size = first_sectors << 9;
-        if (bi->bi_vcnt != 0) {
-                bp->bv1 = *bio_iovec(bi);
-                bp->bv2 = *bio_iovec(bi);
-                if (bio_is_rw(bi)) {
-                        bp->bv2.bv_offset += first_sectors << 9;
-                        bp->bv2.bv_len -= first_sectors << 9;
-                        bp->bv1.bv_len = first_sectors << 9;
-                }
-                bp->bio1.bi_io_vec = &bp->bv1;
+        BUG_ON(sectors <= 0);
-                bp->bio2.bi_io_vec = &bp->bv2;
+        BUG_ON(sectors >= bio_sectors(bio));
-                bp->bio1.bi_max_vecs = 1;
+        split = bio_clone_fast(bio, gfp, bs);
-                bp->bio2.bi_max_vecs = 1;
+        if (!split)
-        }
+                return NULL;
-        bp->bio1.bi_end_io = bio_pair_end_1;
+        split->bi_iter.bi_size = sectors << 9;
-        bp->bio2.bi_end_io = bio_pair_end_2;
-        bp->bio1.bi_private = bi;
+        if (bio_integrity(split))
-        bp->bio2.bi_private = bio_split_pool;
+                bio_integrity_trim(split, 0, sectors);
-        if (bio_integrity(bi))
+        bio_advance(bio, split->bi_iter.bi_size);
-                bio_integrity_split(bi, bp, first_sectors);
-        return bp;
+        return split;
 }
 EXPORT_SYMBOL(bio_split);
@@ -1814,80 +1842,20 @@ void bio_trim(struct bio *bio, int offset, int size)
 {
        /* 'bio' is a cloned bio which we need to trim to match
         * the given offset and size.
-         * This requires adjusting bi_sector, bi_size, and bi_io_vec
         */
-        int i;
-        struct bio_vec *bvec;
-        int sofar = 0;
        size <<= 9;
-        if (offset == 0 && size == bio->bi_size)
+        if (offset == 0 && size == bio->bi_iter.bi_size)
                return;
        clear_bit(BIO_SEG_VALID, &bio->bi_flags);
        bio_advance(bio, offset << 9);
-        bio->bi_size = size;
+        bio->bi_iter.bi_size = size;
-        /* avoid any complications with bi_idx being non-zero*/
-        if (bio->bi_idx) {
-                memmove(bio->bi_io_vec, bio->bi_io_vec+bio->bi_idx,
-                        (bio->bi_vcnt - bio->bi_idx) * sizeof(struct bio_vec));
-                bio->bi_vcnt -= bio->bi_idx;
-                bio->bi_idx = 0;
-        }
-        /* Make sure vcnt and last bv are not too big */
-        bio_for_each_segment(bvec, bio, i) {
-                if (sofar + bvec->bv_len > size)
-                        bvec->bv_len = size - sofar;
-                if (bvec->bv_len == 0) {
-                        bio->bi_vcnt = i;
-                        break;
-                }
-                sofar += bvec->bv_len;
-        }
 }
 EXPORT_SYMBOL_GPL(bio_trim);
-/**
- *      bio_sector_offset - Find hardware sector offset in bio
- *      @bio:           bio to inspect
- *      @index:         bio_vec index
- *      @offset:        offset in bv_page
- *
- *      Return the number of hardware sectors between beginning of bio
- *      and an end point indicated by a bio_vec index and an offset
- *      within that vector's page.
- */
-sector_t bio_sector_offset(struct bio *bio, unsigned short index,
-                           unsigned int offset)
-{
-        unsigned int sector_sz;
-        struct bio_vec *bv;
-        sector_t sectors;
-        int i;
-        sector_sz = queue_logical_block_size(bio->bi_bdev->bd_disk->queue);
-        sectors = 0;
-        if (index >= bio->bi_idx)
-                index = bio->bi_vcnt - 1;
-        bio_for_each_segment_all(bv, bio, i) {
-                if (i == index) {
-                        if (offset > bv->bv_offset)
-                                sectors += (offset - bv->bv_offset) / sector_sz;
-                        break;
-                }
-                sectors += bv->bv_len / sector_sz;
-        }
-        return sectors;
-}
-EXPORT_SYMBOL(bio_sector_offset);
 /*
 * create memory pools for biovec's in a bio_set.
 * use the global biovec slabs created for general use.
@@ -2065,11 +2033,6 @@ static int __init init_bio(void)
        if (bioset_integrity_create(fs_bio_set, BIO_POOL_SIZE))
                panic("bio: can't create integrity pool\n");
-        bio_split_pool = mempool_create_kmalloc_pool(BIO_SPLIT_ENTRIES,
-                                                     sizeof(struct bio_pair));
-        if (!bio_split_pool)
-                panic("bio: can't create split pool\n");
        return 0;
 }
 subsys_initcall(init_bio);
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index aa976eced2d2..a66768ebc8d1 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -1,6 +1,7 @@
 config BTRFS_FS
        tristate "Btrfs filesystem support"
-        select LIBCRC32C
+        select CRYPTO
+        select CRYPTO_CRC32C
        select ZLIB_INFLATE
        select ZLIB_DEFLATE
        select LZO_COMPRESS
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 1a44e42d602a..f341a98031d2 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -9,7 +9,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
           export.o tree-log.o free-space-cache.o zlib.o lzo.o \
           compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
           reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
-           uuid-tree.o
+           uuid-tree.o props.o hash.o
 btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
 btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 0890c83643e9..ff9b3995d453 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -35,13 +35,6 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
        char *value = NULL;
        struct posix_acl *acl;
-        if (!IS_POSIXACL(inode))
-                return NULL;
-        acl = get_cached_acl(inode, type);
-        if (acl != ACL_NOT_CACHED)
-                return acl;
        switch (type) {
        case ACL_TYPE_ACCESS:
                name = POSIX_ACL_XATTR_ACCESS;
@@ -76,31 +69,10 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
        return acl;
 }
-static int btrfs_xattr_acl_get(struct dentry *dentry, const char *name,
-                void *value, size_t size, int type)
-{
-        struct posix_acl *acl;
-        int ret = 0;
-        if (!IS_POSIXACL(dentry->d_inode))
-                return -EOPNOTSUPP;
-        acl = btrfs_get_acl(dentry->d_inode, type);
-        if (IS_ERR(acl))
-                return PTR_ERR(acl);
-        if (acl == NULL)
-                return -ENODATA;
-        ret = posix_acl_to_xattr(&init_user_ns, acl, value, size);
-        posix_acl_release(acl);
-        return ret;
-}
 /*
 * Needs to be called with fs_mutex held
 */
-static int btrfs_set_acl(struct btrfs_trans_handle *trans,
+static int __btrfs_set_acl(struct btrfs_trans_handle *trans,
                         struct inode *inode, struct posix_acl *acl, int type)
 {
        int ret, size = 0;
@@ -158,35 +130,9 @@ out:
        return ret;
 }
-static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name,
+int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
-                const void *value, size_t size, int flags, int type)
 {
-        int ret;
+        return __btrfs_set_acl(NULL, inode, acl, type);
-        struct posix_acl *acl = NULL;
-        if (!inode_owner_or_capable(dentry->d_inode))
-                return -EPERM;
-        if (!IS_POSIXACL(dentry->d_inode))
-                return -EOPNOTSUPP;
-        if (value) {
-                acl = posix_acl_from_xattr(&init_user_ns, value, size);
-                if (IS_ERR(acl))
-                        return PTR_ERR(acl);
-                if (acl) {
-                        ret = posix_acl_valid(acl);
-                        if (ret)
-                                goto out;
-                }
-        }
-        ret = btrfs_set_acl(NULL, dentry->d_inode, acl, type);
-out:
-        posix_acl_release(acl);
-        return ret;
 }
 /*
@@ -197,83 +143,31 @@ out:
 int btrfs_init_acl(struct btrfs_trans_handle *trans,
                   struct inode *inode, struct inode *dir)
 {
-        struct posix_acl *acl = NULL;
+        struct posix_acl *default_acl, *acl;
        int ret = 0;
        /* this happens with subvols */
        if (!dir)
                return 0;
-        if (!S_ISLNK(inode->i_mode)) {
+        ret = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
-                if (IS_POSIXACL(dir)) {
+        if (ret)
-                        acl = btrfs_get_acl(dir, ACL_TYPE_DEFAULT);
+                return ret;
-                        if (IS_ERR(acl))
-                                return PTR_ERR(acl);
-                }
-                if (!acl)
+        if (default_acl) {
-                        inode->i_mode &= ~current_umask();
+                ret = __btrfs_set_acl(trans, inode, default_acl,
+                                      ACL_TYPE_DEFAULT);
+                posix_acl_release(default_acl);
        }
-        if (IS_POSIXACL(dir) && acl) {
+        if (acl) {
-                if (S_ISDIR(inode->i_mode)) {
+                if (!ret)
-                        ret = btrfs_set_acl(trans, inode, acl,
+                        ret = __btrfs_set_acl(trans, inode, acl,
-                                            ACL_TYPE_DEFAULT);
+                                              ACL_TYPE_ACCESS);
-                        if (ret)
+                posix_acl_release(acl);
-                                goto failed;
-                }
-                ret = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode);
-                if (ret < 0)
-                        return ret;
-                if (ret > 0) {
-                        /* we need an acl */
-                        ret = btrfs_set_acl(trans, inode, acl, ACL_TYPE_ACCESS);
-                } else if (ret < 0) {
-                        cache_no_acl(inode);
-                }
-        } else {
-                cache_no_acl(inode);
        }
-failed:
-        posix_acl_release(acl);
-        return ret;
-}
-int btrfs_acl_chmod(struct inode *inode)
+        if (!default_acl && !acl)
-{
+                cache_no_acl(inode);
-        struct posix_acl *acl;
-        int ret = 0;
-        if (S_ISLNK(inode->i_mode))
-                return -EOPNOTSUPP;
-        if (!IS_POSIXACL(inode))
-                return 0;
-        acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS);
-        if (IS_ERR_OR_NULL(acl))
-                return PTR_ERR(acl);
-        ret = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
-        if (ret)
-                return ret;
-        ret = btrfs_set_acl(NULL, inode, acl, ACL_TYPE_ACCESS);
-        posix_acl_release(acl);
        return ret;
 }
-const struct xattr_handler btrfs_xattr_acl_default_handler = {
-        .prefix = POSIX_ACL_XATTR_DEFAULT,
-        .flags  = ACL_TYPE_DEFAULT,
-        .get    = btrfs_xattr_acl_get,
-        .set    = btrfs_xattr_acl_set,
-};
-const struct xattr_handler btrfs_xattr_acl_access_handler = {
-        .prefix = POSIX_ACL_XATTR_ACCESS,
-        .flags  = ACL_TYPE_ACCESS,
-        .get    = btrfs_xattr_acl_get,
-        .set    = btrfs_xattr_acl_set,
-};
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 3775947429b2..aded3ef3d3d4 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -66,6 +66,16 @@ static int check_extent_in_eb(struct btrfs_key *key, struct extent_buffer *eb,
        return 0;
 }
+static void free_inode_elem_list(struct extent_inode_elem *eie)
+{
+        struct extent_inode_elem *eie_next;
+        for (; eie; eie = eie_next) {
+                eie_next = eie->next;
+                kfree(eie);
+        }
+}
 static int find_extent_in_eb(struct extent_buffer *eb, u64 wanted_disk_byte,
                                u64 extent_item_pos,
                                struct extent_inode_elem **eie)
@@ -209,18 +219,19 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id,
 }
 static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
-                                struct ulist *parents, int level,
+                           struct ulist *parents, struct __prelim_ref *ref,
-                                struct btrfs_key *key_for_search, u64 time_seq,
+                           int level, u64 time_seq, const u64 *extent_item_pos)
-                                u64 wanted_disk_byte,
-                                const u64 *extent_item_pos)
 {
        int ret = 0;
        int slot;
        struct extent_buffer *eb;
        struct btrfs_key key;
+        struct btrfs_key *key_for_search = &ref->key_for_search;
        struct btrfs_file_extent_item *fi;
        struct extent_inode_elem *eie = NULL, *old = NULL;
        u64 disk_byte;
+        u64 wanted_disk_byte = ref->wanted_disk_byte;
+        u64 count = 0;
        if (level != 0) {
                eb = path->nodes[level];
@@ -238,7 +249,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
        if (path->slots[0] >= btrfs_header_nritems(path->nodes[0]))
                ret = btrfs_next_old_leaf(root, path, time_seq);
-        while (!ret) {
+        while (!ret && count < ref->count) {
                eb = path->nodes[0];
                slot = path->slots[0];
@@ -254,6 +265,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
                if (disk_byte == wanted_disk_byte) {
                        eie = NULL;
                        old = NULL;
+                        count++;
                        if (extent_item_pos) {
                                ret = check_extent_in_eb(&key, eb, fi,
                                                *extent_item_pos,
@@ -273,6 +285,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
                                        old = old->next;
                                old->next = eie;
                        }
+                        eie = NULL;
                }
 next:
                ret = btrfs_next_old_item(root, path, time_seq);
@@ -280,6 +293,8 @@ next:
        if (ret > 0)
                ret = 0;
+        else if (ret < 0)
+                free_inode_elem_list(eie);
        return ret;
 }
@@ -299,23 +314,34 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
        int ret = 0;
        int root_level;
        int level = ref->level;
+        int index;
        root_key.objectid = ref->root_id;
        root_key.type = BTRFS_ROOT_ITEM_KEY;
        root_key.offset = (u64)-1;
+        index = srcu_read_lock(&fs_info->subvol_srcu);
        root = btrfs_read_fs_root_no_name(fs_info, &root_key);
        if (IS_ERR(root)) {
+                srcu_read_unlock(&fs_info->subvol_srcu, index);
                ret = PTR_ERR(root);
                goto out;
        }
        root_level = btrfs_old_root_level(root, time_seq);
-        if (root_level + 1 == level)
+        if (root_level + 1 == level) {
+                srcu_read_unlock(&fs_info->subvol_srcu, index);
                goto out;
+        }
        path->lowest_level = level;
        ret = btrfs_search_old_slot(root, &ref->key_for_search, path, time_seq);
+        /* root node has been locked, we can release @subvol_srcu safely here */
+        srcu_read_unlock(&fs_info->subvol_srcu, index);
        pr_debug("search slot in root %llu (level %d, ref count %d) returned "
                 "%d for key (%llu %u %llu)\n",
                 ref->root_id, level, ref->count, ret,
@@ -334,9 +360,8 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
                eb = path->nodes[level];
        }
-        ret = add_all_parents(root, path, parents, level, &ref->key_for_search,
+        ret = add_all_parents(root, path, parents, ref, level, time_seq,
-                                time_seq, ref->wanted_disk_byte,
+                              extent_item_pos);
-                                extent_item_pos);
 out:
        path->lowest_level = 0;
        btrfs_release_path(path);
@@ -376,10 +401,16 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
                        continue;
                err = __resolve_indirect_ref(fs_info, path, time_seq, ref,
                                             parents, extent_item_pos);
-                if (err == -ENOMEM)
+                /*
-                        goto out;
+                 * we can only tolerate ENOENT,otherwise,we should catch error
-                if (err)
+                 * and return directly.
+                 */
+                if (err == -ENOENT) {
                        continue;
+                } else if (err) {
+                        ret = err;
+                        goto out;
+                }
                /* we put the first parent into the ref at hand */
                ULIST_ITER_INIT(&uiter);
@@ -538,14 +569,13 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
        if (extent_op && extent_op->update_key)
                btrfs_disk_key_to_cpu(&op_key, &extent_op->key);
-        while ((n = rb_prev(n))) {
+        spin_lock(&head->lock);
+        n = rb_first(&head->ref_root);
+        while (n) {
                struct btrfs_delayed_ref_node *node;
                node = rb_entry(n, struct btrfs_delayed_ref_node,
                                rb_node);
-                if (node->bytenr != head->node.bytenr)
+                n = rb_next(n);
-                        break;
-                WARN_ON(node->is_head);
                if (node->seq > seq)
                        continue;
@@ -612,10 +642,10 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
                        WARN_ON(1);
                }
                if (ret)
-                        return ret;
+                        break;
        }
+        spin_unlock(&head->lock);
-        return 0;
+        return ret;
 }
 /*
@@ -828,6 +858,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
        struct list_head prefs_delayed;
        struct list_head prefs;
        struct __prelim_ref *ref;
+        struct extent_inode_elem *eie = NULL;
        INIT_LIST_HEAD(&prefs);
        INIT_LIST_HEAD(&prefs_delayed);
@@ -882,15 +913,15 @@ again:
                                btrfs_put_delayed_ref(&head->node);
                                goto again;
                        }
+                        spin_unlock(&delayed_refs->lock);
                        ret = __add_delayed_refs(head, time_seq,
                                                 &prefs_delayed);
                        mutex_unlock(&head->mutex);
-                        if (ret) {
+                        if (ret)
-                                spin_unlock(&delayed_refs->lock);
                                goto out;
-                        }
+                } else {
+                        spin_unlock(&delayed_refs->lock);
                }
-                spin_unlock(&delayed_refs->lock);
        }
        if (path->slots[0]) {
@@ -941,7 +972,6 @@ again:
                                goto out;
                }
                if (ref->count && ref->parent) {
-                        struct extent_inode_elem *eie = NULL;
                        if (extent_item_pos && !ref->inode_list) {
                                u32 bsz;
                                struct extent_buffer *eb;
@@ -976,6 +1006,7 @@ again:
                                        eie = eie->next;
                                eie->next = ref->inode_list;
                        }
+                        eie = NULL;
                }
                list_del(&ref->list);
                kmem_cache_free(btrfs_prelim_ref_cache, ref);
@@ -994,7 +1025,8 @@ out:
                list_del(&ref->list);
                kmem_cache_free(btrfs_prelim_ref_cache, ref);
        }
+        if (ret < 0)
+                free_inode_elem_list(eie);
        return ret;
 }
@@ -1002,7 +1034,6 @@ static void free_leaf_list(struct ulist *blocks)
 {
        struct ulist_node *node = NULL;
        struct extent_inode_elem *eie;
-        struct extent_inode_elem *eie_next;
        struct ulist_iterator uiter;
        ULIST_ITER_INIT(&uiter);
@@ -1010,10 +1041,7 @@ static void free_leaf_list(struct ulist *blocks)
                if (!node->aux)
                        continue;
                eie = (struct extent_inode_elem *)(uintptr_t)node->aux;
-                for (; eie; eie = eie_next) {
+                free_inode_elem_list(eie);
-                        eie_next = eie->next;
-                        kfree(eie);
-                }
                node->aux = 0;
        }
@@ -1101,44 +1129,13 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
                if (!node)
                        break;
                bytenr = node->val;
+                cond_resched();
        }
        ulist_free(tmp);
        return 0;
 }
-static int __inode_info(u64 inum, u64 ioff, u8 key_type,
-                        struct btrfs_root *fs_root, struct btrfs_path *path,
-                        struct btrfs_key *found_key)
-{
-        int ret;
-        struct btrfs_key key;
-        struct extent_buffer *eb;
-        key.type = key_type;
-        key.objectid = inum;
-        key.offset = ioff;
-        ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
-        if (ret < 0)
-                return ret;
-        eb = path->nodes[0];
-        if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
-                ret = btrfs_next_leaf(fs_root, path);
-                if (ret)
-                        return ret;
-                eb = path->nodes[0];
-        }
-        btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
-        if (found_key->type != key.type || found_key->objectid != key.objectid)
-                return 1;
-        return 0;
-}
 /*
 * this makes the path point to (inum INODE_ITEM ioff)
 */
@@ -1146,16 +1143,16 @@ int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
                        struct btrfs_path *path)
 {
        struct btrfs_key key;
-        return __inode_info(inum, ioff, BTRFS_INODE_ITEM_KEY, fs_root, path,
+        return btrfs_find_item(fs_root, path, inum, ioff,
-                                &key);
+                        BTRFS_INODE_ITEM_KEY, &key);
 }
 static int inode_ref_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
                                struct btrfs_path *path,
                                struct btrfs_key *found_key)
 {
-        return __inode_info(inum, ioff, BTRFS_INODE_REF_KEY, fs_root, path,
+        return btrfs_find_item(fs_root, path, inum, ioff,
-                                found_key);
+                        BTRFS_INODE_REF_KEY, found_key);
 }
 int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
@@ -1335,20 +1332,45 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
        ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
        if (ret < 0)
                return ret;
-        ret = btrfs_previous_item(fs_info->extent_root, path,
-                                        0, BTRFS_EXTENT_ITEM_KEY);
-        if (ret < 0)
-                return ret;
-        btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]);
+        while (1) {
+                u32 nritems;
+                if (path->slots[0] == 0) {
+                        btrfs_set_path_blocking(path);
+                        ret = btrfs_prev_leaf(fs_info->extent_root, path);
+                        if (ret != 0) {
+                                if (ret > 0) {
+                                        pr_debug("logical %llu is not within "
+                                                 "any extent\n", logical);
+                                        ret = -ENOENT;
+                                }
+                                return ret;
+                        }
+                } else {
+                        path->slots[0]--;
+                }
+                nritems = btrfs_header_nritems(path->nodes[0]);
+                if (nritems == 0) {
+                        pr_debug("logical %llu is not within any extent\n",
+                                 logical);
+                        return -ENOENT;
+                }
+                if (path->slots[0] == nritems)
+                        path->slots[0]--;
+                btrfs_item_key_to_cpu(path->nodes[0], found_key,
+                                      path->slots[0]);
+                if (found_key->type == BTRFS_EXTENT_ITEM_KEY ||
+                    found_key->type == BTRFS_METADATA_ITEM_KEY)
+                        break;
+        }
        if (found_key->type == BTRFS_METADATA_ITEM_KEY)
                size = fs_info->extent_root->leafsize;
        else if (found_key->type == BTRFS_EXTENT_ITEM_KEY)
                size = found_key->offset;
-        if ((found_key->type != BTRFS_EXTENT_ITEM_KEY &&
+        if (found_key->objectid > logical ||
-             found_key->type != BTRFS_METADATA_ITEM_KEY) ||
-            found_key->objectid > logical ||
            found_key->objectid + size <= logical) {
                pr_debug("logical %llu is not within any extent\n", logical);
                return -ENOENT;
@@ -1601,7 +1623,6 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
        struct btrfs_key found_key;
        while (!ret) {
-                path->leave_spinning = 1;
                ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path,
                                     &found_key);
                if (ret < 0)
@@ -1614,9 +1635,12 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
                parent = found_key.offset;
                slot = path->slots[0];
-                eb = path->nodes[0];
+                eb = btrfs_clone_extent_buffer(path->nodes[0]);
-                /* make sure we can use eb after releasing the path */
+                if (!eb) {
-                atomic_inc(&eb->refs);
+                        ret = -ENOMEM;
+                        break;
+                }
+                extent_buffer_get(eb);
                btrfs_tree_read_lock(eb);
                btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
                btrfs_release_path(path);
@@ -1674,17 +1698,20 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root,
                ++found;
                slot = path->slots[0];
-                eb = path->nodes[0];
+                eb = btrfs_clone_extent_buffer(path->nodes[0]);
-                /* make sure we can use eb after releasing the path */
+                if (!eb) {
-                atomic_inc(&eb->refs);
+                        ret = -ENOMEM;
+                        break;
+                }
+                extent_buffer_get(eb);
                btrfs_tree_read_lock(eb);
                btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
                btrfs_release_path(path);
                leaf = path->nodes[0];
-                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+                item_size = btrfs_item_size_nr(leaf, slot);
-                ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+                ptr = btrfs_item_ptr_offset(leaf, slot);
                cur_offset = 0;
                while (cur_offset < item_size) {
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index ac0b39db27d1..8fed2125689e 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -43,6 +43,7 @@
 #define BTRFS_INODE_COPY_EVERYTHING             8
 #define BTRFS_INODE_IN_DELALLOC_LIST            9
 #define BTRFS_INODE_READDIO_NEED_LOCK           10
+#define BTRFS_INODE_HAS_PROPS                   11
 /* in memory btrfs inode */
 struct btrfs_inode {
@@ -135,6 +136,9 @@ struct btrfs_inode {
         */
        u64 index_cnt;
+        /* Cache the directory index number to speed the dir/file remove */
+        u64 dir_index;
        /* the fsync log has some corner cases that mean we have to check
         * directories to see if any unlinks have been done before
         * the directory was logged.  See tree-log.c for all the
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 131d82800b3a..0e8388e72d8d 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -92,11 +92,11 @@
 #include <linux/slab.h>
 #include <linux/buffer_head.h>
 #include <linux/mutex.h>
-#include <linux/crc32c.h>
 #include <linux/genhd.h>
 #include <linux/blkdev.h>
 #include "ctree.h"
 #include "disk-io.h"
+#include "hash.h"
 #include "transaction.h"
 #include "extent_io.h"
 #include "volumes.h"
@@ -1456,10 +1456,14 @@ static int btrfsic_handle_extent_data(
        btrfsic_read_from_block_data(block_ctx, &file_extent_item,
                                     file_extent_item_offset,
                                     sizeof(struct btrfs_file_extent_item));
-        next_bytenr = btrfs_stack_file_extent_disk_bytenr(&file_extent_item) +
+        next_bytenr = btrfs_stack_file_extent_disk_bytenr(&file_extent_item);
-                      btrfs_stack_file_extent_offset(&file_extent_item);
+        if (btrfs_stack_file_extent_compression(&file_extent_item) ==
-        generation = btrfs_stack_file_extent_generation(&file_extent_item);
+            BTRFS_COMPRESS_NONE) {
-        num_bytes = btrfs_stack_file_extent_num_bytes(&file_extent_item);
+                next_bytenr += btrfs_stack_file_extent_offset(&file_extent_item);
+                num_bytes = btrfs_stack_file_extent_num_bytes(&file_extent_item);
+        } else {
+                num_bytes = btrfs_stack_file_extent_disk_num_bytes(&file_extent_item);
+        }
        generation = btrfs_stack_file_extent_generation(&file_extent_item);
        if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
@@ -1695,7 +1699,7 @@ static int btrfsic_read_block(struct btrfsic_state *state,
                        return -1;
                }
                bio->bi_bdev = block_ctx->dev->bdev;
-                bio->bi_sector = dev_bytenr >> 9;
+                bio->bi_iter.bi_sector = dev_bytenr >> 9;
                for (j = i; j < num_pages; j++) {
                        ret = bio_add_page(bio, block_ctx->pagev[j],
@@ -1819,7 +1823,7 @@ static int btrfsic_test_for_metadata(struct btrfsic_state *state,
                size_t sublen = i ? PAGE_CACHE_SIZE :
                                    (PAGE_CACHE_SIZE - BTRFS_CSUM_SIZE);
-                crc = crc32c(crc, data, sublen);
+                crc = btrfs_crc32c(crc, data, sublen);
        }
        btrfs_csum_final(crc, csum);
        if (memcmp(csum, h->csum, state->csum_size))
@@ -3013,7 +3017,7 @@ static void __btrfsic_submit_bio(int rw, struct bio *bio)
                int bio_is_patched;
                char **mapped_datav;
-                dev_bytenr = 512 * bio->bi_sector;
+                dev_bytenr = 512 * bio->bi_iter.bi_sector;
                bio_is_patched = 0;
                if (dev_state->state->print_mask &
                    BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
@@ -3021,8 +3025,8 @@ static void __btrfsic_submit_bio(int rw, struct bio *bio)
                               "submit_bio(rw=0x%x, bi_vcnt=%u,"
                               " bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n",
                               rw, bio->bi_vcnt,
-                               (unsigned long long)bio->bi_sector, dev_bytenr,
+                               (unsigned long long)bio->bi_iter.bi_sector,
-                               bio->bi_bdev);
+                               dev_bytenr, bio->bi_bdev);
                mapped_datav = kmalloc(sizeof(*mapped_datav) * bio->bi_vcnt,
                                       GFP_NOFS);
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 1499b27b4186..b01fb6c527e3 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -128,11 +128,10 @@ static int check_compressed_csum(struct inode *inode,
                kunmap_atomic(kaddr);
                if (csum != *cb_sum) {
-                        printk(KERN_INFO "btrfs csum failed ino %llu "
+                        btrfs_info(BTRFS_I(inode)->root->fs_info,
-                               "extent %llu csum %u "
+                           "csum failed ino %llu extent %llu csum %u wanted %u mirror %d",
-                               "wanted %u mirror %d\n",
+                           btrfs_ino(inode), disk_start, csum, *cb_sum,
-                               btrfs_ino(inode), disk_start, csum, *cb_sum,
+                           cb->mirror_num);
-                               cb->mirror_num);
                        ret = -EIO;
                        goto fail;
                }
@@ -172,7 +171,8 @@ static void end_compressed_bio_read(struct bio *bio, int err)
                goto out;
        inode = cb->inode;
-        ret = check_compressed_csum(inode, cb, (u64)bio->bi_sector << 9);
+        ret = check_compressed_csum(inode, cb,
+                                    (u64)bio->bi_iter.bi_sector << 9);
        if (ret)
                goto csum_failed;
@@ -201,18 +201,16 @@ csum_failed:
        if (cb->errors) {
                bio_io_error(cb->orig_bio);
        } else {
-                int bio_index = 0;
+                int i;
-                struct bio_vec *bvec = cb->orig_bio->bi_io_vec;
+                struct bio_vec *bvec;
                /*
                 * we have verified the checksum already, set page
                 * checked so the end_io handlers know about it
                 */
-                while (bio_index < cb->orig_bio->bi_vcnt) {
+                bio_for_each_segment_all(bvec, cb->orig_bio, i)
                        SetPageChecked(bvec->bv_page);
-                        bvec++;
-                        bio_index++;
-                }
                bio_endio(cb->orig_bio, 0);
        }
@@ -372,7 +370,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
        for (pg_index = 0; pg_index < cb->nr_pages; pg_index++) {
                page = compressed_pages[pg_index];
                page->mapping = inode->i_mapping;
-                if (bio->bi_size)
+                if (bio->bi_iter.bi_size)
                        ret = io_tree->ops->merge_bio_hook(WRITE, page, 0,
                                                           PAGE_CACHE_SIZE,
                                                           bio, 0);
@@ -412,7 +410,8 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
                        bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
                }
                if (bytes_left < PAGE_CACHE_SIZE) {
-                        printk("bytes left %lu compress len %lu nr %lu\n",
+                        btrfs_info(BTRFS_I(inode)->root->fs_info,
+                                        "bytes left %lu compress len %lu nr %lu",
                               bytes_left, cb->compressed_len, cb->nr_pages);
                }
                bytes_left -= PAGE_CACHE_SIZE;
@@ -506,7 +505,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
                if (!em || last_offset < em->start ||
                    (last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) ||
-                    (em->block_start >> 9) != cb->orig_bio->bi_sector) {
+                    (em->block_start >> 9) != cb->orig_bio->bi_iter.bi_sector) {
                        free_extent_map(em);
                        unlock_extent(tree, last_offset, end);
                        unlock_page(page);
@@ -552,7 +551,7 @@ next:
 * in it.  We don't actually do IO on those pages but allocate new ones
 * to hold the compressed pages on disk.
 *
- * bio->bi_sector points to the compressed extent on disk
+ * bio->bi_iter.bi_sector points to the compressed extent on disk
 * bio->bi_io_vec points to all of the inode pages
 * bio->bi_vcnt is a count of pages
 *
@@ -573,7 +572,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
        struct page *page;
        struct block_device *bdev;
        struct bio *comp_bio;
-        u64 cur_disk_byte = (u64)bio->bi_sector << 9;
+        u64 cur_disk_byte = (u64)bio->bi_iter.bi_sector << 9;
        u64 em_len;
        u64 em_start;
        struct extent_map *em;
@@ -659,7 +658,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
                page->mapping = inode->i_mapping;
                page->index = em_start >> PAGE_CACHE_SHIFT;
-                if (comp_bio->bi_size)
+                if (comp_bio->bi_iter.bi_size)
                        ret = tree->ops->merge_bio_hook(READ, page, 0,
                                                        PAGE_CACHE_SIZE,
                                                        comp_bio, 0);
@@ -687,8 +686,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
                                                        comp_bio, sums);
                                BUG_ON(ret); /* -ENOMEM */
                        }
-                        sums += (comp_bio->bi_size + root->sectorsize - 1) /
+                        sums += (comp_bio->bi_iter.bi_size +
-                                root->sectorsize;
+                                 root->sectorsize - 1) / root->sectorsize;
                        ret = btrfs_map_bio(root, READ, comp_bio,
                                            mirror_num, 0);
@@ -1011,6 +1010,8 @@ int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
                bytes = min(bytes, working_bytes);
                kaddr = kmap_atomic(page_out);
                memcpy(kaddr + *pg_offset, buf + buf_offset, bytes);
+                if (*pg_index == (vcnt - 1) && *pg_offset == 0)
+                        memset(kaddr + bytes, 0, PAGE_CACHE_SIZE - bytes);
                kunmap_atomic(kaddr);
                flush_dcache_page(page_out);
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 316136bd6dd7..cbd3a7d6fa68 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -39,9 +39,8 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
                              struct extent_buffer *src_buf);
 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
                    int level, int slot);
-static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
+static int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
                                 struct extent_buffer *eb);
-static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
 struct btrfs_path *btrfs_alloc_path(void)
 {
@@ -475,6 +474,8 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
 * the index is the shifted logical of the *new* root node for root replace
 * operations, or the shifted logical of the affected block for all other
 * operations.
+ *
+ * Note: must be called with write lock (tree_mod_log_write_lock).
 */
 static noinline int
 __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
@@ -483,24 +484,9 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
        struct rb_node **new;
        struct rb_node *parent = NULL;
        struct tree_mod_elem *cur;
-        int ret = 0;
        BUG_ON(!tm);
-        tree_mod_log_write_lock(fs_info);
-        if (list_empty(&fs_info->tree_mod_seq_list)) {
-                tree_mod_log_write_unlock(fs_info);
-                /*
-                 * Ok we no longer care about logging modifications, free up tm
-                 * and return 0.  Any callers shouldn't be using tm after
-                 * calling tree_mod_log_insert, but if they do we can just
-                 * change this to return a special error code to let the callers
-                 * do their own thing.
-                 */
-                kfree(tm);
-                return 0;
-        }
        spin_lock(&fs_info->tree_mod_seq_lock);
        tm->seq = btrfs_inc_tree_mod_seq_minor(fs_info);
        spin_unlock(&fs_info->tree_mod_seq_lock);
@@ -518,18 +504,13 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
                        new = &((*new)->rb_left);
                else if (cur->seq > tm->seq)
                        new = &((*new)->rb_right);
-                else {
+                else
-                        ret = -EEXIST;
+                        return -EEXIST;
-                        kfree(tm);
-                        goto out;
-                }
        }
        rb_link_node(&tm->node, parent, new);
        rb_insert_color(&tm->node, tm_root);
-out:
+        return 0;
-        tree_mod_log_write_unlock(fs_info);
-        return ret;
 }
 /*
@@ -545,19 +526,38 @@ static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
                return 1;
        if (eb && btrfs_header_level(eb) == 0)
                return 1;
+        tree_mod_log_write_lock(fs_info);
+        if (list_empty(&(fs_info)->tree_mod_seq_list)) {
+                tree_mod_log_write_unlock(fs_info);
+                return 1;
+        }
        return 0;
 }
-static inline int
+/* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
-__tree_mod_log_insert_key(struct btrfs_fs_info *fs_info,
+static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info,
-                          struct extent_buffer *eb, int slot,
+                                    struct extent_buffer *eb)
-                          enum mod_log_op op, gfp_t flags)
+{
+        smp_mb();
+        if (list_empty(&(fs_info)->tree_mod_seq_list))
+                return 0;
+        if (eb && btrfs_header_level(eb) == 0)
+                return 0;
+        return 1;
+}
+static struct tree_mod_elem *
+alloc_tree_mod_elem(struct extent_buffer *eb, int slot,
+                    enum mod_log_op op, gfp_t flags)
 {
        struct tree_mod_elem *tm;
        tm = kzalloc(sizeof(*tm), flags);
        if (!tm)
-                return -ENOMEM;
+                return NULL;
        tm->index = eb->start >> PAGE_CACHE_SHIFT;
        if (op != MOD_LOG_KEY_ADD) {
@@ -567,8 +567,9 @@ __tree_mod_log_insert_key(struct btrfs_fs_info *fs_info,
        tm->op = op;
        tm->slot = slot;
        tm->generation = btrfs_node_ptr_generation(eb, slot);
+        RB_CLEAR_NODE(&tm->node);
-        return __tree_mod_log_insert(fs_info, tm);
+        return tm;
 }
 static noinline int
@@ -576,10 +577,27 @@ tree_mod_log_insert_key(struct btrfs_fs_info *fs_info,
                        struct extent_buffer *eb, int slot,
                        enum mod_log_op op, gfp_t flags)
 {
-        if (tree_mod_dont_log(fs_info, eb))
+        struct tree_mod_elem *tm;
+        int ret;
+        if (!tree_mod_need_log(fs_info, eb))
                return 0;
-        return __tree_mod_log_insert_key(fs_info, eb, slot, op, flags);
+        tm = alloc_tree_mod_elem(eb, slot, op, flags);
+        if (!tm)
+                return -ENOMEM;
+        if (tree_mod_dont_log(fs_info, eb)) {
+                kfree(tm);
+                return 0;
+        }
+        ret = __tree_mod_log_insert(fs_info, tm);
+        tree_mod_log_write_unlock(fs_info);
+        if (ret)
+                kfree(tm);
+        return ret;
 }
 static noinline int
@@ -587,53 +605,95 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
                         struct extent_buffer *eb, int dst_slot, int src_slot,
                         int nr_items, gfp_t flags)
 {
-        struct tree_mod_elem *tm;
+        struct tree_mod_elem *tm = NULL;
-        int ret;
+        struct tree_mod_elem **tm_list = NULL;
+        int ret = 0;
        int i;
+        int locked = 0;
-        if (tree_mod_dont_log(fs_info, eb))
+        if (!tree_mod_need_log(fs_info, eb))
                return 0;
+        tm_list = kzalloc(nr_items * sizeof(struct tree_mod_elem *), flags);
+        if (!tm_list)
+                return -ENOMEM;
+        tm = kzalloc(sizeof(*tm), flags);
+        if (!tm) {
+                ret = -ENOMEM;
+                goto free_tms;
+        }
+        tm->index = eb->start >> PAGE_CACHE_SHIFT;
+        tm->slot = src_slot;
+        tm->move.dst_slot = dst_slot;
+        tm->move.nr_items = nr_items;
+        tm->op = MOD_LOG_MOVE_KEYS;
+        for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
+                tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
+                    MOD_LOG_KEY_REMOVE_WHILE_MOVING, flags);
+                if (!tm_list[i]) {
+                        ret = -ENOMEM;
+                        goto free_tms;
+                }
+        }
+        if (tree_mod_dont_log(fs_info, eb))
+                goto free_tms;
+        locked = 1;
        /*
         * When we override something during the move, we log these removals.
         * This can only happen when we move towards the beginning of the
         * buffer, i.e. dst_slot < src_slot.
         */
        for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
-                ret = __tree_mod_log_insert_key(fs_info, eb, i + dst_slot,
+                ret = __tree_mod_log_insert(fs_info, tm_list[i]);
-                                MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS);
+                if (ret)
-                BUG_ON(ret < 0);
+                        goto free_tms;
        }
-        tm = kzalloc(sizeof(*tm), flags);
+        ret = __tree_mod_log_insert(fs_info, tm);
-        if (!tm)
+        if (ret)
-                return -ENOMEM;
+                goto free_tms;
+        tree_mod_log_write_unlock(fs_info);
+        kfree(tm_list);
-        tm->index = eb->start >> PAGE_CACHE_SHIFT;
+        return 0;
-        tm->slot = src_slot;
+free_tms:
-        tm->move.dst_slot = dst_slot;
+        for (i = 0; i < nr_items; i++) {
-        tm->move.nr_items = nr_items;
+                if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
-        tm->op = MOD_LOG_MOVE_KEYS;
+                        rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
+                kfree(tm_list[i]);
+        }
+        if (locked)
+                tree_mod_log_write_unlock(fs_info);
+        kfree(tm_list);
+        kfree(tm);
-        return __tree_mod_log_insert(fs_info, tm);
+        return ret;
 }
-static inline void
+static inline int
-__tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
+__tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
+                       struct tree_mod_elem **tm_list,
+                       int nritems)
 {
-        int i;
+        int i, j;
-        u32 nritems;
        int ret;
-        if (btrfs_header_level(eb) == 0)
-                return;
-        nritems = btrfs_header_nritems(eb);
        for (i = nritems - 1; i >= 0; i--) {
-                ret = __tree_mod_log_insert_key(fs_info, eb, i,
+                ret = __tree_mod_log_insert(fs_info, tm_list[i]);
-                                MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
+                if (ret) {
-                BUG_ON(ret < 0);
+                        for (j = nritems - 1; j > i; j--)
+                                rb_erase(&tm_list[j]->node,
+                                         &fs_info->tree_mod_log);
+                        return ret;
+                }
        }
+        return 0;
 }
 static noinline int
@@ -642,17 +702,38 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
                         struct extent_buffer *new_root, gfp_t flags,
                         int log_removal)
 {
-        struct tree_mod_elem *tm;
+        struct tree_mod_elem *tm = NULL;
+        struct tree_mod_elem **tm_list = NULL;
+        int nritems = 0;
+        int ret = 0;
+        int i;
-        if (tree_mod_dont_log(fs_info, NULL))
+        if (!tree_mod_need_log(fs_info, NULL))
                return 0;
-        if (log_removal)
+        if (log_removal && btrfs_header_level(old_root) > 0) {
-                __tree_mod_log_free_eb(fs_info, old_root);
+                nritems = btrfs_header_nritems(old_root);
+                tm_list = kzalloc(nritems * sizeof(struct tree_mod_elem *),
+                                  flags);
+                if (!tm_list) {
+                        ret = -ENOMEM;
+                        goto free_tms;
+                }
+                for (i = 0; i < nritems; i++) {
+                        tm_list[i] = alloc_tree_mod_elem(old_root, i,
+                            MOD_LOG_KEY_REMOVE_WHILE_FREEING, flags);
+                        if (!tm_list[i]) {
+                                ret = -ENOMEM;
+                                goto free_tms;
+                        }
+                }
+        }
        tm = kzalloc(sizeof(*tm), flags);
-        if (!tm)
+        if (!tm) {
-                return -ENOMEM;
+                ret = -ENOMEM;
+                goto free_tms;
+        }
        tm->index = new_root->start >> PAGE_CACHE_SHIFT;
        tm->old_root.logical = old_root->start;
@@ -660,7 +741,30 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
        tm->generation = btrfs_header_generation(old_root);
        tm->op = MOD_LOG_ROOT_REPLACE;
-        return __tree_mod_log_insert(fs_info, tm);
+        if (tree_mod_dont_log(fs_info, NULL))
+                goto free_tms;
+        if (tm_list)
+                ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
+        if (!ret)
+                ret = __tree_mod_log_insert(fs_info, tm);
+        tree_mod_log_write_unlock(fs_info);
+        if (ret)
+                goto free_tms;
+        kfree(tm_list);
+        return ret;
+free_tms:
+        if (tm_list) {
+                for (i = 0; i < nritems; i++)
+                        kfree(tm_list[i]);
+                kfree(tm_list);
+        }
+        kfree(tm);
+        return ret;
 }
 static struct tree_mod_elem *
@@ -729,31 +833,75 @@ tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
        return __tree_mod_log_search(fs_info, start, min_seq, 0);
 }
-static noinline void
+static noinline int
 tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
                     struct extent_buffer *src, unsigned long dst_offset,
                     unsigned long src_offset, int nr_items)
 {
-        int ret;
+        int ret = 0;
+        struct tree_mod_elem **tm_list = NULL;
+        struct tree_mod_elem **tm_list_add, **tm_list_rem;
        int i;
+        int locked = 0;
-        if (tree_mod_dont_log(fs_info, NULL))
+        if (!tree_mod_need_log(fs_info, NULL))
-                return;
+                return 0;
        if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
-                return;
+                return 0;
+        tm_list = kzalloc(nr_items * 2 * sizeof(struct tree_mod_elem *),
+                          GFP_NOFS);
+        if (!tm_list)
+                return -ENOMEM;
+        tm_list_add = tm_list;
+        tm_list_rem = tm_list + nr_items;
        for (i = 0; i < nr_items; i++) {
-                ret = __tree_mod_log_insert_key(fs_info, src,
+                tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
-                                                i + src_offset,
+                    MOD_LOG_KEY_REMOVE, GFP_NOFS);
-                                                MOD_LOG_KEY_REMOVE, GFP_NOFS);
+                if (!tm_list_rem[i]) {
-                BUG_ON(ret < 0);
+                        ret = -ENOMEM;
-                ret = __tree_mod_log_insert_key(fs_info, dst,
+                        goto free_tms;
-                                                     i + dst_offset,
+                }
-                                                     MOD_LOG_KEY_ADD,
-                                                     GFP_NOFS);
+                tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
-                BUG_ON(ret < 0);
+                    MOD_LOG_KEY_ADD, GFP_NOFS);
+                if (!tm_list_add[i]) {
+                        ret = -ENOMEM;
+                        goto free_tms;
+                }
        }
+        if (tree_mod_dont_log(fs_info, NULL))
+                goto free_tms;
+        locked = 1;
+        for (i = 0; i < nr_items; i++) {
+                ret = __tree_mod_log_insert(fs_info, tm_list_rem[i]);
+                if (ret)
+                        goto free_tms;
+                ret = __tree_mod_log_insert(fs_info, tm_list_add[i]);
+                if (ret)
+                        goto free_tms;
+        }
+        tree_mod_log_write_unlock(fs_info);
+        kfree(tm_list);
+        return 0;
+free_tms:
+        for (i = 0; i < nr_items * 2; i++) {
+                if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
+                        rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
+                kfree(tm_list[i]);
+        }
+        if (locked)
+                tree_mod_log_write_unlock(fs_info);
+        kfree(tm_list);
+        return ret;
 }
 static inline void
@@ -772,18 +920,58 @@ tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
 {
        int ret;
-        ret = __tree_mod_log_insert_key(fs_info, eb, slot,
+        ret = tree_mod_log_insert_key(fs_info, eb, slot,
                                        MOD_LOG_KEY_REPLACE,
                                        atomic ? GFP_ATOMIC : GFP_NOFS);
        BUG_ON(ret < 0);
 }
-static noinline void
+static noinline int
 tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
 {
+        struct tree_mod_elem **tm_list = NULL;
+        int nritems = 0;
+        int i;
+        int ret = 0;
+        if (btrfs_header_level(eb) == 0)
+                return 0;
+        if (!tree_mod_need_log(fs_info, NULL))
+                return 0;
+        nritems = btrfs_header_nritems(eb);
+        tm_list = kzalloc(nritems * sizeof(struct tree_mod_elem *),
+                          GFP_NOFS);
+        if (!tm_list)
+                return -ENOMEM;
+        for (i = 0; i < nritems; i++) {
+                tm_list[i] = alloc_tree_mod_elem(eb, i,
+                    MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
+                if (!tm_list[i]) {
+                        ret = -ENOMEM;
+                        goto free_tms;
+                }
+        }
        if (tree_mod_dont_log(fs_info, eb))
-                return;
+                goto free_tms;
-        __tree_mod_log_free_eb(fs_info, eb);
+        ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
+        tree_mod_log_write_unlock(fs_info);
+        if (ret)
+                goto free_tms;
+        kfree(tm_list);
+        return 0;
+free_tms:
+        for (i = 0; i < nritems; i++)
+                kfree(tm_list[i]);
+        kfree(tm_list);
+        return ret;
 }
 static noinline void
@@ -1041,8 +1229,13 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
                btrfs_set_node_ptr_generation(parent, parent_slot,
                                              trans->transid);
                btrfs_mark_buffer_dirty(parent);
-                if (last_ref)
+                if (last_ref) {
-                        tree_mod_log_free_eb(root->fs_info, buf);
+                        ret = tree_mod_log_free_eb(root->fs_info, buf);
+                        if (ret) {
+                                btrfs_abort_transaction(trans, root, ret);
+                                return ret;
+                        }
+                }
                btrfs_free_tree_block(trans, root, buf, parent_start,
                                      last_ref);
        }
@@ -1287,8 +1480,8 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
                old = read_tree_block(root, logical, blocksize, 0);
                if (WARN_ON(!old || !extent_buffer_uptodate(old))) {
                        free_extent_buffer(old);
-                        pr_warn("btrfs: failed to read tree block %llu from get_old_root\n",
+                        btrfs_warn(root->fs_info,
-                                logical);
+                                "failed to read tree block %llu from get_old_root", logical);
                } else {
                        eb = btrfs_clone_extent_buffer(old);
                        free_extent_buffer(old);
@@ -2462,6 +2655,49 @@ static int key_search(struct extent_buffer *b, struct btrfs_key *key,
        return 0;
 }
+int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *found_path,
+                u64 iobjectid, u64 ioff, u8 key_type,
+                struct btrfs_key *found_key)
+{
+        int ret;
+        struct btrfs_key key;
+        struct extent_buffer *eb;
+        struct btrfs_path *path;
+        key.type = key_type;
+        key.objectid = iobjectid;
+        key.offset = ioff;
+        if (found_path == NULL) {
+                path = btrfs_alloc_path();
+                if (!path)
+                        return -ENOMEM;
+        } else
+                path = found_path;
+        ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
+        if ((ret < 0) || (found_key == NULL)) {
+                if (path != found_path)
+                        btrfs_free_path(path);
+                return ret;
+        }
+        eb = path->nodes[0];
+        if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
+                ret = btrfs_next_leaf(fs_root, path);
+                if (ret)
+                        return ret;
+                eb = path->nodes[0];
+        }
+        btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
+        if (found_key->type != key.type ||
+                        found_key->objectid != key.objectid)
+                return 1;
+        return 0;
+}
 /*
 * look for key in the tree.  path is filled in with nodes along the way
 * if key is found, we return zero and you can find the item in the leaf
@@ -2495,6 +2731,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
        lowest_level = p->lowest_level;
        WARN_ON(lowest_level && ins_len > 0);
        WARN_ON(p->nodes[0] != NULL);
+        BUG_ON(!cow && ins_len);
        if (ins_len < 0) {
                lowest_unlock = 2;
@@ -2603,8 +2840,6 @@ again:
                        }
                }
 cow_done:
-                BUG_ON(!cow && ins_len);
                p->nodes[level] = b;
                btrfs_clear_path_blocking(p, NULL, 0);
@@ -2614,13 +2849,19 @@ cow_done:
                 * It is safe to drop the lock on our parent before we
                 * go through the expensive btree search on b.
                 *
-                 * If cow is true, then we might be changing slot zero,
+                 * If we're inserting or deleting (ins_len != 0), then we might
-                 * which may require changing the parent.  So, we can't
+                 * be changing slot zero, which may require changing the parent.
-                 * drop the lock until after we know which slot we're
+                 * So, we can't drop the lock until after we know which slot
-                 * operating on.
+                 * we're operating on.
                 */
-                if (!cow)
+                if (!ins_len && !p->keep_locks) {
-                        btrfs_unlock_up_safe(p, level + 1);
+                        int u = level + 1;
+                        if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
+                                btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
+                                p->locks[u] = 0;
+                        }
+                }
                ret = key_search(b, key, level, &prev_cmp, &slot);
@@ -2648,7 +2889,7 @@ cow_done:
                         * which means we must have a write lock
                         * on the parent
                         */
-                        if (slot == 0 && cow &&
+                        if (slot == 0 && ins_len &&
                            write_lock_level < level + 1) {
                                write_lock_level = level + 1;
                                btrfs_release_path(p);
@@ -2901,7 +3142,9 @@ again:
                        if (ret < 0)
                                return ret;
                        if (!ret) {
-                                p->slots[0] = btrfs_header_nritems(leaf) - 1;
+                                leaf = p->nodes[0];
+                                if (p->slots[0] == btrfs_header_nritems(leaf))
+                                        p->slots[0]--;
                                return 0;
                        }
                        if (!return_any)
@@ -3022,8 +3265,12 @@ static int push_node_left(struct btrfs_trans_handle *trans,
        } else
                push_items = min(src_nritems - 8, push_items);
-        tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0,
+        ret = tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0,
-                             push_items);
+                                   push_items);
+        if (ret) {
+                btrfs_abort_transaction(trans, root, ret);
+                return ret;
+        }
        copy_extent_buffer(dst, src,
                           btrfs_node_key_ptr_offset(dst_nritems),
                           btrfs_node_key_ptr_offset(0),
@@ -3093,8 +3340,12 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
                                      (dst_nritems) *
                                      sizeof(struct btrfs_key_ptr));
-        tree_mod_log_eb_copy(root->fs_info, dst, src, 0,
+        ret = tree_mod_log_eb_copy(root->fs_info, dst, src, 0,
-                             src_nritems - push_items, push_items);
+                                   src_nritems - push_items, push_items);
+        if (ret) {
+                btrfs_abort_transaction(trans, root, ret);
+                return ret;
+        }
        copy_extent_buffer(dst, src,
                           btrfs_node_key_ptr_offset(0),
                           btrfs_node_key_ptr_offset(src_nritems - push_items),
@@ -3295,7 +3546,12 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
                            btrfs_header_chunk_tree_uuid(split),
                            BTRFS_UUID_SIZE);
-        tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid);
+        ret = tree_mod_log_eb_copy(root->fs_info, split, c, 0,
+                                   mid, c_nritems - mid);
+        if (ret) {
+                btrfs_abort_transaction(trans, root, ret);
+                return ret;
+        }
        copy_extent_buffer(split, c,
                           btrfs_node_key_ptr_offset(0),
                           btrfs_node_key_ptr_offset(mid),
@@ -3362,8 +3618,8 @@ noinline int btrfs_leaf_free_space(struct btrfs_root *root,
        int ret;
        ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems);
        if (ret < 0) {
-                printk(KERN_CRIT "leaf free space ret %d, leaf data size %lu, "
+                btrfs_crit(root->fs_info,
-                       "used %d nritems %d\n",
+                        "leaf free space ret %d, leaf data size %lu, used %d nritems %d",
                       ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root),
                       leaf_space_used(leaf, 0, nritems), nritems);
        }
@@ -3571,6 +3827,19 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
        if (left_nritems == 0)
                goto out_unlock;
+        if (path->slots[0] == left_nritems && !empty) {
+                /* Key greater than all keys in the leaf, right neighbor has
+                 * enough room for it and we're not emptying our leaf to delete
+                 * it, therefore use right neighbor to insert the new item and
+                 * no need to touch/dirty our left leaft. */
+                btrfs_tree_unlock(left);
+                free_extent_buffer(left);
+                path->nodes[0] = right;
+                path->slots[0] = 0;
+                path->slots[1]++;
+                return 0;
+        }
        return __push_leaf_right(trans, root, path, min_data_size, empty,
                                right, free_space, left_nritems, min_slot);
 out_unlock:
@@ -3887,14 +4156,17 @@ static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
        int progress = 0;
        int slot;
        u32 nritems;
+        int space_needed = data_size;
        slot = path->slots[0];
+        if (slot < btrfs_header_nritems(path->nodes[0]))
+                space_needed -= btrfs_leaf_free_space(root, path->nodes[0]);
        /*
         * try to push all the items after our slot into the
         * right leaf
         */
-        ret = push_leaf_right(trans, root, path, 1, data_size, 0, slot);
+        ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
        if (ret < 0)
                return ret;
@@ -3914,7 +4186,7 @@ static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
        /* try to push all the items before our slot into the next leaf */
        slot = path->slots[0];
-        ret = push_leaf_left(trans, root, path, 1, data_size, 0, slot);
+        ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
        if (ret < 0)
                return ret;
@@ -3958,13 +4230,18 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
        /* first try to make some room by pushing left and right */
        if (data_size && path->nodes[1]) {
-                wret = push_leaf_right(trans, root, path, data_size,
+                int space_needed = data_size;
-                                       data_size, 0, 0);
+                if (slot < btrfs_header_nritems(l))
+                        space_needed -= btrfs_leaf_free_space(root, l);
+                wret = push_leaf_right(trans, root, path, space_needed,
+                                       space_needed, 0, 0);
                if (wret < 0)
                        return wret;
                if (wret) {
-                        wret = push_leaf_left(trans, root, path, data_size,
+                        wret = push_leaf_left(trans, root, path, space_needed,
-                                              data_size, 0, (u32)-1);
+                                              space_needed, 0, (u32)-1);
                        if (wret < 0)
                                return wret;
                }
@@ -4432,7 +4709,7 @@ void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path,
        BUG_ON(slot < 0);
        if (slot >= nritems) {
                btrfs_print_leaf(root, leaf);
-                printk(KERN_CRIT "slot %d too large, nritems %d\n",
+                btrfs_crit(root->fs_info, "slot %d too large, nritems %d",
                       slot, nritems);
                BUG_ON(1);
        }
@@ -4495,7 +4772,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
        if (btrfs_leaf_free_space(root, leaf) < total_size) {
                btrfs_print_leaf(root, leaf);
-                printk(KERN_CRIT "not enough freespace need %u have %d\n",
+                btrfs_crit(root->fs_info, "not enough freespace need %u have %d",
                       total_size, btrfs_leaf_free_space(root, leaf));
                BUG();
        }
@@ -4505,7 +4782,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
                if (old_data < data_end) {
                        btrfs_print_leaf(root, leaf);
-                        printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
+                        btrfs_crit(root->fs_info, "slot %d old_data %d data_end %d",
                               slot, old_data, data_end);
                        BUG_ON(1);
                }
@@ -4817,7 +5094,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 * This may release the path, and so you may lose any locks held at the
 * time you call it.
 */
-static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
+int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
 {
        struct btrfs_key key;
        struct btrfs_disk_key found_key;
@@ -5240,7 +5517,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
                        if (!left_start_ctransid || !right_start_ctransid) {
                                WARN(1, KERN_WARNING
-                                        "btrfs: btrfs_compare_tree detected "
+                                        "BTRFS: btrfs_compare_tree detected "
                                        "a change in one of the trees while "
                                        "iterating. This is probably a "
                                        "bug.\n");
@@ -5680,3 +5957,46 @@ int btrfs_previous_item(struct btrfs_root *root,
        }
        return 1;
 }
+/*
+ * search in extent tree to find a previous Metadata/Data extent item with
+ * min objecitd.
+ *
+ * returns 0 if something is found, 1 if nothing was found and < 0 on error
+ */
+int btrfs_previous_extent_item(struct btrfs_root *root,
+                        struct btrfs_path *path, u64 min_objectid)
+{
+        struct btrfs_key found_key;
+        struct extent_buffer *leaf;
+        u32 nritems;
+        int ret;
+        while (1) {
+                if (path->slots[0] == 0) {
+                        btrfs_set_path_blocking(path);
+                        ret = btrfs_prev_leaf(root, path);
+                        if (ret != 0)
+                                return ret;
+                } else {
+                        path->slots[0]--;
+                }
+                leaf = path->nodes[0];
+                nritems = btrfs_header_nritems(leaf);
+                if (nritems == 0)
+                        return 1;
+                if (path->slots[0] == nritems)
+                        path->slots[0]--;
+                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+                if (found_key.objectid < min_objectid)
+                        break;
+                if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
+                    found_key.type == BTRFS_METADATA_ITEM_KEY)
+                        return 0;
+                if (found_key.objectid == min_objectid &&
+                    found_key.type < BTRFS_EXTENT_ITEM_KEY)
+                        break;
+        }
+        return 1;
+}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 54ab86127f7a..2c1a42ca519f 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -521,9 +521,15 @@ struct btrfs_super_block {
 #define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF    (1ULL << 6)
 #define BTRFS_FEATURE_INCOMPAT_RAID56           (1ULL << 7)
 #define BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA  (1ULL << 8)
+#define BTRFS_FEATURE_INCOMPAT_NO_HOLES         (1ULL << 9)
 #define BTRFS_FEATURE_COMPAT_SUPP               0ULL
+#define BTRFS_FEATURE_COMPAT_SAFE_SET           0ULL
+#define BTRFS_FEATURE_COMPAT_SAFE_CLEAR         0ULL
 #define BTRFS_FEATURE_COMPAT_RO_SUPP            0ULL
+#define BTRFS_FEATURE_COMPAT_RO_SAFE_SET        0ULL
+#define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR      0ULL
 #define BTRFS_FEATURE_INCOMPAT_SUPP                     \
        (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF |         \
         BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL |        \
@@ -532,7 +538,12 @@ struct btrfs_super_block {
         BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO |          \
         BTRFS_FEATURE_INCOMPAT_RAID56 |                \
         BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF |         \
-         BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA)
+         BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA |       \
+         BTRFS_FEATURE_INCOMPAT_NO_HOLES)
+#define BTRFS_FEATURE_INCOMPAT_SAFE_SET                 \
+        (BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
+#define BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR               0ULL
 /*
 * A leaf is full of items. offset and size tell us where to find
@@ -1094,7 +1105,7 @@ struct btrfs_qgroup_limit_item {
 } __attribute__ ((__packed__));
 struct btrfs_space_info {
-        u64 flags;
+        spinlock_t lock;
        u64 total_bytes;        /* total bytes in the space,
                                   this doesn't take mirrors into account */
@@ -1104,14 +1115,25 @@ struct btrfs_space_info {
                                   transaction finishes */
        u64 bytes_reserved;     /* total bytes the allocator has reserved for
                                   current allocations */
-        u64 bytes_readonly;     /* total bytes that are read only */
        u64 bytes_may_use;      /* number of bytes that may be used for
                                   delalloc/allocations */
+        u64 bytes_readonly;     /* total bytes that are read only */
+        unsigned int full:1;    /* indicates that we cannot allocate any more
+                                   chunks for this space */
+        unsigned int chunk_alloc:1;     /* set if we are allocating a chunk */
+        unsigned int flush:1;           /* set if we are trying to make space */
+        unsigned int force_alloc;       /* set if we need to force a chunk
+                                           alloc for this space */
        u64 disk_used;          /* total bytes used on disk */
        u64 disk_total;         /* total bytes on disk, takes mirrors into
                                   account */
+        u64 flags;
        /*
         * bytes_pinned is kept in line with what is actually pinned, as in
         * we've called update_block_group and dropped the bytes_used counter
@@ -1124,22 +1146,15 @@ struct btrfs_space_info {
         */
        struct percpu_counter total_bytes_pinned;
-        unsigned int full:1;    /* indicates that we cannot allocate any more
-                                   chunks for this space */
-        unsigned int chunk_alloc:1;     /* set if we are allocating a chunk */
-        unsigned int flush:1;           /* set if we are trying to make space */
-        unsigned int force_alloc;       /* set if we need to force a chunk
-                                           alloc for this space */
        struct list_head list;
+        struct rw_semaphore groups_sem;
        /* for block groups in our same type */
        struct list_head block_groups[BTRFS_NR_RAID_TYPES];
-        spinlock_t lock;
-        struct rw_semaphore groups_sem;
        wait_queue_head_t wait;
+        struct kobject kobj;
+        struct kobject block_group_kobjs[BTRFS_NR_RAID_TYPES];
 };
 #define BTRFS_BLOCK_RSV_GLOBAL          1
@@ -1346,6 +1361,7 @@ struct btrfs_fs_info {
        u64 generation;
        u64 last_trans_committed;
+        u64 avg_delayed_ref_runtime;
        /*
         * this is updated to the current trans every time a full commit
@@ -1448,7 +1464,6 @@ struct btrfs_fs_info {
        spinlock_t tree_mod_seq_lock;
        atomic64_t tree_mod_seq;
        struct list_head tree_mod_seq_list;
-        struct seq_list tree_mod_seq_elem;
        /* this protects tree_mod_log */
        rwlock_t tree_mod_log_lock;
@@ -1515,6 +1530,8 @@ struct btrfs_fs_info {
        int thread_pool_size;
        struct kobject super_kobj;
+        struct kobject *space_info_kobj;
+        struct kobject *device_dir_kobj;
        struct completion kobj_unregister;
        int do_barriers;
        int closing;
@@ -1643,6 +1660,10 @@ struct btrfs_fs_info {
        spinlock_t reada_lock;
        struct radix_tree_root reada_tree;
+        /* Extent buffer radix tree */
+        spinlock_t buffer_lock;
+        struct radix_tree_root buffer_radix;
        /* next backup root to be overwritten */
        int backup_root_index;
@@ -1795,6 +1816,12 @@ struct btrfs_root {
        struct list_head ordered_extents;
        struct list_head ordered_root;
        u64 nr_ordered_extents;
+        /*
+         * Number of currently running SEND ioctls to prevent
+         * manipulation with the read-only status via SUBVOL_SETFLAGS
+         */
+        int send_in_progress;
 };
 struct btrfs_ioctl_defrag_range_args {
@@ -1997,6 +2024,7 @@ struct btrfs_ioctl_defrag_range_args {
 #define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21)
 #define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR        (1 << 22)
 #define BTRFS_MOUNT_RESCAN_UUID_TREE    (1 << 23)
+#define BTRFS_MOUNT_CHANGE_INODE_CACHE  (1 << 24)
 #define BTRFS_DEFAULT_COMMIT_INTERVAL   (30)
@@ -2925,6 +2953,10 @@ BTRFS_SETGET_STACK_FUNCS(stack_file_extent_generation,
                         struct btrfs_file_extent_item, generation, 64);
 BTRFS_SETGET_STACK_FUNCS(stack_file_extent_num_bytes,
                         struct btrfs_file_extent_item, num_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_num_bytes,
+                         struct btrfs_file_extent_item, disk_num_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_file_extent_compression,
+                         struct btrfs_file_extent_item, compression, 8);
 static inline unsigned long
 btrfs_file_extent_inline_start(struct btrfs_file_extent_item *e)
@@ -2958,15 +2990,6 @@ BTRFS_SETGET_FUNCS(file_extent_encryption, struct btrfs_file_extent_item,
 BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item,
                   other_encoding, 16);
-/* this returns the number of file bytes represented by the inline item.
- * If an item is compressed, this is the uncompressed size
- */
-static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb,
-                                               struct btrfs_file_extent_item *e)
-{
-        return btrfs_file_extent_ram_bytes(eb, e);
-}
 /*
 * this returns the number of bytes used by the item on disk, minus the
 * size of any extent headers.  If a file is compressed on disk, this is
@@ -2980,6 +3003,32 @@ static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb,
        return btrfs_item_size(eb, e) - offset;
 }
+/* this returns the number of file bytes represented by the inline item.
+ * If an item is compressed, this is the uncompressed size
+ */
+static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb,
+                                               int slot,
+                                               struct btrfs_file_extent_item *fi)
+{
+        struct btrfs_map_token token;
+        btrfs_init_map_token(&token);
+        /*
+         * return the space used on disk if this item isn't
+         * compressed or encoded
+         */
+        if (btrfs_token_file_extent_compression(eb, fi, &token) == 0 &&
+            btrfs_token_file_extent_encryption(eb, fi, &token) == 0 &&
+            btrfs_token_file_extent_other_encoding(eb, fi, &token) == 0) {
+                return btrfs_file_extent_inline_item_len(eb,
+                                                         btrfs_item_nr(slot));
+        }
+        /* otherwise use the ram bytes field */
+        return btrfs_token_file_extent_ram_bytes(eb, fi, &token);
+}
 /* btrfs_dev_stats_item */
 static inline u64 btrfs_dev_stats_value(struct extent_buffer *eb,
                                        struct btrfs_dev_stats_item *ptr,
@@ -3143,6 +3192,8 @@ static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root,
 int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
                                       struct btrfs_root *root);
+int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
+                                       struct btrfs_root *root);
 void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root, unsigned long count);
@@ -3163,6 +3214,7 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group(
                                                 struct btrfs_fs_info *info,
                                                 u64 bytenr);
 void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
+int get_block_group_index(struct btrfs_block_group_cache *cache);
 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
                                        struct btrfs_root *root, u32 blocksize,
                                        u64 parent, u64 root_objectid,
@@ -3301,6 +3353,8 @@ int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2);
 int btrfs_previous_item(struct btrfs_root *root,
                        struct btrfs_path *path, u64 min_objectid,
                        int type);
+int btrfs_previous_extent_item(struct btrfs_root *root,
+                        struct btrfs_path *path, u64 min_objectid);
 void btrfs_set_item_key_safe(struct btrfs_root *root, struct btrfs_path *path,
                             struct btrfs_key *new_key);
 struct extent_buffer *btrfs_root_node(struct btrfs_root *root);
@@ -3350,6 +3404,8 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
                         struct btrfs_root *root,
                         struct btrfs_path *path,
                         struct btrfs_key *new_key);
+int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
+                u64 inum, u64 ioff, u8 key_type, struct btrfs_key *found_key);
 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
                      *root, struct btrfs_key *key, struct btrfs_path *p, int
                      ins_len, int cow);
@@ -3399,6 +3455,7 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
 }
 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
+int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
                        u64 time_seq);
 static inline int btrfs_next_old_item(struct btrfs_root *root,
@@ -3563,12 +3620,6 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root,
                           const char *name, int name_len,
                           u64 inode_objectid, u64 ref_objectid, u64 *index);
-int btrfs_get_inode_ref_index(struct btrfs_trans_handle *trans,
-                              struct btrfs_root *root,
-                              struct btrfs_path *path,
-                              const char *name, int name_len,
-                              u64 inode_objectid, u64 ref_objectid, int mod,
-                              u64 *ret_index);
 int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root,
                             struct btrfs_path *path, u64 objectid);
@@ -3676,7 +3727,9 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput);
 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
                              struct extent_state **cached_state);
 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
-                             struct btrfs_root *new_root, u64 new_dirid);
+                             struct btrfs_root *new_root,
+                             struct btrfs_root *parent_root,
+                             u64 new_dirid);
 int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset,
                         size_t size, struct bio *bio,
                         unsigned long bio_flags);
@@ -3745,7 +3798,10 @@ extern const struct file_operations btrfs_file_operations;
 int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
                         struct btrfs_root *root, struct inode *inode,
                         struct btrfs_path *path, u64 start, u64 end,
-                         u64 *drop_end, int drop_cache);
+                         u64 *drop_end, int drop_cache,
+                         int replace_extent,
+                         u32 extent_item_size,
+                         int *key_inserted);
 int btrfs_drop_extents(struct btrfs_trans_handle *trans,
                       struct btrfs_root *root, struct inode *inode, u64 start,
                       u64 end, int drop_cache);
@@ -3764,6 +3820,8 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 /* sysfs.c */
 int btrfs_init_sysfs(void);
 void btrfs_exit_sysfs(void);
+int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info);
+void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info);
 /* xattr.c */
 ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
@@ -3796,14 +3854,20 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
        btrfs_printk(fs_info, KERN_NOTICE fmt, ##args)
 #define btrfs_info(fs_info, fmt, args...) \
        btrfs_printk(fs_info, KERN_INFO fmt, ##args)
+#ifdef DEBUG
 #define btrfs_debug(fs_info, fmt, args...) \
        btrfs_printk(fs_info, KERN_DEBUG fmt, ##args)
+#else
+#define btrfs_debug(fs_info, fmt, args...) \
+    no_printk(KERN_DEBUG fmt, ##args)
+#endif
 #ifdef CONFIG_BTRFS_ASSERT
 static inline void assfail(char *expr, char *file, int line)
 {
-        printk(KERN_ERR "BTRFS assertion failed: %s, file: %s, line: %d",
+        pr_err("BTRFS: assertion failed: %s, file: %s, line: %d",
               expr, file, line);
        BUG();
 }
@@ -3841,7 +3905,7 @@ static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info,
                if (!(features & flag)) {
                        features |= flag;
                        btrfs_set_super_incompat_flags(disk_super, features);
-                        printk(KERN_INFO "btrfs: setting %llu feature flag\n",
+                        btrfs_info(fs_info, "setting %llu feature flag",
                                         flag);
                }
                spin_unlock(&fs_info->super_lock);
@@ -3899,20 +3963,17 @@ do {									\
 /* acl.c */
 #ifdef CONFIG_BTRFS_FS_POSIX_ACL
 struct posix_acl *btrfs_get_acl(struct inode *inode, int type);
+int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
 int btrfs_init_acl(struct btrfs_trans_handle *trans,
                   struct inode *inode, struct inode *dir);
-int btrfs_acl_chmod(struct inode *inode);
 #else
 #define btrfs_get_acl NULL
+#define btrfs_set_acl NULL
 static inline int btrfs_init_acl(struct btrfs_trans_handle *trans,
                                 struct inode *inode, struct inode *dir)
 {
        return 0;
 }
-static inline int btrfs_acl_chmod(struct inode *inode)
-{
-        return 0;
-}
 #endif
 /* relocation.c */
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 8d292fbae659..451b00c86f6c 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -55,8 +55,7 @@ static inline void btrfs_init_delayed_node(
        delayed_node->inode_id = inode_id;
        atomic_set(&delayed_node->refs, 0);
        delayed_node->count = 0;
-        delayed_node->in_list = 0;
+        delayed_node->flags = 0;
-        delayed_node->inode_dirty = 0;
        delayed_node->ins_root = RB_ROOT;
        delayed_node->del_root = RB_ROOT;
        mutex_init(&delayed_node->mutex);
@@ -172,7 +171,7 @@ static void btrfs_queue_delayed_node(struct btrfs_delayed_root *root,
                                     int mod)
 {
        spin_lock(&root->lock);
-        if (node->in_list) {
+        if (test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) {
                if (!list_empty(&node->p_list))
                        list_move_tail(&node->p_list, &root->prepare_list);
                else if (mod)
@@ -182,7 +181,7 @@ static void btrfs_queue_delayed_node(struct btrfs_delayed_root *root,
                list_add_tail(&node->p_list, &root->prepare_list);
                atomic_inc(&node->refs);        /* inserted into list */
                root->nodes++;
-                node->in_list = 1;
+                set_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags);
        }
        spin_unlock(&root->lock);
 }
@@ -192,13 +191,13 @@ static void btrfs_dequeue_delayed_node(struct btrfs_delayed_root *root,
                                       struct btrfs_delayed_node *node)
 {
        spin_lock(&root->lock);
-        if (node->in_list) {
+        if (test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) {
                root->nodes--;
                atomic_dec(&node->refs);        /* not in the list */
                list_del_init(&node->n_list);
                if (!list_empty(&node->p_list))
                        list_del_init(&node->p_list);
-                node->in_list = 0;
+                clear_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags);
        }
        spin_unlock(&root->lock);
 }
@@ -231,7 +230,8 @@ static struct btrfs_delayed_node *btrfs_next_delayed_node(
        delayed_root = node->root->fs_info->delayed_root;
        spin_lock(&delayed_root->lock);
-        if (!node->in_list) {   /* not in the list */
+        if (!test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) {
+                /* not in the list */
                if (list_empty(&delayed_root->node_list))
                        goto out;
                p = delayed_root->node_list.next;
@@ -1004,9 +1004,10 @@ static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node)
 {
        struct btrfs_delayed_root *delayed_root;
-        if (delayed_node && delayed_node->inode_dirty) {
+        if (delayed_node &&
+            test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
                BUG_ON(!delayed_node->root);
-                delayed_node->inode_dirty = 0;
+                clear_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags);
                delayed_node->count--;
                delayed_root = delayed_node->root->fs_info->delayed_root;
@@ -1014,6 +1015,18 @@ static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node)
        }
 }
+static void btrfs_release_delayed_iref(struct btrfs_delayed_node *delayed_node)
+{
+        struct btrfs_delayed_root *delayed_root;
+        ASSERT(delayed_node->root);
+        clear_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags);
+        delayed_node->count--;
+        delayed_root = delayed_node->root->fs_info->delayed_root;
+        finish_one_item(delayed_root);
+}
 static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
                                        struct btrfs_root *root,
                                        struct btrfs_path *path,
@@ -1022,13 +1035,19 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
        struct btrfs_key key;
        struct btrfs_inode_item *inode_item;
        struct extent_buffer *leaf;
+        int mod;
        int ret;
        key.objectid = node->inode_id;
        btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
        key.offset = 0;
-        ret = btrfs_lookup_inode(trans, root, path, &key, 1);
+        if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags))
+                mod = -1;
+        else
+                mod = 1;
+        ret = btrfs_lookup_inode(trans, root, path, &key, mod);
        if (ret > 0) {
                btrfs_release_path(path);
                return -ENOENT;
@@ -1036,19 +1055,58 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
                return ret;
        }
-        btrfs_unlock_up_safe(path, 1);
        leaf = path->nodes[0];
        inode_item = btrfs_item_ptr(leaf, path->slots[0],
                                    struct btrfs_inode_item);
        write_extent_buffer(leaf, &node->inode_item, (unsigned long)inode_item,
                            sizeof(struct btrfs_inode_item));
        btrfs_mark_buffer_dirty(leaf);
-        btrfs_release_path(path);
+        if (!test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags))
+                goto no_iref;
+        path->slots[0]++;
+        if (path->slots[0] >= btrfs_header_nritems(leaf))
+                goto search;
+again:
+        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+        if (key.objectid != node->inode_id)
+                goto out;
+        if (key.type != BTRFS_INODE_REF_KEY &&
+            key.type != BTRFS_INODE_EXTREF_KEY)
+                goto out;
+        /*
+         * Delayed iref deletion is for the inode who has only one link,
+         * so there is only one iref. The case that several irefs are
+         * in the same item doesn't exist.
+         */
+        btrfs_del_item(trans, root, path);
+out:
+        btrfs_release_delayed_iref(node);
+no_iref:
+        btrfs_release_path(path);
+err_out:
        btrfs_delayed_inode_release_metadata(root, node);
        btrfs_release_delayed_inode(node);
-        return 0;
+        return ret;
+search:
+        btrfs_release_path(path);
+        btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY);
+        key.offset = -1;
+        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+        if (ret < 0)
+                goto err_out;
+        ASSERT(ret);
+        ret = 0;
+        leaf = path->nodes[0];
+        path->slots[0]--;
+        goto again;
 }
 static inline int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
@@ -1059,7 +1117,7 @@ static inline int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
        int ret;
        mutex_lock(&node->mutex);
-        if (!node->inode_dirty) {
+        if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &node->flags)) {
                mutex_unlock(&node->mutex);
                return 0;
        }
@@ -1203,7 +1261,7 @@ int btrfs_commit_inode_delayed_inode(struct inode *inode)
                return 0;
        mutex_lock(&delayed_node->mutex);
-        if (!delayed_node->inode_dirty) {
+        if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
                mutex_unlock(&delayed_node->mutex);
                btrfs_release_delayed_node(delayed_node);
                return 0;
@@ -1227,7 +1285,7 @@ int btrfs_commit_inode_delayed_inode(struct inode *inode)
        trans->block_rsv = &delayed_node->root->fs_info->delayed_block_rsv;
        mutex_lock(&delayed_node->mutex);
-        if (delayed_node->inode_dirty)
+        if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags))
                ret = __btrfs_update_delayed_inode(trans, delayed_node->root,
                                                   path, delayed_node);
        else
@@ -1300,36 +1358,9 @@ again:
        trans->block_rsv = &root->fs_info->delayed_block_rsv;
        __btrfs_commit_inode_delayed_items(trans, path, delayed_node);
-        /*
-         * Maybe new delayed items have been inserted, so we need requeue
-         * the work. Besides that, we must dequeue the empty delayed nodes
-         * to avoid the race between delayed items balance and the worker.
-         * The race like this:
-         *      Task1                           Worker thread
-         *                                      count == 0, needn't requeue
-         *                                        also needn't insert the
-         *                                        delayed node into prepare
-         *                                        list again.
-         *      add lots of delayed items
-         *      queue the delayed node
-         *        already in the list,
-         *        and not in the prepare
-         *        list, it means the delayed
-         *        node is being dealt with
-         *        by the worker.
-         *      do delayed items balance
-         *        the delayed node is being
-         *        dealt with by the worker
-         *        now, just wait.
-         *                                      the worker goto idle.
-         * Task1 will sleep until the transaction is commited.
-         */
-        mutex_lock(&delayed_node->mutex);
-        btrfs_dequeue_delayed_node(root->fs_info->delayed_root, delayed_node);
-        mutex_unlock(&delayed_node->mutex);
        trans->block_rsv = block_rsv;
-        btrfs_end_transaction_dmeta(trans, root);
+        btrfs_end_transaction(trans, root);
        btrfs_btree_balance_dirty_nodelay(root);
 release_path:
@@ -1376,52 +1407,41 @@ void btrfs_assert_delayed_root_empty(struct btrfs_root *root)
        WARN_ON(btrfs_first_delayed_node(delayed_root));
 }
-static int refs_newer(struct btrfs_delayed_root *delayed_root,
+static int could_end_wait(struct btrfs_delayed_root *delayed_root, int seq)
-                      int seq, int count)
 {
        int val = atomic_read(&delayed_root->items_seq);
-        if (val < seq || val >= seq + count)
+        if (val < seq || val >= seq + BTRFS_DELAYED_BATCH)
+                return 1;
+        if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND)
                return 1;
        return 0;
 }
 void btrfs_balance_delayed_items(struct btrfs_root *root)
 {
        struct btrfs_delayed_root *delayed_root;
-        int seq;
        delayed_root = btrfs_get_delayed_root(root);
        if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND)
                return;
-        seq = atomic_read(&delayed_root->items_seq);
        if (atomic_read(&delayed_root->items) >= BTRFS_DELAYED_WRITEBACK) {
+                int seq;
                int ret;
-                DEFINE_WAIT(__wait);
+                seq = atomic_read(&delayed_root->items_seq);
                ret = btrfs_wq_run_delayed_node(delayed_root, root, 0);
                if (ret)
                        return;
-                while (1) {
+                wait_event_interruptible(delayed_root->wait,
-                        prepare_to_wait(&delayed_root->wait, &__wait,
+                                         could_end_wait(delayed_root, seq));
-                                        TASK_INTERRUPTIBLE);
+                return;
-                        if (refs_newer(delayed_root, seq,
-                                       BTRFS_DELAYED_BATCH) ||
-                            atomic_read(&delayed_root->items) <
-                            BTRFS_DELAYED_BACKGROUND) {
-                                break;
-                        }
-                        if (!signal_pending(current))
-                                schedule();
-                        else
-                                break;
-                }
-                finish_wait(&delayed_root->wait, &__wait);
        }
        btrfs_wq_run_delayed_node(delayed_root, root, BTRFS_DELAYED_BATCH);
@@ -1472,9 +1492,9 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
        mutex_lock(&delayed_node->mutex);
        ret = __btrfs_add_delayed_insertion_item(delayed_node, delayed_item);
        if (unlikely(ret)) {
-                printk(KERN_ERR "err add delayed dir index item(name: %.*s) "
+                btrfs_err(root->fs_info, "err add delayed dir index item(name: %.*s) "
                                "into the insertion tree of the delayed node"
-                                "(root id: %llu, inode id: %llu, errno: %d)\n",
+                                "(root id: %llu, inode id: %llu, errno: %d)",
                                name_len, name, delayed_node->root->objectid,
                                delayed_node->inode_id, ret);
                BUG();
@@ -1544,9 +1564,9 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
        mutex_lock(&node->mutex);
        ret = __btrfs_add_delayed_deletion_item(node, item);
        if (unlikely(ret)) {
-                printk(KERN_ERR "err add delayed dir index item(index: %llu) "
+                btrfs_err(root->fs_info, "err add delayed dir index item(index: %llu) "
                                "into the deletion tree of the delayed node"
-                                "(root id: %llu, inode id: %llu, errno: %d)\n",
+                                "(root id: %llu, inode id: %llu, errno: %d)",
                                index, node->root->objectid, node->inode_id,
                                ret);
                BUG();
@@ -1759,7 +1779,7 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
                return -ENOENT;
        mutex_lock(&delayed_node->mutex);
-        if (!delayed_node->inode_dirty) {
+        if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
                mutex_unlock(&delayed_node->mutex);
                btrfs_release_delayed_node(delayed_node);
                return -ENOENT;
@@ -1810,7 +1830,7 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
                return PTR_ERR(delayed_node);
        mutex_lock(&delayed_node->mutex);
-        if (delayed_node->inode_dirty) {
+        if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
                fill_stack_inode_item(trans, &delayed_node->inode_item, inode);
                goto release_node;
        }
@@ -1821,7 +1841,7 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
                goto release_node;
        fill_stack_inode_item(trans, &delayed_node->inode_item, inode);
-        delayed_node->inode_dirty = 1;
+        set_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags);
        delayed_node->count++;
        atomic_inc(&root->fs_info->delayed_root->items);
 release_node:
@@ -1830,6 +1850,41 @@ release_node:
        return ret;
 }
+int btrfs_delayed_delete_inode_ref(struct inode *inode)
+{
+        struct btrfs_delayed_node *delayed_node;
+        delayed_node = btrfs_get_or_create_delayed_node(inode);
+        if (IS_ERR(delayed_node))
+                return PTR_ERR(delayed_node);
+        /*
+         * We don't reserve space for inode ref deletion is because:
+         * - We ONLY do async inode ref deletion for the inode who has only
+         *   one link(i_nlink == 1), it means there is only one inode ref.
+         *   And in most case, the inode ref and the inode item are in the
+         *   same leaf, and we will deal with them at the same time.
+         *   Since we are sure we will reserve the space for the inode item,
+         *   it is unnecessary to reserve space for inode ref deletion.
+         * - If the inode ref and the inode item are not in the same leaf,
+         *   We also needn't worry about enospc problem, because we reserve
+         *   much more space for the inode update than it needs.
+         * - At the worst, we can steal some space from the global reservation.
+         *   It is very rare.
+         */
+        mutex_lock(&delayed_node->mutex);
+        if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags))
+                goto release_node;
+        set_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags);
+        delayed_node->count++;
+        atomic_inc(&BTRFS_I(inode)->root->fs_info->delayed_root->items);
+release_node:
+        mutex_unlock(&delayed_node->mutex);
+        btrfs_release_delayed_node(delayed_node);
+        return 0;
+}
 static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node)
 {
        struct btrfs_root *root = delayed_node->root;
@@ -1852,7 +1907,10 @@ static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node)
                btrfs_release_delayed_item(prev_item);
        }
-        if (delayed_node->inode_dirty) {
+        if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags))
+                btrfs_release_delayed_iref(delayed_node);
+        if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
                btrfs_delayed_inode_release_metadata(root, delayed_node);
                btrfs_release_delayed_inode(delayed_node);
        }
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index a4b38f934d14..f70119f25421 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -48,6 +48,10 @@ struct btrfs_delayed_root {
        wait_queue_head_t wait;
 };
+#define BTRFS_DELAYED_NODE_IN_LIST      0
+#define BTRFS_DELAYED_NODE_INODE_DIRTY  1
+#define BTRFS_DELAYED_NODE_DEL_IREF     2
 struct btrfs_delayed_node {
        u64 inode_id;
        u64 bytes_reserved;
@@ -65,8 +69,7 @@ struct btrfs_delayed_node {
        struct btrfs_inode_item inode_item;
        atomic_t refs;
        u64 index_cnt;
-        bool in_list;
+        unsigned long flags;
-        bool inode_dirty;
        int count;
 };
@@ -125,6 +128,7 @@ int btrfs_commit_inode_delayed_inode(struct inode *inode);
 int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
                               struct btrfs_root *root, struct inode *inode);
 int btrfs_fill_inode(struct inode *inode, u32 *rdev);
+int btrfs_delayed_delete_inode_ref(struct inode *inode);
 /* Used for drop dead root */
 void btrfs_kill_all_delayed_nodes(struct btrfs_root *root);
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index e4d467be2dd4..f3bff89eecf0 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -161,35 +161,61 @@ static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
        return NULL;
 }
+/* insert a new ref to head ref rbtree */
+static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root,
+                                                   struct rb_node *node)
+{
+        struct rb_node **p = &root->rb_node;
+        struct rb_node *parent_node = NULL;
+        struct btrfs_delayed_ref_head *entry;
+        struct btrfs_delayed_ref_head *ins;
+        u64 bytenr;
+        ins = rb_entry(node, struct btrfs_delayed_ref_head, href_node);
+        bytenr = ins->node.bytenr;
+        while (*p) {
+                parent_node = *p;
+                entry = rb_entry(parent_node, struct btrfs_delayed_ref_head,
+                                 href_node);
+                if (bytenr < entry->node.bytenr)
+                        p = &(*p)->rb_left;
+                else if (bytenr > entry->node.bytenr)
+                        p = &(*p)->rb_right;
+                else
+                        return entry;
+        }
+        rb_link_node(node, parent_node, p);
+        rb_insert_color(node, root);
+        return NULL;
+}
 /*
 * find an head entry based on bytenr. This returns the delayed ref
 * head if it was able to find one, or NULL if nothing was in that spot.
 * If return_bigger is given, the next bigger entry is returned if no exact
 * match is found.
 */
-static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root,
+static struct btrfs_delayed_ref_head *
-                                  u64 bytenr,
+find_ref_head(struct rb_root *root, u64 bytenr,
-                                  struct btrfs_delayed_ref_node **last,
+              struct btrfs_delayed_ref_head **last, int return_bigger)
-                                  int return_bigger)
 {
        struct rb_node *n;
-        struct btrfs_delayed_ref_node *entry;
+        struct btrfs_delayed_ref_head *entry;
        int cmp = 0;
 again:
        n = root->rb_node;
        entry = NULL;
        while (n) {
-                entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
+                entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node);
-                WARN_ON(!entry->in_tree);
                if (last)
                        *last = entry;
-                if (bytenr < entry->bytenr)
+                if (bytenr < entry->node.bytenr)
                        cmp = -1;
-                else if (bytenr > entry->bytenr)
+                else if (bytenr > entry->node.bytenr)
-                        cmp = 1;
-                else if (!btrfs_delayed_ref_is_head(entry))
                        cmp = 1;
                else
                        cmp = 0;
@@ -203,12 +229,12 @@ again:
        }
        if (entry && return_bigger) {
                if (cmp > 0) {
-                        n = rb_next(&entry->rb_node);
+                        n = rb_next(&entry->href_node);
                        if (!n)
                                n = rb_first(root);
-                        entry = rb_entry(n, struct btrfs_delayed_ref_node,
+                        entry = rb_entry(n, struct btrfs_delayed_ref_head,
-                                         rb_node);
+                                         href_node);
-                        bytenr = entry->bytenr;
+                        bytenr = entry->node.bytenr;
                        return_bigger = 0;
                        goto again;
                }
@@ -243,33 +269,38 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
 static inline void drop_delayed_ref(struct btrfs_trans_handle *trans,
                                    struct btrfs_delayed_ref_root *delayed_refs,
+                                    struct btrfs_delayed_ref_head *head,
                                    struct btrfs_delayed_ref_node *ref)
 {
-        rb_erase(&ref->rb_node, &delayed_refs->root);
+        if (btrfs_delayed_ref_is_head(ref)) {
+                head = btrfs_delayed_node_to_head(ref);
+                rb_erase(&head->href_node, &delayed_refs->href_root);
+        } else {
+                assert_spin_locked(&head->lock);
+                rb_erase(&ref->rb_node, &head->ref_root);
+        }
        ref->in_tree = 0;
        btrfs_put_delayed_ref(ref);
-        delayed_refs->num_entries--;
+        atomic_dec(&delayed_refs->num_entries);
        if (trans->delayed_ref_updates)
                trans->delayed_ref_updates--;
 }
 static int merge_ref(struct btrfs_trans_handle *trans,
                     struct btrfs_delayed_ref_root *delayed_refs,
+                     struct btrfs_delayed_ref_head *head,
                     struct btrfs_delayed_ref_node *ref, u64 seq)
 {
        struct rb_node *node;
-        int merged = 0;
        int mod = 0;
        int done = 0;
-        node = rb_prev(&ref->rb_node);
+        node = rb_next(&ref->rb_node);
-        while (node) {
+        while (!done && node) {
                struct btrfs_delayed_ref_node *next;
                next = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
-                node = rb_prev(node);
+                node = rb_next(node);
-                if (next->bytenr != ref->bytenr)
-                        break;
                if (seq && next->seq >= seq)
                        break;
                if (comp_entry(ref, next, 0))
@@ -289,12 +320,11 @@ static int merge_ref(struct btrfs_trans_handle *trans,
                        mod = -next->ref_mod;
                }
-                merged++;
+                drop_delayed_ref(trans, delayed_refs, head, next);
-                drop_delayed_ref(trans, delayed_refs, next);
                ref->ref_mod += mod;
                if (ref->ref_mod == 0) {
-                        drop_delayed_ref(trans, delayed_refs, ref);
+                        drop_delayed_ref(trans, delayed_refs, head, ref);
-                        break;
+                        done = 1;
                } else {
                        /*
                         * You can't have multiples of the same ref on a tree
@@ -303,13 +333,8 @@ static int merge_ref(struct btrfs_trans_handle *trans,
                        WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY ||
                                ref->type == BTRFS_SHARED_BLOCK_REF_KEY);
                }
-                if (done)
-                        break;
-                node = rb_prev(&ref->rb_node);
        }
+        return done;
-        return merged;
 }
 void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
@@ -320,6 +345,14 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
        struct rb_node *node;
        u64 seq = 0;
+        assert_spin_locked(&head->lock);
+        /*
+         * We don't have too much refs to merge in the case of delayed data
+         * refs.
+         */
+        if (head->is_data)
+                return;
        spin_lock(&fs_info->tree_mod_seq_lock);
        if (!list_empty(&fs_info->tree_mod_seq_list)) {
                struct seq_list *elem;
@@ -330,22 +363,19 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
        }
        spin_unlock(&fs_info->tree_mod_seq_lock);
-        node = rb_prev(&head->node.rb_node);
+        node = rb_first(&head->ref_root);
        while (node) {
                struct btrfs_delayed_ref_node *ref;
                ref = rb_entry(node, struct btrfs_delayed_ref_node,
                               rb_node);
-                if (ref->bytenr != head->node.bytenr)
-                        break;
                /* We can't merge refs that are outside of our seq count */
                if (seq && ref->seq >= seq)
                        break;
-                if (merge_ref(trans, delayed_refs, ref, seq))
+                if (merge_ref(trans, delayed_refs, head, ref, seq))
-                        node = rb_prev(&head->node.rb_node);
+                        node = rb_first(&head->ref_root);
                else
-                        node = rb_prev(node);
+                        node = rb_next(&ref->rb_node);
        }
 }
@@ -373,71 +403,52 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
        return ret;
 }
-int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
+struct btrfs_delayed_ref_head *
-                           struct list_head *cluster, u64 start)
+btrfs_select_ref_head(struct btrfs_trans_handle *trans)
 {
-        int count = 0;
        struct btrfs_delayed_ref_root *delayed_refs;
-        struct rb_node *node;
-        struct btrfs_delayed_ref_node *ref;
        struct btrfs_delayed_ref_head *head;
+        u64 start;
+        bool loop = false;
        delayed_refs = &trans->transaction->delayed_refs;
-        if (start == 0) {
-                node = rb_first(&delayed_refs->root);
-        } else {
-                ref = NULL;
-                find_ref_head(&delayed_refs->root, start + 1, &ref, 1);
-                if (ref) {
-                        node = &ref->rb_node;
-                } else
-                        node = rb_first(&delayed_refs->root);
-        }
 again:
-        while (node && count < 32) {
+        start = delayed_refs->run_delayed_start;
-                ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+        head = find_ref_head(&delayed_refs->href_root, start, NULL, 1);
-                if (btrfs_delayed_ref_is_head(ref)) {
+        if (!head && !loop) {
-                        head = btrfs_delayed_node_to_head(ref);
+                delayed_refs->run_delayed_start = 0;
-                        if (list_empty(&head->cluster)) {
-                                list_add_tail(&head->cluster, cluster);
-                                delayed_refs->run_delayed_start =
-                                        head->node.bytenr;
-                                count++;
-                                WARN_ON(delayed_refs->num_heads_ready == 0);
-                                delayed_refs->num_heads_ready--;
-                        } else if (count) {
-                                /* the goal of the clustering is to find extents
-                                 * that are likely to end up in the same extent
-                                 * leaf on disk.  So, we don't want them spread
-                                 * all over the tree.  Stop now if we've hit
-                                 * a head that was already in use
-                                 */
-                                break;
-                        }
-                }
-                node = rb_next(node);
-        }
-        if (count) {
-                return 0;
-        } else if (start) {
-                /*
-                 * we've gone to the end of the rbtree without finding any
-                 * clusters.  start from the beginning and try again
-                 */
                start = 0;
-                node = rb_first(&delayed_refs->root);
+                loop = true;
-                goto again;
+                head = find_ref_head(&delayed_refs->href_root, start, NULL, 1);
+                if (!head)
+                        return NULL;
+        } else if (!head && loop) {
+                return NULL;
        }
-        return 1;
-}
-void btrfs_release_ref_cluster(struct list_head *cluster)
+        while (head->processing) {
-{
+                struct rb_node *node;
-        struct list_head *pos, *q;
+                node = rb_next(&head->href_node);
+                if (!node) {
+                        if (loop)
+                                return NULL;
+                        delayed_refs->run_delayed_start = 0;
+                        start = 0;
+                        loop = true;
+                        goto again;
+                }
+                head = rb_entry(node, struct btrfs_delayed_ref_head,
+                                href_node);
+        }
-        list_for_each_safe(pos, q, cluster)
+        head->processing = 1;
-                list_del_init(pos);
+        WARN_ON(delayed_refs->num_heads_ready == 0);
+        delayed_refs->num_heads_ready--;
+        delayed_refs->run_delayed_start = head->node.bytenr +
+                head->node.num_bytes;
+        return head;
 }
 /*
@@ -451,6 +462,7 @@ void btrfs_release_ref_cluster(struct list_head *cluster)
 static noinline void
 update_existing_ref(struct btrfs_trans_handle *trans,
                    struct btrfs_delayed_ref_root *delayed_refs,
+                    struct btrfs_delayed_ref_head *head,
                    struct btrfs_delayed_ref_node *existing,
                    struct btrfs_delayed_ref_node *update)
 {
@@ -463,7 +475,7 @@ update_existing_ref(struct btrfs_trans_handle *trans,
                 */
                existing->ref_mod--;
                if (existing->ref_mod == 0)
-                        drop_delayed_ref(trans, delayed_refs, existing);
+                        drop_delayed_ref(trans, delayed_refs, head, existing);
                else
                        WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY ||
                                existing->type == BTRFS_SHARED_BLOCK_REF_KEY);
@@ -533,9 +545,13 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
                }
        }
        /*
-         * update the reference mod on the head to reflect this new operation
+         * update the reference mod on the head to reflect this new operation,
+         * only need the lock for this case cause we could be processing it
+         * currently, for refs we just added we know we're a-ok.
         */
+        spin_lock(&existing_ref->lock);
        existing->ref_mod += update->ref_mod;
+        spin_unlock(&existing_ref->lock);
 }
 /*
@@ -543,13 +559,13 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
 * this does all the dirty work in terms of maintaining the correct
 * overall modification count.
 */
-static noinline void add_delayed_ref_head(struct btrfs_fs_info *fs_info,
+static noinline struct btrfs_delayed_ref_head *
-                                        struct btrfs_trans_handle *trans,
+add_delayed_ref_head(struct btrfs_fs_info *fs_info,
-                                        struct btrfs_delayed_ref_node *ref,
+                     struct btrfs_trans_handle *trans,
-                                        u64 bytenr, u64 num_bytes,
+                     struct btrfs_delayed_ref_node *ref, u64 bytenr,
-                                        int action, int is_data)
+                     u64 num_bytes, int action, int is_data)
 {
-        struct btrfs_delayed_ref_node *existing;
+        struct btrfs_delayed_ref_head *existing;
        struct btrfs_delayed_ref_head *head_ref = NULL;
        struct btrfs_delayed_ref_root *delayed_refs;
        int count_mod = 1;
@@ -596,38 +612,43 @@ static noinline void add_delayed_ref_head(struct btrfs_fs_info *fs_info,
        head_ref = btrfs_delayed_node_to_head(ref);
        head_ref->must_insert_reserved = must_insert_reserved;
        head_ref->is_data = is_data;
+        head_ref->ref_root = RB_ROOT;
+        head_ref->processing = 0;
-        INIT_LIST_HEAD(&head_ref->cluster);
+        spin_lock_init(&head_ref->lock);
        mutex_init(&head_ref->mutex);
        trace_add_delayed_ref_head(ref, head_ref, action);
-        existing = tree_insert(&delayed_refs->root, &ref->rb_node);
+        existing = htree_insert(&delayed_refs->href_root,
+                                &head_ref->href_node);
        if (existing) {
-                update_existing_head_ref(existing, ref);
+                update_existing_head_ref(&existing->node, ref);
                /*
                 * we've updated the existing ref, free the newly
                 * allocated ref
                 */
                kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
+                head_ref = existing;
        } else {
                delayed_refs->num_heads++;
                delayed_refs->num_heads_ready++;
-                delayed_refs->num_entries++;
+                atomic_inc(&delayed_refs->num_entries);
                trans->delayed_ref_updates++;
        }
+        return head_ref;
 }
 /*
 * helper to insert a delayed tree ref into the rbtree.
 */
-static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
+static noinline void
-                                         struct btrfs_trans_handle *trans,
+add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
-                                         struct btrfs_delayed_ref_node *ref,
+                     struct btrfs_trans_handle *trans,
-                                         u64 bytenr, u64 num_bytes, u64 parent,
+                     struct btrfs_delayed_ref_head *head_ref,
-                                         u64 ref_root, int level, int action,
+                     struct btrfs_delayed_ref_node *ref, u64 bytenr,
-                                         int for_cow)
+                     u64 num_bytes, u64 parent, u64 ref_root, int level,
+                     int action, int for_cow)
 {
        struct btrfs_delayed_ref_node *existing;
        struct btrfs_delayed_tree_ref *full_ref;
@@ -663,30 +684,33 @@ static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
        trace_add_delayed_tree_ref(ref, full_ref, action);
-        existing = tree_insert(&delayed_refs->root, &ref->rb_node);
+        spin_lock(&head_ref->lock);
+        existing = tree_insert(&head_ref->ref_root, &ref->rb_node);
        if (existing) {
-                update_existing_ref(trans, delayed_refs, existing, ref);
+                update_existing_ref(trans, delayed_refs, head_ref, existing,
+                                    ref);
                /*
                 * we've updated the existing ref, free the newly
                 * allocated ref
                 */
                kmem_cache_free(btrfs_delayed_tree_ref_cachep, full_ref);
        } else {
-                delayed_refs->num_entries++;
+                atomic_inc(&delayed_refs->num_entries);
                trans->delayed_ref_updates++;
        }
+        spin_unlock(&head_ref->lock);
 }
 /*
 * helper to insert a delayed data ref into the rbtree.
 */
-static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info,
+static noinline void
-                                         struct btrfs_trans_handle *trans,
+add_delayed_data_ref(struct btrfs_fs_info *fs_info,
-                                         struct btrfs_delayed_ref_node *ref,
+                     struct btrfs_trans_handle *trans,
-                                         u64 bytenr, u64 num_bytes, u64 parent,
+                     struct btrfs_delayed_ref_head *head_ref,
-                                         u64 ref_root, u64 owner, u64 offset,
+                     struct btrfs_delayed_ref_node *ref, u64 bytenr,
-                                         int action, int for_cow)
+                     u64 num_bytes, u64 parent, u64 ref_root, u64 owner,
+                     u64 offset, int action, int for_cow)
 {
        struct btrfs_delayed_ref_node *existing;
        struct btrfs_delayed_data_ref *full_ref;
@@ -724,19 +748,21 @@ static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info,
        trace_add_delayed_data_ref(ref, full_ref, action);
-        existing = tree_insert(&delayed_refs->root, &ref->rb_node);
+        spin_lock(&head_ref->lock);
+        existing = tree_insert(&head_ref->ref_root, &ref->rb_node);
        if (existing) {
-                update_existing_ref(trans, delayed_refs, existing, ref);
+                update_existing_ref(trans, delayed_refs, head_ref, existing,
+                                    ref);
                /*
                 * we've updated the existing ref, free the newly
                 * allocated ref
                 */
                kmem_cache_free(btrfs_delayed_data_ref_cachep, full_ref);
        } else {
-                delayed_refs->num_entries++;
+                atomic_inc(&delayed_refs->num_entries);
                trans->delayed_ref_updates++;
        }
+        spin_unlock(&head_ref->lock);
 }
 /*
@@ -775,10 +801,10 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
         * insert both the head node and the new ref without dropping
         * the spin lock
         */
-        add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
+        head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node,
-                                   num_bytes, action, 0);
+                                        bytenr, num_bytes, action, 0);
-        add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr,
+        add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr,
                                   num_bytes, parent, ref_root, level, action,
                                   for_cow);
        spin_unlock(&delayed_refs->lock);
@@ -823,10 +849,10 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
         * insert both the head node and the new ref without dropping
         * the spin lock
         */
-        add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
+        head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node,
-                                   num_bytes, action, 1);
+                                        bytenr, num_bytes, action, 1);
-        add_delayed_data_ref(fs_info, trans, &ref->node, bytenr,
+        add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr,
                                   num_bytes, parent, ref_root, owner, offset,
                                   action, for_cow);
        spin_unlock(&delayed_refs->lock);
@@ -869,14 +895,10 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
 struct btrfs_delayed_ref_head *
 btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
 {
-        struct btrfs_delayed_ref_node *ref;
        struct btrfs_delayed_ref_root *delayed_refs;
        delayed_refs = &trans->transaction->delayed_refs;
-        ref = find_ref_head(&delayed_refs->root, bytenr, NULL, 0);
+        return find_ref_head(&delayed_refs->href_root, bytenr, NULL, 0);
-        if (ref)
-                return btrfs_delayed_node_to_head(ref);
-        return NULL;
 }
 void btrfs_delayed_ref_exit(void)
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 70b962cc177d..4ba9b93022ff 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -81,7 +81,10 @@ struct btrfs_delayed_ref_head {
         */
        struct mutex mutex;
-        struct list_head cluster;
+        spinlock_t lock;
+        struct rb_root ref_root;
+        struct rb_node href_node;
        struct btrfs_delayed_extent_op *extent_op;
        /*
@@ -98,6 +101,7 @@ struct btrfs_delayed_ref_head {
         */
        unsigned int must_insert_reserved:1;
        unsigned int is_data:1;
+        unsigned int processing:1;
 };
 struct btrfs_delayed_tree_ref {
@@ -116,7 +120,8 @@ struct btrfs_delayed_data_ref {
 };
 struct btrfs_delayed_ref_root {
-        struct rb_root root;
+        /* head ref rbtree */
+        struct rb_root href_root;
        /* this spin lock protects the rbtree and the entries inside */
        spinlock_t lock;
@@ -124,7 +129,7 @@ struct btrfs_delayed_ref_root {
        /* how many delayed ref updates we've queued, used by the
         * throttling code
         */
-        unsigned long num_entries;
+        atomic_t num_entries;
        /* total number of head nodes in tree */
        unsigned long num_heads;
@@ -133,15 +138,6 @@ struct btrfs_delayed_ref_root {
        unsigned long num_heads_ready;
        /*
-         * bumped when someone is making progress on the delayed
-         * refs, so that other procs know they are just adding to
-         * contention intead of helping
-         */
-        atomic_t procs_running_refs;
-        atomic_t ref_seq;
-        wait_queue_head_t wait;
-        /*
         * set when the tree is flushing before a transaction commit,
         * used by the throttling code to decide if new updates need
         * to be run right away
@@ -226,9 +222,9 @@ static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head)
        mutex_unlock(&head->mutex);
 }
-int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
-                           struct list_head *cluster, u64 search_start);
+struct btrfs_delayed_ref_head *
-void btrfs_release_ref_cluster(struct list_head *cluster);
+btrfs_select_ref_head(struct btrfs_trans_handle *trans);
 int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
                            struct btrfs_delayed_ref_root *delayed_refs,
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 2cfc3dfff64f..564c92638b20 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -102,7 +102,8 @@ no_valid_dev_replace_entry_found:
        ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item);
        if (item_size != sizeof(struct btrfs_dev_replace_item)) {
-                pr_warn("btrfs: dev_replace entry found has unexpected size, ignore entry\n");
+                btrfs_warn(fs_info,
+                        "dev_replace entry found has unexpected size, ignore entry");
                goto no_valid_dev_replace_entry_found;
        }
@@ -145,13 +146,19 @@ no_valid_dev_replace_entry_found:
                if (!dev_replace->srcdev &&
                    !btrfs_test_opt(dev_root, DEGRADED)) {
                        ret = -EIO;
-                        pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?\n",
+                        btrfs_warn(fs_info,
-                                src_devid);
+                           "cannot mount because device replace operation is ongoing and");
+                        btrfs_warn(fs_info,
+                           "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?",
+                           src_devid);
                }
                if (!dev_replace->tgtdev &&
                    !btrfs_test_opt(dev_root, DEGRADED)) {
                        ret = -EIO;
-                        pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "tgtdev (devid %llu) is missing, need to run btrfs dev scan?\n",
+                        btrfs_warn(fs_info,
+                           "cannot mount because device replace operation is ongoing and");
+                        btrfs_warn(fs_info,
+                           "tgtdev (devid %llu) is missing, need to run 'btrfs dev scan'?",
                                BTRFS_DEV_REPLACE_DEVID);
                }
                if (dev_replace->tgtdev) {
@@ -210,7 +217,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
        }
        ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
        if (ret < 0) {
-                pr_warn("btrfs: error %d while searching for dev_replace item!\n",
+                btrfs_warn(fs_info, "error %d while searching for dev_replace item!",
                        ret);
                goto out;
        }
@@ -230,7 +237,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
                 */
                ret = btrfs_del_item(trans, dev_root, path);
                if (ret != 0) {
-                        pr_warn("btrfs: delete too small dev_replace item failed %d!\n",
+                        btrfs_warn(fs_info, "delete too small dev_replace item failed %d!",
                                ret);
                        goto out;
                }
@@ -243,7 +250,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
                ret = btrfs_insert_empty_item(trans, dev_root, path,
                                              &key, sizeof(*ptr));
                if (ret < 0) {
-                        pr_warn("btrfs: insert dev_replace item failed %d!\n",
+                        btrfs_warn(fs_info, "insert dev_replace item failed %d!",
                                ret);
                        goto out;
                }
@@ -305,7 +312,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
        struct btrfs_device *src_device = NULL;
        if (btrfs_fs_incompat(fs_info, RAID56)) {
-                pr_warn("btrfs: dev_replace cannot yet handle RAID5/RAID6\n");
+                btrfs_warn(fs_info, "dev_replace cannot yet handle RAID5/RAID6");
                return -EINVAL;
        }
@@ -325,7 +332,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
        ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name,
                                            &tgt_device);
        if (ret) {
-                pr_err("btrfs: target device %s is invalid!\n",
+                btrfs_err(fs_info, "target device %s is invalid!",
                       args->start.tgtdev_name);
                mutex_unlock(&fs_info->volume_mutex);
                return -EINVAL;
@@ -341,7 +348,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
        }
        if (tgt_device->total_bytes < src_device->total_bytes) {
-                pr_err("btrfs: target device is smaller than source device!\n");
+                btrfs_err(fs_info, "target device is smaller than source device!");
                ret = -EINVAL;
                goto leave_no_lock;
        }
@@ -366,7 +373,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
        dev_replace->tgtdev = tgt_device;
        printk_in_rcu(KERN_INFO
-                      "btrfs: dev_replace from %s (devid %llu) to %s started\n",
+                      "BTRFS: dev_replace from %s (devid %llu) to %s started\n",
                      src_device->missing ? "<missing disk>" :
                        rcu_str_deref(src_device->name),
                      src_device->devid,
@@ -489,7 +496,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
        if (scrub_ret) {
                printk_in_rcu(KERN_ERR
-                              "btrfs: btrfs_scrub_dev(%s, %llu, %s) failed %d\n",
+                              "BTRFS: btrfs_scrub_dev(%s, %llu, %s) failed %d\n",
                              src_device->missing ? "<missing disk>" :
                                rcu_str_deref(src_device->name),
                              src_device->devid,
@@ -504,7 +511,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
        }
        printk_in_rcu(KERN_INFO
-                      "btrfs: dev_replace from %s (devid %llu) to %s) finished\n",
+                      "BTRFS: dev_replace from %s (devid %llu) to %s) finished\n",
                      src_device->missing ? "<missing disk>" :
                        rcu_str_deref(src_device->name),
                      src_device->devid,
@@ -699,7 +706,7 @@ void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
                        BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
                dev_replace->time_stopped = get_seconds();
                dev_replace->item_needs_writeback = 1;
-                pr_info("btrfs: suspending dev_replace for unmount\n");
+                btrfs_info(fs_info, "suspending dev_replace for unmount");
                break;
        }
@@ -728,8 +735,9 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
                break;
        }
        if (!dev_replace->tgtdev || !dev_replace->tgtdev->bdev) {
-                pr_info("btrfs: cannot continue dev_replace, tgtdev is missing\n"
+                btrfs_info(fs_info, "cannot continue dev_replace, tgtdev is missing");
-                        "btrfs: you may cancel the operation after 'mount -o degraded'\n");
+                btrfs_info(fs_info,
+                        "you may cancel the operation after 'mount -o degraded'");
                btrfs_dev_replace_unlock(dev_replace);
                return 0;
        }
@@ -755,14 +763,14 @@ static int btrfs_dev_replace_kthread(void *data)
                kfree(status_args);
                do_div(progress, 10);
                printk_in_rcu(KERN_INFO
-                              "btrfs: continuing dev_replace from %s (devid %llu) to %s @%u%%\n",
+                        "BTRFS: continuing dev_replace from %s (devid %llu) to %s @%u%%\n",
-                              dev_replace->srcdev->missing ? "<missing disk>" :
+                        dev_replace->srcdev->missing ? "<missing disk>" :
-                                rcu_str_deref(dev_replace->srcdev->name),
+                        rcu_str_deref(dev_replace->srcdev->name),
-                              dev_replace->srcdev->devid,
+                        dev_replace->srcdev->devid,
-                              dev_replace->tgtdev ?
+                        dev_replace->tgtdev ?
-                                rcu_str_deref(dev_replace->tgtdev->name) :
+                        rcu_str_deref(dev_replace->tgtdev->name) :
-                                "<missing target disk>",
+                        "<missing target disk>",
-                              (unsigned int)progress);
+                        (unsigned int)progress);
        }
        btrfs_dev_replace_continue_on_mount(fs_info);
        atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index c031ea3fd70f..a0691df5dcea 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -261,7 +261,7 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
         * see if there is room in the item to insert this
         * name
         */
-        data_size = sizeof(*di) + name_len + sizeof(struct btrfs_item);
+        data_size = sizeof(*di) + name_len;
        leaf = path->nodes[0];
        slot = path->slots[0];
        if (data_size + btrfs_item_size_nr(leaf, slot) +
@@ -459,7 +459,7 @@ int verify_dir_item(struct btrfs_root *root,
        u8 type = btrfs_dir_type(leaf, dir_item);
        if (type >= BTRFS_FT_MAX) {
-                printk(KERN_CRIT "btrfs: invalid dir item type: %d\n",
+                btrfs_crit(root->fs_info, "invalid dir item type: %d",
                       (int)type);
                return 1;
        }
@@ -468,7 +468,7 @@ int verify_dir_item(struct btrfs_root *root,
                namelen = XATTR_NAME_MAX;
        if (btrfs_dir_name_len(leaf, dir_item) > namelen) {
-                printk(KERN_CRIT "btrfs: invalid dir item name len: %u\n",
+                btrfs_crit(root->fs_info, "invalid dir item name len: %u",
                       (unsigned)btrfs_dir_data_len(leaf, dir_item));
                return 1;
        }
@@ -476,7 +476,7 @@ int verify_dir_item(struct btrfs_root *root,
        /* BTRFS_MAX_XATTR_SIZE is the same for all dir items */
        if ((btrfs_dir_data_len(leaf, dir_item) +
             btrfs_dir_name_len(leaf, dir_item)) > BTRFS_MAX_XATTR_SIZE(root)) {
-                printk(KERN_CRIT "btrfs: invalid dir item name + data len: %u + %u\n",
+                btrfs_crit(root->fs_info, "invalid dir item name + data len: %u + %u",
                       (unsigned)btrfs_dir_name_len(leaf, dir_item),
                       (unsigned)btrfs_dir_data_len(leaf, dir_item));
                return 1;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8072cfa8a3b1..81ea55314b1f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -26,7 +26,6 @@
 #include <linux/workqueue.h>
 #include <linux/kthread.h>
 #include <linux/freezer.h>
-#include <linux/crc32c.h>
 #include <linux/slab.h>
 #include <linux/migrate.h>
 #include <linux/ratelimit.h>
@@ -35,6 +34,7 @@
 #include <asm/unaligned.h>
 #include "ctree.h"
 #include "disk-io.h"
+#include "hash.h"
 #include "transaction.h"
 #include "btrfs_inode.h"
 #include "volumes.h"
@@ -48,6 +48,7 @@
 #include "rcu-string.h"
 #include "dev-replace.h"
 #include "raid56.h"
+#include "sysfs.h"
 #ifdef CONFIG_X86
 #include <asm/cpufeature.h>
@@ -243,7 +244,7 @@ out:
 u32 btrfs_csum_data(char *data, u32 seed, size_t len)
 {
-        return crc32c(seed, data, len);
+        return btrfs_crc32c(seed, data, len);
 }
 void btrfs_csum_final(u32 crc, char *result)
@@ -299,11 +300,11 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
                        memcpy(&found, result, csum_size);
                        read_extent_buffer(buf, &val, 0, csum_size);
-                        printk_ratelimited(KERN_INFO "btrfs: %s checksum verify "
+                        printk_ratelimited(KERN_INFO
-                                       "failed on %llu wanted %X found %X "
+                                "BTRFS: %s checksum verify failed on %llu wanted %X found %X "
-                                       "level %d\n",
+                                "level %d\n",
-                                       root->fs_info->sb->s_id, buf->start,
+                                root->fs_info->sb->s_id, buf->start,
-                                       val, found, btrfs_header_level(buf));
+                                val, found, btrfs_header_level(buf));
                        if (result != (char *)&inline_result)
                                kfree(result);
                        return 1;
@@ -382,13 +383,14 @@ static int btrfs_check_super_csum(char *raw_disk_sb)
                        ret = 1;
                if (ret && btrfs_super_generation(disk_sb) < 10) {
-                        printk(KERN_WARNING "btrfs: super block crcs don't match, older mkfs detected\n");
+                        printk(KERN_WARNING
+                                "BTRFS: super block crcs don't match, older mkfs detected\n");
                        ret = 0;
                }
        }
        if (csum_type >= ARRAY_SIZE(btrfs_csum_sizes)) {
-                printk(KERN_ERR "btrfs: unsupported checksum algorithm %u\n",
+                printk(KERN_ERR "BTRFS: unsupported checksum algorithm %u\n",
                                csum_type);
                ret = 1;
        }
@@ -464,13 +466,10 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
 static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
 {
-        struct extent_io_tree *tree;
        u64 start = page_offset(page);
        u64 found_start;
        struct extent_buffer *eb;
-        tree = &BTRFS_I(page->mapping->host)->io_tree;
        eb = (struct extent_buffer *)page->private;
        if (page != eb->pages[0])
                return 0;
@@ -500,8 +499,8 @@ static int check_tree_block_fsid(struct btrfs_root *root,
 }
 #define CORRUPT(reason, eb, root, slot)                         \
-        printk(KERN_CRIT "btrfs: corrupt leaf, %s: block=%llu," \
+        btrfs_crit(root->fs_info, "corrupt leaf, %s: block=%llu,"       \
-               "root=%llu, slot=%d\n", reason,                  \
+                   "root=%llu, slot=%d", reason,                        \
               btrfs_header_bytenr(eb), root->objectid, slot)
 static noinline int check_leaf(struct btrfs_root *root,
@@ -569,7 +568,6 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
                                      u64 phy_offset, struct page *page,
                                      u64 start, u64 end, int mirror)
 {
-        struct extent_io_tree *tree;
        u64 found_start;
        int found_level;
        struct extent_buffer *eb;
@@ -580,7 +578,6 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
        if (!page->private)
                goto out;
-        tree = &BTRFS_I(page->mapping->host)->io_tree;
        eb = (struct extent_buffer *)page->private;
        /* the pending IO might have been the only thing that kept this buffer
@@ -600,21 +597,21 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
        found_start = btrfs_header_bytenr(eb);
        if (found_start != eb->start) {
-                printk_ratelimited(KERN_INFO "btrfs bad tree block start "
+                printk_ratelimited(KERN_INFO "BTRFS: bad tree block start "
                               "%llu %llu\n",
                               found_start, eb->start);
                ret = -EIO;
                goto err;
        }
        if (check_tree_block_fsid(root, eb)) {
-                printk_ratelimited(KERN_INFO "btrfs bad fsid on block %llu\n",
+                printk_ratelimited(KERN_INFO "BTRFS: bad fsid on block %llu\n",
                               eb->start);
                ret = -EIO;
                goto err;
        }
        found_level = btrfs_header_level(eb);
        if (found_level >= BTRFS_MAX_LEVEL) {
-                btrfs_info(root->fs_info, "bad tree block level %d\n",
+                btrfs_info(root->fs_info, "bad tree block level %d",
                           (int)btrfs_header_level(eb));
                ret = -EIO;
                goto err;
@@ -842,20 +839,17 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
 static int btree_csum_one_bio(struct bio *bio)
 {
-        struct bio_vec *bvec = bio->bi_io_vec;
+        struct bio_vec *bvec;
-        int bio_index = 0;
        struct btrfs_root *root;
-        int ret = 0;
+        int i, ret = 0;
-        WARN_ON(bio->bi_vcnt <= 0);
+        bio_for_each_segment_all(bvec, bio, i) {
-        while (bio_index < bio->bi_vcnt) {
                root = BTRFS_I(bvec->bv_page->mapping->host)->root;
                ret = csum_dirty_buffer(root, bvec->bv_page);
                if (ret)
                        break;
-                bio_index++;
-                bvec++;
        }
        return ret;
 }
@@ -967,11 +961,9 @@ static int btree_migratepage(struct address_space *mapping,
 static int btree_writepages(struct address_space *mapping,
                            struct writeback_control *wbc)
 {
-        struct extent_io_tree *tree;
        struct btrfs_fs_info *fs_info;
        int ret;
-        tree = &BTRFS_I(mapping->host)->io_tree;
        if (wbc->sync_mode == WB_SYNC_NONE) {
                if (wbc->for_kupdate)
@@ -1010,8 +1002,9 @@ static void btree_invalidatepage(struct page *page, unsigned int offset,
        extent_invalidatepage(tree, page, offset);
        btree_releasepage(page, GFP_NOFS);
        if (PagePrivate(page)) {
-                printk(KERN_WARNING "btrfs warning page private not zero "
+                btrfs_warn(BTRFS_I(page->mapping->host)->root->fs_info,
-                       "on page %llu\n", (unsigned long long)page_offset(page));
+                           "page private not zero on page %llu",
+                           (unsigned long long)page_offset(page));
                ClearPagePrivate(page);
                set_page_private(page, 0);
                page_cache_release(page);
@@ -1095,21 +1088,13 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
 struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
                                            u64 bytenr, u32 blocksize)
 {
-        struct inode *btree_inode = root->fs_info->btree_inode;
+        return find_extent_buffer(root->fs_info, bytenr);
-        struct extent_buffer *eb;
-        eb = find_extent_buffer(&BTRFS_I(btree_inode)->io_tree, bytenr);
-        return eb;
 }
 struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
                                                 u64 bytenr, u32 blocksize)
 {
-        struct inode *btree_inode = root->fs_info->btree_inode;
+        return alloc_extent_buffer(root->fs_info, bytenr, blocksize);
-        struct extent_buffer *eb;
-        eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
-                                 bytenr, blocksize);
-        return eb;
 }
@@ -1273,7 +1258,6 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
        struct btrfs_root *root;
        struct btrfs_key key;
        int ret = 0;
-        u64 bytenr;
        uuid_le uuid;
        root = btrfs_alloc_root(fs_info);
@@ -1295,7 +1279,6 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
                goto fail;
        }
-        bytenr = leaf->start;
        memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header));
        btrfs_set_header_bytenr(leaf, leaf->start);
        btrfs_set_header_generation(leaf, trans->transid);
@@ -1616,7 +1599,8 @@ again:
        if (ret)
                goto fail;
-        ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
+        ret = btrfs_find_item(fs_info->tree_root, NULL, BTRFS_ORPHAN_OBJECTID,
+                        location->objectid, BTRFS_ORPHAN_ITEM_KEY, NULL);
        if (ret < 0)
                goto fail;
        if (ret == 0)
@@ -1684,18 +1668,16 @@ static void end_workqueue_fn(struct btrfs_work *work)
 {
        struct bio *bio;
        struct end_io_wq *end_io_wq;
-        struct btrfs_fs_info *fs_info;
        int error;
        end_io_wq = container_of(work, struct end_io_wq, work);
        bio = end_io_wq->bio;
-        fs_info = end_io_wq->info;
        error = end_io_wq->error;
        bio->bi_private = end_io_wq->private;
        bio->bi_end_io = end_io_wq->end_io;
        kfree(end_io_wq);
-        bio_endio(bio, error);
+        bio_endio_nodec(bio, error);
 }
 static int cleaner_kthread(void *arg)
@@ -2080,6 +2062,12 @@ static void del_fs_roots(struct btrfs_fs_info *fs_info)
                for (i = 0; i < ret; i++)
                        btrfs_drop_and_free_fs_root(fs_info, gang[i]);
        }
+        if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+                btrfs_free_log_root_tree(NULL, fs_info);
+                btrfs_destroy_pinned_extent(fs_info->tree_root,
+                                            fs_info->pinned_extents);
+        }
 }
 int open_ctree(struct super_block *sb,
@@ -2154,6 +2142,7 @@ int open_ctree(struct super_block *sb,
        mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
        INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
+        INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
        INIT_LIST_HEAD(&fs_info->trans_list);
        INIT_LIST_HEAD(&fs_info->dead_roots);
        INIT_LIST_HEAD(&fs_info->delayed_iputs);
@@ -2167,6 +2156,7 @@ int open_ctree(struct super_block *sb,
        spin_lock_init(&fs_info->free_chunk_lock);
        spin_lock_init(&fs_info->tree_mod_seq_lock);
        spin_lock_init(&fs_info->super_lock);
+        spin_lock_init(&fs_info->buffer_lock);
        rwlock_init(&fs_info->tree_mod_log_lock);
        mutex_init(&fs_info->reloc_mutex);
        seqlock_init(&fs_info->profiles_lock);
@@ -2198,7 +2188,7 @@ int open_ctree(struct super_block *sb,
        fs_info->free_chunk_space = 0;
        fs_info->tree_mod_log = RB_ROOT;
        fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
+        fs_info->avg_delayed_ref_runtime = div64_u64(NSEC_PER_SEC, 64);
        /* readahead state */
        INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
        spin_lock_init(&fs_info->reada_lock);
@@ -2337,7 +2327,7 @@ int open_ctree(struct super_block *sb,
         * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k).
         */
        if (btrfs_check_super_csum(bh->b_data)) {
-                printk(KERN_ERR "btrfs: superblock checksum mismatch\n");
+                printk(KERN_ERR "BTRFS: superblock checksum mismatch\n");
                err = -EINVAL;
                goto fail_alloc;
        }
@@ -2356,7 +2346,7 @@ int open_ctree(struct super_block *sb,
        ret = btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
        if (ret) {
-                printk(KERN_ERR "btrfs: superblock contains fatal errors\n");
+                printk(KERN_ERR "BTRFS: superblock contains fatal errors\n");
                err = -EINVAL;
                goto fail_alloc;
        }
@@ -2421,7 +2411,7 @@ int open_ctree(struct super_block *sb,
                features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
        if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA)
-                printk(KERN_ERR "btrfs: has skinny extents\n");
+                printk(KERN_ERR "BTRFS: has skinny extents\n");
        /*
         * flag our filesystem as having big metadata blocks if
@@ -2429,7 +2419,7 @@ int open_ctree(struct super_block *sb,
         */
        if (btrfs_super_leafsize(disk_super) > PAGE_CACHE_SIZE) {
                if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA))
-                        printk(KERN_INFO "btrfs flagging fs with big metadata feature\n");
+                        printk(KERN_INFO "BTRFS: flagging fs with big metadata feature\n");
                features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
        }
@@ -2446,7 +2436,7 @@ int open_ctree(struct super_block *sb,
         */
        if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
            (sectorsize != leafsize)) {
-                printk(KERN_WARNING "btrfs: unequal leaf/node/sector sizes "
+                printk(KERN_WARNING "BTRFS: unequal leaf/node/sector sizes "
                                "are not allowed for mixed block groups on %s\n",
                                sb->s_id);
                goto fail_alloc;
@@ -2583,12 +2573,12 @@ int open_ctree(struct super_block *sb,
        sb->s_blocksize_bits = blksize_bits(sectorsize);
        if (btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
-                printk(KERN_INFO "btrfs: valid FS not found on %s\n", sb->s_id);
+                printk(KERN_INFO "BTRFS: valid FS not found on %s\n", sb->s_id);
                goto fail_sb_buffer;
        }
        if (sectorsize != PAGE_SIZE) {
-                printk(KERN_WARNING "btrfs: Incompatible sector size(%lu) "
+                printk(KERN_WARNING "BTRFS: Incompatible sector size(%lu) "
                       "found on %s\n", (unsigned long)sectorsize, sb->s_id);
                goto fail_sb_buffer;
        }
@@ -2597,7 +2587,7 @@ int open_ctree(struct super_block *sb,
        ret = btrfs_read_sys_array(tree_root);
        mutex_unlock(&fs_info->chunk_mutex);
        if (ret) {
-                printk(KERN_WARNING "btrfs: failed to read the system "
+                printk(KERN_WARNING "BTRFS: failed to read the system "
                       "array on %s\n", sb->s_id);
                goto fail_sb_buffer;
        }
@@ -2614,7 +2604,7 @@ int open_ctree(struct super_block *sb,
                                           blocksize, generation);
        if (!chunk_root->node ||
            !test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
-                printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n",
+                printk(KERN_WARNING "BTRFS: failed to read chunk root on %s\n",
                       sb->s_id);
                goto fail_tree_roots;
        }
@@ -2626,7 +2616,7 @@ int open_ctree(struct super_block *sb,
        ret = btrfs_read_chunk_tree(chunk_root);
        if (ret) {
-                printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n",
+                printk(KERN_WARNING "BTRFS: failed to read chunk tree on %s\n",
                       sb->s_id);
                goto fail_tree_roots;
        }
@@ -2638,7 +2628,7 @@ int open_ctree(struct super_block *sb,
        btrfs_close_extra_devices(fs_info, fs_devices, 0);
        if (!fs_devices->latest_bdev) {
-                printk(KERN_CRIT "btrfs: failed to read devices on %s\n",
+                printk(KERN_CRIT "BTRFS: failed to read devices on %s\n",
                       sb->s_id);
                goto fail_tree_roots;
        }
@@ -2653,7 +2643,7 @@ retry_root_backup:
                                          blocksize, generation);
        if (!tree_root->node ||
            !test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
-                printk(KERN_WARNING "btrfs: failed to read tree root on %s\n",
+                printk(KERN_WARNING "BTRFS: failed to read tree root on %s\n",
                       sb->s_id);
                goto recovery_tree_root;
@@ -2724,50 +2714,56 @@ retry_root_backup:
        ret = btrfs_recover_balance(fs_info);
        if (ret) {
-                printk(KERN_WARNING "btrfs: failed to recover balance\n");
+                printk(KERN_WARNING "BTRFS: failed to recover balance\n");
                goto fail_block_groups;
        }
        ret = btrfs_init_dev_stats(fs_info);
        if (ret) {
-                printk(KERN_ERR "btrfs: failed to init dev_stats: %d\n",
+                printk(KERN_ERR "BTRFS: failed to init dev_stats: %d\n",
                       ret);
                goto fail_block_groups;
        }
        ret = btrfs_init_dev_replace(fs_info);
        if (ret) {
-                pr_err("btrfs: failed to init dev_replace: %d\n", ret);
+                pr_err("BTRFS: failed to init dev_replace: %d\n", ret);
                goto fail_block_groups;
        }
        btrfs_close_extra_devices(fs_info, fs_devices, 1);
-        ret = btrfs_init_space_info(fs_info);
+        ret = btrfs_sysfs_add_one(fs_info);
        if (ret) {
-                printk(KERN_ERR "Failed to initial space info: %d\n", ret);
+                pr_err("BTRFS: failed to init sysfs interface: %d\n", ret);
                goto fail_block_groups;
        }
+        ret = btrfs_init_space_info(fs_info);
+        if (ret) {
+                printk(KERN_ERR "BTRFS: Failed to initial space info: %d\n", ret);
+                goto fail_sysfs;
+        }
        ret = btrfs_read_block_groups(extent_root);
        if (ret) {
-                printk(KERN_ERR "Failed to read block groups: %d\n", ret);
+                printk(KERN_ERR "BTRFS: Failed to read block groups: %d\n", ret);
-                goto fail_block_groups;
+                goto fail_sysfs;
        }
        fs_info->num_tolerated_disk_barrier_failures =
                btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
        if (fs_info->fs_devices->missing_devices >
             fs_info->num_tolerated_disk_barrier_failures &&
            !(sb->s_flags & MS_RDONLY)) {
-                printk(KERN_WARNING
+                printk(KERN_WARNING "BTRFS: "
-                       "Btrfs: too many missing devices, writeable mount is not allowed\n");
+                        "too many missing devices, writeable mount is not allowed\n");
-                goto fail_block_groups;
+                goto fail_sysfs;
        }
        fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
                                               "btrfs-cleaner");
        if (IS_ERR(fs_info->cleaner_kthread))
-                goto fail_block_groups;
+                goto fail_sysfs;
        fs_info->transaction_kthread = kthread_run(transaction_kthread,
                                                   tree_root,
@@ -2778,11 +2774,15 @@ retry_root_backup:
        if (!btrfs_test_opt(tree_root, SSD) &&
            !btrfs_test_opt(tree_root, NOSSD) &&
            !fs_info->fs_devices->rotating) {
-                printk(KERN_INFO "Btrfs detected SSD devices, enabling SSD "
+                printk(KERN_INFO "BTRFS: detected SSD devices, enabling SSD "
                       "mode\n");
                btrfs_set_opt(fs_info->mount_opt, SSD);
        }
+        /* Set the real inode map cache flag */
+        if (btrfs_test_opt(tree_root, CHANGE_INODE_CACHE))
+                btrfs_set_opt(tree_root->fs_info->mount_opt, INODE_MAP_CACHE);
 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
        if (btrfs_test_opt(tree_root, CHECK_INTEGRITY)) {
                ret = btrfsic_mount(tree_root, fs_devices,
@@ -2791,7 +2791,7 @@ retry_root_backup:
                                    1 : 0,
                                    fs_info->check_integrity_print_mask);
                if (ret)
-                        printk(KERN_WARNING "btrfs: failed to initialize"
+                        printk(KERN_WARNING "BTRFS: failed to initialize"
                               " integrity check module %s\n", sb->s_id);
        }
 #endif
@@ -2804,7 +2804,7 @@ retry_root_backup:
                u64 bytenr = btrfs_super_log_root(disk_super);
                if (fs_devices->rw_devices == 0) {
-                        printk(KERN_WARNING "Btrfs log replay required "
+                        printk(KERN_WARNING "BTRFS: log replay required "
                               "on RO media\n");
                        err = -EIO;
                        goto fail_qgroup;
@@ -2827,7 +2827,7 @@ retry_root_backup:
                                                      generation + 1);
                if (!log_tree_root->node ||
                    !extent_buffer_uptodate(log_tree_root->node)) {
-                        printk(KERN_ERR "btrfs: failed to read log tree\n");
+                        printk(KERN_ERR "BTRFS: failed to read log tree\n");
                        free_extent_buffer(log_tree_root->node);
                        kfree(log_tree_root);
                        goto fail_trans_kthread;
@@ -2861,7 +2861,7 @@ retry_root_backup:
                ret = btrfs_recover_relocation(tree_root);
                if (ret < 0) {
                        printk(KERN_WARNING
-                               "btrfs: failed to recover relocation\n");
+                               "BTRFS: failed to recover relocation\n");
                        err = -EINVAL;
                        goto fail_qgroup;
                }
@@ -2891,14 +2891,14 @@ retry_root_backup:
        ret = btrfs_resume_balance_async(fs_info);
        if (ret) {
-                printk(KERN_WARNING "btrfs: failed to resume balance\n");
+                printk(KERN_WARNING "BTRFS: failed to resume balance\n");
                close_ctree(tree_root);
                return ret;
        }
        ret = btrfs_resume_dev_replace_async(fs_info);
        if (ret) {
-                pr_warn("btrfs: failed to resume dev_replace\n");
+                pr_warn("BTRFS: failed to resume dev_replace\n");
                close_ctree(tree_root);
                return ret;
        }
@@ -2906,20 +2906,20 @@ retry_root_backup:
        btrfs_qgroup_rescan_resume(fs_info);
        if (create_uuid_tree) {
-                pr_info("btrfs: creating UUID tree\n");
+                pr_info("BTRFS: creating UUID tree\n");
                ret = btrfs_create_uuid_tree(fs_info);
                if (ret) {
-                        pr_warn("btrfs: failed to create the UUID tree %d\n",
+                        pr_warn("BTRFS: failed to create the UUID tree %d\n",
                                ret);
                        close_ctree(tree_root);
                        return ret;
                }
        } else if (check_uuid_tree ||
                   btrfs_test_opt(tree_root, RESCAN_UUID_TREE)) {
-                pr_info("btrfs: checking UUID tree\n");
+                pr_info("BTRFS: checking UUID tree\n");
                ret = btrfs_check_uuid_tree(fs_info);
                if (ret) {
-                        pr_warn("btrfs: failed to check the UUID tree %d\n",
+                        pr_warn("BTRFS: failed to check the UUID tree %d\n",
                                ret);
                        close_ctree(tree_root);
                        return ret;
@@ -2945,6 +2945,9 @@ fail_cleaner:
         */
        filemap_write_and_wait(fs_info->btree_inode->i_mapping);
+fail_sysfs:
+        btrfs_sysfs_remove_one(fs_info);
 fail_block_groups:
        btrfs_put_block_group_cache(fs_info);
        btrfs_free_block_groups(fs_info);
@@ -3000,7 +3003,7 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
                struct btrfs_device *device = (struct btrfs_device *)
                        bh->b_private;
-                printk_ratelimited_in_rcu(KERN_WARNING "lost page write due to "
+                printk_ratelimited_in_rcu(KERN_WARNING "BTRFS: lost page write due to "
                                          "I/O error on %s\n",
                                          rcu_str_deref(device->name));
                /* note, we dont' set_buffer_write_io_error because we have
@@ -3119,7 +3122,7 @@ static int write_dev_supers(struct btrfs_device *device,
                        bh = __getblk(device->bdev, bytenr / 4096,
                                      BTRFS_SUPER_INFO_SIZE);
                        if (!bh) {
-                                printk(KERN_ERR "btrfs: couldn't get super "
+                                printk(KERN_ERR "BTRFS: couldn't get super "
                                       "buffer head for bytenr %Lu\n", bytenr);
                                errors++;
                                continue;
@@ -3140,7 +3143,10 @@ static int write_dev_supers(struct btrfs_device *device,
                 * we fua the first super.  The others we allow
                 * to go down lazy.
                 */
-                ret = btrfsic_submit_bh(WRITE_FUA, bh);
+                if (i == 0)
+                        ret = btrfsic_submit_bh(WRITE_FUA, bh);
+                else
+                        ret = btrfsic_submit_bh(WRITE_SYNC, bh);
                if (ret)
                        errors++;
        }
@@ -3186,7 +3192,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
                wait_for_completion(&device->flush_wait);
                if (bio_flagged(bio, BIO_EOPNOTSUPP)) {
-                        printk_in_rcu("btrfs: disabling barriers on dev %s\n",
+                        printk_in_rcu("BTRFS: disabling barriers on dev %s\n",
                                      rcu_str_deref(device->name));
                        device->nobarriers = 1;
                } else if (!bio_flagged(bio, BIO_UPTODATE)) {
@@ -3407,7 +3413,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors)
                        total_errors++;
        }
        if (total_errors > max_errors) {
-                printk(KERN_ERR "btrfs: %d errors while writing supers\n",
+                btrfs_err(root->fs_info, "%d errors while writing supers",
                       total_errors);
                mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
@@ -3455,10 +3461,8 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
        if (btrfs_root_refs(&root->root_item) == 0)
                synchronize_srcu(&fs_info->subvol_srcu);
-        if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+        if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
                btrfs_free_log(NULL, root);
-                btrfs_free_log_root_tree(NULL, fs_info);
-        }
        __btrfs_remove_free_space_cache(root->free_ino_pinned);
        __btrfs_remove_free_space_cache(root->free_ino_ctl);
@@ -3563,14 +3567,12 @@ int close_ctree(struct btrfs_root *root)
        if (!(fs_info->sb->s_flags & MS_RDONLY)) {
                ret = btrfs_commit_super(root);
                if (ret)
-                        printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
+                        btrfs_err(root->fs_info, "commit super ret %d", ret);
        }
        if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
                btrfs_error_commit_super(root);
-        btrfs_put_block_group_cache(fs_info);
        kthread_stop(fs_info->transaction_kthread);
        kthread_stop(fs_info->cleaner_kthread);
@@ -3580,12 +3582,16 @@ int close_ctree(struct btrfs_root *root)
        btrfs_free_qgroup_config(root->fs_info);
        if (percpu_counter_sum(&fs_info->delalloc_bytes)) {
-                printk(KERN_INFO "btrfs: at unmount delalloc count %lld\n",
+                btrfs_info(root->fs_info, "at unmount delalloc count %lld",
                       percpu_counter_sum(&fs_info->delalloc_bytes));
        }
+        btrfs_sysfs_remove_one(fs_info);
        del_fs_roots(fs_info);
+        btrfs_put_block_group_cache(fs_info);
        btrfs_free_block_groups(fs_info);
        btrfs_stop_all_workers(fs_info);
@@ -3803,55 +3809,54 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
        delayed_refs = &trans->delayed_refs;
        spin_lock(&delayed_refs->lock);
-        if (delayed_refs->num_entries == 0) {
+        if (atomic_read(&delayed_refs->num_entries) == 0) {
                spin_unlock(&delayed_refs->lock);
-                printk(KERN_INFO "delayed_refs has NO entry\n");
+                btrfs_info(root->fs_info, "delayed_refs has NO entry");
                return ret;
        }
-        while ((node = rb_first(&delayed_refs->root)) != NULL) {
+        while ((node = rb_first(&delayed_refs->href_root)) != NULL) {
-                struct btrfs_delayed_ref_head *head = NULL;
+                struct btrfs_delayed_ref_head *head;
                bool pin_bytes = false;
-                ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+                head = rb_entry(node, struct btrfs_delayed_ref_head,
-                atomic_set(&ref->refs, 1);
+                                href_node);
-                if (btrfs_delayed_ref_is_head(ref)) {
+                if (!mutex_trylock(&head->mutex)) {
+                        atomic_inc(&head->node.refs);
-                        head = btrfs_delayed_node_to_head(ref);
+                        spin_unlock(&delayed_refs->lock);
-                        if (!mutex_trylock(&head->mutex)) {
-                                atomic_inc(&ref->refs);
-                                spin_unlock(&delayed_refs->lock);
-                                /* Need to wait for the delayed ref to run */
-                                mutex_lock(&head->mutex);
-                                mutex_unlock(&head->mutex);
-                                btrfs_put_delayed_ref(ref);
-                                spin_lock(&delayed_refs->lock);
-                                continue;
-                        }
-                        if (head->must_insert_reserved)
+                        mutex_lock(&head->mutex);
-                                pin_bytes = true;
-                        btrfs_free_delayed_extent_op(head->extent_op);
-                        delayed_refs->num_heads--;
-                        if (list_empty(&head->cluster))
-                                delayed_refs->num_heads_ready--;
-                        list_del_init(&head->cluster);
-                }
-                ref->in_tree = 0;
-                rb_erase(&ref->rb_node, &delayed_refs->root);
-                delayed_refs->num_entries--;
-                spin_unlock(&delayed_refs->lock);
-                if (head) {
-                        if (pin_bytes)
-                                btrfs_pin_extent(root, ref->bytenr,
-                                                 ref->num_bytes, 1);
                        mutex_unlock(&head->mutex);
+                        btrfs_put_delayed_ref(&head->node);
+                        spin_lock(&delayed_refs->lock);
+                        continue;
+                }
+                spin_lock(&head->lock);
+                while ((node = rb_first(&head->ref_root)) != NULL) {
+                        ref = rb_entry(node, struct btrfs_delayed_ref_node,
+                                       rb_node);
+                        ref->in_tree = 0;
+                        rb_erase(&ref->rb_node, &head->ref_root);
+                        atomic_dec(&delayed_refs->num_entries);
+                        btrfs_put_delayed_ref(ref);
                }
-                btrfs_put_delayed_ref(ref);
+                if (head->must_insert_reserved)
+                        pin_bytes = true;
+                btrfs_free_delayed_extent_op(head->extent_op);
+                delayed_refs->num_heads--;
+                if (head->processing == 0)
+                        delayed_refs->num_heads_ready--;
+                atomic_dec(&delayed_refs->num_entries);
+                head->node.in_tree = 0;
+                rb_erase(&head->href_node, &delayed_refs->href_root);
+                spin_unlock(&head->lock);
+                spin_unlock(&delayed_refs->lock);
+                mutex_unlock(&head->mutex);
+                if (pin_bytes)
+                        btrfs_pin_extent(root, head->node.bytenr,
+                                         head->node.num_bytes, 1);
+                btrfs_put_delayed_ref(&head->node);
                cond_resched();
                spin_lock(&delayed_refs->lock);
        }
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 9c01509dd8ab..32312e09f0f5 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -35,6 +35,7 @@
 #include "locking.h"
 #include "free-space-cache.h"
 #include "math.h"
+#include "sysfs.h"
 #undef SCRAMBLE_DELAYED_REFS
@@ -441,7 +442,8 @@ next:
                        if (ret)
                                break;
-                        if (need_resched()) {
+                        if (need_resched() ||
+                            rwsem_is_contended(&fs_info->extent_commit_sem)) {
                                caching_ctl->progress = last;
                                btrfs_release_path(path);
                                up_read(&fs_info->extent_commit_sem);
@@ -855,12 +857,14 @@ again:
                        btrfs_put_delayed_ref(&head->node);
                        goto search_again;
                }
+                spin_lock(&head->lock);
                if (head->extent_op && head->extent_op->update_flags)
                        extent_flags |= head->extent_op->flags_to_set;
                else
                        BUG_ON(num_refs == 0);
                num_refs += head->node.ref_mod;
+                spin_unlock(&head->lock);
                mutex_unlock(&head->mutex);
        }
        spin_unlock(&delayed_refs->lock);
@@ -1070,11 +1074,11 @@ static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
        __le64 lenum;
        lenum = cpu_to_le64(root_objectid);
-        high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
+        high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
        lenum = cpu_to_le64(owner);
-        low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
+        low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
        lenum = cpu_to_le64(offset);
-        low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
+        low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
        return ((u64)high_crc << 31) ^ (u64)low_crc;
 }
@@ -2285,64 +2289,62 @@ static noinline struct btrfs_delayed_ref_node *
 select_delayed_ref(struct btrfs_delayed_ref_head *head)
 {
        struct rb_node *node;
-        struct btrfs_delayed_ref_node *ref;
+        struct btrfs_delayed_ref_node *ref, *last = NULL;;
-        int action = BTRFS_ADD_DELAYED_REF;
-again:
        /*
         * select delayed ref of type BTRFS_ADD_DELAYED_REF first.
         * this prevents ref count from going down to zero when
         * there still are pending delayed ref.
         */
-        node = rb_prev(&head->node.rb_node);
+        node = rb_first(&head->ref_root);
-        while (1) {
+        while (node) {
-                if (!node)
-                        break;
                ref = rb_entry(node, struct btrfs_delayed_ref_node,
                                rb_node);
-                if (ref->bytenr != head->node.bytenr)
+                if (ref->action == BTRFS_ADD_DELAYED_REF)
-                        break;
-                if (ref->action == action)
                        return ref;
-                node = rb_prev(node);
+                else if (last == NULL)
-        }
+                        last = ref;
-        if (action == BTRFS_ADD_DELAYED_REF) {
+                node = rb_next(node);
-                action = BTRFS_DROP_DELAYED_REF;
-                goto again;
        }
-        return NULL;
+        return last;
 }
 /*
 * Returns 0 on success or if called with an already aborted transaction.
 * Returns -ENOMEM or -EIO on failure and will abort the transaction.
 */
-static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
+static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
-                                       struct btrfs_root *root,
+                                             struct btrfs_root *root,
-                                       struct list_head *cluster)
+                                             unsigned long nr)
 {
        struct btrfs_delayed_ref_root *delayed_refs;
        struct btrfs_delayed_ref_node *ref;
        struct btrfs_delayed_ref_head *locked_ref = NULL;
        struct btrfs_delayed_extent_op *extent_op;
        struct btrfs_fs_info *fs_info = root->fs_info;
+        ktime_t start = ktime_get();
        int ret;
-        int count = 0;
+        unsigned long count = 0;
+        unsigned long actual_count = 0;
        int must_insert_reserved = 0;
        delayed_refs = &trans->transaction->delayed_refs;
        while (1) {
                if (!locked_ref) {
-                        /* pick a new head ref from the cluster list */
+                        if (count >= nr)
-                        if (list_empty(cluster))
                                break;
-                        locked_ref = list_entry(cluster->next,
+                        spin_lock(&delayed_refs->lock);
-                                     struct btrfs_delayed_ref_head, cluster);
+                        locked_ref = btrfs_select_ref_head(trans);
+                        if (!locked_ref) {
+                                spin_unlock(&delayed_refs->lock);
+                                break;
+                        }
                        /* grab the lock that says we are going to process
                         * all the refs for this head */
                        ret = btrfs_delayed_ref_lock(trans, locked_ref);
+                        spin_unlock(&delayed_refs->lock);
                        /*
                         * we may have dropped the spin lock to get the head
                         * mutex lock, and that might have given someone else
@@ -2363,6 +2365,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                 * finish.  If we merged anything we need to re-loop so we can
                 * get a good ref.
                 */
+                spin_lock(&locked_ref->lock);
                btrfs_merge_delayed_refs(trans, fs_info, delayed_refs,
                                         locked_ref);
@@ -2374,17 +2377,15 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                if (ref && ref->seq &&
                    btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) {
-                        /*
+                        spin_unlock(&locked_ref->lock);
-                         * there are still refs with lower seq numbers in the
-                         * process of being added. Don't run this ref yet.
-                         */
-                        list_del_init(&locked_ref->cluster);
                        btrfs_delayed_ref_unlock(locked_ref);
-                        locked_ref = NULL;
+                        spin_lock(&delayed_refs->lock);
+                        locked_ref->processing = 0;
                        delayed_refs->num_heads_ready++;
                        spin_unlock(&delayed_refs->lock);
+                        locked_ref = NULL;
                        cond_resched();
-                        spin_lock(&delayed_refs->lock);
+                        count++;
                        continue;
                }
@@ -2399,6 +2400,8 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                locked_ref->extent_op = NULL;
                if (!ref) {
                        /* All delayed refs have been processed, Go ahead
                         * and send the head node to run_one_delayed_ref,
                         * so that any accounting fixes can happen
@@ -2411,8 +2414,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                        }
                        if (extent_op) {
-                                spin_unlock(&delayed_refs->lock);
+                                spin_unlock(&locked_ref->lock);
                                ret = run_delayed_extent_op(trans, root,
                                                            ref, extent_op);
                                btrfs_free_delayed_extent_op(extent_op);
@@ -2426,19 +2428,39 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                                         */
                                        if (must_insert_reserved)
                                                locked_ref->must_insert_reserved = 1;
+                                        locked_ref->processing = 0;
                                        btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
-                                        spin_lock(&delayed_refs->lock);
                                        btrfs_delayed_ref_unlock(locked_ref);
                                        return ret;
                                }
+                                continue;
+                        }
-                                goto next;
+                        /*
+                         * Need to drop our head ref lock and re-aqcuire the
+                         * delayed ref lock and then re-check to make sure
+                         * nobody got added.
+                         */
+                        spin_unlock(&locked_ref->lock);
+                        spin_lock(&delayed_refs->lock);
+                        spin_lock(&locked_ref->lock);
+                        if (rb_first(&locked_ref->ref_root)) {
+                                spin_unlock(&locked_ref->lock);
+                                spin_unlock(&delayed_refs->lock);
+                                continue;
                        }
+                        ref->in_tree = 0;
+                        delayed_refs->num_heads--;
+                        rb_erase(&locked_ref->href_node,
+                                 &delayed_refs->href_root);
+                        spin_unlock(&delayed_refs->lock);
+                } else {
+                        actual_count++;
+                        ref->in_tree = 0;
+                        rb_erase(&ref->rb_node, &locked_ref->ref_root);
                }
+                atomic_dec(&delayed_refs->num_entries);
-                ref->in_tree = 0;
-                rb_erase(&ref->rb_node, &delayed_refs->root);
-                delayed_refs->num_entries--;
                if (!btrfs_delayed_ref_is_head(ref)) {
                        /*
                         * when we play the delayed ref, also correct the
@@ -2455,20 +2477,18 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                        default:
                                WARN_ON(1);
                        }
-                } else {
-                        list_del_init(&locked_ref->cluster);
                }
-                spin_unlock(&delayed_refs->lock);
+                spin_unlock(&locked_ref->lock);
                ret = run_one_delayed_ref(trans, root, ref, extent_op,
                                          must_insert_reserved);
                btrfs_free_delayed_extent_op(extent_op);
                if (ret) {
+                        locked_ref->processing = 0;
                        btrfs_delayed_ref_unlock(locked_ref);
                        btrfs_put_delayed_ref(ref);
                        btrfs_debug(fs_info, "run_one_delayed_ref returned %d", ret);
-                        spin_lock(&delayed_refs->lock);
                        return ret;
                }
@@ -2484,11 +2504,29 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                }
                btrfs_put_delayed_ref(ref);
                count++;
-next:
                cond_resched();
+        }
+        /*
+         * We don't want to include ref heads since we can have empty ref heads
+         * and those will drastically skew our runtime down since we just do
+         * accounting, no actual extent tree updates.
+         */
+        if (actual_count > 0) {
+                u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start));
+                u64 avg;
+                /*
+                 * We weigh the current average higher than our current runtime
+                 * to avoid large swings in the average.
+                 */
                spin_lock(&delayed_refs->lock);
+                avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
+                avg = div64_u64(avg, 4);
+                fs_info->avg_delayed_ref_runtime = avg;
+                spin_unlock(&delayed_refs->lock);
        }
-        return count;
+        return 0;
 }
 #ifdef SCRAMBLE_DELAYED_REFS
@@ -2570,16 +2608,6 @@ int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
        return ret;
 }
-static int refs_newer(struct btrfs_delayed_ref_root *delayed_refs, int seq,
-                      int count)
-{
-        int val = atomic_read(&delayed_refs->ref_seq);
-        if (val < seq || val >= seq + count)
-                return 1;
-        return 0;
-}
 static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads)
 {
        u64 num_bytes;
@@ -2596,7 +2624,7 @@ static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads)
        return div64_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root));
 }
-int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
+int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
                                       struct btrfs_root *root)
 {
        struct btrfs_block_rsv *global_rsv;
@@ -2625,6 +2653,22 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
        return ret;
 }
+int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
+                                       struct btrfs_root *root)
+{
+        struct btrfs_fs_info *fs_info = root->fs_info;
+        u64 num_entries =
+                atomic_read(&trans->transaction->delayed_refs.num_entries);
+        u64 avg_runtime;
+        smp_mb();
+        avg_runtime = fs_info->avg_delayed_ref_runtime;
+        if (num_entries * avg_runtime >= NSEC_PER_SEC)
+                return 1;
+        return btrfs_check_space_for_delayed_refs(trans, root);
+}
 /*
 * this starts processing the delayed reference count updates and
 * extent insertions we have queued up so far.  count can be
@@ -2640,13 +2684,10 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 {
        struct rb_node *node;
        struct btrfs_delayed_ref_root *delayed_refs;
-        struct btrfs_delayed_ref_node *ref;
+        struct btrfs_delayed_ref_head *head;
-        struct list_head cluster;
        int ret;
-        u64 delayed_start;
        int run_all = count == (unsigned long)-1;
        int run_most = 0;
-        int loops;
        /* We'll clean this up in btrfs_cleanup_transaction */
        if (trans->aborted)
@@ -2658,130 +2699,40 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
        btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
        delayed_refs = &trans->transaction->delayed_refs;
-        INIT_LIST_HEAD(&cluster);
        if (count == 0) {
-                count = delayed_refs->num_entries * 2;
+                count = atomic_read(&delayed_refs->num_entries) * 2;
                run_most = 1;
        }
-        if (!run_all && !run_most) {
-                int old;
-                int seq = atomic_read(&delayed_refs->ref_seq);
-progress:
-                old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1);
-                if (old) {
-                        DEFINE_WAIT(__wait);
-                        if (delayed_refs->flushing ||
-                            !btrfs_should_throttle_delayed_refs(trans, root))
-                                return 0;
-                        prepare_to_wait(&delayed_refs->wait, &__wait,
-                                        TASK_UNINTERRUPTIBLE);
-                        old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1);
-                        if (old) {
-                                schedule();
-                                finish_wait(&delayed_refs->wait, &__wait);
-                                if (!refs_newer(delayed_refs, seq, 256))
-                                        goto progress;
-                                else
-                                        return 0;
-                        } else {
-                                finish_wait(&delayed_refs->wait, &__wait);
-                                goto again;
-                        }
-                }
-        } else {
-                atomic_inc(&delayed_refs->procs_running_refs);
-        }
 again:
-        loops = 0;
-        spin_lock(&delayed_refs->lock);
 #ifdef SCRAMBLE_DELAYED_REFS
        delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
 #endif
+        ret = __btrfs_run_delayed_refs(trans, root, count);
-        while (1) {
+        if (ret < 0) {
-                if (!(run_all || run_most) &&
+                btrfs_abort_transaction(trans, root, ret);
-                    !btrfs_should_throttle_delayed_refs(trans, root))
+                return ret;
-                        break;
-                /*
-                 * go find something we can process in the rbtree.  We start at
-                 * the beginning of the tree, and then build a cluster
-                 * of refs to process starting at the first one we are able to
-                 * lock
-                 */
-                delayed_start = delayed_refs->run_delayed_start;
-                ret = btrfs_find_ref_cluster(trans, &cluster,
-                                             delayed_refs->run_delayed_start);
-                if (ret)
-                        break;
-                ret = run_clustered_refs(trans, root, &cluster);
-                if (ret < 0) {
-                        btrfs_release_ref_cluster(&cluster);
-                        spin_unlock(&delayed_refs->lock);
-                        btrfs_abort_transaction(trans, root, ret);
-                        atomic_dec(&delayed_refs->procs_running_refs);
-                        wake_up(&delayed_refs->wait);
-                        return ret;
-                }
-                atomic_add(ret, &delayed_refs->ref_seq);
-                count -= min_t(unsigned long, ret, count);
-                if (count == 0)
-                        break;
-                if (delayed_start >= delayed_refs->run_delayed_start) {
-                        if (loops == 0) {
-                                /*
-                                 * btrfs_find_ref_cluster looped. let's do one
-                                 * more cycle. if we don't run any delayed ref
-                                 * during that cycle (because we can't because
-                                 * all of them are blocked), bail out.
-                                 */
-                                loops = 1;
-                        } else {
-                                /*
-                                 * no runnable refs left, stop trying
-                                 */
-                                BUG_ON(run_all);
-                                break;
-                        }
-                }
-                if (ret) {
-                        /* refs were run, let's reset staleness detection */
-                        loops = 0;
-                }
        }
        if (run_all) {
-                if (!list_empty(&trans->new_bgs)) {
+                if (!list_empty(&trans->new_bgs))
-                        spin_unlock(&delayed_refs->lock);
                        btrfs_create_pending_block_groups(trans, root);
-                        spin_lock(&delayed_refs->lock);
-                }
-                node = rb_first(&delayed_refs->root);
+                spin_lock(&delayed_refs->lock);
-                if (!node)
+                node = rb_first(&delayed_refs->href_root);
+                if (!node) {
+                        spin_unlock(&delayed_refs->lock);
                        goto out;
+                }
                count = (unsigned long)-1;
                while (node) {
-                        ref = rb_entry(node, struct btrfs_delayed_ref_node,
+                        head = rb_entry(node, struct btrfs_delayed_ref_head,
-                                       rb_node);
+                                        href_node);
-                        if (btrfs_delayed_ref_is_head(ref)) {
+                        if (btrfs_delayed_ref_is_head(&head->node)) {
-                                struct btrfs_delayed_ref_head *head;
+                                struct btrfs_delayed_ref_node *ref;
-                                head = btrfs_delayed_node_to_head(ref);
+                                ref = &head->node;
                                atomic_inc(&ref->refs);
                                spin_unlock(&delayed_refs->lock);
@@ -2795,20 +2746,16 @@ again:
                                btrfs_put_delayed_ref(ref);
                                cond_resched();
                                goto again;
+                        } else {
+                                WARN_ON(1);
                        }
                        node = rb_next(node);
                }
                spin_unlock(&delayed_refs->lock);
-                schedule_timeout(1);
+                cond_resched();
                goto again;
        }
 out:
-        atomic_dec(&delayed_refs->procs_running_refs);
-        smp_mb();
-        if (waitqueue_active(&delayed_refs->wait))
-                wake_up(&delayed_refs->wait);
-        spin_unlock(&delayed_refs->lock);
        assert_qgroups_uptodate(trans);
        return 0;
 }
@@ -2850,12 +2797,13 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
        struct rb_node *node;
        int ret = 0;
-        ret = -ENOENT;
        delayed_refs = &trans->transaction->delayed_refs;
        spin_lock(&delayed_refs->lock);
        head = btrfs_find_delayed_ref_head(trans, bytenr);
-        if (!head)
+        if (!head) {
-                goto out;
+                spin_unlock(&delayed_refs->lock);
+                return 0;
+        }
        if (!mutex_trylock(&head->mutex)) {
                atomic_inc(&head->node.refs);
@@ -2872,40 +2820,35 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
                btrfs_put_delayed_ref(&head->node);
                return -EAGAIN;
        }
+        spin_unlock(&delayed_refs->lock);
-        node = rb_prev(&head->node.rb_node);
+        spin_lock(&head->lock);
-        if (!node)
+        node = rb_first(&head->ref_root);
-                goto out_unlock;
+        while (node) {
+                ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
-        ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+                node = rb_next(node);
-        if (ref->bytenr != bytenr)
-                goto out_unlock;
-        ret = 1;
-        if (ref->type != BTRFS_EXTENT_DATA_REF_KEY)
-                goto out_unlock;
-        data_ref = btrfs_delayed_node_to_data_ref(ref);
+                /* If it's a shared ref we know a cross reference exists */
+                if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
+                        ret = 1;
+                        break;
+                }
-        node = rb_prev(node);
+                data_ref = btrfs_delayed_node_to_data_ref(ref);
-        if (node) {
-                int seq = ref->seq;
-                ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+                /*
-                if (ref->bytenr == bytenr && ref->seq == seq)
+                 * If our ref doesn't match the one we're currently looking at
-                        goto out_unlock;
+                 * then we have a cross reference.
+                 */
+                if (data_ref->root != root->root_key.objectid ||
+                    data_ref->objectid != objectid ||
+                    data_ref->offset != offset) {
+                        ret = 1;
+                        break;
+                }
        }
+        spin_unlock(&head->lock);
-        if (data_ref->root != root->root_key.objectid ||
-            data_ref->objectid != objectid || data_ref->offset != offset)
-                goto out_unlock;
-        ret = 0;
-out_unlock:
        mutex_unlock(&head->mutex);
-out:
-        spin_unlock(&delayed_refs->lock);
        return ret;
 }
@@ -3402,6 +3345,23 @@ int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
        return readonly;
 }
+static const char *alloc_name(u64 flags)
+{
+        switch (flags) {
+        case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA:
+                return "mixed";
+        case BTRFS_BLOCK_GROUP_METADATA:
+                return "metadata";
+        case BTRFS_BLOCK_GROUP_DATA:
+                return "data";
+        case BTRFS_BLOCK_GROUP_SYSTEM:
+                return "system";
+        default:
+                WARN_ON(1);
+                return "invalid-combination";
+        };
+}
 static int update_space_info(struct btrfs_fs_info *info, u64 flags,
                             u64 total_bytes, u64 bytes_used,
                             struct btrfs_space_info **space_info)
@@ -3439,8 +3399,10 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
                return ret;
        }
-        for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
+        for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
                INIT_LIST_HEAD(&found->block_groups[i]);
+                kobject_init(&found->block_group_kobjs[i], &btrfs_raid_ktype);
+        }
        init_rwsem(&found->groups_sem);
        spin_lock_init(&found->lock);
        found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
@@ -3457,11 +3419,21 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
        found->chunk_alloc = 0;
        found->flush = 0;
        init_waitqueue_head(&found->wait);
+        ret = kobject_init_and_add(&found->kobj, &space_info_ktype,
+                                    info->space_info_kobj, "%s",
+                                    alloc_name(found->flags));
+        if (ret) {
+                kfree(found);
+                return ret;
+        }
        *space_info = found;
        list_add_rcu(&found->list, &info->space_info);
        if (flags & BTRFS_BLOCK_GROUP_DATA)
                info->data_sinfo = found;
-        return 0;
+        return ret;
 }
 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
@@ -4637,7 +4609,7 @@ void btrfs_block_rsv_release(struct btrfs_root *root,
                             u64 num_bytes)
 {
        struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
-        if (global_rsv->full || global_rsv == block_rsv ||
+        if (global_rsv == block_rsv ||
            block_rsv->space_info != global_rsv->space_info)
                global_rsv = NULL;
        block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv,
@@ -5916,24 +5888,16 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
 {
        struct btrfs_delayed_ref_head *head;
        struct btrfs_delayed_ref_root *delayed_refs;
-        struct btrfs_delayed_ref_node *ref;
-        struct rb_node *node;
        int ret = 0;
        delayed_refs = &trans->transaction->delayed_refs;
        spin_lock(&delayed_refs->lock);
        head = btrfs_find_delayed_ref_head(trans, bytenr);
        if (!head)
-                goto out;
+                goto out_delayed_unlock;
-        node = rb_prev(&head->node.rb_node);
+        spin_lock(&head->lock);
-        if (!node)
+        if (rb_first(&head->ref_root))
-                goto out;
-        ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
-        /* there are still entries for this ref, we can't drop it */
-        if (ref->bytenr == bytenr)
                goto out;
        if (head->extent_op) {
@@ -5955,19 +5919,19 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
         * ahead and process it.
         */
        head->node.in_tree = 0;
-        rb_erase(&head->node.rb_node, &delayed_refs->root);
+        rb_erase(&head->href_node, &delayed_refs->href_root);
-        delayed_refs->num_entries--;
+        atomic_dec(&delayed_refs->num_entries);
        /*
         * we don't take a ref on the node because we're removing it from the
         * tree, so we just steal the ref the tree was holding.
         */
        delayed_refs->num_heads--;
-        if (list_empty(&head->cluster))
+        if (head->processing == 0)
                delayed_refs->num_heads_ready--;
+        head->processing = 0;
-        list_del_init(&head->cluster);
+        spin_unlock(&head->lock);
        spin_unlock(&delayed_refs->lock);
        BUG_ON(head->extent_op);
@@ -5978,6 +5942,9 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
        btrfs_put_delayed_ref(&head->node);
        return ret;
 out:
+        spin_unlock(&head->lock);
+out_delayed_unlock:
        spin_unlock(&delayed_refs->lock);
        return 0;
 }
@@ -6145,11 +6112,29 @@ int __get_raid_index(u64 flags)
        return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
 }
-static int get_block_group_index(struct btrfs_block_group_cache *cache)
+int get_block_group_index(struct btrfs_block_group_cache *cache)
 {
        return __get_raid_index(cache->flags);
 }
+static const char *btrfs_raid_type_names[BTRFS_NR_RAID_TYPES] = {
+        [BTRFS_RAID_RAID10]     = "raid10",
+        [BTRFS_RAID_RAID1]      = "raid1",
+        [BTRFS_RAID_DUP]        = "dup",
+        [BTRFS_RAID_RAID0]      = "raid0",
+        [BTRFS_RAID_SINGLE]     = "single",
+        [BTRFS_RAID_RAID5]      = "raid5",
+        [BTRFS_RAID_RAID6]      = "raid6",
+};
+static const char *get_raid_name(enum btrfs_raid_types type)
+{
+        if (type >= BTRFS_NR_RAID_TYPES)
+                return NULL;
+        return btrfs_raid_type_names[type];
+}
 enum btrfs_loop_type {
        LOOP_CACHING_NOWAIT = 0,
        LOOP_CACHING_WAIT = 1,
@@ -6177,7 +6162,6 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
        struct btrfs_root *root = orig_root->fs_info->extent_root;
        struct btrfs_free_cluster *last_ptr = NULL;
        struct btrfs_block_group_cache *block_group = NULL;
-        struct btrfs_block_group_cache *used_block_group;
        u64 search_start = 0;
        u64 max_extent_size = 0;
        int empty_cluster = 2 * 1024 * 1024;
@@ -6186,7 +6170,6 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
        int index = __get_raid_index(flags);
        int alloc_type = (flags & BTRFS_BLOCK_GROUP_DATA) ?
                RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
-        bool found_uncached_bg = false;
        bool failed_cluster_refill = false;
        bool failed_alloc = false;
        bool use_cluster = true;
@@ -6239,7 +6222,6 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
        if (search_start == hint_byte) {
                block_group = btrfs_lookup_block_group(root->fs_info,
                                                       search_start);
-                used_block_group = block_group;
                /*
                 * we don't want to use the block group if it doesn't match our
                 * allocation bits, or if its not cached.
@@ -6276,7 +6258,6 @@ search:
                u64 offset;
                int cached;
-                used_block_group = block_group;
                btrfs_get_block_group(block_group);
                search_start = block_group->key.objectid;
@@ -6304,7 +6285,6 @@ search:
 have_block_group:
                cached = block_group_cache_done(block_group);
                if (unlikely(!cached)) {
-                        found_uncached_bg = true;
                        ret = cache_block_group(block_group, 0);
                        BUG_ON(ret < 0);
                        ret = 0;
@@ -6320,6 +6300,7 @@ have_block_group:
                 * lets look there
                 */
                if (last_ptr) {
+                        struct btrfs_block_group_cache *used_block_group;
                        unsigned long aligned_cluster;
                        /*
                         * the refill lock keeps out other
@@ -6330,10 +6311,8 @@ have_block_group:
                        if (used_block_group != block_group &&
                            (!used_block_group ||
                             used_block_group->ro ||
-                             !block_group_bits(used_block_group, flags))) {
+                             !block_group_bits(used_block_group, flags)))
-                                used_block_group = block_group;
                                goto refill_cluster;
-                        }
                        if (used_block_group != block_group)
                                btrfs_get_block_group(used_block_group);
@@ -6347,17 +6326,19 @@ have_block_group:
                                /* we have a block, we're done */
                                spin_unlock(&last_ptr->refill_lock);
                                trace_btrfs_reserve_extent_cluster(root,
-                                        block_group, search_start, num_bytes);
+                                                used_block_group,
+                                                search_start, num_bytes);
+                                if (used_block_group != block_group) {
+                                        btrfs_put_block_group(block_group);
+                                        block_group = used_block_group;
+                                }
                                goto checks;
                        }
                        WARN_ON(last_ptr->block_group != used_block_group);
-                        if (used_block_group != block_group) {
+                        if (used_block_group != block_group)
                                btrfs_put_block_group(used_block_group);
-                                used_block_group = block_group;
-                        }
 refill_cluster:
-                        BUG_ON(used_block_group != block_group);
                        /* If we are on LOOP_NO_EMPTY_SIZE, we can't
                         * set up a new clusters, so lets just skip it
                         * and let the allocator find whatever block
@@ -6476,25 +6457,25 @@ unclustered_alloc:
                        goto loop;
                }
 checks:
-                search_start = stripe_align(root, used_block_group,
+                search_start = stripe_align(root, block_group,
                                            offset, num_bytes);
                /* move on to the next group */
                if (search_start + num_bytes >
-                    used_block_group->key.objectid + used_block_group->key.offset) {
+                    block_group->key.objectid + block_group->key.offset) {
-                        btrfs_add_free_space(used_block_group, offset, num_bytes);
+                        btrfs_add_free_space(block_group, offset, num_bytes);
                        goto loop;
                }
                if (offset < search_start)
-                        btrfs_add_free_space(used_block_group, offset,
+                        btrfs_add_free_space(block_group, offset,
                                             search_start - offset);
                BUG_ON(offset > search_start);
-                ret = btrfs_update_reserved_bytes(used_block_group, num_bytes,
+                ret = btrfs_update_reserved_bytes(block_group, num_bytes,
                                                  alloc_type);
                if (ret == -EAGAIN) {
-                        btrfs_add_free_space(used_block_group, offset, num_bytes);
+                        btrfs_add_free_space(block_group, offset, num_bytes);
                        goto loop;
                }
@@ -6504,16 +6485,12 @@ checks:
                trace_btrfs_reserve_extent(orig_root, block_group,
                                           search_start, num_bytes);
-                if (used_block_group != block_group)
-                        btrfs_put_block_group(used_block_group);
                btrfs_put_block_group(block_group);
                break;
 loop:
                failed_cluster_refill = false;
                failed_alloc = false;
                BUG_ON(index != get_block_group_index(block_group));
-                if (used_block_group != block_group)
-                        btrfs_put_block_group(used_block_group);
                btrfs_put_block_group(block_group);
        }
        up_read(&space_info->groups_sem);
@@ -6584,12 +6561,12 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
        int index = 0;
        spin_lock(&info->lock);
-        printk(KERN_INFO "space_info %llu has %llu free, is %sfull\n",
+        printk(KERN_INFO "BTRFS: space_info %llu has %llu free, is %sfull\n",
               info->flags,
               info->total_bytes - info->bytes_used - info->bytes_pinned -
               info->bytes_reserved - info->bytes_readonly,
               (info->full) ? "" : "not ");
-        printk(KERN_INFO "space_info total=%llu, used=%llu, pinned=%llu, "
+        printk(KERN_INFO "BTRFS: space_info total=%llu, used=%llu, pinned=%llu, "
               "reserved=%llu, may_use=%llu, readonly=%llu\n",
               info->total_bytes, info->bytes_used, info->bytes_pinned,
               info->bytes_reserved, info->bytes_may_use,
@@ -6603,7 +6580,9 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
 again:
        list_for_each_entry(cache, &info->block_groups[index], list) {
                spin_lock(&cache->lock);
-                printk(KERN_INFO "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s\n",
+                printk(KERN_INFO "BTRFS: "
+                           "block group %llu has %llu bytes, "
+                           "%llu used %llu pinned %llu reserved %s\n",
                       cache->key.objectid, cache->key.offset,
                       btrfs_block_group_used(&cache->item), cache->pinned,
                       cache->reserved, cache->ro ? "[readonly]" : "");
@@ -6966,7 +6945,7 @@ again:
                                /*DEFAULT_RATELIMIT_BURST*/ 1);
                if (__ratelimit(&_rs))
                        WARN(1, KERN_DEBUG
-                                "btrfs: block rsv returned %d\n", ret);
+                                "BTRFS: block rsv returned %d\n", ret);
        }
 try_reserve:
        ret = reserve_metadata_bytes(root, block_rsv, blocksize,
@@ -7714,7 +7693,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
                        btrfs_end_transaction_throttle(trans, tree_root);
                        if (!for_reloc && btrfs_need_cleaner_sleep(root)) {
-                                pr_debug("btrfs: drop snapshot early exit\n");
+                                pr_debug("BTRFS: drop snapshot early exit\n");
                                err = -EAGAIN;
                                goto out_free;
                        }
@@ -7779,7 +7758,7 @@ out:
         */
        if (!for_reloc && root_dropped == false)
                btrfs_add_dead_root(root);
-        if (err)
+        if (err && err != -EAGAIN)
                btrfs_std_error(root->fs_info, err);
        return err;
 }
@@ -8333,6 +8312,8 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
        release_global_block_rsv(info);
        while (!list_empty(&info->space_info)) {
+                int i;
                space_info = list_entry(info->space_info.next,
                                        struct btrfs_space_info,
                                        list);
@@ -8343,9 +8324,17 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
                                dump_space_info(space_info, 0, 0);
                        }
                }
-                percpu_counter_destroy(&space_info->total_bytes_pinned);
                list_del(&space_info->list);
-                kfree(space_info);
+                for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
+                        struct kobject *kobj;
+                        kobj = &space_info->block_group_kobjs[i];
+                        if (kobj->parent) {
+                                kobject_del(kobj);
+                                kobject_put(kobj);
+                        }
+                }
+                kobject_del(&space_info->kobj);
+                kobject_put(&space_info->kobj);
        }
        return 0;
 }
@@ -8356,10 +8345,57 @@ static void __link_block_group(struct btrfs_space_info *space_info,
        int index = get_block_group_index(cache);
        down_write(&space_info->groups_sem);
+        if (list_empty(&space_info->block_groups[index])) {
+                struct kobject *kobj = &space_info->block_group_kobjs[index];
+                int ret;
+                kobject_get(&space_info->kobj); /* put in release */
+                ret = kobject_add(kobj, &space_info->kobj, "%s",
+                                  get_raid_name(index));
+                if (ret) {
+                        pr_warn("BTRFS: failed to add kobject for block cache. ignoring.\n");
+                        kobject_put(&space_info->kobj);
+                }
+        }
        list_add_tail(&cache->list, &space_info->block_groups[index]);
        up_write(&space_info->groups_sem);
 }
+static struct btrfs_block_group_cache *
+btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
+{
+        struct btrfs_block_group_cache *cache;
+        cache = kzalloc(sizeof(*cache), GFP_NOFS);
+        if (!cache)
+                return NULL;
+        cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
+                                        GFP_NOFS);
+        if (!cache->free_space_ctl) {
+                kfree(cache);
+                return NULL;
+        }
+        cache->key.objectid = start;
+        cache->key.offset = size;
+        cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
+        cache->sectorsize = root->sectorsize;
+        cache->fs_info = root->fs_info;
+        cache->full_stripe_len = btrfs_full_stripe_len(root,
+                                               &root->fs_info->mapping_tree,
+                                               start);
+        atomic_set(&cache->count, 1);
+        spin_lock_init(&cache->lock);
+        INIT_LIST_HEAD(&cache->list);
+        INIT_LIST_HEAD(&cache->cluster_list);
+        INIT_LIST_HEAD(&cache->new_bg_list);
+        btrfs_init_free_space_ctl(cache);
+        return cache;
+}
 int btrfs_read_block_groups(struct btrfs_root *root)
 {
        struct btrfs_path *path;
@@ -8395,26 +8431,16 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                        break;
                if (ret != 0)
                        goto error;
                leaf = path->nodes[0];
                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-                cache = kzalloc(sizeof(*cache), GFP_NOFS);
+                cache = btrfs_create_block_group_cache(root, found_key.objectid,
+                                                       found_key.offset);
                if (!cache) {
                        ret = -ENOMEM;
                        goto error;
                }
-                cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
-                                                GFP_NOFS);
-                if (!cache->free_space_ctl) {
-                        kfree(cache);
-                        ret = -ENOMEM;
-                        goto error;
-                }
-                atomic_set(&cache->count, 1);
-                spin_lock_init(&cache->lock);
-                cache->fs_info = info;
-                INIT_LIST_HEAD(&cache->list);
-                INIT_LIST_HEAD(&cache->cluster_list);
                if (need_clear) {
                        /*
@@ -8435,16 +8461,10 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                read_extent_buffer(leaf, &cache->item,
                                   btrfs_item_ptr_offset(leaf, path->slots[0]),
                                   sizeof(cache->item));
-                memcpy(&cache->key, &found_key, sizeof(found_key));
+                cache->flags = btrfs_block_group_flags(&cache->item);
                key.objectid = found_key.objectid + found_key.offset;
                btrfs_release_path(path);
-                cache->flags = btrfs_block_group_flags(&cache->item);
-                cache->sectorsize = root->sectorsize;
-                cache->full_stripe_len = btrfs_full_stripe_len(root,
-                                               &root->fs_info->mapping_tree,
-                                               found_key.objectid);
-                btrfs_init_free_space_ctl(cache);
                /*
                 * We need to exclude the super stripes now so that the space
@@ -8458,8 +8478,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                         * case.
                         */
                        free_excluded_extents(root, cache);
-                        kfree(cache->free_space_ctl);
+                        btrfs_put_block_group(cache);
-                        kfree(cache);
                        goto error;
                }
@@ -8590,38 +8609,15 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
        root->fs_info->last_trans_log_full_commit = trans->transid;
-        cache = kzalloc(sizeof(*cache), GFP_NOFS);
+        cache = btrfs_create_block_group_cache(root, chunk_offset, size);
        if (!cache)
                return -ENOMEM;
-        cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
-                                        GFP_NOFS);
-        if (!cache->free_space_ctl) {
-                kfree(cache);
-                return -ENOMEM;
-        }
-        cache->key.objectid = chunk_offset;
-        cache->key.offset = size;
-        cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
-        cache->sectorsize = root->sectorsize;
-        cache->fs_info = root->fs_info;
-        cache->full_stripe_len = btrfs_full_stripe_len(root,
-                                               &root->fs_info->mapping_tree,
-                                               chunk_offset);
-        atomic_set(&cache->count, 1);
-        spin_lock_init(&cache->lock);
-        INIT_LIST_HEAD(&cache->list);
-        INIT_LIST_HEAD(&cache->cluster_list);
-        INIT_LIST_HEAD(&cache->new_bg_list);
-        btrfs_init_free_space_ctl(cache);
        btrfs_set_block_group_used(&cache->item, bytes_used);
        btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
-        cache->flags = type;
        btrfs_set_block_group_flags(&cache->item, type);
+        cache->flags = type;
        cache->last_byte_to_unpin = (u64)-1;
        cache->cached = BTRFS_CACHE_FINISHED;
        ret = exclude_super_stripes(root, cache);
@@ -8631,8 +8627,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
                 * case.
                 */
                free_excluded_extents(root, cache);
-                kfree(cache->free_space_ctl);
+                btrfs_put_block_group(cache);
-                kfree(cache);
                return ret;
        }
@@ -8796,8 +8791,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
         * are still on the list after taking the semaphore
         */
        list_del_init(&block_group->list);
-        if (list_empty(&block_group->space_info->block_groups[index]))
+        if (list_empty(&block_group->space_info->block_groups[index])) {
+                kobject_del(&block_group->space_info->block_group_kobjs[index]);
+                kobject_put(&block_group->space_info->block_group_kobjs[index]);
                clear_avail_alloc_bits(root->fs_info, block_group->flags);
+        }
        up_write(&block_group->space_info->groups_sem);
        if (block_group->cached == BTRFS_CACHE_STARTED)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index ff43802a7c88..85bbd01f1271 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -59,7 +59,7 @@ void btrfs_leak_debug_check(void)
        while (!list_empty(&states)) {
                state = list_entry(states.next, struct extent_state, leak_list);
-                printk(KERN_ERR "btrfs state leak: start %llu end %llu "
+                printk(KERN_ERR "BTRFS: state leak: start %llu end %llu "
                       "state %lu in tree %p refs %d\n",
                       state->start, state->end, state->state, state->tree,
                       atomic_read(&state->refs));
@@ -69,7 +69,7 @@ void btrfs_leak_debug_check(void)
        while (!list_empty(&buffers)) {
                eb = list_entry(buffers.next, struct extent_buffer, leak_list);
-                printk(KERN_ERR "btrfs buffer leak start %llu len %lu "
+                printk(KERN_ERR "BTRFS: buffer leak start %llu len %lu "
                       "refs %d\n",
                       eb->start, eb->len, atomic_read(&eb->refs));
                list_del(&eb->leak_list);
@@ -77,16 +77,22 @@ void btrfs_leak_debug_check(void)
        }
 }
-#define btrfs_debug_check_extent_io_range(inode, start, end)            \
+#define btrfs_debug_check_extent_io_range(tree, start, end)             \
-        __btrfs_debug_check_extent_io_range(__func__, (inode), (start), (end))
+        __btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end))
 static inline void __btrfs_debug_check_extent_io_range(const char *caller,
-                struct inode *inode, u64 start, u64 end)
+                struct extent_io_tree *tree, u64 start, u64 end)
 {
-        u64 isize = i_size_read(inode);
+        struct inode *inode;
+        u64 isize;
+        if (!tree->mapping)
+                return;
+        inode = tree->mapping->host;
+        isize = i_size_read(inode);
        if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
                printk_ratelimited(KERN_DEBUG
-                    "btrfs: %s: ino %llu isize %llu odd range [%llu,%llu]\n",
+                    "BTRFS: %s: ino %llu isize %llu odd range [%llu,%llu]\n",
                                caller, btrfs_ino(inode), isize, start, end);
        }
 }
@@ -124,6 +130,8 @@ static noinline void flush_write_bio(void *data);
 static inline struct btrfs_fs_info *
 tree_fs_info(struct extent_io_tree *tree)
 {
+        if (!tree->mapping)
+                return NULL;
        return btrfs_sb(tree->mapping->host->i_sb);
 }
@@ -186,11 +194,9 @@ void extent_io_tree_init(struct extent_io_tree *tree,
                         struct address_space *mapping)
 {
        tree->state = RB_ROOT;
-        INIT_RADIX_TREE(&tree->buffer, GFP_ATOMIC);
        tree->ops = NULL;
        tree->dirty_bytes = 0;
        spin_lock_init(&tree->lock);
-        spin_lock_init(&tree->buffer_lock);
        tree->mapping = mapping;
 }
@@ -224,12 +230,20 @@ void free_extent_state(struct extent_state *state)
 }
 static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
-                                   struct rb_node *node)
+                                   struct rb_node *node,
+                                   struct rb_node ***p_in,
+                                   struct rb_node **parent_in)
 {
        struct rb_node **p = &root->rb_node;
        struct rb_node *parent = NULL;
        struct tree_entry *entry;
+        if (p_in && parent_in) {
+                p = *p_in;
+                parent = *parent_in;
+                goto do_insert;
+        }
        while (*p) {
                parent = *p;
                entry = rb_entry(parent, struct tree_entry, rb_node);
@@ -242,35 +256,43 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
                        return parent;
        }
+do_insert:
        rb_link_node(node, parent, p);
        rb_insert_color(node, root);
        return NULL;
 }
 static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
-                                     struct rb_node **prev_ret,
+                                      struct rb_node **prev_ret,
-                                     struct rb_node **next_ret)
+                                      struct rb_node **next_ret,
+                                      struct rb_node ***p_ret,
+                                      struct rb_node **parent_ret)
 {
        struct rb_root *root = &tree->state;
-        struct rb_node *n = root->rb_node;
+        struct rb_node **n = &root->rb_node;
        struct rb_node *prev = NULL;
        struct rb_node *orig_prev = NULL;
        struct tree_entry *entry;
        struct tree_entry *prev_entry = NULL;
-        while (n) {
+        while (*n) {
-                entry = rb_entry(n, struct tree_entry, rb_node);
+                prev = *n;
-                prev = n;
+                entry = rb_entry(prev, struct tree_entry, rb_node);
                prev_entry = entry;
                if (offset < entry->start)
-                        n = n->rb_left;
+                        n = &(*n)->rb_left;
                else if (offset > entry->end)
-                        n = n->rb_right;
+                        n = &(*n)->rb_right;
                else
-                        return n;
+                        return *n;
        }
+        if (p_ret)
+                *p_ret = n;
+        if (parent_ret)
+                *parent_ret = prev;
        if (prev_ret) {
                orig_prev = prev;
                while (prev && offset > prev_entry->end) {
@@ -292,18 +314,27 @@ static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
        return NULL;
 }
-static inline struct rb_node *tree_search(struct extent_io_tree *tree,
+static inline struct rb_node *
-                                          u64 offset)
+tree_search_for_insert(struct extent_io_tree *tree,
+                       u64 offset,
+                       struct rb_node ***p_ret,
+                       struct rb_node **parent_ret)
 {
        struct rb_node *prev = NULL;
        struct rb_node *ret;
-        ret = __etree_search(tree, offset, &prev, NULL);
+        ret = __etree_search(tree, offset, &prev, NULL, p_ret, parent_ret);
        if (!ret)
                return prev;
        return ret;
 }
+static inline struct rb_node *tree_search(struct extent_io_tree *tree,
+                                          u64 offset)
+{
+        return tree_search_for_insert(tree, offset, NULL, NULL);
+}
 static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
                     struct extent_state *other)
 {
@@ -385,23 +416,25 @@ static void set_state_bits(struct extent_io_tree *tree,
 */
 static int insert_state(struct extent_io_tree *tree,
                        struct extent_state *state, u64 start, u64 end,
+                        struct rb_node ***p,
+                        struct rb_node **parent,
                        unsigned long *bits)
 {
        struct rb_node *node;
        if (end < start)
-                WARN(1, KERN_ERR "btrfs end < start %llu %llu\n",
+                WARN(1, KERN_ERR "BTRFS: end < start %llu %llu\n",
                       end, start);
        state->start = start;
        state->end = end;
        set_state_bits(tree, state, bits);
-        node = tree_insert(&tree->state, end, &state->rb_node);
+        node = tree_insert(&tree->state, end, &state->rb_node, p, parent);
        if (node) {
                struct extent_state *found;
                found = rb_entry(node, struct extent_state, rb_node);
-                printk(KERN_ERR "btrfs found node %llu %llu on insert of "
+                printk(KERN_ERR "BTRFS: found node %llu %llu on insert of "
                       "%llu %llu\n",
                       found->start, found->end, start, end);
                return -EEXIST;
@@ -444,7 +477,8 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
        prealloc->state = orig->state;
        orig->start = split;
-        node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
+        node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node,
+                           NULL, NULL);
        if (node) {
                free_extent_state(prealloc);
                return -EEXIST;
@@ -542,7 +576,7 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
        int err;
        int clear = 0;
-        btrfs_debug_check_extent_io_range(tree->mapping->host, start, end);
+        btrfs_debug_check_extent_io_range(tree, start, end);
        if (bits & EXTENT_DELALLOC)
                bits |= EXTENT_NORESERVE;
@@ -702,7 +736,7 @@ static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
        struct extent_state *state;
        struct rb_node *node;
-        btrfs_debug_check_extent_io_range(tree->mapping->host, start, end);
+        btrfs_debug_check_extent_io_range(tree, start, end);
        spin_lock(&tree->lock);
 again:
@@ -783,11 +817,13 @@ __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
        struct extent_state *state;
        struct extent_state *prealloc = NULL;
        struct rb_node *node;
+        struct rb_node **p;
+        struct rb_node *parent;
        int err = 0;
        u64 last_start;
        u64 last_end;
-        btrfs_debug_check_extent_io_range(tree->mapping->host, start, end);
+        btrfs_debug_check_extent_io_range(tree, start, end);
        bits |= EXTENT_FIRST_DELALLOC;
 again:
@@ -809,14 +845,16 @@ again:
         * this search will find all the extents that end after
         * our range starts.
         */
-        node = tree_search(tree, start);
+        node = tree_search_for_insert(tree, start, &p, &parent);
        if (!node) {
                prealloc = alloc_extent_state_atomic(prealloc);
                BUG_ON(!prealloc);
-                err = insert_state(tree, prealloc, start, end, &bits);
+                err = insert_state(tree, prealloc, start, end,
+                                   &p, &parent, &bits);
                if (err)
                        extent_io_tree_panic(tree, err);
+                cache_state(prealloc, cached_state);
                prealloc = NULL;
                goto out;
        }
@@ -919,7 +957,7 @@ hit_next:
                 * the later extent.
                 */
                err = insert_state(tree, prealloc, start, this_end,
-                                   &bits);
+                                   NULL, NULL, &bits);
                if (err)
                        extent_io_tree_panic(tree, err);
@@ -1005,11 +1043,13 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
        struct extent_state *state;
        struct extent_state *prealloc = NULL;
        struct rb_node *node;
+        struct rb_node **p;
+        struct rb_node *parent;
        int err = 0;
        u64 last_start;
        u64 last_end;
-        btrfs_debug_check_extent_io_range(tree->mapping->host, start, end);
+        btrfs_debug_check_extent_io_range(tree, start, end);
 again:
        if (!prealloc && (mask & __GFP_WAIT)) {
@@ -1032,17 +1072,19 @@ again:
         * this search will find all the extents that end after
         * our range starts.
         */
-        node = tree_search(tree, start);
+        node = tree_search_for_insert(tree, start, &p, &parent);
        if (!node) {
                prealloc = alloc_extent_state_atomic(prealloc);
                if (!prealloc) {
                        err = -ENOMEM;
                        goto out;
                }
-                err = insert_state(tree, prealloc, start, end, &bits);
+                err = insert_state(tree, prealloc, start, end,
-                prealloc = NULL;
+                                   &p, &parent, &bits);
                if (err)
                        extent_io_tree_panic(tree, err);
+                cache_state(prealloc, cached_state);
+                prealloc = NULL;
                goto out;
        }
        state = rb_entry(node, struct extent_state, rb_node);
@@ -1135,7 +1177,7 @@ hit_next:
                 * the later extent.
                 */
                err = insert_state(tree, prealloc, start, this_end,
-                                   &bits);
+                                   NULL, NULL, &bits);
                if (err)
                        extent_io_tree_panic(tree, err);
                cache_state(prealloc, cached_state);
@@ -1984,7 +2026,7 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
        bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
        if (!bio)
                return -EIO;
-        bio->bi_size = 0;
+        bio->bi_iter.bi_size = 0;
        map_length = length;
        ret = btrfs_map_block(fs_info, WRITE, logical,
@@ -1995,7 +2037,7 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
        }
        BUG_ON(mirror_num != bbio->mirror_num);
        sector = bbio->stripes[mirror_num-1].physical >> 9;
-        bio->bi_sector = sector;
+        bio->bi_iter.bi_sector = sector;
        dev = bbio->stripes[mirror_num-1].dev;
        kfree(bbio);
        if (!dev || !dev->bdev || !dev->writeable) {
@@ -2012,9 +2054,10 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
                return -EIO;
        }
-        printk_ratelimited_in_rcu(KERN_INFO "btrfs read error corrected: ino %lu off %llu "
+        printk_ratelimited_in_rcu(KERN_INFO
-                      "(dev %s sector %llu)\n", page->mapping->host->i_ino,
+                        "BTRFS: read error corrected: ino %lu off %llu "
-                      start, rcu_str_deref(dev->name), sector);
+                    "(dev %s sector %llu)\n", page->mapping->host->i_ino,
+                    start, rcu_str_deref(dev->name), sector);
        bio_put(bio);
        return 0;
@@ -2156,7 +2199,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
                        return -EIO;
                }
-                if (em->start > start || em->start + em->len < start) {
+                if (em->start > start || em->start + em->len <= start) {
                        free_extent_map(em);
                        em = NULL;
                }
@@ -2268,9 +2311,9 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
                return -EIO;
        }
        bio->bi_end_io = failed_bio->bi_end_io;
-        bio->bi_sector = failrec->logical >> 9;
+        bio->bi_iter.bi_sector = failrec->logical >> 9;
        bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
-        bio->bi_size = 0;
+        bio->bi_iter.bi_size = 0;
        btrfs_failed_bio = btrfs_io_bio(failed_bio);
        if (btrfs_failed_bio->csum) {
@@ -2332,37 +2375,39 @@ int end_extent_writepage(struct page *page, int err, u64 start, u64 end)
 */
 static void end_bio_extent_writepage(struct bio *bio, int err)
 {
-        struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+        struct bio_vec *bvec;
-        struct extent_io_tree *tree;
        u64 start;
        u64 end;
+        int i;
-        do {
+        bio_for_each_segment_all(bvec, bio, i) {
                struct page *page = bvec->bv_page;
-                tree = &BTRFS_I(page->mapping->host)->io_tree;
                /* We always issue full-page reads, but if some block
                 * in a page fails to read, blk_update_request() will
                 * advance bv_offset and adjust bv_len to compensate.
                 * Print a warning for nonzero offsets, and an error
                 * if they don't add up to a full page.  */
-                if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE)
+                if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) {
-                        printk("%s page write in btrfs with offset %u and length %u\n",
+                        if (bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE)
-                               bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE
+                                btrfs_err(BTRFS_I(page->mapping->host)->root->fs_info,
-                               ? KERN_ERR "partial" : KERN_INFO "incomplete",
+                                   "partial page write in btrfs with offset %u and length %u",
-                               bvec->bv_offset, bvec->bv_len);
+                                        bvec->bv_offset, bvec->bv_len);
+                        else
+                                btrfs_info(BTRFS_I(page->mapping->host)->root->fs_info,
+                                   "incomplete page write in btrfs with offset %u and "
+                                   "length %u",
+                                        bvec->bv_offset, bvec->bv_len);
+                }
                start = page_offset(page);
                end = start + bvec->bv_offset + bvec->bv_len - 1;
-                if (--bvec >= bio->bi_io_vec)
-                        prefetchw(&bvec->bv_page->flags);
                if (end_extent_writepage(page, err, start, end))
                        continue;
                end_page_writeback(page);
-        } while (bvec >= bio->bi_io_vec);
+        }
        bio_put(bio);
 }
@@ -2392,9 +2437,8 @@ endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len,
 */
 static void end_bio_extent_readpage(struct bio *bio, int err)
 {
+        struct bio_vec *bvec;
        int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-        struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
-        struct bio_vec *bvec = bio->bi_io_vec;
        struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
        struct extent_io_tree *tree;
        u64 offset = 0;
@@ -2405,16 +2449,17 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
        u64 extent_len = 0;
        int mirror;
        int ret;
+        int i;
        if (err)
                uptodate = 0;
-        do {
+        bio_for_each_segment_all(bvec, bio, i) {
                struct page *page = bvec->bv_page;
                struct inode *inode = page->mapping->host;
                pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, "
-                         "mirror=%lu\n", (u64)bio->bi_sector, err,
+                         "mirror=%lu\n", (u64)bio->bi_iter.bi_sector, err,
                         io_bio->mirror_num);
                tree = &BTRFS_I(inode)->io_tree;
@@ -2423,19 +2468,22 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
                 * advance bv_offset and adjust bv_len to compensate.
                 * Print a warning for nonzero offsets, and an error
                 * if they don't add up to a full page.  */
-                if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE)
+                if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) {
-                        printk("%s page read in btrfs with offset %u and length %u\n",
+                        if (bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE)
-                               bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE
+                                btrfs_err(BTRFS_I(page->mapping->host)->root->fs_info,
-                               ? KERN_ERR "partial" : KERN_INFO "incomplete",
+                                   "partial page read in btrfs with offset %u and length %u",
-                               bvec->bv_offset, bvec->bv_len);
+                                        bvec->bv_offset, bvec->bv_len);
+                        else
+                                btrfs_info(BTRFS_I(page->mapping->host)->root->fs_info,
+                                   "incomplete page read in btrfs with offset %u and "
+                                   "length %u",
+                                        bvec->bv_offset, bvec->bv_len);
+                }
                start = page_offset(page);
                end = start + bvec->bv_offset + bvec->bv_len - 1;
                len = bvec->bv_len;
-                if (++bvec <= bvec_end)
-                        prefetchw(&bvec->bv_page->flags);
                mirror = io_bio->mirror_num;
                if (likely(uptodate && tree->ops &&
                           tree->ops->readpage_end_io_hook)) {
@@ -2516,7 +2564,7 @@ readpage_ok:
                        extent_start = start;
                        extent_len = end + 1 - start;
                }
-        } while (bvec <= bvec_end);
+        }
        if (extent_len)
                endio_readpage_release_extent(tree, extent_start, extent_len,
@@ -2547,9 +2595,8 @@ btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
        }
        if (bio) {
-                bio->bi_size = 0;
                bio->bi_bdev = bdev;
-                bio->bi_sector = first_sector;
+                bio->bi_iter.bi_sector = first_sector;
                btrfs_bio = btrfs_io_bio(bio);
                btrfs_bio->csum = NULL;
                btrfs_bio->csum_allocated = NULL;
@@ -2643,7 +2690,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
        if (bio_ret && *bio_ret) {
                bio = *bio_ret;
                if (old_compressed)
-                        contig = bio->bi_sector == sector;
+                        contig = bio->bi_iter.bi_sector == sector;
                else
                        contig = bio_end_sector(bio) == sector;
@@ -3287,8 +3334,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
                        set_range_writeback(tree, cur, cur + iosize - 1);
                        if (!PageWriteback(page)) {
-                                printk(KERN_ERR "btrfs warning page %lu not "
+                                btrfs_err(BTRFS_I(inode)->root->fs_info,
-                                       "writeback, cur %llu end %llu\n",
+                                           "page %lu not writeback, cur %llu end %llu",
                                       page->index, cur, end);
                        }
@@ -3410,20 +3457,18 @@ static void end_extent_buffer_writeback(struct extent_buffer *eb)
 static void end_bio_extent_buffer_writepage(struct bio *bio, int err)
 {
-        int uptodate = err == 0;
+        struct bio_vec *bvec;
-        struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
        struct extent_buffer *eb;
-        int done;
+        int i, done;
-        do {
+        bio_for_each_segment_all(bvec, bio, i) {
                struct page *page = bvec->bv_page;
-                bvec--;
                eb = (struct extent_buffer *)page->private;
                BUG_ON(!eb);
                done = atomic_dec_and_test(&eb->io_pages);
-                if (!uptodate || test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) {
+                if (err || test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) {
                        set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
                        ClearPageUptodate(page);
                        SetPageError(page);
@@ -3435,10 +3480,9 @@ static void end_bio_extent_buffer_writepage(struct bio *bio, int err)
                        continue;
                end_extent_buffer_writeback(eb);
-        } while (bvec >= bio->bi_io_vec);
+        }
        bio_put(bio);
 }
 static int write_one_eb(struct extent_buffer *eb,
@@ -3447,6 +3491,7 @@ static int write_one_eb(struct extent_buffer *eb,
                        struct extent_page_data *epd)
 {
        struct block_device *bdev = fs_info->fs_devices->latest_bdev;
+        struct extent_io_tree *tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
        u64 offset = eb->start;
        unsigned long i, num_pages;
        unsigned long bio_flags = 0;
@@ -3464,7 +3509,7 @@ static int write_one_eb(struct extent_buffer *eb,
                clear_page_dirty_for_io(p);
                set_page_writeback(p);
-                ret = submit_extent_page(rw, eb->tree, p, offset >> 9,
+                ret = submit_extent_page(rw, tree, p, offset >> 9,
                                         PAGE_CACHE_SIZE, 0, bdev, &epd->bio,
                                         -1, end_bio_extent_buffer_writepage,
                                         0, epd->bio_flags, bio_flags);
@@ -4082,12 +4127,10 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
        struct extent_map *em = NULL;
        struct extent_state *cached_state = NULL;
        struct btrfs_path *path;
-        struct btrfs_file_extent_item *item;
        int end = 0;
        u64 em_start = 0;
        u64 em_len = 0;
        u64 em_end = 0;
-        unsigned long emflags;
        if (len == 0)
                return -EINVAL;
@@ -4112,8 +4155,6 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
        }
        WARN_ON(!ret);
        path->slots[0]--;
-        item = btrfs_item_ptr(path->nodes[0], path->slots[0],
-                              struct btrfs_file_extent_item);
        btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
        found_type = btrfs_key_type(&found_key);
@@ -4181,7 +4222,6 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                        offset_in_extent = em_start - em->start;
                em_end = extent_map_end(em);
                em_len = em_end - em_start;
-                emflags = em->flags;
                disko = 0;
                flags = 0;
@@ -4333,10 +4373,9 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
        __free_extent_buffer(eb);
 }
-static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
+static struct extent_buffer *
-                                                   u64 start,
+__alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
-                                                   unsigned long len,
+                      unsigned long len, gfp_t mask)
-                                                   gfp_t mask)
 {
        struct extent_buffer *eb = NULL;
@@ -4345,7 +4384,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
                return NULL;
        eb->start = start;
        eb->len = len;
-        eb->tree = tree;
+        eb->fs_info = fs_info;
        eb->bflags = 0;
        rwlock_init(&eb->lock);
        atomic_set(&eb->write_locks, 0);
@@ -4477,13 +4516,14 @@ static void mark_extent_buffer_accessed(struct extent_buffer *eb)
        }
 }
-struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
+struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
-                                                        u64 start)
+                                         u64 start)
 {
        struct extent_buffer *eb;
        rcu_read_lock();
-        eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
+        eb = radix_tree_lookup(&fs_info->buffer_radix,
+                               start >> PAGE_CACHE_SHIFT);
        if (eb && atomic_inc_not_zero(&eb->refs)) {
                rcu_read_unlock();
                mark_extent_buffer_accessed(eb);
@@ -4494,7 +4534,7 @@ struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
        return NULL;
 }
-struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
+struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
                                          u64 start, unsigned long len)
 {
        unsigned long num_pages = num_extent_pages(start, len);
@@ -4503,16 +4543,15 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
        struct extent_buffer *eb;
        struct extent_buffer *exists = NULL;
        struct page *p;
-        struct address_space *mapping = tree->mapping;
+        struct address_space *mapping = fs_info->btree_inode->i_mapping;
        int uptodate = 1;
        int ret;
+        eb = find_extent_buffer(fs_info, start);
-        eb = find_extent_buffer(tree, start);
        if (eb)
                return eb;
-        eb = __alloc_extent_buffer(tree, start, len, GFP_NOFS);
+        eb = __alloc_extent_buffer(fs_info, start, len, GFP_NOFS);
        if (!eb)
                return NULL;
@@ -4567,12 +4606,13 @@ again:
        if (ret)
                goto free_eb;
-        spin_lock(&tree->buffer_lock);
+        spin_lock(&fs_info->buffer_lock);
-        ret = radix_tree_insert(&tree->buffer, start >> PAGE_CACHE_SHIFT, eb);
+        ret = radix_tree_insert(&fs_info->buffer_radix,
-        spin_unlock(&tree->buffer_lock);
+                                start >> PAGE_CACHE_SHIFT, eb);
+        spin_unlock(&fs_info->buffer_lock);
        radix_tree_preload_end();
        if (ret == -EEXIST) {
-                exists = find_extent_buffer(tree, start);
+                exists = find_extent_buffer(fs_info, start);
                if (exists)
                        goto free_eb;
                else
@@ -4580,6 +4620,7 @@ again:
        }
        /* add one reference for the tree */
        check_buffer_tree_ref(eb);
+        set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
        /*
         * there is a race where release page may have
@@ -4623,17 +4664,17 @@ static int release_extent_buffer(struct extent_buffer *eb)
 {
        WARN_ON(atomic_read(&eb->refs) == 0);
        if (atomic_dec_and_test(&eb->refs)) {
-                if (test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags)) {
+                if (test_and_clear_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags)) {
-                        spin_unlock(&eb->refs_lock);
+                        struct btrfs_fs_info *fs_info = eb->fs_info;
-                } else {
-                        struct extent_io_tree *tree = eb->tree;
                        spin_unlock(&eb->refs_lock);
-                        spin_lock(&tree->buffer_lock);
+                        spin_lock(&fs_info->buffer_lock);
-                        radix_tree_delete(&tree->buffer,
+                        radix_tree_delete(&fs_info->buffer_radix,
                                          eb->start >> PAGE_CACHE_SHIFT);
-                        spin_unlock(&tree->buffer_lock);
+                        spin_unlock(&fs_info->buffer_lock);
+                } else {
+                        spin_unlock(&eb->refs_lock);
                }
                /* Should be safe to release our pages at this point */
@@ -5112,12 +5153,12 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
        unsigned long src_i;
        if (src_offset + len > dst->len) {
-                printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
+                printk(KERN_ERR "BTRFS: memmove bogus src_offset %lu move "
                       "len %lu dst len %lu\n", src_offset, len, dst->len);
                BUG_ON(1);
        }
        if (dst_offset + len > dst->len) {
-                printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
+                printk(KERN_ERR "BTRFS: memmove bogus dst_offset %lu move "
                       "len %lu dst len %lu\n", dst_offset, len, dst->len);
                BUG_ON(1);
        }
@@ -5159,12 +5200,12 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
        unsigned long src_i;
        if (src_offset + len > dst->len) {
-                printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
+                printk(KERN_ERR "BTRFS: memmove bogus src_offset %lu move "
                       "len %lu len %lu\n", src_offset, len, dst->len);
                BUG_ON(1);
        }
        if (dst_offset + len > dst->len) {
-                printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
+                printk(KERN_ERR "BTRFS: memmove bogus dst_offset %lu move "
                       "len %lu len %lu\n", dst_offset, len, dst->len);
                BUG_ON(1);
        }
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 19620c58f096..58b27e5ab521 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -43,6 +43,7 @@
 #define EXTENT_BUFFER_WRITEBACK 7
 #define EXTENT_BUFFER_IOERR 8
 #define EXTENT_BUFFER_DUMMY 9
+#define EXTENT_BUFFER_IN_TREE 10
 /* these are flags for extent_clear_unlock_delalloc */
 #define PAGE_UNLOCK             (1 << 0)
@@ -94,12 +95,10 @@ struct extent_io_ops {
 struct extent_io_tree {
        struct rb_root state;
-        struct radix_tree_root buffer;
        struct address_space *mapping;
        u64 dirty_bytes;
        int track_uptodate;
        spinlock_t lock;
-        spinlock_t buffer_lock;
        struct extent_io_ops *ops;
 };
@@ -130,7 +129,7 @@ struct extent_buffer {
        unsigned long map_start;
        unsigned long map_len;
        unsigned long bflags;
-        struct extent_io_tree *tree;
+        struct btrfs_fs_info *fs_info;
        spinlock_t refs_lock;
        atomic_t refs;
        atomic_t io_pages;
@@ -266,11 +265,11 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private);
 void set_page_extent_mapped(struct page *page);
-struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
+struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
                                          u64 start, unsigned long len);
 struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len);
 struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src);
-struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
+struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
                                         u64 start);
 void free_extent_buffer(struct extent_buffer *eb);
 void free_extent_buffer_stale(struct extent_buffer *eb);
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index a4a7a1a8da95..996ad56b57db 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -79,12 +79,21 @@ void free_extent_map(struct extent_map *em)
        }
 }
-static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
+/* simple helper to do math around the end of an extent, handling wrap */
-                                   struct rb_node *node)
+static u64 range_end(u64 start, u64 len)
+{
+        if (start + len < start)
+                return (u64)-1;
+        return start + len;
+}
+static int tree_insert(struct rb_root *root, struct extent_map *em)
 {
        struct rb_node **p = &root->rb_node;
        struct rb_node *parent = NULL;
-        struct extent_map *entry;
+        struct extent_map *entry = NULL;
+        struct rb_node *orig_parent = NULL;
+        u64 end = range_end(em->start, em->len);
        while (*p) {
                parent = *p;
@@ -92,19 +101,37 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
                WARN_ON(!entry->in_tree);
-                if (offset < entry->start)
+                if (em->start < entry->start)
                        p = &(*p)->rb_left;
-                else if (offset >= extent_map_end(entry))
+                else if (em->start >= extent_map_end(entry))
                        p = &(*p)->rb_right;
                else
-                        return parent;
+                        return -EEXIST;
        }
-        entry = rb_entry(node, struct extent_map, rb_node);
+        orig_parent = parent;
-        entry->in_tree = 1;
+        while (parent && em->start >= extent_map_end(entry)) {
-        rb_link_node(node, parent, p);
+                parent = rb_next(parent);
-        rb_insert_color(node, root);
+                entry = rb_entry(parent, struct extent_map, rb_node);
-        return NULL;
+        }
+        if (parent)
+                if (end > entry->start && em->start < extent_map_end(entry))
+                        return -EEXIST;
+        parent = orig_parent;
+        entry = rb_entry(parent, struct extent_map, rb_node);
+        while (parent && em->start < entry->start) {
+                parent = rb_prev(parent);
+                entry = rb_entry(parent, struct extent_map, rb_node);
+        }
+        if (parent)
+                if (end > entry->start && em->start < extent_map_end(entry))
+                        return -EEXIST;
+        em->in_tree = 1;
+        rb_link_node(&em->rb_node, orig_parent, p);
+        rb_insert_color(&em->rb_node, root);
+        return 0;
 }
 /*
@@ -228,7 +255,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
                merge = rb_entry(rb, struct extent_map, rb_node);
        if (rb && mergable_maps(em, merge)) {
                em->len += merge->len;
-                em->block_len += merge->len;
+                em->block_len += merge->block_len;
                rb_erase(&merge->rb_node, &tree->map);
                merge->in_tree = 0;
                em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start;
@@ -310,20 +337,11 @@ int add_extent_mapping(struct extent_map_tree *tree,
                       struct extent_map *em, int modified)
 {
        int ret = 0;
-        struct rb_node *rb;
-        struct extent_map *exist;
-        exist = lookup_extent_mapping(tree, em->start, em->len);
+        ret = tree_insert(&tree->map, em);
-        if (exist) {
+        if (ret)
-                free_extent_map(exist);
-                ret = -EEXIST;
-                goto out;
-        }
-        rb = tree_insert(&tree->map, em->start, &em->rb_node);
-        if (rb) {
-                ret = -EEXIST;
                goto out;
-        }
        atomic_inc(&em->refs);
        em->mod_start = em->start;
@@ -337,14 +355,6 @@ out:
        return ret;
 }
-/* simple helper to do math around the end of an extent, handling wrap */
-static u64 range_end(u64 start, u64 len)
-{
-        if (start + len < start)
-                return (u64)-1;
-        return start + len;
-}
 static struct extent_map *
 __lookup_extent_mapping(struct extent_map_tree *tree,
                        u64 start, u64 len, int strict)
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 6f3848860283..127555b29f58 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -182,7 +182,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
        if (!path)
                return -ENOMEM;
-        nblocks = bio->bi_size >> inode->i_sb->s_blocksize_bits;
+        nblocks = bio->bi_iter.bi_size >> inode->i_sb->s_blocksize_bits;
        if (!dst) {
                if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) {
                        btrfs_bio->csum_allocated = kmalloc(nblocks * csum_size,
@@ -201,7 +201,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
                csum = (u8 *)dst;
        }
-        if (bio->bi_size > PAGE_CACHE_SIZE * 8)
+        if (bio->bi_iter.bi_size > PAGE_CACHE_SIZE * 8)
                path->reada = 2;
        WARN_ON(bio->bi_vcnt <= 0);
@@ -217,7 +217,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
                path->skip_locking = 1;
        }
-        disk_bytenr = (u64)bio->bi_sector << 9;
+        disk_bytenr = (u64)bio->bi_iter.bi_sector << 9;
        if (dio)
                offset = logical_offset;
        while (bio_index < bio->bi_vcnt) {
@@ -246,8 +246,8 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
                                                offset + bvec->bv_len - 1,
                                                EXTENT_NODATASUM, GFP_NOFS);
                                } else {
-                                        printk(KERN_INFO "btrfs no csum found "
+                                        btrfs_info(BTRFS_I(inode)->root->fs_info,
-                                               "for inode %llu start %llu\n",
+                                                   "no csum found for inode %llu start %llu",
                                               btrfs_ino(inode), offset);
                                }
                                item = NULL;
@@ -302,7 +302,7 @@ int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
                              struct btrfs_dio_private *dip, struct bio *bio,
                              u64 offset)
 {
-        int len = (bio->bi_sector << 9) - dip->disk_bytenr;
+        int len = (bio->bi_iter.bi_sector << 9) - dip->disk_bytenr;
        u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
        int ret;
@@ -447,11 +447,12 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
        u64 offset;
        WARN_ON(bio->bi_vcnt <= 0);
-        sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_size), GFP_NOFS);
+        sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_iter.bi_size),
+                       GFP_NOFS);
        if (!sums)
                return -ENOMEM;
-        sums->len = bio->bi_size;
+        sums->len = bio->bi_iter.bi_size;
        INIT_LIST_HEAD(&sums->list);
        if (contig)
@@ -461,7 +462,7 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
        ordered = btrfs_lookup_ordered_extent(inode, offset);
        BUG_ON(!ordered); /* Logic error */
-        sums->bytenr = (u64)bio->bi_sector << 9;
+        sums->bytenr = (u64)bio->bi_iter.bi_sector << 9;
        index = 0;
        while (bio_index < bio->bi_vcnt) {
@@ -476,7 +477,7 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
                        btrfs_add_ordered_sum(inode, ordered, sums);
                        btrfs_put_ordered_extent(ordered);
-                        bytes_left = bio->bi_size - total_bytes;
+                        bytes_left = bio->bi_iter.bi_size - total_bytes;
                        sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left),
                                       GFP_NOFS);
@@ -484,7 +485,7 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
                        sums->len = bytes_left;
                        ordered = btrfs_lookup_ordered_extent(inode, offset);
                        BUG_ON(!ordered); /* Logic error */
-                        sums->bytenr = ((u64)bio->bi_sector << 9) +
+                        sums->bytenr = ((u64)bio->bi_iter.bi_sector << 9) +
                                       total_bytes;
                        index = 0;
                }
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 82d0342763c5..0165b8672f09 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -692,7 +692,10 @@ next:
 int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
                         struct btrfs_root *root, struct inode *inode,
                         struct btrfs_path *path, u64 start, u64 end,
-                         u64 *drop_end, int drop_cache)
+                         u64 *drop_end, int drop_cache,
+                         int replace_extent,
+                         u32 extent_item_size,
+                         int *key_inserted)
 {
        struct extent_buffer *leaf;
        struct btrfs_file_extent_item *fi;
@@ -712,6 +715,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
        int modify_tree = -1;
        int update_refs = (root->ref_cows || root == root->fs_info->tree_root);
        int found = 0;
+        int leafs_visited = 0;
        if (drop_cache)
                btrfs_drop_extent_cache(inode, start, end - 1, 0);
@@ -733,6 +737,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
                                path->slots[0]--;
                }
                ret = 0;
+                leafs_visited++;
 next_slot:
                leaf = path->nodes[0];
                if (path->slots[0] >= btrfs_header_nritems(leaf)) {
@@ -744,6 +749,7 @@ next_slot:
                                ret = 0;
                                break;
                        }
+                        leafs_visited++;
                        leaf = path->nodes[0];
                        recow = 1;
                }
@@ -766,7 +772,8 @@ next_slot:
                                btrfs_file_extent_num_bytes(leaf, fi);
                } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
                        extent_end = key.offset +
-                                btrfs_file_extent_inline_len(leaf, fi);
+                                btrfs_file_extent_inline_len(leaf,
+                                                     path->slots[0], fi);
                } else {
                        WARN_ON(1);
                        extent_end = search_start;
@@ -927,14 +934,44 @@ next_slot:
        }
        if (!ret && del_nr > 0) {
+                /*
+                 * Set path->slots[0] to first slot, so that after the delete
+                 * if items are move off from our leaf to its immediate left or
+                 * right neighbor leafs, we end up with a correct and adjusted
+                 * path->slots[0] for our insertion.
+                 */
+                path->slots[0] = del_slot;
                ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
                if (ret)
                        btrfs_abort_transaction(trans, root, ret);
+                leaf = path->nodes[0];
+                /*
+                 * leaf eb has flag EXTENT_BUFFER_STALE if it was deleted (that
+                 * is, its contents got pushed to its neighbors), in which case
+                 * it means path->locks[0] == 0
+                 */
+                if (!ret && replace_extent && leafs_visited == 1 &&
+                    path->locks[0] &&
+                    btrfs_leaf_free_space(root, leaf) >=
+                    sizeof(struct btrfs_item) + extent_item_size) {
+                        key.objectid = ino;
+                        key.type = BTRFS_EXTENT_DATA_KEY;
+                        key.offset = start;
+                        setup_items_for_insert(root, path, &key,
+                                               &extent_item_size,
+                                               extent_item_size,
+                                               sizeof(struct btrfs_item) +
+                                               extent_item_size, 1);
+                        *key_inserted = 1;
+                }
        }
+        if (!replace_extent || !(*key_inserted))
+                btrfs_release_path(path);
        if (drop_end)
                *drop_end = found ? min(end, extent_end) : end;
-        btrfs_release_path(path);
        return ret;
 }
@@ -949,7 +986,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
        if (!path)
                return -ENOMEM;
        ret = __btrfs_drop_extents(trans, root, inode, path, start, end, NULL,
-                                   drop_cache);
+                                   drop_cache, 0, 0, NULL);
        btrfs_free_path(path);
        return ret;
 }
@@ -1235,29 +1272,18 @@ static int prepare_uptodate_page(struct page *page, u64 pos,
 }
 /*
- * this gets pages into the page cache and locks them down, it also properly
+ * this just gets pages into the page cache and locks them down.
- * waits for data=ordered extents to finish before allowing the pages to be
- * modified.
 */
-static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
+static noinline int prepare_pages(struct inode *inode, struct page **pages,
-                         struct page **pages, size_t num_pages,
+                                  size_t num_pages, loff_t pos,
-                         loff_t pos, unsigned long first_index,
+                                  size_t write_bytes, bool force_uptodate)
-                         size_t write_bytes, bool force_uptodate)
 {
-        struct extent_state *cached_state = NULL;
        int i;
        unsigned long index = pos >> PAGE_CACHE_SHIFT;
-        struct inode *inode = file_inode(file);
        gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
        int err = 0;
-        int faili = 0;
+        int faili;
-        u64 start_pos;
-        u64 last_pos;
-        start_pos = pos & ~((u64)root->sectorsize - 1);
-        last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT;
-again:
        for (i = 0; i < num_pages; i++) {
                pages[i] = find_or_create_page(inode->i_mapping, index + i,
                                               mask | __GFP_WRITE);
@@ -1280,57 +1306,85 @@ again:
                }
                wait_on_page_writeback(pages[i]);
        }
-        faili = num_pages - 1;
-        err = 0;
+        return 0;
+fail:
+        while (faili >= 0) {
+                unlock_page(pages[faili]);
+                page_cache_release(pages[faili]);
+                faili--;
+        }
+        return err;
+}
+/*
+ * This function locks the extent and properly waits for data=ordered extents
+ * to finish before allowing the pages to be modified if need.
+ *
+ * The return value:
+ * 1 - the extent is locked
+ * 0 - the extent is not locked, and everything is OK
+ * -EAGAIN - need re-prepare the pages
+ * the other < 0 number - Something wrong happens
+ */
+static noinline int
+lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages,
+                                size_t num_pages, loff_t pos,
+                                u64 *lockstart, u64 *lockend,
+                                struct extent_state **cached_state)
+{
+        u64 start_pos;
+        u64 last_pos;
+        int i;
+        int ret = 0;
+        start_pos = pos & ~((u64)PAGE_CACHE_SIZE - 1);
+        last_pos = start_pos + ((u64)num_pages << PAGE_CACHE_SHIFT) - 1;
        if (start_pos < inode->i_size) {
                struct btrfs_ordered_extent *ordered;
                lock_extent_bits(&BTRFS_I(inode)->io_tree,
-                                 start_pos, last_pos - 1, 0, &cached_state);
+                                 start_pos, last_pos, 0, cached_state);
-                ordered = btrfs_lookup_first_ordered_extent(inode,
+                ordered = btrfs_lookup_first_ordered_extent(inode, last_pos);
-                                                            last_pos - 1);
                if (ordered &&
                    ordered->file_offset + ordered->len > start_pos &&
-                    ordered->file_offset < last_pos) {
+                    ordered->file_offset <= last_pos) {
                        btrfs_put_ordered_extent(ordered);
                        unlock_extent_cached(&BTRFS_I(inode)->io_tree,
-                                             start_pos, last_pos - 1,
+                                             start_pos, last_pos,
-                                             &cached_state, GFP_NOFS);
+                                             cached_state, GFP_NOFS);
                        for (i = 0; i < num_pages; i++) {
                                unlock_page(pages[i]);
                                page_cache_release(pages[i]);
                        }
-                        err = btrfs_wait_ordered_range(inode, start_pos,
+                        ret = btrfs_wait_ordered_range(inode, start_pos,
-                                                       last_pos - start_pos);
+                                                last_pos - start_pos + 1);
-                        if (err)
+                        if (ret)
-                                goto fail;
+                                return ret;
-                        goto again;
+                        else
+                                return -EAGAIN;
                }
                if (ordered)
                        btrfs_put_ordered_extent(ordered);
                clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos,
-                                  last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
+                                  last_pos, EXTENT_DIRTY | EXTENT_DELALLOC |
                                  EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
-                                  0, 0, &cached_state, GFP_NOFS);
+                                  0, 0, cached_state, GFP_NOFS);
-                unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+                *lockstart = start_pos;
-                                     start_pos, last_pos - 1, &cached_state,
+                *lockend = last_pos;
-                                     GFP_NOFS);
+                ret = 1;
        }
        for (i = 0; i < num_pages; i++) {
                if (clear_page_dirty_for_io(pages[i]))
                        account_page_redirty(pages[i]);
                set_page_extent_mapped(pages[i]);
                WARN_ON(!PageLocked(pages[i]));
        }
-        return 0;
-fail:
-        while (faili >= 0) {
-                unlock_page(pages[faili]);
-                page_cache_release(pages[faili]);
-                faili--;
-        }
-        return err;
+        return ret;
 }
 static noinline int check_can_nocow(struct inode *inode, loff_t pos,
@@ -1381,13 +1435,17 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
        struct inode *inode = file_inode(file);
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct page **pages = NULL;
+        struct extent_state *cached_state = NULL;
        u64 release_bytes = 0;
+        u64 lockstart;
+        u64 lockend;
        unsigned long first_index;
        size_t num_written = 0;
        int nrptrs;
        int ret = 0;
        bool only_release_metadata = false;
        bool force_page_uptodate = false;
+        bool need_unlock;
        nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) /
                     PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
@@ -1456,18 +1514,31 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
                }
                release_bytes = reserve_bytes;
+                need_unlock = false;
+again:
                /*
                 * This is going to setup the pages array with the number of
                 * pages we want, so we don't really need to worry about the
                 * contents of pages from loop to loop
                 */
-                ret = prepare_pages(root, file, pages, num_pages,
+                ret = prepare_pages(inode, pages, num_pages,
-                                    pos, first_index, write_bytes,
+                                    pos, write_bytes,
                                    force_page_uptodate);
                if (ret)
                        break;
+                ret = lock_and_cleanup_extent_if_need(inode, pages, num_pages,
+                                                      pos, &lockstart, &lockend,
+                                                      &cached_state);
+                if (ret < 0) {
+                        if (ret == -EAGAIN)
+                                goto again;
+                        break;
+                } else if (ret > 0) {
+                        need_unlock = true;
+                        ret = 0;
+                }
                copied = btrfs_copy_from_user(pos, num_pages,
                                           write_bytes, pages, i);
@@ -1512,19 +1583,21 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
                }
                release_bytes = dirty_pages << PAGE_CACHE_SHIFT;
-                if (copied > 0) {
+                if (copied > 0)
                        ret = btrfs_dirty_pages(root, inode, pages,
                                                dirty_pages, pos, copied,
                                                NULL);
-                        if (ret) {
+                if (need_unlock)
-                                btrfs_drop_pages(pages, num_pages);
+                        unlock_extent_cached(&BTRFS_I(inode)->io_tree,
-                                break;
+                                             lockstart, lockend, &cached_state,
-                        }
+                                             GFP_NOFS);
+                if (ret) {
+                        btrfs_drop_pages(pages, num_pages);
+                        break;
                }
                release_bytes = 0;
-                btrfs_drop_pages(pages, num_pages);
                if (only_release_metadata && copied > 0) {
                        u64 lockstart = round_down(pos, root->sectorsize);
                        u64 lockend = lockstart +
@@ -1536,6 +1609,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
                        only_release_metadata = false;
                }
+                btrfs_drop_pages(pages, num_pages);
                cond_resched();
                balance_dirty_pages_ratelimited(inode->i_mapping);
@@ -1857,12 +1932,24 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
        if (file->private_data)
                btrfs_ioctl_trans_end(file);
+        /*
+         * We use start here because we will need to wait on the IO to complete
+         * in btrfs_sync_log, which could require joining a transaction (for
+         * example checking cross references in the nocow path).  If we use join
+         * here we could get into a situation where we're waiting on IO to
+         * happen that is blocked on a transaction trying to commit.  With start
+         * we inc the extwriter counter, so we wait for all extwriters to exit
+         * before we start blocking join'ers.  This comment is to keep somebody
+         * from thinking they are super smart and changing this to
+         * btrfs_join_transaction *cough*Josef*cough*.
+         */
        trans = btrfs_start_transaction(root, 0);
        if (IS_ERR(trans)) {
                ret = PTR_ERR(trans);
                mutex_unlock(&inode->i_mutex);
                goto out;
        }
+        trans->sync = true;
        ret = btrfs_log_dentry_safe(trans, root, dentry);
        if (ret < 0) {
@@ -1963,11 +2050,13 @@ static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode,
        struct btrfs_key key;
        int ret;
+        if (btrfs_fs_incompat(root->fs_info, NO_HOLES))
+                goto out;
        key.objectid = btrfs_ino(inode);
        key.type = BTRFS_EXTENT_DATA_KEY;
        key.offset = offset;
        ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
        if (ret < 0)
                return ret;
@@ -2064,8 +2153,10 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
        u64 drop_end;
        int ret = 0;
        int err = 0;
+        int rsv_count;
        bool same_page = ((offset >> PAGE_CACHE_SHIFT) ==
                          ((offset + len - 1) >> PAGE_CACHE_SHIFT));
+        bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);
        ret = btrfs_wait_ordered_range(inode, offset, len);
        if (ret)
@@ -2125,7 +2216,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
                 * we need to try again.
                 */
                if ((!ordered ||
-                    (ordered->file_offset + ordered->len < lockstart ||
+                    (ordered->file_offset + ordered->len <= lockstart ||
                     ordered->file_offset > lockend)) &&
                     !test_range_bit(&BTRFS_I(inode)->io_tree, lockstart,
                                     lockend, EXTENT_UPTODATE, 0,
@@ -2163,9 +2254,10 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
        /*
         * 1 - update the inode
         * 1 - removing the extents in the range
-         * 1 - adding the hole extent
+         * 1 - adding the hole extent if no_holes isn't set
         */
-        trans = btrfs_start_transaction(root, 3);
+        rsv_count = no_holes ? 2 : 3;
+        trans = btrfs_start_transaction(root, rsv_count);
        if (IS_ERR(trans)) {
                err = PTR_ERR(trans);
                goto out_free;
@@ -2179,7 +2271,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
        while (cur_offset < lockend) {
                ret = __btrfs_drop_extents(trans, root, inode, path,
                                           cur_offset, lockend + 1,
-                                           &drop_end, 1);
+                                           &drop_end, 1, 0, 0, NULL);
                if (ret != -ENOSPC)
                        break;
@@ -2202,7 +2294,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
                btrfs_end_transaction(trans, root);
                btrfs_btree_balance_dirty(root);
-                trans = btrfs_start_transaction(root, 3);
+                trans = btrfs_start_transaction(root, rsv_count);
                if (IS_ERR(trans)) {
                        ret = PTR_ERR(trans);
                        trans = NULL;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 057be95b1e1e..73f3de7a083c 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -347,8 +347,8 @@ static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode,
                        btrfs_readpage(NULL, page);
                        lock_page(page);
                        if (!PageUptodate(page)) {
-                                printk(KERN_ERR "btrfs: error reading free "
+                                btrfs_err(BTRFS_I(inode)->root->fs_info,
-                                       "space cache\n");
+                                           "error reading free space cache");
                                io_ctl_drop_pages(io_ctl);
                                return -EIO;
                        }
@@ -405,7 +405,7 @@ static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation)
        gen = io_ctl->cur;
        if (le64_to_cpu(*gen) != generation) {
-                printk_ratelimited(KERN_ERR "btrfs: space cache generation "
+                printk_ratelimited(KERN_ERR "BTRFS: space cache generation "
                                   "(%Lu) does not match inode (%Lu)\n", *gen,
                                   generation);
                io_ctl_unmap_page(io_ctl);
@@ -463,7 +463,7 @@ static int io_ctl_check_crc(struct io_ctl *io_ctl, int index)
                              PAGE_CACHE_SIZE - offset);
        btrfs_csum_final(crc, (char *)&crc);
        if (val != crc) {
-                printk_ratelimited(KERN_ERR "btrfs: csum mismatch on free "
+                printk_ratelimited(KERN_ERR "BTRFS: csum mismatch on free "
                                   "space cache\n");
                io_ctl_unmap_page(io_ctl);
                return -EIO;
@@ -1902,7 +1902,7 @@ out:
        spin_unlock(&ctl->tree_lock);
        if (ret) {
-                printk(KERN_CRIT "btrfs: unable to add free space :%d\n", ret);
+                printk(KERN_CRIT "BTRFS: unable to add free space :%d\n", ret);
                ASSERT(ret != -EEXIST);
        }
@@ -2011,14 +2011,15 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
                info = rb_entry(n, struct btrfs_free_space, offset_index);
                if (info->bytes >= bytes && !block_group->ro)
                        count++;
-                printk(KERN_CRIT "entry offset %llu, bytes %llu, bitmap %s\n",
+                btrfs_crit(block_group->fs_info,
-                       info->offset, info->bytes,
+                           "entry offset %llu, bytes %llu, bitmap %s",
+                           info->offset, info->bytes,
                       (info->bitmap) ? "yes" : "no");
        }
-        printk(KERN_INFO "block group has cluster?: %s\n",
+        btrfs_info(block_group->fs_info, "block group has cluster?: %s",
               list_empty(&block_group->cluster_list) ? "no" : "yes");
-        printk(KERN_INFO "%d blocks of free space at or bigger than bytes is"
+        btrfs_info(block_group->fs_info,
-               "\n", count);
+                   "%d blocks of free space at or bigger than bytes is", count);
 }
 void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group)
@@ -2421,7 +2422,6 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
        struct btrfs_free_space *entry = NULL;
        struct btrfs_free_space *last;
        struct rb_node *node;
-        u64 window_start;
        u64 window_free;
        u64 max_extent;
        u64 total_size = 0;
@@ -2443,7 +2443,6 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
                entry = rb_entry(node, struct btrfs_free_space, offset_index);
        }
-        window_start = entry->offset;
        window_free = entry->bytes;
        max_extent = entry->bytes;
        first = entry;
diff --git a/fs/btrfs/hash.c b/fs/btrfs/hash.c
new file mode 100644
index 000000000000..85889aa82c62
--- /dev/null
+++ b/fs/btrfs/hash.c
@@ -0,0 +1,50 @@
+/*
+ * Copyright (C) 2014 Filipe David Borba Manana <fdmanana@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <crypto/hash.h>
+#include <linux/err.h>
+#include "hash.h"
+static struct crypto_shash *tfm;
+int __init btrfs_hash_init(void)
+{
+        tfm = crypto_alloc_shash("crc32c", 0, 0);
+        if (IS_ERR(tfm))
+                return PTR_ERR(tfm);
+        return 0;
+}
+void btrfs_hash_exit(void)
+{
+        crypto_free_shash(tfm);
+}
+u32 btrfs_crc32c(u32 crc, const void *address, unsigned int length)
+{
+        struct {
+                struct shash_desc shash;
+                char ctx[crypto_shash_descsize(tfm)];
+        } desc;
+        int err;
+        desc.shash.tfm = tfm;
+        desc.shash.flags = 0;
+        *(u32 *)desc.ctx = crc;
+        err = crypto_shash_update(&desc.shash, address, length);
+        BUG_ON(err);
+        return *(u32 *)desc.ctx;
+}
diff --git a/fs/btrfs/hash.h b/fs/btrfs/hash.h
index 1d982812ab67..118a2316e5d3 100644
--- a/fs/btrfs/hash.h
+++ b/fs/btrfs/hash.h
@@ -19,10 +19,15 @@
 #ifndef __HASH__
 #define __HASH__
-#include <linux/crc32c.h>
+int __init btrfs_hash_init(void);
+void btrfs_hash_exit(void);
+u32 btrfs_crc32c(u32 crc, const void *address, unsigned int length);
 static inline u64 btrfs_name_hash(const char *name, int len)
 {
-        return crc32c((u32)~1, name, len);
+        return btrfs_crc32c((u32)~1, name, len);
 }
 /*
@@ -31,7 +36,7 @@ static inline u64 btrfs_name_hash(const char *name, int len)
 static inline u64 btrfs_extref_hash(u64 parent_objectid, const char *name,
                                    int len)
 {
-        return (u64) crc32c(parent_objectid, name, len);
+        return (u64) btrfs_crc32c(parent_objectid, name, len);
 }
 #endif
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index ec82fae07097..2be38df703c9 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -91,32 +91,6 @@ int btrfs_find_name_in_ext_backref(struct btrfs_path *path, u64 ref_objectid,
        return 0;
 }
-static struct btrfs_inode_ref *
-btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,
-                       struct btrfs_root *root,
-                       struct btrfs_path *path,
-                       const char *name, int name_len,
-                       u64 inode_objectid, u64 ref_objectid, int ins_len,
-                       int cow)
-{
-        int ret;
-        struct btrfs_key key;
-        struct btrfs_inode_ref *ref;
-        key.objectid = inode_objectid;
-        key.type = BTRFS_INODE_REF_KEY;
-        key.offset = ref_objectid;
-        ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
-        if (ret < 0)
-                return ERR_PTR(ret);
-        if (ret > 0)
-                return NULL;
-        if (!find_name_in_backref(path, name, name_len, &ref))
-                return NULL;
-        return ref;
-}
 /* Returns NULL if no extref found */
 struct btrfs_inode_extref *
 btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
@@ -144,45 +118,6 @@ btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
        return extref;
 }
-int btrfs_get_inode_ref_index(struct btrfs_trans_handle *trans,
-                              struct btrfs_root *root,
-                              struct btrfs_path *path,
-                              const char *name, int name_len,
-                              u64 inode_objectid, u64 ref_objectid, int mod,
-                              u64 *ret_index)
-{
-        struct btrfs_inode_ref *ref;
-        struct btrfs_inode_extref *extref;
-        int ins_len = mod < 0 ? -1 : 0;
-        int cow = mod != 0;
-        ref = btrfs_lookup_inode_ref(trans, root, path, name, name_len,
-                                     inode_objectid, ref_objectid, ins_len,
-                                     cow);
-        if (IS_ERR(ref))
-                return PTR_ERR(ref);
-        if (ref != NULL) {
-                *ret_index = btrfs_inode_ref_index(path->nodes[0], ref);
-                return 0;
-        }
-        btrfs_release_path(path);
-        extref = btrfs_lookup_inode_extref(trans, root, path, name,
-                                           name_len, inode_objectid,
-                                           ref_objectid, ins_len, cow);
-        if (IS_ERR(extref))
-                return PTR_ERR(extref);
-        if (extref) {
-                *ret_index = btrfs_inode_extref_index(path->nodes[0], extref);
-                return 0;
-        }
-        return -ENOENT;
-}
 static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
                                  struct btrfs_root *root,
                                  const char *name, int name_len,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index f1a77449d032..d3d44486290b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -58,9 +58,10 @@
 #include "inode-map.h"
 #include "backref.h"
 #include "hash.h"
+#include "props.h"
 struct btrfs_iget_args {
-        u64 ino;
+        struct btrfs_key *location;
        struct btrfs_root *root;
 };
@@ -125,13 +126,12 @@ static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
 * no overlapping inline items exist in the btree
 */
 static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
+                                struct btrfs_path *path, int extent_inserted,
                                struct btrfs_root *root, struct inode *inode,
                                u64 start, size_t size, size_t compressed_size,
                                int compress_type,
                                struct page **compressed_pages)
 {
-        struct btrfs_key key;
-        struct btrfs_path *path;
        struct extent_buffer *leaf;
        struct page *page = NULL;
        char *kaddr;
@@ -140,29 +140,29 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
        int err = 0;
        int ret;
        size_t cur_size = size;
-        size_t datasize;
        unsigned long offset;
        if (compressed_size && compressed_pages)
                cur_size = compressed_size;
-        path = btrfs_alloc_path();
+        inode_add_bytes(inode, size);
-        if (!path)
-                return -ENOMEM;
-        path->leave_spinning = 1;
+        if (!extent_inserted) {
+                struct btrfs_key key;
+                size_t datasize;
-        key.objectid = btrfs_ino(inode);
+                key.objectid = btrfs_ino(inode);
-        key.offset = start;
+                key.offset = start;
-        btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
+                btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
-        datasize = btrfs_file_extent_calc_inline_size(cur_size);
-        inode_add_bytes(inode, size);
+                datasize = btrfs_file_extent_calc_inline_size(cur_size);
-        ret = btrfs_insert_empty_item(trans, root, path, &key,
+                path->leave_spinning = 1;
-                                      datasize);
+                ret = btrfs_insert_empty_item(trans, root, path, &key,
-        if (ret) {
+                                              datasize);
-                err = ret;
+                if (ret) {
-                goto fail;
+                        err = ret;
+                        goto fail;
+                }
        }
        leaf = path->nodes[0];
        ei = btrfs_item_ptr(leaf, path->slots[0],
@@ -203,7 +203,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
                page_cache_release(page);
        }
        btrfs_mark_buffer_dirty(leaf);
-        btrfs_free_path(path);
+        btrfs_release_path(path);
        /*
         * we're an inline extent, so nobody can
@@ -219,7 +219,6 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
        return ret;
 fail:
-        btrfs_free_path(path);
        return err;
 }
@@ -242,6 +241,9 @@ static noinline int cow_file_range_inline(struct btrfs_root *root,
        u64 aligned_end = ALIGN(end, root->sectorsize);
        u64 data_len = inline_len;
        int ret;
+        struct btrfs_path *path;
+        int extent_inserted = 0;
+        u32 extent_item_size;
        if (compressed_size)
                data_len = compressed_size;
@@ -256,12 +258,27 @@ static noinline int cow_file_range_inline(struct btrfs_root *root,
                return 1;
        }
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
        trans = btrfs_join_transaction(root);
-        if (IS_ERR(trans))
+        if (IS_ERR(trans)) {
+                btrfs_free_path(path);
                return PTR_ERR(trans);
+        }
        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
-        ret = btrfs_drop_extents(trans, root, inode, start, aligned_end, 1);
+        if (compressed_size && compressed_pages)
+                extent_item_size = btrfs_file_extent_calc_inline_size(
+                   compressed_size);
+        else
+                extent_item_size = btrfs_file_extent_calc_inline_size(
+                    inline_len);
+        ret = __btrfs_drop_extents(trans, root, inode, path,
+                                   start, aligned_end, NULL,
+                                   1, 1, extent_item_size, &extent_inserted);
        if (ret) {
                btrfs_abort_transaction(trans, root, ret);
                goto out;
@@ -269,7 +286,8 @@ static noinline int cow_file_range_inline(struct btrfs_root *root,
        if (isize > actual_end)
                inline_len = min_t(u64, isize, actual_end);
-        ret = insert_inline_extent(trans, root, inode, start,
+        ret = insert_inline_extent(trans, path, extent_inserted,
+                                   root, inode, start,
                                   inline_len, compressed_size,
                                   compress_type, compressed_pages);
        if (ret && ret != -ENOSPC) {
@@ -284,6 +302,7 @@ static noinline int cow_file_range_inline(struct btrfs_root *root,
        btrfs_delalloc_release_metadata(inode, end + 1 - start);
        btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
 out:
+        btrfs_free_path(path);
        btrfs_end_transaction(trans, root);
        return ret;
 }
@@ -1262,7 +1281,8 @@ next_slot:
                        nocow = 1;
                } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
                        extent_end = found_key.offset +
-                                btrfs_file_extent_inline_len(leaf, fi);
+                                btrfs_file_extent_inline_len(leaf,
+                                                     path->slots[0], fi);
                        extent_end = ALIGN(extent_end, root->sectorsize);
                } else {
                        BUG_ON(1);
@@ -1577,7 +1597,7 @@ int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset,
                         unsigned long bio_flags)
 {
        struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
-        u64 logical = (u64)bio->bi_sector << 9;
+        u64 logical = (u64)bio->bi_iter.bi_sector << 9;
        u64 length = 0;
        u64 map_length;
        int ret;
@@ -1585,7 +1605,7 @@ int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset,
        if (bio_flags & EXTENT_BIO_COMPRESSED)
                return 0;
-        length = bio->bi_size;
+        length = bio->bi_iter.bi_size;
        map_length = length;
        ret = btrfs_map_block(root->fs_info, rw, logical,
                              &map_length, NULL, 0);
@@ -1841,14 +1861,13 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
        struct btrfs_path *path;
        struct extent_buffer *leaf;
        struct btrfs_key ins;
+        int extent_inserted = 0;
        int ret;
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
-        path->leave_spinning = 1;
        /*
         * we may be replacing one extent in the tree with another.
         * The new extent is pinned in the extent map, and we don't want
@@ -1858,17 +1877,23 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
         * the caller is expected to unpin it and allow it to be merged
         * with the others.
         */
-        ret = btrfs_drop_extents(trans, root, inode, file_pos,
+        ret = __btrfs_drop_extents(trans, root, inode, path, file_pos,
-                                 file_pos + num_bytes, 0);
+                                   file_pos + num_bytes, NULL, 0,
+                                   1, sizeof(*fi), &extent_inserted);
        if (ret)
                goto out;
-        ins.objectid = btrfs_ino(inode);
+        if (!extent_inserted) {
-        ins.offset = file_pos;
+                ins.objectid = btrfs_ino(inode);
-        ins.type = BTRFS_EXTENT_DATA_KEY;
+                ins.offset = file_pos;
-        ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*fi));
+                ins.type = BTRFS_EXTENT_DATA_KEY;
-        if (ret)
-                goto out;
+                path->leave_spinning = 1;
+                ret = btrfs_insert_empty_item(trans, root, path, &ins,
+                                              sizeof(*fi));
+                if (ret)
+                        goto out;
+        }
        leaf = path->nodes[0];
        fi = btrfs_item_ptr(leaf, path->slots[0],
                            struct btrfs_file_extent_item);
@@ -2290,7 +2315,7 @@ again:
                u64 extent_len;
                struct btrfs_key found_key;
-                ret = btrfs_search_slot(trans, root, &key, path, 1, 1);
+                ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
                if (ret < 0)
                        goto out_free_path;
@@ -2543,12 +2568,6 @@ out_kfree:
        return NULL;
 }
-/*
- * helper function for btrfs_finish_ordered_io, this
- * just reads in some of the csum leaves to prime them into ram
- * before we start the transaction.  It limits the amount of btree
- * reads required while inside the transaction.
- */
 /* as ordered data IO finishes, this gets called so we can finish
 * an ordered extent if the range of bytes in the file it covers are
 * fully written.
@@ -2610,7 +2629,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
                        EXTENT_DEFRAG, 1, cached_state);
        if (ret) {
                u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
-                if (last_snapshot >= BTRFS_I(inode)->generation)
+                if (0 && last_snapshot >= BTRFS_I(inode)->generation)
                        /* the inode is shared */
                        new = record_old_file_extents(inode, ordered_extent);
@@ -3248,7 +3267,8 @@ out:
 * slot is the slot the inode is in, objectid is the objectid of the inode
 */
 static noinline int acls_after_inode_item(struct extent_buffer *leaf,
-                                          int slot, u64 objectid)
+                                          int slot, u64 objectid,
+                                          int *first_xattr_slot)
 {
        u32 nritems = btrfs_header_nritems(leaf);
        struct btrfs_key found_key;
@@ -3264,6 +3284,7 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
        }
        slot++;
+        *first_xattr_slot = -1;
        while (slot < nritems) {
                btrfs_item_key_to_cpu(leaf, &found_key, slot);
@@ -3273,6 +3294,8 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
                /* we found an xattr, assume we've got an acl */
                if (found_key.type == BTRFS_XATTR_ITEM_KEY) {
+                        if (*first_xattr_slot == -1)
+                                *first_xattr_slot = slot;
                        if (found_key.offset == xattr_access ||
                            found_key.offset == xattr_default)
                                return 1;
@@ -3301,6 +3324,8 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
         * something larger than an xattr.  We have to assume the inode
         * has acls
         */
+        if (*first_xattr_slot == -1)
+                *first_xattr_slot = slot;
        return 1;
 }
@@ -3315,10 +3340,12 @@ static void btrfs_read_locked_inode(struct inode *inode)
        struct btrfs_timespec *tspec;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_key location;
+        unsigned long ptr;
        int maybe_acls;
        u32 rdev;
        int ret;
        bool filled = false;
+        int first_xattr_slot;
        ret = btrfs_fill_inode(inode, &rdev);
        if (!ret)
@@ -3328,7 +3355,6 @@ static void btrfs_read_locked_inode(struct inode *inode)
        if (!path)
                goto make_bad;
-        path->leave_spinning = 1;
        memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
        ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
@@ -3338,7 +3364,7 @@ static void btrfs_read_locked_inode(struct inode *inode)
        leaf = path->nodes[0];
        if (filled)
-                goto cache_acl;
+                goto cache_index;
        inode_item = btrfs_item_ptr(leaf, path->slots[0],
                                    struct btrfs_inode_item);
@@ -3381,18 +3407,51 @@ static void btrfs_read_locked_inode(struct inode *inode)
        BTRFS_I(inode)->index_cnt = (u64)-1;
        BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
+cache_index:
+        path->slots[0]++;
+        if (inode->i_nlink != 1 ||
+            path->slots[0] >= btrfs_header_nritems(leaf))
+                goto cache_acl;
+        btrfs_item_key_to_cpu(leaf, &location, path->slots[0]);
+        if (location.objectid != btrfs_ino(inode))
+                goto cache_acl;
+        ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+        if (location.type == BTRFS_INODE_REF_KEY) {
+                struct btrfs_inode_ref *ref;
+                ref = (struct btrfs_inode_ref *)ptr;
+                BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref);
+        } else if (location.type == BTRFS_INODE_EXTREF_KEY) {
+                struct btrfs_inode_extref *extref;
+                extref = (struct btrfs_inode_extref *)ptr;
+                BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf,
+                                                                     extref);
+        }
 cache_acl:
        /*
         * try to precache a NULL acl entry for files that don't have
         * any xattrs or acls
         */
        maybe_acls = acls_after_inode_item(leaf, path->slots[0],
-                                           btrfs_ino(inode));
+                                           btrfs_ino(inode), &first_xattr_slot);
+        if (first_xattr_slot != -1) {
+                path->slots[0] = first_xattr_slot;
+                ret = btrfs_load_inode_props(inode, path);
+                if (ret)
+                        btrfs_err(root->fs_info,
+                                  "error loading props for ino %llu (root %llu): %d\n",
+                                  btrfs_ino(inode),
+                                  root->root_key.objectid, ret);
+        }
+        btrfs_free_path(path);
        if (!maybe_acls)
                cache_no_acl(inode);
-        btrfs_free_path(path);
        switch (inode->i_mode & S_IFMT) {
        case S_IFREG:
                inode->i_mapping->a_ops = &btrfs_aops;
@@ -3496,7 +3555,6 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
                goto failed;
        }
-        btrfs_unlock_up_safe(path, 1);
        leaf = path->nodes[0];
        inode_item = btrfs_item_ptr(leaf, path->slots[0],
                                    struct btrfs_inode_item);
@@ -3593,6 +3651,24 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
                goto err;
        btrfs_release_path(path);
+        /*
+         * If we don't have dir index, we have to get it by looking up
+         * the inode ref, since we get the inode ref, remove it directly,
+         * it is unnecessary to do delayed deletion.
+         *
+         * But if we have dir index, needn't search inode ref to get it.
+         * Since the inode ref is close to the inode item, it is better
+         * that we delay to delete it, and just do this deletion when
+         * we update the inode item.
+         */
+        if (BTRFS_I(inode)->dir_index) {
+                ret = btrfs_delayed_delete_inode_ref(inode);
+                if (!ret) {
+                        index = BTRFS_I(inode)->dir_index;
+                        goto skip_backref;
+                }
+        }
        ret = btrfs_del_inode_ref(trans, root, name, name_len, ino,
                                  dir_ino, &index);
        if (ret) {
@@ -3602,7 +3678,7 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
                btrfs_abort_transaction(trans, root, ret);
                goto err;
        }
+skip_backref:
        ret = btrfs_delete_delayed_dir_index(trans, root, dir, index);
        if (ret) {
                btrfs_abort_transaction(trans, root, ret);
@@ -3948,7 +4024,7 @@ search_again:
                                    btrfs_file_extent_num_bytes(leaf, fi);
                        } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
                                item_end += btrfs_file_extent_inline_len(leaf,
-                                                                         fi);
+                                                         path->slots[0], fi);
                        }
                        item_end--;
                }
@@ -4018,6 +4094,12 @@ search_again:
                                        inode_sub_bytes(inode, item_end + 1 -
                                                        new_size);
                                }
+                                /*
+                                 * update the ram bytes to properly reflect
+                                 * the new size of our item
+                                 */
+                                btrfs_set_file_extent_ram_bytes(leaf, fi, size);
                                size =
                                    btrfs_file_extent_calc_inline_size(size);
                                btrfs_truncate_item(root, path, size, 1);
@@ -4203,6 +4285,49 @@ out:
        return ret;
 }
+static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode,
+                             u64 offset, u64 len)
+{
+        struct btrfs_trans_handle *trans;
+        int ret;
+        /*
+         * Still need to make sure the inode looks like it's been updated so
+         * that any holes get logged if we fsync.
+         */
+        if (btrfs_fs_incompat(root->fs_info, NO_HOLES)) {
+                BTRFS_I(inode)->last_trans = root->fs_info->generation;
+                BTRFS_I(inode)->last_sub_trans = root->log_transid;
+                BTRFS_I(inode)->last_log_commit = root->last_log_commit;
+                return 0;
+        }
+        /*
+         * 1 - for the one we're dropping
+         * 1 - for the one we're adding
+         * 1 - for updating the inode.
+         */
+        trans = btrfs_start_transaction(root, 3);
+        if (IS_ERR(trans))
+                return PTR_ERR(trans);
+        ret = btrfs_drop_extents(trans, root, inode, offset, offset + len, 1);
+        if (ret) {
+                btrfs_abort_transaction(trans, root, ret);
+                btrfs_end_transaction(trans, root);
+                return ret;
+        }
+        ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset,
+                                       0, 0, len, 0, len, 0, 0, 0);
+        if (ret)
+                btrfs_abort_transaction(trans, root, ret);
+        else
+                btrfs_update_inode(trans, root, inode);
+        btrfs_end_transaction(trans, root);
+        return ret;
+}
 /*
 * This function puts in dummy file extents for the area we're creating a hole
 * for.  So if we are truncating this file to a larger size we need to insert
@@ -4211,7 +4336,6 @@ out:
 */
 int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
 {
-        struct btrfs_trans_handle *trans;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
        struct extent_map *em = NULL;
@@ -4266,31 +4390,10 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
                        struct extent_map *hole_em;
                        hole_size = last_byte - cur_offset;
-                        trans = btrfs_start_transaction(root, 3);
+                        err = maybe_insert_hole(root, inode, cur_offset,
-                        if (IS_ERR(trans)) {
+                                                hole_size);
-                                err = PTR_ERR(trans);
+                        if (err)
-                                break;
-                        }
-                        err = btrfs_drop_extents(trans, root, inode,
-                                                 cur_offset,
-                                                 cur_offset + hole_size, 1);
-                        if (err) {
-                                btrfs_abort_transaction(trans, root, err);
-                                btrfs_end_transaction(trans, root);
-                                break;
-                        }
-                        err = btrfs_insert_file_extent(trans, root,
-                                        btrfs_ino(inode), cur_offset, 0,
-                                        0, hole_size, 0, hole_size,
-                                        0, 0, 0);
-                        if (err) {
-                                btrfs_abort_transaction(trans, root, err);
-                                btrfs_end_transaction(trans, root);
                                break;
-                        }
                        btrfs_drop_extent_cache(inode, cur_offset,
                                                cur_offset + hole_size - 1, 0);
                        hole_em = alloc_extent_map();
@@ -4309,7 +4412,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
                        hole_em->ram_bytes = hole_size;
                        hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
                        hole_em->compress_type = BTRFS_COMPRESS_NONE;
-                        hole_em->generation = trans->transid;
+                        hole_em->generation = root->fs_info->generation;
                        while (1) {
                                write_lock(&em_tree->lock);
@@ -4322,17 +4425,14 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
                                                        hole_size - 1, 0);
                        }
                        free_extent_map(hole_em);
-next:
-                        btrfs_update_inode(trans, root, inode);
-                        btrfs_end_transaction(trans, root);
                }
+next:
                free_extent_map(em);
                em = NULL;
                cur_offset = last_byte;
                if (cur_offset >= block_end)
                        break;
        }
        free_extent_map(em);
        unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state,
                             GFP_NOFS);
@@ -4354,8 +4454,12 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
         * these flags set.  For all other operations the VFS set these flags
         * explicitly if it wants a timestamp update.
         */
-        if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME))))
+        if (newsize != oldsize) {
-                inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb);
+                inode_inc_iversion(inode);
+                if (!(mask & (ATTR_CTIME | ATTR_MTIME)))
+                        inode->i_ctime = inode->i_mtime =
+                                current_fs_time(inode->i_sb);
+        }
        if (newsize > oldsize) {
                truncate_pagecache(inode, newsize);
@@ -4464,12 +4568,70 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
                err = btrfs_dirty_inode(inode);
                if (!err && attr->ia_valid & ATTR_MODE)
-                        err = btrfs_acl_chmod(inode);
+                        err = posix_acl_chmod(inode, inode->i_mode);
        }
        return err;
 }
+/*
+ * While truncating the inode pages during eviction, we get the VFS calling
+ * btrfs_invalidatepage() against each page of the inode. This is slow because
+ * the calls to btrfs_invalidatepage() result in a huge amount of calls to
+ * lock_extent_bits() and clear_extent_bit(), which keep merging and splitting
+ * extent_state structures over and over, wasting lots of time.
+ *
+ * Therefore if the inode is being evicted, let btrfs_invalidatepage() skip all
+ * those expensive operations on a per page basis and do only the ordered io
+ * finishing, while we release here the extent_map and extent_state structures,
+ * without the excessive merging and splitting.
+ */
+static void evict_inode_truncate_pages(struct inode *inode)
+{
+        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+        struct extent_map_tree *map_tree = &BTRFS_I(inode)->extent_tree;
+        struct rb_node *node;
+        ASSERT(inode->i_state & I_FREEING);
+        truncate_inode_pages(&inode->i_data, 0);
+        write_lock(&map_tree->lock);
+        while (!RB_EMPTY_ROOT(&map_tree->map)) {
+                struct extent_map *em;
+                node = rb_first(&map_tree->map);
+                em = rb_entry(node, struct extent_map, rb_node);
+                clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+                clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
+                remove_extent_mapping(map_tree, em);
+                free_extent_map(em);
+        }
+        write_unlock(&map_tree->lock);
+        spin_lock(&io_tree->lock);
+        while (!RB_EMPTY_ROOT(&io_tree->state)) {
+                struct extent_state *state;
+                struct extent_state *cached_state = NULL;
+                node = rb_first(&io_tree->state);
+                state = rb_entry(node, struct extent_state, rb_node);
+                atomic_inc(&state->refs);
+                spin_unlock(&io_tree->lock);
+                lock_extent_bits(io_tree, state->start, state->end,
+                                 0, &cached_state);
+                clear_extent_bit(io_tree, state->start, state->end,
+                                 EXTENT_LOCKED | EXTENT_DIRTY |
+                                 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
+                                 EXTENT_DEFRAG, 1, 1,
+                                 &cached_state, GFP_NOFS);
+                free_extent_state(state);
+                spin_lock(&io_tree->lock);
+        }
+        spin_unlock(&io_tree->lock);
+}
 void btrfs_evict_inode(struct inode *inode)
 {
        struct btrfs_trans_handle *trans;
@@ -4480,7 +4642,8 @@ void btrfs_evict_inode(struct inode *inode)
        trace_btrfs_inode_evict(inode);
-        truncate_inode_pages(&inode->i_data, 0);
+        evict_inode_truncate_pages(inode);
        if (inode->i_nlink &&
            ((btrfs_root_refs(&root->root_item) != 0 &&
              root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) ||
@@ -4655,9 +4818,9 @@ static int fixup_tree_root_location(struct btrfs_root *root,
        }
        err = -ENOENT;
-        ret = btrfs_find_root_ref(root->fs_info->tree_root, path,
+        ret = btrfs_find_item(root->fs_info->tree_root, path,
-                                  BTRFS_I(dir)->root->root_key.objectid,
+                                BTRFS_I(dir)->root->root_key.objectid,
-                                  location->objectid);
+                                location->objectid, BTRFS_ROOT_REF_KEY, NULL);
        if (ret) {
                if (ret < 0)
                        err = ret;
@@ -4818,7 +4981,9 @@ again:
 static int btrfs_init_locked_inode(struct inode *inode, void *p)
 {
        struct btrfs_iget_args *args = p;
-        inode->i_ino = args->ino;
+        inode->i_ino = args->location->objectid;
+        memcpy(&BTRFS_I(inode)->location, args->location,
+               sizeof(*args->location));
        BTRFS_I(inode)->root = args->root;
        return 0;
 }
@@ -4826,19 +4991,19 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p)
 static int btrfs_find_actor(struct inode *inode, void *opaque)
 {
        struct btrfs_iget_args *args = opaque;
-        return args->ino == btrfs_ino(inode) &&
+        return args->location->objectid == BTRFS_I(inode)->location.objectid &&
                args->root == BTRFS_I(inode)->root;
 }
 static struct inode *btrfs_iget_locked(struct super_block *s,
-                                       u64 objectid,
+                                       struct btrfs_key *location,
                                       struct btrfs_root *root)
 {
        struct inode *inode;
        struct btrfs_iget_args args;
-        unsigned long hashval = btrfs_inode_hash(objectid, root);
+        unsigned long hashval = btrfs_inode_hash(location->objectid, root);
-        args.ino = objectid;
+        args.location = location;
        args.root = root;
        inode = iget5_locked(s, hashval, btrfs_find_actor,
@@ -4855,13 +5020,11 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
 {
        struct inode *inode;
-        inode = btrfs_iget_locked(s, location->objectid, root);
+        inode = btrfs_iget_locked(s, location, root);
        if (!inode)
                return ERR_PTR(-ENOMEM);
        if (inode->i_state & I_NEW) {
-                BTRFS_I(inode)->root = root;
-                memcpy(&BTRFS_I(inode)->location, location, sizeof(*location));
                btrfs_read_locked_inode(inode);
                if (!is_bad_inode(inode)) {
                        inode_tree_add(inode);
@@ -4917,7 +5080,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
                return ERR_PTR(ret);
        if (location.objectid == 0)
-                return NULL;
+                return ERR_PTR(-ENOENT);
        if (location.type == BTRFS_INODE_ITEM_KEY) {
                inode = btrfs_iget(dir->i_sb, &location, root, NULL);
@@ -4981,10 +5144,17 @@ static void btrfs_dentry_release(struct dentry *dentry)
 static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
                                   unsigned int flags)
 {
-        struct dentry *ret;
+        struct inode *inode;
-        ret = d_splice_alias(btrfs_lookup_dentry(dir, dentry), dentry);
+        inode = btrfs_lookup_dentry(dir, dentry);
-        return ret;
+        if (IS_ERR(inode)) {
+                if (PTR_ERR(inode) == -ENOENT)
+                        inode = NULL;
+                else
+                        return ERR_CAST(inode);
+        }
+        return d_materialise_unique(dentry, inode);
 }
 unsigned char btrfs_filetype_table[] = {
@@ -5354,7 +5524,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
        u32 sizes[2];
        unsigned long ptr;
        int ret;
-        int owner;
        path = btrfs_alloc_path();
        if (!path)
@@ -5388,6 +5557,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
         * number
         */
        BTRFS_I(inode)->index_cnt = 2;
+        BTRFS_I(inode)->dir_index = *index;
        BTRFS_I(inode)->root = root;
        BTRFS_I(inode)->generation = trans->transid;
        inode->i_generation = BTRFS_I(inode)->generation;
@@ -5400,11 +5570,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
         */
        set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
-        if (S_ISDIR(mode))
-                owner = 0;
-        else
-                owner = 1;
        key[0].objectid = objectid;
        btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
        key[0].offset = 0;
@@ -5469,6 +5634,12 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
        btrfs_update_root_times(trans, root);
+        ret = btrfs_inode_inherit_props(trans, inode, dir);
+        if (ret)
+                btrfs_err(root->fs_info,
+                          "error inheriting props for ino %llu (root %llu): %d",
+                          btrfs_ino(inode), root->root_key.objectid, ret);
        return inode;
 fail:
        if (dir)
@@ -5737,6 +5908,8 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
                goto fail;
        }
+        /* There are several dir indexes for this inode, clear the cache. */
+        BTRFS_I(inode)->dir_index = 0ULL;
        inc_nlink(inode);
        inode_inc_iversion(inode);
        inode->i_ctime = CURRENT_TIME;
@@ -6000,7 +6173,7 @@ again:
                       btrfs_file_extent_num_bytes(leaf, item);
        } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
                size_t size;
-                size = btrfs_file_extent_inline_len(leaf, item);
+                size = btrfs_file_extent_inline_len(leaf, path->slots[0], item);
                extent_end = ALIGN(extent_start + size, root->sectorsize);
        }
 next:
@@ -6069,7 +6242,7 @@ next:
                        goto out;
                }
-                size = btrfs_file_extent_inline_len(leaf, item);
+                size = btrfs_file_extent_inline_len(leaf, path->slots[0], item);
                extent_offset = page_offset(page) + pg_offset - extent_start;
                copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset,
                                size - extent_offset);
@@ -6386,6 +6559,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
        int slot;
        int found_type;
        bool nocow = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW);
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
@@ -6429,6 +6603,10 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
        if (!nocow && found_type == BTRFS_FILE_EXTENT_REG)
                goto out;
+        extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
+        if (extent_end <= offset)
+                goto out;
        disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
        if (disk_bytenr == 0)
                goto out;
@@ -6446,8 +6624,6 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
                *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
        }
-        extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
        if (btrfs_extent_readonly(root, disk_bytenr))
                goto out;
        btrfs_release_path(path);
@@ -6779,17 +6955,16 @@ unlock_err:
 static void btrfs_endio_direct_read(struct bio *bio, int err)
 {
        struct btrfs_dio_private *dip = bio->bi_private;
-        struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
+        struct bio_vec *bvec;
-        struct bio_vec *bvec = bio->bi_io_vec;
        struct inode *inode = dip->inode;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct bio *dio_bio;
        u32 *csums = (u32 *)dip->csum;
-        int index = 0;
        u64 start;
+        int i;
        start = dip->logical_offset;
-        do {
+        bio_for_each_segment_all(bvec, bio, i) {
                if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
                        struct page *page = bvec->bv_page;
                        char *kaddr;
@@ -6805,18 +6980,16 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
                        local_irq_restore(flags);
                        flush_dcache_page(bvec->bv_page);
-                        if (csum != csums[index]) {
+                        if (csum != csums[i]) {
                                btrfs_err(root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u",
                                          btrfs_ino(inode), start, csum,
-                                          csums[index]);
+                                          csums[i]);
                                err = -EIO;
                        }
                }
                start += bvec->bv_len;
-                bvec++;
+        }
-                index++;
-        } while (bvec <= bvec_end);
        unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
                      dip->logical_offset + dip->bytes - 1);
@@ -6894,10 +7067,11 @@ static void btrfs_end_dio_bio(struct bio *bio, int err)
        struct btrfs_dio_private *dip = bio->bi_private;
        if (err) {
-                printk(KERN_ERR "btrfs direct IO failed ino %llu rw %lu "
+                btrfs_err(BTRFS_I(dip->inode)->root->fs_info,
-                      "sector %#Lx len %u err no %d\n",
+                          "direct IO failed ino %llu rw %lu sector %#Lx len %u err no %d",
                      btrfs_ino(dip->inode), bio->bi_rw,
-                      (unsigned long long)bio->bi_sector, bio->bi_size, err);
+                      (unsigned long long)bio->bi_iter.bi_sector,
+                      bio->bi_iter.bi_size, err);
                dip->errors = 1;
                /*
@@ -6988,7 +7162,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
        struct bio *bio;
        struct bio *orig_bio = dip->orig_bio;
        struct bio_vec *bvec = orig_bio->bi_io_vec;
-        u64 start_sector = orig_bio->bi_sector;
+        u64 start_sector = orig_bio->bi_iter.bi_sector;
        u64 file_offset = dip->logical_offset;
        u64 submit_len = 0;
        u64 map_length;
@@ -6996,7 +7170,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
        int ret = 0;
        int async_submit = 0;
-        map_length = orig_bio->bi_size;
+        map_length = orig_bio->bi_iter.bi_size;
        ret = btrfs_map_block(root->fs_info, rw, start_sector << 9,
                              &map_length, NULL, 0);
        if (ret) {
@@ -7004,7 +7178,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
                return -EIO;
        }
-        if (map_length >= orig_bio->bi_size) {
+        if (map_length >= orig_bio->bi_iter.bi_size) {
                bio = orig_bio;
                goto submit;
        }
@@ -7056,7 +7230,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
                        bio->bi_private = dip;
                        bio->bi_end_io = btrfs_end_dio_bio;
-                        map_length = orig_bio->bi_size;
+                        map_length = orig_bio->bi_iter.bi_size;
                        ret = btrfs_map_block(root->fs_info, rw,
                                              start_sector << 9,
                                              &map_length, NULL, 0);
@@ -7114,7 +7288,8 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
        if (!skip_sum && !write) {
                csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
-                sum_len = dio_bio->bi_size >> inode->i_sb->s_blocksize_bits;
+                sum_len = dio_bio->bi_iter.bi_size >>
+                        inode->i_sb->s_blocksize_bits;
                sum_len *= csum_size;
        } else {
                sum_len = 0;
@@ -7129,8 +7304,8 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
        dip->private = dio_bio->bi_private;
        dip->inode = inode;
        dip->logical_offset = file_offset;
-        dip->bytes = dio_bio->bi_size;
+        dip->bytes = dio_bio->bi_iter.bi_size;
-        dip->disk_bytenr = (u64)dio_bio->bi_sector << 9;
+        dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9;
        io_bio->bi_private = dip;
        dip->errors = 0;
        dip->orig_bio = io_bio;
@@ -7367,6 +7542,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
        struct extent_state *cached_state = NULL;
        u64 page_start = page_offset(page);
        u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
+        int inode_evicting = inode->i_state & I_FREEING;
        /*
         * we have the page locked, so new writeback can't start,
@@ -7382,17 +7558,21 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
                btrfs_releasepage(page, GFP_NOFS);
                return;
        }
-        lock_extent_bits(tree, page_start, page_end, 0, &cached_state);
-        ordered = btrfs_lookup_ordered_extent(inode, page_offset(page));
+        if (!inode_evicting)
+                lock_extent_bits(tree, page_start, page_end, 0, &cached_state);
+        ordered = btrfs_lookup_ordered_extent(inode, page_start);
        if (ordered) {
                /*
                 * IO on this page will never be started, so we need
                 * to account for any ordered extents now
                 */
-                clear_extent_bit(tree, page_start, page_end,
+                if (!inode_evicting)
-                                 EXTENT_DIRTY | EXTENT_DELALLOC |
+                        clear_extent_bit(tree, page_start, page_end,
-                                 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
+                                         EXTENT_DIRTY | EXTENT_DELALLOC |
-                                 EXTENT_DEFRAG, 1, 0, &cached_state, GFP_NOFS);
+                                         EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
+                                         EXTENT_DEFRAG, 1, 0, &cached_state,
+                                         GFP_NOFS);
                /*
                 * whoever cleared the private bit is responsible
                 * for the finish_ordered_io
@@ -7416,14 +7596,22 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
                                btrfs_finish_ordered_io(ordered);
                }
                btrfs_put_ordered_extent(ordered);
-                cached_state = NULL;
+                if (!inode_evicting) {
-                lock_extent_bits(tree, page_start, page_end, 0, &cached_state);
+                        cached_state = NULL;
+                        lock_extent_bits(tree, page_start, page_end, 0,
+                                         &cached_state);
+                }
+        }
+        if (!inode_evicting) {
+                clear_extent_bit(tree, page_start, page_end,
+                                 EXTENT_LOCKED | EXTENT_DIRTY |
+                                 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
+                                 EXTENT_DEFRAG, 1, 1,
+                                 &cached_state, GFP_NOFS);
+                __btrfs_releasepage(page, GFP_NOFS);
        }
-        clear_extent_bit(tree, page_start, page_end,
-                 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
-                 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1,
-                 &cached_state, GFP_NOFS);
-        __btrfs_releasepage(page, GFP_NOFS);
        ClearPageChecked(page);
        if (PagePrivate(page)) {
@@ -7733,7 +7921,9 @@ out:
 * create a new subvolume directory/inode (helper for the ioctl).
 */
 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
-                             struct btrfs_root *new_root, u64 new_dirid)
+                             struct btrfs_root *new_root,
+                             struct btrfs_root *parent_root,
+                             u64 new_dirid)
 {
        struct inode *inode;
        int err;
@@ -7751,6 +7941,12 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
        set_nlink(inode, 1);
        btrfs_i_size_write(inode, 0);
+        err = btrfs_subvol_inherit_props(trans, new_root, parent_root);
+        if (err)
+                btrfs_err(new_root->fs_info,
+                          "error inheriting subvolume %llu properties: %d\n",
+                          new_root->root_key.objectid, err);
        err = btrfs_update_inode(trans, new_root, inode);
        iput(inode);
@@ -7776,6 +7972,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
        ei->flags = 0;
        ei->csum_bytes = 0;
        ei->index_cnt = (u64)-1;
+        ei->dir_index = 0;
        ei->last_unlink_trans = 0;
        ei->last_log_commit = 0;
@@ -8063,6 +8260,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        if (ret)
                goto out_fail;
+        BTRFS_I(old_inode)->dir_index = 0ULL;
        if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
                /* force full log commit if subvolume involved. */
                root->fs_info->last_trans_log_full_commit = trans->transid;
@@ -8151,6 +8349,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                goto out_fail;
        }
+        if (old_inode->i_nlink == 1)
+                BTRFS_I(old_inode)->dir_index = index;
        if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
                struct dentry *parent = new_dentry->d_parent;
                btrfs_log_new_name(trans, old_inode, old_dir, parent);
@@ -8286,7 +8487,7 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
 {
        int ret;
-        if (root->fs_info->sb->s_flags & MS_RDONLY)
+        if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
                return -EROFS;
        ret = __start_delalloc_inodes(root, delay_iput);
@@ -8312,7 +8513,7 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput)
        struct list_head splice;
        int ret;
-        if (fs_info->sb->s_flags & MS_RDONLY)
+        if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
                return -EROFS;
        INIT_LIST_HEAD(&splice);
@@ -8649,12 +8850,14 @@ static const struct inode_operations btrfs_dir_inode_operations = {
        .removexattr    = btrfs_removexattr,
        .permission     = btrfs_permission,
        .get_acl        = btrfs_get_acl,
+        .set_acl        = btrfs_set_acl,
        .update_time    = btrfs_update_time,
 };
 static const struct inode_operations btrfs_dir_ro_inode_operations = {
        .lookup         = btrfs_lookup,
        .permission     = btrfs_permission,
        .get_acl        = btrfs_get_acl,
+        .set_acl        = btrfs_set_acl,
        .update_time    = btrfs_update_time,
 };
@@ -8724,6 +8927,7 @@ static const struct inode_operations btrfs_file_inode_operations = {
        .permission     = btrfs_permission,
        .fiemap         = btrfs_fiemap,
        .get_acl        = btrfs_get_acl,
+        .set_acl        = btrfs_set_acl,
        .update_time    = btrfs_update_time,
 };
 static const struct inode_operations btrfs_special_inode_operations = {
@@ -8735,6 +8939,7 @@ static const struct inode_operations btrfs_special_inode_operations = {
        .listxattr      = btrfs_listxattr,
        .removexattr    = btrfs_removexattr,
        .get_acl        = btrfs_get_acl,
+        .set_acl        = btrfs_set_acl,
        .update_time    = btrfs_update_time,
 };
 static const struct inode_operations btrfs_symlink_inode_operations = {
@@ -8748,7 +8953,6 @@ static const struct inode_operations btrfs_symlink_inode_operations = {
        .getxattr       = btrfs_getxattr,
        .listxattr      = btrfs_listxattr,
        .removexattr    = btrfs_removexattr,
-        .get_acl        = btrfs_get_acl,
        .update_time    = btrfs_update_time,
 };
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 21da5762b0b1..a6d8efa46bfe 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -56,6 +56,8 @@
 #include "rcu-string.h"
 #include "send.h"
 #include "dev-replace.h"
+#include "props.h"
+#include "sysfs.h"
 static int btrfs_clone(struct inode *src, struct inode *inode,
                       u64 off, u64 olen, u64 olen_aligned, u64 destoff);
@@ -190,6 +192,9 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
        unsigned int i_oldflags;
        umode_t mode;
+        if (!inode_owner_or_capable(inode))
+                return -EPERM;
        if (btrfs_root_readonly(root))
                return -EROFS;
@@ -200,9 +205,6 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
        if (ret)
                return ret;
-        if (!inode_owner_or_capable(inode))
-                return -EACCES;
        ret = mnt_want_write_file(file);
        if (ret)
                return ret;
@@ -280,9 +282,25 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
        if (flags & FS_NOCOMP_FL) {
                ip->flags &= ~BTRFS_INODE_COMPRESS;
                ip->flags |= BTRFS_INODE_NOCOMPRESS;
+                ret = btrfs_set_prop(inode, "btrfs.compression", NULL, 0, 0);
+                if (ret && ret != -ENODATA)
+                        goto out_drop;
        } else if (flags & FS_COMPR_FL) {
+                const char *comp;
                ip->flags |= BTRFS_INODE_COMPRESS;
                ip->flags &= ~BTRFS_INODE_NOCOMPRESS;
+                if (root->fs_info->compress_type == BTRFS_COMPRESS_LZO)
+                        comp = "lzo";
+                else
+                        comp = "zlib";
+                ret = btrfs_set_prop(inode, "btrfs.compression",
+                                     comp, strlen(comp), 0);
+                if (ret)
+                        goto out_drop;
        } else {
                ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS);
        }
@@ -392,6 +410,7 @@ static noinline int create_subvol(struct inode *dir,
        struct btrfs_root *new_root;
        struct btrfs_block_rsv block_rsv;
        struct timespec cur_time = CURRENT_TIME;
+        struct inode *inode;
        int ret;
        int err;
        u64 objectid;
@@ -417,7 +436,9 @@ static noinline int create_subvol(struct inode *dir,
        trans = btrfs_start_transaction(root, 0);
        if (IS_ERR(trans)) {
                ret = PTR_ERR(trans);
-                goto out;
+                btrfs_subvolume_release_metadata(root, &block_rsv,
+                                                 qgroup_reserved);
+                return ret;
        }
        trans->block_rsv = &block_rsv;
        trans->bytes_reserved = block_rsv.size;
@@ -500,7 +521,7 @@ static noinline int create_subvol(struct inode *dir,
        btrfs_record_root_in_trans(trans, new_root);
-        ret = btrfs_create_subvol_root(trans, new_root, new_dirid);
+        ret = btrfs_create_subvol_root(trans, new_root, root, new_dirid);
        if (ret) {
                /* We potentially lose an unused inode item here */
                btrfs_abort_transaction(trans, root, ret);
@@ -542,6 +563,8 @@ static noinline int create_subvol(struct inode *dir,
 fail:
        trans->block_rsv = NULL;
        trans->bytes_reserved = 0;
+        btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved);
        if (async_transid) {
                *async_transid = trans->transid;
                err = btrfs_commit_transaction_async(trans, root, 1);
@@ -553,10 +576,12 @@ fail:
        if (err && !ret)
                ret = err;
-        if (!ret)
+        if (!ret) {
-                d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry));
+                inode = btrfs_lookup_dentry(dir, dentry);
-out:
+                if (IS_ERR(inode))
-        btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved);
+                        return PTR_ERR(inode);
+                d_instantiate(dentry, inode);
+        }
        return ret;
 }
@@ -642,7 +667,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
                ret = PTR_ERR(inode);
                goto fail;
        }
-        BUG_ON(!inode);
        d_instantiate(dentry, inode);
        ret = 0;
 fail:
@@ -1011,7 +1036,7 @@ out:
 static int cluster_pages_for_defrag(struct inode *inode,
                                    struct page **pages,
                                    unsigned long start_index,
-                                    int num_pages)
+                                    unsigned long num_pages)
 {
        unsigned long file_end;
        u64 isize = i_size_read(inode);
@@ -1169,8 +1194,8 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
        int defrag_count = 0;
        int compress_type = BTRFS_COMPRESS_ZLIB;
        int extent_thresh = range->extent_thresh;
-        int max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT;
+        unsigned long max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT;
-        int cluster = max_cluster;
+        unsigned long cluster = max_cluster;
        u64 new_align = ~((u64)128 * 1024 - 1);
        struct page **pages = NULL;
@@ -1254,7 +1279,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
                        break;
                if (btrfs_defrag_cancelled(root->fs_info)) {
-                        printk(KERN_DEBUG "btrfs: defrag_file cancelled\n");
+                        printk(KERN_DEBUG "BTRFS: defrag_file cancelled\n");
                        ret = -EAGAIN;
                        break;
                }
@@ -1416,20 +1441,20 @@ static noinline int btrfs_ioctl_resize(struct file *file,
                        ret = -EINVAL;
                        goto out_free;
                }
-                printk(KERN_INFO "btrfs: resizing devid %llu\n", devid);
+                btrfs_info(root->fs_info, "resizing devid %llu", devid);
        }
        device = btrfs_find_device(root->fs_info, devid, NULL, NULL);
        if (!device) {
-                printk(KERN_INFO "btrfs: resizer unable to find device %llu\n",
+                btrfs_info(root->fs_info, "resizer unable to find device %llu",
                       devid);
                ret = -ENODEV;
                goto out_free;
        }
        if (!device->writeable) {
-                printk(KERN_INFO "btrfs: resizer unable to apply on "
+                btrfs_info(root->fs_info,
-                       "readonly device %llu\n",
+                           "resizer unable to apply on readonly device %llu",
                       devid);
                ret = -EPERM;
                goto out_free;
@@ -1466,6 +1491,10 @@ static noinline int btrfs_ioctl_resize(struct file *file,
                }
                new_size = old_size - new_size;
        } else if (mod > 0) {
+                if (new_size > ULLONG_MAX - old_size) {
+                        ret = -EINVAL;
+                        goto out_free;
+                }
                new_size = old_size + new_size;
        }
@@ -1481,7 +1510,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
        do_div(new_size, root->sectorsize);
        new_size *= root->sectorsize;
-        printk_in_rcu(KERN_INFO "btrfs: new size for %s is %llu\n",
+        printk_in_rcu(KERN_INFO "BTRFS: new size for %s is %llu\n",
                      rcu_str_deref(device->name), new_size);
        if (new_size > old_size) {
@@ -1542,9 +1571,15 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
                src_inode = file_inode(src.file);
                if (src_inode->i_sb != file_inode(file)->i_sb) {
-                        printk(KERN_INFO "btrfs: Snapshot src from "
+                        btrfs_info(BTRFS_I(src_inode)->root->fs_info,
-                               "another FS\n");
+                                   "Snapshot src from another FS");
                        ret = -EINVAL;
+                } else if (!inode_owner_or_capable(src_inode)) {
+                        /*
+                         * Subvolume creation is not restricted, but snapshots
+                         * are limited to own subvolumes only
+                         */
+                        ret = -EPERM;
                } else {
                        ret = btrfs_mksubvol(&file->f_path, name, namelen,
                                             BTRFS_I(src_inode)->root,
@@ -1662,6 +1697,9 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
        u64 flags;
        int ret = 0;
+        if (!inode_owner_or_capable(inode))
+                return -EPERM;
        ret = mnt_want_write_file(file);
        if (ret)
                goto out;
@@ -1686,11 +1724,6 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
                goto out_drop_write;
        }
-        if (!inode_owner_or_capable(inode)) {
-                ret = -EACCES;
-                goto out_drop_write;
-        }
        down_write(&root->fs_info->subvol_sem);
        /* nothing to do */
@@ -1698,12 +1731,28 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
                goto out_drop_sem;
        root_flags = btrfs_root_flags(&root->root_item);
-        if (flags & BTRFS_SUBVOL_RDONLY)
+        if (flags & BTRFS_SUBVOL_RDONLY) {
                btrfs_set_root_flags(&root->root_item,
                                     root_flags | BTRFS_ROOT_SUBVOL_RDONLY);
-        else
+        } else {
-                btrfs_set_root_flags(&root->root_item,
+                /*
+                 * Block RO -> RW transition if this subvolume is involved in
+                 * send
+                 */
+                spin_lock(&root->root_item_lock);
+                if (root->send_in_progress == 0) {
+                        btrfs_set_root_flags(&root->root_item,
                                     root_flags & ~BTRFS_ROOT_SUBVOL_RDONLY);
+                        spin_unlock(&root->root_item_lock);
+                } else {
+                        spin_unlock(&root->root_item_lock);
+                        btrfs_warn(root->fs_info,
+                        "Attempt to set subvolume %llu read-write during send",
+                                        root->root_key.objectid);
+                        ret = -EPERM;
+                        goto out_drop_sem;
+                }
+        }
        trans = btrfs_start_transaction(root, 1);
        if (IS_ERR(trans)) {
@@ -1910,7 +1959,7 @@ static noinline int search_ioctl(struct inode *inode,
                key.offset = (u64)-1;
                root = btrfs_read_fs_root_no_name(info, &key);
                if (IS_ERR(root)) {
-                        printk(KERN_ERR "could not find root %llu\n",
+                        printk(KERN_ERR "BTRFS: could not find root %llu\n",
                               sk->tree_id);
                        btrfs_free_path(path);
                        return -ENOENT;
@@ -2000,7 +2049,7 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
        key.offset = (u64)-1;
        root = btrfs_read_fs_root_no_name(info, &key);
        if (IS_ERR(root)) {
-                printk(KERN_ERR "could not find root %llu\n", tree_id);
+                printk(KERN_ERR "BTRFS: could not find root %llu\n", tree_id);
                ret = -ENOENT;
                goto out;
        }
@@ -2686,14 +2735,11 @@ out_unlock:
 #define BTRFS_MAX_DEDUPE_LEN    (16 * 1024 * 1024)
 static long btrfs_ioctl_file_extent_same(struct file *file,
-                                         void __user *argp)
+                        struct btrfs_ioctl_same_args __user *argp)
 {
-        struct btrfs_ioctl_same_args tmp;
        struct btrfs_ioctl_same_args *same;
        struct btrfs_ioctl_same_extent_info *info;
-        struct inode *src = file->f_dentry->d_inode;
+        struct inode *src = file_inode(file);
-        struct file *dst_file = NULL;
-        struct inode *dst;
        u64 off;
        u64 len;
        int i;
@@ -2701,6 +2747,7 @@ static long btrfs_ioctl_file_extent_same(struct file *file,
        unsigned long size;
        u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
        bool is_admin = capable(CAP_SYS_ADMIN);
+        u16 count;
        if (!(file->f_mode & FMODE_READ))
                return -EINVAL;
@@ -2709,17 +2756,14 @@ static long btrfs_ioctl_file_extent_same(struct file *file,
        if (ret)
                return ret;
-        if (copy_from_user(&tmp,
+        if (get_user(count, &argp->dest_count)) {
-                           (struct btrfs_ioctl_same_args __user *)argp,
-                           sizeof(tmp))) {
                ret = -EFAULT;
                goto out;
        }
-        size = sizeof(tmp) +
+        size = offsetof(struct btrfs_ioctl_same_args __user, info[count]);
-                tmp.dest_count * sizeof(struct btrfs_ioctl_same_extent_info);
-        same = memdup_user((struct btrfs_ioctl_same_args __user *)argp, size);
+        same = memdup_user(argp, size);
        if (IS_ERR(same)) {
                ret = PTR_ERR(same);
@@ -2756,52 +2800,35 @@ static long btrfs_ioctl_file_extent_same(struct file *file,
                goto out;
        /* pre-format output fields to sane values */
-        for (i = 0; i < same->dest_count; i++) {
+        for (i = 0; i < count; i++) {
                same->info[i].bytes_deduped = 0ULL;
                same->info[i].status = 0;
        }
-        ret = 0;
+        for (i = 0, info = same->info; i < count; i++, info++) {
-        for (i = 0; i < same->dest_count; i++) {
+                struct inode *dst;
-                info = &same->info[i];
+                struct fd dst_file = fdget(info->fd);
+                if (!dst_file.file) {
-                dst_file = fget(info->fd);
-                if (!dst_file) {
                        info->status = -EBADF;
-                        goto next;
+                        continue;
                }
+                dst = file_inode(dst_file.file);
-                if (!(is_admin || (dst_file->f_mode & FMODE_WRITE))) {
+                if (!(is_admin || (dst_file.file->f_mode & FMODE_WRITE))) {
                        info->status = -EINVAL;
-                        goto next;
+                } else if (file->f_path.mnt != dst_file.file->f_path.mnt) {
-                }
+                        info->status = -EXDEV;
+                } else if (S_ISDIR(dst->i_mode)) {
-                info->status = -EXDEV;
-                if (file->f_path.mnt != dst_file->f_path.mnt)
-                        goto next;
-                dst = dst_file->f_dentry->d_inode;
-                if (src->i_sb != dst->i_sb)
-                        goto next;
-                if (S_ISDIR(dst->i_mode)) {
                        info->status = -EISDIR;
-                        goto next;
+                } else if (!S_ISREG(dst->i_mode)) {
-                }
-                if (!S_ISREG(dst->i_mode)) {
                        info->status = -EACCES;
-                        goto next;
+                } else {
+                        info->status = btrfs_extent_same(src, off, len, dst,
+                                                        info->logical_offset);
+                        if (info->status == 0)
+                                info->bytes_deduped += len;
                }
+                fdput(dst_file);
-                info->status = btrfs_extent_same(src, off, len, dst,
-                                                info->logical_offset);
-                if (info->status == 0)
-                        info->bytes_deduped += len;
-next:
-                if (dst_file)
-                        fput(dst_file);
        }
        ret = copy_to_user(argp, same, size);
@@ -2860,12 +2887,14 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
                 * note the key will change type as we walk through the
                 * tree.
                 */
+                path->leave_spinning = 1;
                ret = btrfs_search_slot(NULL, BTRFS_I(src)->root, &key, path,
                                0, 0);
                if (ret < 0)
                        goto out;
                nritems = btrfs_header_nritems(path->nodes[0]);
+process_slot:
                if (path->slots[0] >= nritems) {
                        ret = btrfs_next_leaf(BTRFS_I(src)->root, path);
                        if (ret < 0)
@@ -2892,11 +2921,6 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
                        u8 comp;
                        u64 endoff;
-                        size = btrfs_item_size_nr(leaf, slot);
-                        read_extent_buffer(leaf, buf,
-                                           btrfs_item_ptr_offset(leaf, slot),
-                                           size);
                        extent = btrfs_item_ptr(leaf, slot,
                                                struct btrfs_file_extent_item);
                        comp = btrfs_file_extent_compression(leaf, extent);
@@ -2915,11 +2939,20 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
                                datal = btrfs_file_extent_ram_bytes(leaf,
                                                                    extent);
                        }
-                        btrfs_release_path(path);
                        if (key.offset + datal <= off ||
-                            key.offset >= off + len - 1)
+                            key.offset >= off + len - 1) {
-                                goto next;
+                                path->slots[0]++;
+                                goto process_slot;
+                        }
+                        size = btrfs_item_size_nr(leaf, slot);
+                        read_extent_buffer(leaf, buf,
+                                           btrfs_item_ptr_offset(leaf, slot),
+                                           size);
+                        btrfs_release_path(path);
+                        path->leave_spinning = 0;
                        memcpy(&new_key, &key, sizeof(new_key));
                        new_key.objectid = btrfs_ino(inode);
@@ -3090,7 +3123,6 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
                        }
                        ret = btrfs_end_transaction(trans, root);
                }
-next:
                btrfs_release_path(path);
                key.offset++;
        }
@@ -3218,9 +3250,17 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
        unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);
 out_unlock:
-        mutex_unlock(&src->i_mutex);
+        if (!same_inode) {
-        if (!same_inode)
+                if (inode < src) {
-                mutex_unlock(&inode->i_mutex);
+                        mutex_unlock(&src->i_mutex);
+                        mutex_unlock(&inode->i_mutex);
+                } else {
+                        mutex_unlock(&inode->i_mutex);
+                        mutex_unlock(&src->i_mutex);
+                }
+        } else {
+                mutex_unlock(&src->i_mutex);
+        }
 out_fput:
        fdput(src_file);
 out_drop_write:
@@ -3343,8 +3383,8 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
        if (IS_ERR_OR_NULL(di)) {
                btrfs_free_path(path);
                btrfs_end_transaction(trans, root);
-                printk(KERN_ERR "Umm, you don't have the default dir item, "
+                btrfs_err(new_root->fs_info, "Umm, you don't have the default dir"
-                       "this isn't going to work\n");
+                           "item, this isn't going to work");
                ret = -ENOENT;
                goto out;
        }
@@ -4325,6 +4365,9 @@ static long btrfs_ioctl_set_received_subvol(struct file *file,
        int ret = 0;
        int received_uuid_changed;
+        if (!inode_owner_or_capable(inode))
+                return -EPERM;
        ret = mnt_want_write_file(file);
        if (ret < 0)
                return ret;
@@ -4341,11 +4384,6 @@ static long btrfs_ioctl_set_received_subvol(struct file *file,
                goto out;
        }
-        if (!inode_owner_or_capable(inode)) {
-                ret = -EACCES;
-                goto out;
-        }
        sa = memdup_user(arg, sizeof(*sa));
        if (IS_ERR(sa)) {
                ret = PTR_ERR(sa);
@@ -4431,8 +4469,8 @@ static int btrfs_ioctl_get_fslabel(struct file *file, void __user *arg)
        len = strnlen(label, BTRFS_LABEL_SIZE);
        if (len == BTRFS_LABEL_SIZE) {
-                pr_warn("btrfs: label is too long, return the first %zu bytes\n",
+                btrfs_warn(root->fs_info,
-                        --len);
+                        "label is too long, return the first %zu bytes", --len);
        }
        ret = copy_to_user(arg, label, len);
@@ -4455,7 +4493,7 @@ static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg)
                return -EFAULT;
        if (strnlen(label, BTRFS_LABEL_SIZE) == BTRFS_LABEL_SIZE) {
-                pr_err("btrfs: unable to set label with more than %d bytes\n",
+                btrfs_err(root->fs_info, "unable to set label with more than %d bytes",
                       BTRFS_LABEL_SIZE - 1);
                return -EINVAL;
        }
@@ -4473,13 +4511,173 @@ static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg)
        spin_lock(&root->fs_info->super_lock);
        strcpy(super_block->label, label);
        spin_unlock(&root->fs_info->super_lock);
-        ret = btrfs_end_transaction(trans, root);
+        ret = btrfs_commit_transaction(trans, root);
 out_unlock:
        mnt_drop_write_file(file);
        return ret;
 }
+#define INIT_FEATURE_FLAGS(suffix) \
+        { .compat_flags = BTRFS_FEATURE_COMPAT_##suffix, \
+          .compat_ro_flags = BTRFS_FEATURE_COMPAT_RO_##suffix, \
+          .incompat_flags = BTRFS_FEATURE_INCOMPAT_##suffix }
+static int btrfs_ioctl_get_supported_features(struct file *file,
+                                              void __user *arg)
+{
+        static struct btrfs_ioctl_feature_flags features[3] = {
+                INIT_FEATURE_FLAGS(SUPP),
+                INIT_FEATURE_FLAGS(SAFE_SET),
+                INIT_FEATURE_FLAGS(SAFE_CLEAR)
+        };
+        if (copy_to_user(arg, &features, sizeof(features)))
+                return -EFAULT;
+        return 0;
+}
+static int btrfs_ioctl_get_features(struct file *file, void __user *arg)
+{
+        struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+        struct btrfs_super_block *super_block = root->fs_info->super_copy;
+        struct btrfs_ioctl_feature_flags features;
+        features.compat_flags = btrfs_super_compat_flags(super_block);
+        features.compat_ro_flags = btrfs_super_compat_ro_flags(super_block);
+        features.incompat_flags = btrfs_super_incompat_flags(super_block);
+        if (copy_to_user(arg, &features, sizeof(features)))
+                return -EFAULT;
+        return 0;
+}
+static int check_feature_bits(struct btrfs_root *root,
+                              enum btrfs_feature_set set,
+                              u64 change_mask, u64 flags, u64 supported_flags,
+                              u64 safe_set, u64 safe_clear)
+{
+        const char *type = btrfs_feature_set_names[set];
+        char *names;
+        u64 disallowed, unsupported;
+        u64 set_mask = flags & change_mask;
+        u64 clear_mask = ~flags & change_mask;
+        unsupported = set_mask & ~supported_flags;
+        if (unsupported) {
+                names = btrfs_printable_features(set, unsupported);
+                if (names) {
+                        btrfs_warn(root->fs_info,
+                           "this kernel does not support the %s feature bit%s",
+                           names, strchr(names, ',') ? "s" : "");
+                        kfree(names);
+                } else
+                        btrfs_warn(root->fs_info,
+                           "this kernel does not support %s bits 0x%llx",
+                           type, unsupported);
+                return -EOPNOTSUPP;
+        }
+        disallowed = set_mask & ~safe_set;
+        if (disallowed) {
+                names = btrfs_printable_features(set, disallowed);
+                if (names) {
+                        btrfs_warn(root->fs_info,
+                           "can't set the %s feature bit%s while mounted",
+                           names, strchr(names, ',') ? "s" : "");
+                        kfree(names);
+                } else
+                        btrfs_warn(root->fs_info,
+                           "can't set %s bits 0x%llx while mounted",
+                           type, disallowed);
+                return -EPERM;
+        }
+        disallowed = clear_mask & ~safe_clear;
+        if (disallowed) {
+                names = btrfs_printable_features(set, disallowed);
+                if (names) {
+                        btrfs_warn(root->fs_info,
+                           "can't clear the %s feature bit%s while mounted",
+                           names, strchr(names, ',') ? "s" : "");
+                        kfree(names);
+                } else
+                        btrfs_warn(root->fs_info,
+                           "can't clear %s bits 0x%llx while mounted",
+                           type, disallowed);
+                return -EPERM;
+        }
+        return 0;
+}
+#define check_feature(root, change_mask, flags, mask_base)      \
+check_feature_bits(root, FEAT_##mask_base, change_mask, flags,  \
+                   BTRFS_FEATURE_ ## mask_base ## _SUPP,        \
+                   BTRFS_FEATURE_ ## mask_base ## _SAFE_SET,    \
+                   BTRFS_FEATURE_ ## mask_base ## _SAFE_CLEAR)
+static int btrfs_ioctl_set_features(struct file *file, void __user *arg)
+{
+        struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+        struct btrfs_super_block *super_block = root->fs_info->super_copy;
+        struct btrfs_ioctl_feature_flags flags[2];
+        struct btrfs_trans_handle *trans;
+        u64 newflags;
+        int ret;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        if (copy_from_user(flags, arg, sizeof(flags)))
+                return -EFAULT;
+        /* Nothing to do */
+        if (!flags[0].compat_flags && !flags[0].compat_ro_flags &&
+            !flags[0].incompat_flags)
+                return 0;
+        ret = check_feature(root, flags[0].compat_flags,
+                            flags[1].compat_flags, COMPAT);
+        if (ret)
+                return ret;
+        ret = check_feature(root, flags[0].compat_ro_flags,
+                            flags[1].compat_ro_flags, COMPAT_RO);
+        if (ret)
+                return ret;
+        ret = check_feature(root, flags[0].incompat_flags,
+                            flags[1].incompat_flags, INCOMPAT);
+        if (ret)
+                return ret;
+        trans = btrfs_start_transaction(root, 0);
+        if (IS_ERR(trans))
+                return PTR_ERR(trans);
+        spin_lock(&root->fs_info->super_lock);
+        newflags = btrfs_super_compat_flags(super_block);
+        newflags |= flags[0].compat_flags & flags[1].compat_flags;
+        newflags &= ~(flags[0].compat_flags & ~flags[1].compat_flags);
+        btrfs_set_super_compat_flags(super_block, newflags);
+        newflags = btrfs_super_compat_ro_flags(super_block);
+        newflags |= flags[0].compat_ro_flags & flags[1].compat_ro_flags;
+        newflags &= ~(flags[0].compat_ro_flags & ~flags[1].compat_ro_flags);
+        btrfs_set_super_compat_ro_flags(super_block, newflags);
+        newflags = btrfs_super_incompat_flags(super_block);
+        newflags |= flags[0].incompat_flags & flags[1].incompat_flags;
+        newflags &= ~(flags[0].incompat_flags & ~flags[1].incompat_flags);
+        btrfs_set_super_incompat_flags(super_block, newflags);
+        spin_unlock(&root->fs_info->super_lock);
+        return btrfs_commit_transaction(trans, root);
+}
 long btrfs_ioctl(struct file *file, unsigned int
                cmd, unsigned long arg)
 {
@@ -4598,6 +4796,12 @@ long btrfs_ioctl(struct file *file, unsigned int
                return btrfs_ioctl_set_fslabel(file, argp);
        case BTRFS_IOC_FILE_EXTENT_SAME:
                return btrfs_ioctl_file_extent_same(file, argp);
+        case BTRFS_IOC_GET_SUPPORTED_FEATURES:
+                return btrfs_ioctl_get_supported_features(file, argp);
+        case BTRFS_IOC_GET_FEATURES:
+                return btrfs_ioctl_get_features(file, argp);
+        case BTRFS_IOC_SET_FEATURES:
+                return btrfs_ioctl_set_features(file, argp);
        }
        return -ENOTTY;
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index b6a6f07c5ce2..b47f669aca75 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -141,7 +141,7 @@ static int lzo_compress_pages(struct list_head *ws,
                ret = lzo1x_1_compress(data_in, in_len, workspace->cbuf,
                                       &out_len, workspace->mem);
                if (ret != LZO_E_OK) {
-                        printk(KERN_DEBUG "btrfs deflate in loop returned %d\n",
+                        printk(KERN_DEBUG "BTRFS: deflate in loop returned %d\n",
                               ret);
                        ret = -1;
                        goto out;
@@ -357,7 +357,7 @@ cont:
                if (need_unmap)
                        kunmap(pages_in[page_in_index - 1]);
                if (ret != LZO_E_OK) {
-                        printk(KERN_WARNING "btrfs decompress failed\n");
+                        printk(KERN_WARNING "BTRFS: decompress failed\n");
                        ret = -1;
                        break;
                }
@@ -401,7 +401,7 @@ static int lzo_decompress(struct list_head *ws, unsigned char *data_in,
        out_len = PAGE_CACHE_SIZE;
        ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len);
        if (ret != LZO_E_OK) {
-                printk(KERN_WARNING "btrfs decompress failed!\n");
+                printk(KERN_WARNING "BTRFS: decompress failed!\n");
                ret = -1;
                goto out;
        }
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 69582d5b69d1..b16450b840e7 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -336,13 +336,14 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode,
                      entry->len);
        *file_offset = dec_end;
        if (dec_start > dec_end) {
-                printk(KERN_CRIT "bad ordering dec_start %llu end %llu\n",
+                btrfs_crit(BTRFS_I(inode)->root->fs_info,
-                       dec_start, dec_end);
+                        "bad ordering dec_start %llu end %llu", dec_start, dec_end);
        }
        to_dec = dec_end - dec_start;
        if (to_dec > entry->bytes_left) {
-                printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n",
+                btrfs_crit(BTRFS_I(inode)->root->fs_info,
-                       entry->bytes_left, to_dec);
+                        "bad ordered accounting left %llu size %llu",
+                        entry->bytes_left, to_dec);
        }
        entry->bytes_left -= to_dec;
        if (!uptodate)
@@ -401,7 +402,8 @@ have_entry:
        }
        if (io_size > entry->bytes_left) {
-                printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n",
+                btrfs_crit(BTRFS_I(inode)->root->fs_info,
+                           "bad ordered accounting left %llu size %llu",
                       entry->bytes_left, io_size);
        }
        entry->bytes_left -= io_size;
@@ -520,7 +522,8 @@ void btrfs_remove_ordered_extent(struct inode *inode,
        spin_lock_irq(&tree->lock);
        node = &entry->rb_node;
        rb_erase(node, &tree->tree);
-        tree->last = NULL;
+        if (tree->last == node)
+                tree->last = NULL;
        set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
        spin_unlock_irq(&tree->lock);
diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c
index 24cad1695af7..65793edb38ca 100644
--- a/fs/btrfs/orphan.c
+++ b/fs/btrfs/orphan.c
@@ -69,23 +69,3 @@ out:
        btrfs_free_path(path);
        return ret;
 }
-int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset)
-{
-        struct btrfs_path *path;
-        struct btrfs_key key;
-        int ret;
-        key.objectid = BTRFS_ORPHAN_OBJECTID;
-        key.type = BTRFS_ORPHAN_ITEM_KEY;
-        key.offset = offset;
-        path = btrfs_alloc_path();
-        if (!path)
-                return -ENOMEM;
-        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-        btrfs_free_path(path);
-        return ret;
-}
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 417053b17181..6efd70d3b64f 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -154,7 +154,7 @@ static void print_uuid_item(struct extent_buffer *l, unsigned long offset,
                            u32 item_size)
 {
        if (!IS_ALIGNED(item_size, sizeof(u64))) {
-                pr_warn("btrfs: uuid item with illegal size %lu!\n",
+                pr_warn("BTRFS: uuid item with illegal size %lu!\n",
                        (unsigned long)item_size);
                return;
        }
@@ -249,7 +249,7 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
                            BTRFS_FILE_EXTENT_INLINE) {
                                printk(KERN_INFO "\t\tinline extent data "
                                       "size %u\n",
-                                       btrfs_file_extent_inline_len(l, fi));
+                                       btrfs_file_extent_inline_len(l, i, fi));
                                break;
                        }
                        printk(KERN_INFO "\t\textent data disk bytenr %llu "
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
new file mode 100644
index 000000000000..129b1dd28527
--- /dev/null
+++ b/fs/btrfs/props.c
@@ -0,0 +1,427 @@
+/*
+ * Copyright (C) 2014 Filipe David Borba Manana <fdmanana@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/hashtable.h>
+#include "props.h"
+#include "btrfs_inode.h"
+#include "hash.h"
+#include "transaction.h"
+#include "xattr.h"
+#define BTRFS_PROP_HANDLERS_HT_BITS 8
+static DEFINE_HASHTABLE(prop_handlers_ht, BTRFS_PROP_HANDLERS_HT_BITS);
+struct prop_handler {
+        struct hlist_node node;
+        const char *xattr_name;
+        int (*validate)(const char *value, size_t len);
+        int (*apply)(struct inode *inode, const char *value, size_t len);
+        const char *(*extract)(struct inode *inode);
+        int inheritable;
+};
+static int prop_compression_validate(const char *value, size_t len);
+static int prop_compression_apply(struct inode *inode,
+                                  const char *value,
+                                  size_t len);
+static const char *prop_compression_extract(struct inode *inode);
+static struct prop_handler prop_handlers[] = {
+        {
+                .xattr_name = XATTR_BTRFS_PREFIX "compression",
+                .validate = prop_compression_validate,
+                .apply = prop_compression_apply,
+                .extract = prop_compression_extract,
+                .inheritable = 1
+        },
+        {
+                .xattr_name = NULL
+        }
+};
+void __init btrfs_props_init(void)
+{
+        struct prop_handler *p;
+        hash_init(prop_handlers_ht);
+        for (p = &prop_handlers[0]; p->xattr_name; p++) {
+                u64 h = btrfs_name_hash(p->xattr_name, strlen(p->xattr_name));
+                hash_add(prop_handlers_ht, &p->node, h);
+        }
+}
+static const struct hlist_head *find_prop_handlers_by_hash(const u64 hash)
+{
+        struct hlist_head *h;
+        h = &prop_handlers_ht[hash_min(hash, BTRFS_PROP_HANDLERS_HT_BITS)];
+        if (hlist_empty(h))
+                return NULL;
+        return h;
+}
+static const struct prop_handler *
+find_prop_handler(const char *name,
+                  const struct hlist_head *handlers)
+{
+        struct prop_handler *h;
+        if (!handlers) {
+                u64 hash = btrfs_name_hash(name, strlen(name));
+                handlers = find_prop_handlers_by_hash(hash);
+                if (!handlers)
+                        return NULL;
+        }
+        hlist_for_each_entry(h, handlers, node)
+                if (!strcmp(h->xattr_name, name))
+                        return h;
+        return NULL;
+}
+static int __btrfs_set_prop(struct btrfs_trans_handle *trans,
+                            struct inode *inode,
+                            const char *name,
+                            const char *value,
+                            size_t value_len,
+                            int flags)
+{
+        const struct prop_handler *handler;
+        int ret;
+        if (strlen(name) <= XATTR_BTRFS_PREFIX_LEN)
+                return -EINVAL;
+        handler = find_prop_handler(name, NULL);
+        if (!handler)
+                return -EINVAL;
+        if (value_len == 0) {
+                ret = __btrfs_setxattr(trans, inode, handler->xattr_name,
+                                       NULL, 0, flags);
+                if (ret)
+                        return ret;
+                ret = handler->apply(inode, NULL, 0);
+                ASSERT(ret == 0);
+                return ret;
+        }
+        ret = handler->validate(value, value_len);
+        if (ret)
+                return ret;
+        ret = __btrfs_setxattr(trans, inode, handler->xattr_name,
+                               value, value_len, flags);
+        if (ret)
+                return ret;
+        ret = handler->apply(inode, value, value_len);
+        if (ret) {
+                __btrfs_setxattr(trans, inode, handler->xattr_name,
+                                 NULL, 0, flags);
+                return ret;
+        }
+        set_bit(BTRFS_INODE_HAS_PROPS, &BTRFS_I(inode)->runtime_flags);
+        return 0;
+}
+int btrfs_set_prop(struct inode *inode,
+                   const char *name,
+                   const char *value,
+                   size_t value_len,
+                   int flags)
+{
+        return __btrfs_set_prop(NULL, inode, name, value, value_len, flags);
+}
+static int iterate_object_props(struct btrfs_root *root,
+                                struct btrfs_path *path,
+                                u64 objectid,
+                                void (*iterator)(void *,
+                                                 const struct prop_handler *,
+                                                 const char *,
+                                                 size_t),
+                                void *ctx)
+{
+        int ret;
+        char *name_buf = NULL;
+        char *value_buf = NULL;
+        int name_buf_len = 0;
+        int value_buf_len = 0;
+        while (1) {
+                struct btrfs_key key;
+                struct btrfs_dir_item *di;
+                struct extent_buffer *leaf;
+                u32 total_len, cur, this_len;
+                int slot;
+                const struct hlist_head *handlers;
+                slot = path->slots[0];
+                leaf = path->nodes[0];
+                if (slot >= btrfs_header_nritems(leaf)) {
+                        ret = btrfs_next_leaf(root, path);
+                        if (ret < 0)
+                                goto out;
+                        else if (ret > 0)
+                                break;
+                        continue;
+                }
+                btrfs_item_key_to_cpu(leaf, &key, slot);
+                if (key.objectid != objectid)
+                        break;
+                if (key.type != BTRFS_XATTR_ITEM_KEY)
+                        break;
+                handlers = find_prop_handlers_by_hash(key.offset);
+                if (!handlers)
+                        goto next_slot;
+                di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
+                cur = 0;
+                total_len = btrfs_item_size_nr(leaf, slot);
+                while (cur < total_len) {
+                        u32 name_len = btrfs_dir_name_len(leaf, di);
+                        u32 data_len = btrfs_dir_data_len(leaf, di);
+                        unsigned long name_ptr, data_ptr;
+                        const struct prop_handler *handler;
+                        this_len = sizeof(*di) + name_len + data_len;
+                        name_ptr = (unsigned long)(di + 1);
+                        data_ptr = name_ptr + name_len;
+                        if (name_len <= XATTR_BTRFS_PREFIX_LEN ||
+                            memcmp_extent_buffer(leaf, XATTR_BTRFS_PREFIX,
+                                                 name_ptr,
+                                                 XATTR_BTRFS_PREFIX_LEN))
+                                goto next_dir_item;
+                        if (name_len >= name_buf_len) {
+                                kfree(name_buf);
+                                name_buf_len = name_len + 1;
+                                name_buf = kmalloc(name_buf_len, GFP_NOFS);
+                                if (!name_buf) {
+                                        ret = -ENOMEM;
+                                        goto out;
+                                }
+                        }
+                        read_extent_buffer(leaf, name_buf, name_ptr, name_len);
+                        name_buf[name_len] = '\0';
+                        handler = find_prop_handler(name_buf, handlers);
+                        if (!handler)
+                                goto next_dir_item;
+                        if (data_len > value_buf_len) {
+                                kfree(value_buf);
+                                value_buf_len = data_len;
+                                value_buf = kmalloc(data_len, GFP_NOFS);
+                                if (!value_buf) {
+                                        ret = -ENOMEM;
+                                        goto out;
+                                }
+                        }
+                        read_extent_buffer(leaf, value_buf, data_ptr, data_len);
+                        iterator(ctx, handler, value_buf, data_len);
+next_dir_item:
+                        cur += this_len;
+                        di = (struct btrfs_dir_item *)((char *) di + this_len);
+                }
+next_slot:
+                path->slots[0]++;
+        }
+        ret = 0;
+out:
+        btrfs_release_path(path);
+        kfree(name_buf);
+        kfree(value_buf);
+        return ret;
+}
+static void inode_prop_iterator(void *ctx,
+                                const struct prop_handler *handler,
+                                const char *value,
+                                size_t len)
+{
+        struct inode *inode = ctx;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        int ret;
+        ret = handler->apply(inode, value, len);
+        if (unlikely(ret))
+                btrfs_warn(root->fs_info,
+                           "error applying prop %s to ino %llu (root %llu): %d",
+                           handler->xattr_name, btrfs_ino(inode),
+                           root->root_key.objectid, ret);
+        else
+                set_bit(BTRFS_INODE_HAS_PROPS, &BTRFS_I(inode)->runtime_flags);
+}
+int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        u64 ino = btrfs_ino(inode);
+        int ret;
+        ret = iterate_object_props(root, path, ino, inode_prop_iterator, inode);
+        return ret;
+}
+static int inherit_props(struct btrfs_trans_handle *trans,
+                         struct inode *inode,
+                         struct inode *parent)
+{
+        const struct prop_handler *h;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        int ret;
+        if (!test_bit(BTRFS_INODE_HAS_PROPS,
+                      &BTRFS_I(parent)->runtime_flags))
+                return 0;
+        for (h = &prop_handlers[0]; h->xattr_name; h++) {
+                const char *value;
+                u64 num_bytes;
+                if (!h->inheritable)
+                        continue;
+                value = h->extract(parent);
+                if (!value)
+                        continue;
+                num_bytes = btrfs_calc_trans_metadata_size(root, 1);
+                ret = btrfs_block_rsv_add(root, trans->block_rsv,
+                                          num_bytes, BTRFS_RESERVE_NO_FLUSH);
+                if (ret)
+                        goto out;
+                ret = __btrfs_set_prop(trans, inode, h->xattr_name,
+                                       value, strlen(value), 0);
+                btrfs_block_rsv_release(root, trans->block_rsv, num_bytes);
+                if (ret)
+                        goto out;
+        }
+        ret = 0;
+out:
+        return ret;
+}
+int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans,
+                              struct inode *inode,
+                              struct inode *dir)
+{
+        if (!dir)
+                return 0;
+        return inherit_props(trans, inode, dir);
+}
+int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *root,
+                               struct btrfs_root *parent_root)
+{
+        struct btrfs_key key;
+        struct inode *parent_inode, *child_inode;
+        int ret;
+        key.objectid = BTRFS_FIRST_FREE_OBJECTID;
+        key.type = BTRFS_INODE_ITEM_KEY;
+        key.offset = 0;
+        parent_inode = btrfs_iget(parent_root->fs_info->sb, &key,
+                                  parent_root, NULL);
+        if (IS_ERR(parent_inode))
+                return PTR_ERR(parent_inode);
+        child_inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
+        if (IS_ERR(child_inode)) {
+                iput(parent_inode);
+                return PTR_ERR(child_inode);
+        }
+        ret = inherit_props(trans, child_inode, parent_inode);
+        iput(child_inode);
+        iput(parent_inode);
+        return ret;
+}
+static int prop_compression_validate(const char *value, size_t len)
+{
+        if (!strncmp("lzo", value, len))
+                return 0;
+        else if (!strncmp("zlib", value, len))
+                return 0;
+        return -EINVAL;
+}
+static int prop_compression_apply(struct inode *inode,
+                                  const char *value,
+                                  size_t len)
+{
+        int type;
+        if (len == 0) {
+                BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
+                BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS;
+                BTRFS_I(inode)->force_compress = BTRFS_COMPRESS_NONE;
+                return 0;
+        }
+        if (!strncmp("lzo", value, len))
+                type = BTRFS_COMPRESS_LZO;
+        else if (!strncmp("zlib", value, len))
+                type = BTRFS_COMPRESS_ZLIB;
+        else
+                return -EINVAL;
+        BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS;
+        BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
+        BTRFS_I(inode)->force_compress = type;
+        return 0;
+}
+static const char *prop_compression_extract(struct inode *inode)
+{
+        switch (BTRFS_I(inode)->force_compress) {
+        case BTRFS_COMPRESS_ZLIB:
+                return "zlib";
+        case BTRFS_COMPRESS_LZO:
+                return "lzo";
+        }
+        return NULL;
+}
diff --git a/fs/btrfs/props.h b/fs/btrfs/props.h
new file mode 100644
index 000000000000..100f18829d50
--- /dev/null
+++ b/fs/btrfs/props.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (C) 2014 Filipe David Borba Manana <fdmanana@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef __BTRFS_PROPS_H
+#define __BTRFS_PROPS_H
+#include "ctree.h"
+void __init btrfs_props_init(void);
+int btrfs_set_prop(struct inode *inode,
+                   const char *name,
+                   const char *value,
+                   size_t value_len,
+                   int flags);
+int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path);
+int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans,
+                              struct inode *inode,
+                              struct inode *dir);
+int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *root,
+                               struct btrfs_root *parent_root);
+#endif
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 4e6ef490619e..472302a2d745 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -301,16 +301,16 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
                        if (btrfs_qgroup_status_version(l, ptr) !=
                            BTRFS_QGROUP_STATUS_VERSION) {
-                                printk(KERN_ERR
+                                btrfs_err(fs_info,
-                                 "btrfs: old qgroup version, quota disabled\n");
+                                 "old qgroup version, quota disabled");
                                goto out;
                        }
                        if (btrfs_qgroup_status_generation(l, ptr) !=
                            fs_info->generation) {
                                flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
-                                printk(KERN_ERR
+                                btrfs_err(fs_info,
-                                        "btrfs: qgroup generation mismatch, "
+                                        "qgroup generation mismatch, "
-                                        "marked as inconsistent\n");
+                                        "marked as inconsistent");
                        }
                        fs_info->qgroup_flags = btrfs_qgroup_status_flags(l,
                                                                          ptr);
@@ -325,7 +325,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
                qgroup = find_qgroup_rb(fs_info, found_key.offset);
                if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) ||
                    (!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) {
-                        printk(KERN_ERR "btrfs: inconsitent qgroup config\n");
+                        btrfs_err(fs_info, "inconsitent qgroup config");
                        flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
                }
                if (!qgroup) {
@@ -396,8 +396,8 @@ next1:
                ret = add_relation_rb(fs_info, found_key.objectid,
                                      found_key.offset);
                if (ret == -ENOENT) {
-                        printk(KERN_WARNING
+                        btrfs_warn(fs_info,
-                                "btrfs: orphan qgroup relation 0x%llx->0x%llx\n",
+                                "orphan qgroup relation 0x%llx->0x%llx",
                                found_key.objectid, found_key.offset);
                        ret = 0;        /* ignore the error */
                }
@@ -644,8 +644,7 @@ static int update_qgroup_limit_item(struct btrfs_trans_handle *trans,
        l = path->nodes[0];
        slot = path->slots[0];
-        qgroup_limit = btrfs_item_ptr(l, path->slots[0],
+        qgroup_limit = btrfs_item_ptr(l, slot, struct btrfs_qgroup_limit_item);
-                                      struct btrfs_qgroup_limit_item);
        btrfs_set_qgroup_limit_flags(l, qgroup_limit, flags);
        btrfs_set_qgroup_limit_max_rfer(l, qgroup_limit, max_rfer);
        btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, max_excl);
@@ -687,8 +686,7 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans,
        l = path->nodes[0];
        slot = path->slots[0];
-        qgroup_info = btrfs_item_ptr(l, path->slots[0],
+        qgroup_info = btrfs_item_ptr(l, slot, struct btrfs_qgroup_info_item);
-                                 struct btrfs_qgroup_info_item);
        btrfs_set_qgroup_info_generation(l, qgroup_info, trans->transid);
        btrfs_set_qgroup_info_rfer(l, qgroup_info, qgroup->rfer);
        btrfs_set_qgroup_info_rfer_cmpr(l, qgroup_info, qgroup->rfer_cmpr);
@@ -1161,7 +1159,7 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
                                       limit->rsv_excl);
        if (ret) {
                fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
-                printk(KERN_INFO "unable to update quota limit for %llu\n",
+                btrfs_info(fs_info, "unable to update quota limit for %llu",
                       qgroupid);
        }
@@ -1349,7 +1347,6 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
                             struct btrfs_delayed_ref_node *node,
                             struct btrfs_delayed_extent_op *extent_op)
 {
-        struct btrfs_key ins;
        struct btrfs_root *quota_root;
        u64 ref_root;
        struct btrfs_qgroup *qgroup;
@@ -1363,10 +1360,6 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
        BUG_ON(!fs_info->quota_root);
-        ins.objectid = node->bytenr;
-        ins.offset = node->num_bytes;
-        ins.type = BTRFS_EXTENT_ITEM_KEY;
        if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
            node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
                struct btrfs_delayed_tree_ref *ref;
@@ -1840,7 +1833,9 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
 {
        if (list_empty(&trans->qgroup_ref_list) && !trans->delayed_ref_elem.seq)
                return;
-        pr_err("btrfs: qgroups not uptodate in trans handle %p: list is%s empty, seq is %#x.%x\n",
+        btrfs_err(trans->root->fs_info,
+                "qgroups not uptodate in trans handle %p:  list is%s empty, "
+                "seq is %#x.%x",
                trans, list_empty(&trans->qgroup_ref_list) ? "" : " not",
                (u32)(trans->delayed_ref_elem.seq >> 32),
                (u32)trans->delayed_ref_elem.seq);
@@ -1902,9 +1897,17 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
        mutex_unlock(&fs_info->qgroup_rescan_lock);
        for (; slot < btrfs_header_nritems(scratch_leaf); ++slot) {
+                u64 num_bytes;
                btrfs_item_key_to_cpu(scratch_leaf, &found, slot);
-                if (found.type != BTRFS_EXTENT_ITEM_KEY)
+                if (found.type != BTRFS_EXTENT_ITEM_KEY &&
+                    found.type != BTRFS_METADATA_ITEM_KEY)
                        continue;
+                if (found.type == BTRFS_METADATA_ITEM_KEY)
+                        num_bytes = fs_info->extent_root->leafsize;
+                else
+                        num_bytes = found.offset;
                ret = btrfs_find_all_roots(trans, fs_info, found.objectid,
                                           tree_mod_seq_elem.seq, &roots);
                if (ret < 0)
@@ -1949,12 +1952,12 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
                        struct btrfs_qgroup_list *glist;
                        qg = (struct btrfs_qgroup *)(uintptr_t) unode->aux;
-                        qg->rfer += found.offset;
+                        qg->rfer += num_bytes;
-                        qg->rfer_cmpr += found.offset;
+                        qg->rfer_cmpr += num_bytes;
                        WARN_ON(qg->tag >= seq);
                        if (qg->refcnt - seq == roots->nnodes) {
-                                qg->excl += found.offset;
+                                qg->excl += num_bytes;
-                                qg->excl_cmpr += found.offset;
+                                qg->excl_cmpr += num_bytes;
                        }
                        qgroup_dirty(fs_info, qg);
@@ -2037,10 +2040,10 @@ out:
        mutex_unlock(&fs_info->qgroup_rescan_lock);
        if (err >= 0) {
-                pr_info("btrfs: qgroup scan completed%s\n",
+                btrfs_info(fs_info, "qgroup scan completed%s",
                        err == 2 ? " (inconsistency flag cleared)" : "");
        } else {
-                pr_err("btrfs: qgroup scan failed with %d\n", err);
+                btrfs_err(fs_info, "qgroup scan failed with %d", err);
        }
        complete_all(&fs_info->qgroup_rescan_completion);
@@ -2096,7 +2099,7 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
        if (ret) {
 err:
-                pr_info("btrfs: qgroup_rescan_init failed with %d\n", ret);
+                btrfs_info(fs_info, "qgroup_rescan_init failed with %d", ret);
                return ret;
        }
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 24ac21840a9a..9af0b25d991a 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1032,8 +1032,8 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
        /* see if we can add this page onto our existing bio */
        if (last) {
-                last_end = (u64)last->bi_sector << 9;
+                last_end = (u64)last->bi_iter.bi_sector << 9;
-                last_end += last->bi_size;
+                last_end += last->bi_iter.bi_size;
                /*
                 * we can't merge these if they are from different
@@ -1053,9 +1053,9 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
        if (!bio)
                return -ENOMEM;
-        bio->bi_size = 0;
+        bio->bi_iter.bi_size = 0;
        bio->bi_bdev = stripe->dev->bdev;
-        bio->bi_sector = disk_start >> 9;
+        bio->bi_iter.bi_sector = disk_start >> 9;
        set_bit(BIO_UPTODATE, &bio->bi_flags);
        bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
@@ -1111,7 +1111,7 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio)
        spin_lock_irq(&rbio->bio_list_lock);
        bio_list_for_each(bio, &rbio->bio_list) {
-                start = (u64)bio->bi_sector << 9;
+                start = (u64)bio->bi_iter.bi_sector << 9;
                stripe_offset = start - rbio->raid_map[0];
                page_index = stripe_offset >> PAGE_CACHE_SHIFT;
@@ -1272,7 +1272,7 @@ cleanup:
 static int find_bio_stripe(struct btrfs_raid_bio *rbio,
                           struct bio *bio)
 {
-        u64 physical = bio->bi_sector;
+        u64 physical = bio->bi_iter.bi_sector;
        u64 stripe_start;
        int i;
        struct btrfs_bio_stripe *stripe;
@@ -1298,7 +1298,7 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio,
 static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
                                   struct bio *bio)
 {
-        u64 logical = bio->bi_sector;
+        u64 logical = bio->bi_iter.bi_sector;
        u64 stripe_start;
        int i;
@@ -1602,8 +1602,8 @@ static int plug_cmp(void *priv, struct list_head *a, struct list_head *b)
                                                 plug_list);
        struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
                                                 plug_list);
-        u64 a_sector = ra->bio_list.head->bi_sector;
+        u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
-        u64 b_sector = rb->bio_list.head->bi_sector;
+        u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
        if (a_sector < b_sector)
                return -1;
@@ -1691,7 +1691,7 @@ int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
        if (IS_ERR(rbio))
                return PTR_ERR(rbio);
        bio_list_add(&rbio->bio_list, bio);
-        rbio->bio_list_bytes = bio->bi_size;
+        rbio->bio_list_bytes = bio->bi_iter.bi_size;
        /*
         * don't plug on full rbios, just get them out the door
@@ -2044,7 +2044,7 @@ int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
        rbio->read_rebuild = 1;
        bio_list_add(&rbio->bio_list, bio);
-        rbio->bio_list_bytes = bio->bi_size;
+        rbio->bio_list_bytes = bio->bi_iter.bi_size;
        rbio->faila = find_logical_bio_stripe(rbio, bio);
        if (rbio->faila == -1) {
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 1031b69252c5..31c797c48c3e 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -189,8 +189,8 @@ static int __readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
                         */
 #ifdef DEBUG
                        if (rec->generation != generation) {
-                                printk(KERN_DEBUG "generation mismatch for "
+                                btrfs_debug(root->fs_info,
-                                                "(%llu,%d,%llu) %llu != %llu\n",
+                                           "generation mismatch for (%llu,%d,%llu) %llu != %llu",
                                       key.objectid, key.type, key.offset,
                                       rec->generation, generation);
                        }
@@ -365,8 +365,9 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
                goto error;
        if (bbio->num_stripes > BTRFS_MAX_MIRRORS) {
-                printk(KERN_ERR "btrfs readahead: more than %d copies not "
+                btrfs_err(root->fs_info,
-                                "supported", BTRFS_MAX_MIRRORS);
+                           "readahead: more than %d copies not supported",
+                           BTRFS_MAX_MIRRORS);
                goto error;
        }
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 429c73c374b8..07b3b36f40ee 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -94,6 +94,7 @@ struct backref_edge {
 #define LOWER   0
 #define UPPER   1
+#define RELOCATION_RESERVED_NODES       256
 struct backref_cache {
        /* red black tree of all backref nodes in the cache */
@@ -176,6 +177,8 @@ struct reloc_control {
        u64 merging_rsv_size;
        /* size of relocated tree nodes */
        u64 nodes_relocated;
+        /* reserved size for block group relocation*/
+        u64 reserved_bytes;
        u64 search_start;
        u64 extents_found;
@@ -184,7 +187,6 @@ struct reloc_control {
        unsigned int create_reloc_tree:1;
        unsigned int merge_reloc_tree:1;
        unsigned int found_file_extent:1;
-        unsigned int commit_transaction:1;
 };
 /* stages of data relocation */
@@ -2309,9 +2311,6 @@ void free_reloc_roots(struct list_head *list)
                reloc_root = list_entry(list->next, struct btrfs_root,
                                        root_list);
                __del_reloc_root(reloc_root);
-                free_extent_buffer(reloc_root->node);
-                free_extent_buffer(reloc_root->commit_root);
-                kfree(reloc_root);
        }
 }
@@ -2353,10 +2352,9 @@ again:
                        ret = merge_reloc_root(rc, root);
                        if (ret) {
-                                __del_reloc_root(reloc_root);
+                                if (list_empty(&reloc_root->root_list))
-                                free_extent_buffer(reloc_root->node);
+                                        list_add_tail(&reloc_root->root_list,
-                                free_extent_buffer(reloc_root->commit_root);
+                                                      &reloc_roots);
-                                kfree(reloc_root);
                                goto out;
                        }
                } else {
@@ -2452,7 +2450,7 @@ static noinline_for_stack
 struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
                                     struct reloc_control *rc,
                                     struct backref_node *node,
-                                     struct backref_edge *edges[], int *nr)
+                                     struct backref_edge *edges[])
 {
        struct backref_node *next;
        struct btrfs_root *root;
@@ -2494,7 +2492,6 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
        if (!root)
                return NULL;
-        *nr = index;
        next = node;
        /* setup backref node path for btrfs_reloc_cow_block */
        while (1) {
@@ -2590,28 +2587,36 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,
        struct btrfs_root *root = rc->extent_root;
        u64 num_bytes;
        int ret;
+        u64 tmp;
        num_bytes = calcu_metadata_size(rc, node, 1) * 2;
        trans->block_rsv = rc->block_rsv;
-        ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes,
+        rc->reserved_bytes += num_bytes;
-                                  BTRFS_RESERVE_FLUSH_ALL);
+        ret = btrfs_block_rsv_refill(root, rc->block_rsv, num_bytes,
+                                BTRFS_RESERVE_FLUSH_ALL);
        if (ret) {
-                if (ret == -EAGAIN)
+                if (ret == -EAGAIN) {
-                        rc->commit_transaction = 1;
+                        tmp = rc->extent_root->nodesize *
+                                RELOCATION_RESERVED_NODES;
+                        while (tmp <= rc->reserved_bytes)
+                                tmp <<= 1;
+                        /*
+                         * only one thread can access block_rsv at this point,
+                         * so we don't need hold lock to protect block_rsv.
+                         * we expand more reservation size here to allow enough
+                         * space for relocation and we will return eailer in
+                         * enospc case.
+                         */
+                        rc->block_rsv->size = tmp + rc->extent_root->nodesize *
+                                              RELOCATION_RESERVED_NODES;
+                }
                return ret;
        }
        return 0;
 }
-static void release_metadata_space(struct reloc_control *rc,
-                                   struct backref_node *node)
-{
-        u64 num_bytes = calcu_metadata_size(rc, node, 0) * 2;
-        btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, num_bytes);
-}
 /*
 * relocate a block tree, and then update pointers in upper level
 * blocks that reference the block to point to the new location.
@@ -2633,7 +2638,6 @@ static int do_relocation(struct btrfs_trans_handle *trans,
        u32 blocksize;
        u64 bytenr;
        u64 generation;
-        int nr;
        int slot;
        int ret;
        int err = 0;
@@ -2646,7 +2650,7 @@ static int do_relocation(struct btrfs_trans_handle *trans,
                cond_resched();
                upper = edge->node[UPPER];
-                root = select_reloc_root(trans, rc, upper, edges, &nr);
+                root = select_reloc_root(trans, rc, upper, edges);
                BUG_ON(!root);
                if (upper->eb && !upper->locked) {
@@ -2898,7 +2902,6 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
                                struct btrfs_path *path)
 {
        struct btrfs_root *root;
-        int release = 0;
        int ret = 0;
        if (!node)
@@ -2915,7 +2918,6 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
                ret = reserve_metadata_space(trans, rc, node);
                if (ret)
                        goto out;
-                release = 1;
        }
        if (root) {
@@ -2940,11 +2942,8 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
                ret = do_relocation(trans, rc, node, key, path, 1);
        }
 out:
-        if (ret || node->level == 0 || node->cowonly) {
+        if (ret || node->level == 0 || node->cowonly)
-                if (release)
-                        release_metadata_space(rc, node);
                remove_backref_node(&rc->backref_cache, node);
-        }
        return ret;
 }
@@ -3867,29 +3866,20 @@ static noinline_for_stack
 int prepare_to_relocate(struct reloc_control *rc)
 {
        struct btrfs_trans_handle *trans;
-        int ret;
        rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root,
                                              BTRFS_BLOCK_RSV_TEMP);
        if (!rc->block_rsv)
                return -ENOMEM;
-        /*
-         * reserve some space for creating reloc trees.
-         * btrfs_init_reloc_root will use them when there
-         * is no reservation in transaction handle.
-         */
-        ret = btrfs_block_rsv_add(rc->extent_root, rc->block_rsv,
-                                  rc->extent_root->nodesize * 256,
-                                  BTRFS_RESERVE_FLUSH_ALL);
-        if (ret)
-                return ret;
        memset(&rc->cluster, 0, sizeof(rc->cluster));
        rc->search_start = rc->block_group->key.objectid;
        rc->extents_found = 0;
        rc->nodes_relocated = 0;
        rc->merging_rsv_size = 0;
+        rc->reserved_bytes = 0;
+        rc->block_rsv->size = rc->extent_root->nodesize *
+                              RELOCATION_RESERVED_NODES;
        rc->create_reloc_tree = 1;
        set_reloc_control(rc);
@@ -3933,6 +3923,14 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
        }
        while (1) {
+                rc->reserved_bytes = 0;
+                ret = btrfs_block_rsv_refill(rc->extent_root,
+                                        rc->block_rsv, rc->block_rsv->size,
+                                        BTRFS_RESERVE_FLUSH_ALL);
+                if (ret) {
+                        err = ret;
+                        break;
+                }
                progress++;
                trans = btrfs_start_transaction(rc->extent_root, 0);
                if (IS_ERR(trans)) {
@@ -4011,6 +4009,12 @@ restart:
                if (!RB_EMPTY_ROOT(&blocks)) {
                        ret = relocate_tree_blocks(trans, rc, &blocks);
                        if (ret < 0) {
+                                /*
+                                 * if we fail to relocate tree blocks, force to update
+                                 * backref cache when committing transaction.
+                                 */
+                                rc->backref_cache.last_trans = trans->transid - 1;
                                if (ret != -EAGAIN) {
                                        err = ret;
                                        break;
@@ -4020,14 +4024,8 @@ restart:
                        }
                }
-                if (rc->commit_transaction) {
+                btrfs_end_transaction_throttle(trans, rc->extent_root);
-                        rc->commit_transaction = 0;
+                btrfs_btree_balance_dirty(rc->extent_root);
-                        ret = btrfs_commit_transaction(trans, rc->extent_root);
-                        BUG_ON(ret);
-                } else {
-                        btrfs_end_transaction_throttle(trans, rc->extent_root);
-                        btrfs_btree_balance_dirty(rc->extent_root);
-                }
                trans = NULL;
                if (rc->stage == MOVE_DATA_EXTENTS &&
@@ -4247,7 +4245,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
                goto out;
        }
-        printk(KERN_INFO "btrfs: relocating block group %llu flags %llu\n",
+        btrfs_info(extent_root->fs_info, "relocating block group %llu flags %llu",
               rc->block_group->key.objectid, rc->block_group->flags);
        ret = btrfs_start_delalloc_roots(fs_info, 0);
@@ -4269,7 +4267,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
                if (rc->extents_found == 0)
                        break;
-                printk(KERN_INFO "btrfs: found %llu extents\n",
+                btrfs_info(extent_root->fs_info, "found %llu extents",
                        rc->extents_found);
                if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
@@ -4285,11 +4283,6 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
                }
        }
-        filemap_write_and_wait_range(fs_info->btree_inode->i_mapping,
-                                     rc->block_group->key.objectid,
-                                     rc->block_group->key.objectid +
-                                     rc->block_group->key.offset - 1);
        WARN_ON(rc->block_group->pinned > 0);
        WARN_ON(rc->block_group->reserved > 0);
        WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0);
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index ec71ea44d2b4..1389b69059de 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -44,7 +44,7 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot,
        if (!need_reset && btrfs_root_generation(item)
                != btrfs_root_generation_v2(item)) {
                if (btrfs_root_generation_v2(item) != 0) {
-                        printk(KERN_WARNING "btrfs: mismatching "
+                        printk(KERN_WARNING "BTRFS: mismatching "
                                        "generation and generation_v2 "
                                        "found in root item. This root "
                                        "was probably mounted with an "
@@ -154,7 +154,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
        if (ret != 0) {
                btrfs_print_leaf(root, path->nodes[0]);
-                printk(KERN_CRIT "unable to update root key %llu %u %llu\n",
+                btrfs_crit(root->fs_info, "unable to update root key %llu %u %llu",
                       key->objectid, key->type, key->offset);
                BUG_ON(1);
        }
@@ -400,21 +400,6 @@ out:
        return err;
 }
-int btrfs_find_root_ref(struct btrfs_root *tree_root,
-                   struct btrfs_path *path,
-                   u64 root_id, u64 ref_id)
-{
-        struct btrfs_key key;
-        int ret;
-        key.objectid = root_id;
-        key.type = BTRFS_ROOT_REF_KEY;
-        key.offset = ref_id;
-        ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
-        return ret;
-}
 /*
 * add a btrfs_root_ref item.  type is either BTRFS_ROOT_REF_KEY
 * or BTRFS_ROOT_BACKREF_KEY.
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 1fd3f33c330a..efba5d1282ee 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -256,6 +256,8 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
 static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
                            int mirror_num, u64 physical_for_dev_replace);
 static void copy_nocow_pages_worker(struct btrfs_work *work);
+static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
+static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
 static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
@@ -269,6 +271,29 @@ static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
        wake_up(&sctx->list_wait);
 }
+static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
+{
+        while (atomic_read(&fs_info->scrub_pause_req)) {
+                mutex_unlock(&fs_info->scrub_lock);
+                wait_event(fs_info->scrub_pause_wait,
+                   atomic_read(&fs_info->scrub_pause_req) == 0);
+                mutex_lock(&fs_info->scrub_lock);
+        }
+}
+static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
+{
+        atomic_inc(&fs_info->scrubs_paused);
+        wake_up(&fs_info->scrub_pause_wait);
+        mutex_lock(&fs_info->scrub_lock);
+        __scrub_blocked_if_needed(fs_info);
+        atomic_dec(&fs_info->scrubs_paused);
+        mutex_unlock(&fs_info->scrub_lock);
+        wake_up(&fs_info->scrub_pause_wait);
+}
 /*
 * used for workers that require transaction commits (i.e., for the
 * NOCOW case)
@@ -480,7 +505,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
         * hold all of the paths here
         */
        for (i = 0; i < ipath->fspath->elem_cnt; ++i)
-                printk_in_rcu(KERN_WARNING "btrfs: %s at logical %llu on dev "
+                printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev "
                        "%s, sector %llu, root %llu, inode %llu, offset %llu, "
                        "length %llu, links %u (path: %s)\n", swarn->errstr,
                        swarn->logical, rcu_str_deref(swarn->dev->name),
@@ -492,7 +517,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
        return 0;
 err:
-        printk_in_rcu(KERN_WARNING "btrfs: %s at logical %llu on dev "
+        printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev "
                "%s, sector %llu, root %llu, inode %llu, offset %llu: path "
                "resolving failed with ret=%d\n", swarn->errstr,
                swarn->logical, rcu_str_deref(swarn->dev->name),
@@ -555,7 +580,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
                        ret = tree_backref_for_extent(&ptr, eb, ei, item_size,
                                                        &ref_root, &ref_level);
                        printk_in_rcu(KERN_WARNING
-                                "btrfs: %s at logical %llu on dev %s, "
+                                "BTRFS: %s at logical %llu on dev %s, "
                                "sector %llu: metadata %s (level %d) in tree "
                                "%llu\n", errstr, swarn.logical,
                                rcu_str_deref(dev->name),
@@ -704,13 +729,11 @@ static void scrub_fixup_nodatasum(struct btrfs_work *work)
        struct scrub_fixup_nodatasum *fixup;
        struct scrub_ctx *sctx;
        struct btrfs_trans_handle *trans = NULL;
-        struct btrfs_fs_info *fs_info;
        struct btrfs_path *path;
        int uncorrectable = 0;
        fixup = container_of(work, struct scrub_fixup_nodatasum, work);
        sctx = fixup->sctx;
-        fs_info = fixup->root->fs_info;
        path = btrfs_alloc_path();
        if (!path) {
@@ -759,8 +782,8 @@ out:
                btrfs_dev_replace_stats_inc(
                        &sctx->dev_root->fs_info->dev_replace.
                        num_uncorrectable_read_errors);
-                printk_ratelimited_in_rcu(KERN_ERR
+                printk_ratelimited_in_rcu(KERN_ERR "BTRFS: "
-                        "btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n",
+                    "unable to fixup (nodatasum) error at logical %llu on dev %s\n",
                        fixup->logical, rcu_str_deref(fixup->dev->name));
        }
@@ -1161,7 +1184,7 @@ corrected_error:
                        sctx->stat.corrected_errors++;
                        spin_unlock(&sctx->stat_lock);
                        printk_ratelimited_in_rcu(KERN_ERR
-                                "btrfs: fixed up error at logical %llu on dev %s\n",
+                                "BTRFS: fixed up error at logical %llu on dev %s\n",
                                logical, rcu_str_deref(dev->name));
                }
        } else {
@@ -1170,7 +1193,7 @@ did_not_correct_error:
                sctx->stat.uncorrectable_errors++;
                spin_unlock(&sctx->stat_lock);
                printk_ratelimited_in_rcu(KERN_ERR
-                        "btrfs: unable to fixup (regular) error at logical %llu on dev %s\n",
+                        "BTRFS: unable to fixup (regular) error at logical %llu on dev %s\n",
                        logical, rcu_str_deref(dev->name));
        }
@@ -1308,7 +1331,7 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
                        continue;
                }
                bio->bi_bdev = page->dev->bdev;
-                bio->bi_sector = page->physical >> 9;
+                bio->bi_iter.bi_sector = page->physical >> 9;
                bio_add_page(bio, page->page, PAGE_SIZE, 0);
                if (btrfsic_submit_bio_wait(READ, bio))
@@ -1418,8 +1441,9 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
                int ret;
                if (!page_bad->dev->bdev) {
-                        printk_ratelimited(KERN_WARNING
+                        printk_ratelimited(KERN_WARNING "BTRFS: "
-                                "btrfs: scrub_repair_page_from_good_copy(bdev == NULL) is unexpected!\n");
+                                "scrub_repair_page_from_good_copy(bdev == NULL) "
+                                "is unexpected!\n");
                        return -EIO;
                }
@@ -1427,7 +1451,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
                if (!bio)
                        return -EIO;
                bio->bi_bdev = page_bad->dev->bdev;
-                bio->bi_sector = page_bad->physical >> 9;
+                bio->bi_iter.bi_sector = page_bad->physical >> 9;
                ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0);
                if (PAGE_SIZE != ret) {
@@ -1520,7 +1544,7 @@ again:
                bio->bi_private = sbio;
                bio->bi_end_io = scrub_wr_bio_end_io;
                bio->bi_bdev = sbio->dev->bdev;
-                bio->bi_sector = sbio->physical >> 9;
+                bio->bi_iter.bi_sector = sbio->physical >> 9;
                sbio->err = 0;
        } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
                   spage->physical_for_dev_replace ||
@@ -1877,7 +1901,7 @@ static void scrub_submit(struct scrub_ctx *sctx)
                 * This case is handled correctly (but _very_ slowly).
                 */
                printk_ratelimited(KERN_WARNING
-                        "btrfs: scrub_submit(bio bdev == NULL) is unexpected!\n");
+                        "BTRFS: scrub_submit(bio bdev == NULL) is unexpected!\n");
                bio_endio(sbio->bio, -EIO);
        } else {
                btrfsic_submit_bio(READ, sbio->bio);
@@ -1926,7 +1950,7 @@ again:
                bio->bi_private = sbio;
                bio->bi_end_io = scrub_bio_end_io;
                bio->bi_bdev = sbio->dev->bdev;
-                bio->bi_sector = sbio->physical >> 9;
+                bio->bi_iter.bi_sector = sbio->physical >> 9;
                sbio->err = 0;
        } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
                   spage->physical ||
@@ -2286,8 +2310,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
        wait_event(sctx->list_wait,
                   atomic_read(&sctx->bios_in_flight) == 0);
-        atomic_inc(&fs_info->scrubs_paused);
+        scrub_blocked_if_needed(fs_info);
-        wake_up(&fs_info->scrub_pause_wait);
        /* FIXME it might be better to start readahead at commit root */
        key_start.objectid = logical;
@@ -2311,16 +2334,6 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
        if (!IS_ERR(reada2))
                btrfs_reada_wait(reada2);
-        mutex_lock(&fs_info->scrub_lock);
-        while (atomic_read(&fs_info->scrub_pause_req)) {
-                mutex_unlock(&fs_info->scrub_lock);
-                wait_event(fs_info->scrub_pause_wait,
-                   atomic_read(&fs_info->scrub_pause_req) == 0);
-                mutex_lock(&fs_info->scrub_lock);
-        }
-        atomic_dec(&fs_info->scrubs_paused);
-        mutex_unlock(&fs_info->scrub_lock);
-        wake_up(&fs_info->scrub_pause_wait);
        /*
         * collect all data csums for the stripe to avoid seeking during
@@ -2357,22 +2370,14 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
                        wait_event(sctx->list_wait,
                                   atomic_read(&sctx->bios_in_flight) == 0);
                        atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
-                        atomic_inc(&fs_info->scrubs_paused);
+                        scrub_blocked_if_needed(fs_info);
-                        wake_up(&fs_info->scrub_pause_wait);
-                        mutex_lock(&fs_info->scrub_lock);
-                        while (atomic_read(&fs_info->scrub_pause_req)) {
-                                mutex_unlock(&fs_info->scrub_lock);
-                                wait_event(fs_info->scrub_pause_wait,
-                                   atomic_read(&fs_info->scrub_pause_req) == 0);
-                                mutex_lock(&fs_info->scrub_lock);
-                        }
-                        atomic_dec(&fs_info->scrubs_paused);
-                        mutex_unlock(&fs_info->scrub_lock);
-                        wake_up(&fs_info->scrub_pause_wait);
                }
+                if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
+                        key.type = BTRFS_METADATA_ITEM_KEY;
+                else
+                        key.type = BTRFS_EXTENT_ITEM_KEY;
                key.objectid = logical;
-                key.type = BTRFS_EXTENT_ITEM_KEY;
                key.offset = (u64)-1;
                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@@ -2380,8 +2385,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
                        goto out;
                if (ret > 0) {
-                        ret = btrfs_previous_item(root, path, 0,
+                        ret = btrfs_previous_extent_item(root, path, 0);
-                                                  BTRFS_EXTENT_ITEM_KEY);
                        if (ret < 0)
                                goto out;
                        if (ret > 0) {
@@ -2439,9 +2443,9 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
                        if (key.objectid < logical &&
                            (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
-                                printk(KERN_ERR
+                                btrfs_err(fs_info,
-                                       "btrfs scrub: tree block %llu spanning "
+                                           "scrub: tree block %llu spanning "
-                                       "stripes, ignored. logical=%llu\n",
+                                           "stripes, ignored. logical=%llu",
                                       key.objectid, logical);
                                goto next;
                        }
@@ -2683,21 +2687,9 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
                wait_event(sctx->list_wait,
                           atomic_read(&sctx->bios_in_flight) == 0);
                atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
-                atomic_inc(&fs_info->scrubs_paused);
-                wake_up(&fs_info->scrub_pause_wait);
                wait_event(sctx->list_wait,
                           atomic_read(&sctx->workers_pending) == 0);
+                scrub_blocked_if_needed(fs_info);
-                mutex_lock(&fs_info->scrub_lock);
-                while (atomic_read(&fs_info->scrub_pause_req)) {
-                        mutex_unlock(&fs_info->scrub_lock);
-                        wait_event(fs_info->scrub_pause_wait,
-                           atomic_read(&fs_info->scrub_pause_req) == 0);
-                        mutex_lock(&fs_info->scrub_lock);
-                }
-                atomic_dec(&fs_info->scrubs_paused);
-                mutex_unlock(&fs_info->scrub_lock);
-                wake_up(&fs_info->scrub_pause_wait);
                btrfs_put_block_group(cache);
                if (ret)
@@ -2823,8 +2815,8 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
         * check some assumptions
         */
        if (fs_info->chunk_root->nodesize != fs_info->chunk_root->leafsize) {
-                printk(KERN_ERR
+                btrfs_err(fs_info,
-                       "btrfs_scrub: size assumption nodesize == leafsize (%d == %d) fails\n",
+                           "scrub: size assumption nodesize == leafsize (%d == %d) fails",
                       fs_info->chunk_root->nodesize,
                       fs_info->chunk_root->leafsize);
                return -EINVAL;
@@ -2836,16 +2828,17 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
                 * the way scrub is implemented. Do not handle this
                 * situation at all because it won't ever happen.
                 */
-                printk(KERN_ERR
+                btrfs_err(fs_info,
-                       "btrfs_scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails\n",
+                           "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails",
                       fs_info->chunk_root->nodesize, BTRFS_STRIPE_LEN);
                return -EINVAL;
        }
        if (fs_info->chunk_root->sectorsize != PAGE_SIZE) {
                /* not supported for data w/o checksums */
-                printk(KERN_ERR
+                btrfs_err(fs_info,
-                       "btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lu) fails\n",
+                           "scrub: size assumption sectorsize != PAGE_SIZE "
+                           "(%d != %lu) fails",
                       fs_info->chunk_root->sectorsize, PAGE_SIZE);
                return -EINVAL;
        }
@@ -2858,7 +2851,8 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
                 * would exhaust the array bounds of pagev member in
                 * struct scrub_block
                 */
-                pr_err("btrfs_scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails\n",
+                btrfs_err(fs_info, "scrub: size assumption nodesize and sectorsize "
+                           "<= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails",
                       fs_info->chunk_root->nodesize,
                       SCRUB_MAX_PAGES_PER_BLOCK,
                       fs_info->chunk_root->sectorsize,
@@ -2908,7 +2902,13 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
        }
        sctx->readonly = readonly;
        dev->scrub_device = sctx;
+        mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+        /*
+         * checking @scrub_pause_req here, we can avoid
+         * race between committing transaction and scrubbing.
+         */
+        __scrub_blocked_if_needed(fs_info);
        atomic_inc(&fs_info->scrubs_running);
        mutex_unlock(&fs_info->scrub_lock);
@@ -2917,9 +2917,10 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
                 * by holding device list mutex, we can
                 * kick off writing super in log tree sync.
                 */
+                mutex_lock(&fs_info->fs_devices->device_list_mutex);
                ret = scrub_supers(sctx, dev);
+                mutex_unlock(&fs_info->fs_devices->device_list_mutex);
        }
-        mutex_unlock(&fs_info->fs_devices->device_list_mutex);
        if (!ret)
                ret = scrub_enumerate_chunks(sctx, dev, start, end,
@@ -3167,7 +3168,8 @@ static void copy_nocow_pages_worker(struct btrfs_work *work)
        ret = iterate_inodes_from_logical(logical, fs_info, path,
                                          record_inode_for_nocow, nocow_ctx);
        if (ret != 0 && ret != -ENOENT) {
-                pr_warn("iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %u, ret %d\n",
+                btrfs_warn(fs_info, "iterate_inodes_from_logical() failed: log %llu, "
+                        "phys %llu, len %llu, mir %u, ret %d",
                        logical, physical_for_dev_replace, len, mirror_num,
                        ret);
                not_written = 1;
@@ -3289,7 +3291,7 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
 again:
                page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
                if (!page) {
-                        pr_err("find_or_create_page() failed\n");
+                        btrfs_err(fs_info, "find_or_create_page() failed");
                        ret = -ENOMEM;
                        goto out;
                }
@@ -3361,7 +3363,7 @@ static int write_page_nocow(struct scrub_ctx *sctx,
                return -EIO;
        if (!dev->bdev) {
                printk_ratelimited(KERN_WARNING
-                        "btrfs: scrub write_page_nocow(bdev == NULL) is unexpected!\n");
+                        "BTRFS: scrub write_page_nocow(bdev == NULL) is unexpected!\n");
                return -EIO;
        }
        bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
@@ -3371,8 +3373,8 @@ static int write_page_nocow(struct scrub_ctx *sctx,
                spin_unlock(&sctx->stat_lock);
                return -ENOMEM;
        }
-        bio->bi_size = 0;
+        bio->bi_iter.bi_size = 0;
-        bio->bi_sector = physical_for_dev_replace >> 9;
+        bio->bi_iter.bi_sector = physical_for_dev_replace >> 9;
        bio->bi_bdev = dev->bdev;
        ret = bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
        if (ret != PAGE_CACHE_SIZE) {
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 945d1db98f26..9dde9717c1b9 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -24,12 +24,12 @@
 #include <linux/xattr.h>
 #include <linux/posix_acl_xattr.h>
 #include <linux/radix-tree.h>
-#include <linux/crc32c.h>
 #include <linux/vmalloc.h>
 #include <linux/string.h>
 #include "send.h"
 #include "backref.h"
+#include "hash.h"
 #include "locking.h"
 #include "disk-io.h"
 #include "btrfs_inode.h"
@@ -88,8 +88,6 @@ struct send_ctx {
        u64 cmd_send_size[BTRFS_SEND_C_MAX + 1];
        u64 flags;      /* 'flags' member of btrfs_ioctl_send_args is u64 */
-        struct vfsmount *mnt;
        struct btrfs_root *send_root;
        struct btrfs_root *parent_root;
        struct clone_root *clone_roots;
@@ -111,6 +109,7 @@ struct send_ctx {
        int cur_inode_deleted;
        u64 cur_inode_size;
        u64 cur_inode_mode;
+        u64 cur_inode_last_extent;
        u64 send_progress;
@@ -122,6 +121,74 @@ struct send_ctx {
        int name_cache_size;
        char *read_buf;
+        /*
+         * We process inodes by their increasing order, so if before an
+         * incremental send we reverse the parent/child relationship of
+         * directories such that a directory with a lower inode number was
+         * the parent of a directory with a higher inode number, and the one
+         * becoming the new parent got renamed too, we can't rename/move the
+         * directory with lower inode number when we finish processing it - we
+         * must process the directory with higher inode number first, then
+         * rename/move it and then rename/move the directory with lower inode
+         * number. Example follows.
+         *
+         * Tree state when the first send was performed:
+         *
+         * .
+         * |-- a                   (ino 257)
+         *     |-- b               (ino 258)
+         *         |
+         *         |
+         *         |-- c           (ino 259)
+         *         |   |-- d       (ino 260)
+         *         |
+         *         |-- c2          (ino 261)
+         *
+         * Tree state when the second (incremental) send is performed:
+         *
+         * .
+         * |-- a                   (ino 257)
+         *     |-- b               (ino 258)
+         *         |-- c2          (ino 261)
+         *             |-- d2      (ino 260)
+         *                 |-- cc  (ino 259)
+         *
+         * The sequence of steps that lead to the second state was:
+         *
+         * mv /a/b/c/d /a/b/c2/d2
+         * mv /a/b/c /a/b/c2/d2/cc
+         *
+         * "c" has lower inode number, but we can't move it (2nd mv operation)
+         * before we move "d", which has higher inode number.
+         *
+         * So we just memorize which move/rename operations must be performed
+         * later when their respective parent is processed and moved/renamed.
+         */
+        /* Indexed by parent directory inode number. */
+        struct rb_root pending_dir_moves;
+        /*
+         * Reverse index, indexed by the inode number of a directory that
+         * is waiting for the move/rename of its immediate parent before its
+         * own move/rename can be performed.
+         */
+        struct rb_root waiting_dir_moves;
+};
+struct pending_dir_move {
+        struct rb_node node;
+        struct list_head list;
+        u64 parent_ino;
+        u64 ino;
+        u64 gen;
+        struct list_head update_refs;
+};
+struct waiting_dir_move {
+        struct rb_node node;
+        u64 ino;
 };
 struct name_cache_entry {
@@ -145,6 +212,15 @@ struct name_cache_entry {
        char name[];
 };
+static int is_waiting_for_move(struct send_ctx *sctx, u64 ino);
+static int need_send_hole(struct send_ctx *sctx)
+{
+        return (sctx->parent_root && !sctx->cur_inode_new &&
+                !sctx->cur_inode_new_gen && !sctx->cur_inode_deleted &&
+                S_ISREG(sctx->cur_inode_mode));
+}
 static void fs_path_reset(struct fs_path *p)
 {
        if (p->reversed) {
@@ -336,16 +412,6 @@ out:
        return ret;
 }
-#if 0
-static void fs_path_remove(struct fs_path *p)
-{
-        BUG_ON(p->reversed);
-        while (p->start != p->end && *p->end != '/')
-                p->end--;
-        *p->end = 0;
-}
-#endif
 static int fs_path_copy(struct fs_path *p, struct fs_path *from)
 {
        int ret;
@@ -436,30 +502,15 @@ static int tlv_put(struct send_ctx *sctx, u16 attr, const void *data, int len)
        return 0;
 }
-#if 0
+#define TLV_PUT_DEFINE_INT(bits) \
-static int tlv_put_u8(struct send_ctx *sctx, u16 attr, u8 value)
+        static int tlv_put_u##bits(struct send_ctx *sctx,               \
-{
+                        u##bits attr, u##bits value)                    \
-        return tlv_put(sctx, attr, &value, sizeof(value));
+        {                                                               \
-}
+                __le##bits __tmp = cpu_to_le##bits(value);              \
+                return tlv_put(sctx, attr, &__tmp, sizeof(__tmp));      \
-static int tlv_put_u16(struct send_ctx *sctx, u16 attr, u16 value)
+        }
-{
-        __le16 tmp = cpu_to_le16(value);
-        return tlv_put(sctx, attr, &tmp, sizeof(tmp));
-}
-static int tlv_put_u32(struct send_ctx *sctx, u16 attr, u32 value)
-{
-        __le32 tmp = cpu_to_le32(value);
-        return tlv_put(sctx, attr, &tmp, sizeof(tmp));
-}
-#endif
-static int tlv_put_u64(struct send_ctx *sctx, u16 attr, u64 value)
+TLV_PUT_DEFINE_INT(64)
-{
-        __le64 tmp = cpu_to_le64(value);
-        return tlv_put(sctx, attr, &tmp, sizeof(tmp));
-}
 static int tlv_put_string(struct send_ctx *sctx, u16 attr,
                          const char *str, int len)
@@ -475,17 +526,6 @@ static int tlv_put_uuid(struct send_ctx *sctx, u16 attr,
        return tlv_put(sctx, attr, uuid, BTRFS_UUID_SIZE);
 }
-#if 0
-static int tlv_put_timespec(struct send_ctx *sctx, u16 attr,
-                            struct timespec *ts)
-{
-        struct btrfs_timespec bts;
-        bts.sec = cpu_to_le64(ts->tv_sec);
-        bts.nsec = cpu_to_le32(ts->tv_nsec);
-        return tlv_put(sctx, attr, &bts, sizeof(bts));
-}
-#endif
 static int tlv_put_btrfs_timespec(struct send_ctx *sctx, u16 attr,
                                  struct extent_buffer *eb,
                                  struct btrfs_timespec *ts)
@@ -533,12 +573,6 @@ static int tlv_put_btrfs_timespec(struct send_ctx *sctx, u16 attr,
                if (ret < 0) \
                        goto tlv_put_failure; \
        } while (0)
-#define TLV_PUT_TIMESPEC(sctx, attrtype, ts) \
-        do { \
-                ret = tlv_put_timespec(sctx, attrtype, ts); \
-                if (ret < 0) \
-                        goto tlv_put_failure; \
-        } while (0)
 #define TLV_PUT_BTRFS_TIMESPEC(sctx, attrtype, eb, ts) \
        do { \
                ret = tlv_put_btrfs_timespec(sctx, attrtype, eb, ts); \
@@ -586,7 +620,7 @@ static int send_cmd(struct send_ctx *sctx)
        hdr->len = cpu_to_le32(sctx->send_size - sizeof(*hdr));
        hdr->crc = 0;
-        crc = crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size);
+        crc = btrfs_crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size);
        hdr->crc = cpu_to_le32(crc);
        ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size,
@@ -1270,7 +1304,7 @@ static int find_extent_clone(struct send_ctx *sctx,
        if (!backref_ctx->found_itself) {
                /* found a bug in backref code? */
                ret = -EIO;
-                printk(KERN_ERR "btrfs: ERROR did not find backref in "
+                btrfs_err(sctx->send_root->fs_info, "did not find backref in "
                                "send_root. inode=%llu, offset=%llu, "
                                "disk_byte=%llu found extent=%llu\n",
                                ino, data_offset, disk_byte, found_key.objectid);
@@ -1298,6 +1332,16 @@ verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, "
        }
        if (cur_clone_root) {
+                if (compressed != BTRFS_COMPRESS_NONE) {
+                        /*
+                         * Offsets given by iterate_extent_inodes() are relative
+                         * to the start of the extent, we need to add logical
+                         * offset from the file extent item.
+                         * (See why at backref.c:check_extent_in_eb())
+                         */
+                        cur_clone_root->offset += btrfs_file_extent_offset(eb,
+                                                                           fi);
+                }
                *found = cur_clone_root;
                ret = 0;
        } else {
@@ -1343,7 +1387,7 @@ static int read_symlink(struct btrfs_root *root,
        BUG_ON(compression);
        off = btrfs_file_extent_inline_start(ei);
-        len = btrfs_file_extent_inline_len(path->nodes[0], ei);
+        len = btrfs_file_extent_inline_len(path->nodes[0], path->slots[0], ei);
        ret = fs_path_add_from_extent_buffer(dest, path->nodes[0], off, len);
@@ -1372,7 +1416,7 @@ static int gen_unique_name(struct send_ctx *sctx,
                return -ENOMEM;
        while (1) {
-                len = snprintf(tmp, sizeof(tmp) - 1, "o%llu-%llu-%llu",
+                len = snprintf(tmp, sizeof(tmp), "o%llu-%llu-%llu",
                                ino, gen, idx);
                if (len >= sizeof(tmp)) {
                        /* should really not happen */
@@ -1933,6 +1977,7 @@ static void name_cache_free(struct send_ctx *sctx)
 */
 static int __get_cur_name_and_parent(struct send_ctx *sctx,
                                     u64 ino, u64 gen,
+                                     int skip_name_cache,
                                     u64 *parent_ino,
                                     u64 *parent_gen,
                                     struct fs_path *dest)
@@ -1942,6 +1987,8 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
        struct btrfs_path *path = NULL;
        struct name_cache_entry *nce = NULL;
+        if (skip_name_cache)
+                goto get_ref;
        /*
         * First check if we already did a call to this function with the same
         * ino/gen. If yes, check if the cache entry is still up-to-date. If yes
@@ -1986,11 +2033,12 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
                goto out_cache;
        }
+get_ref:
        /*
         * Depending on whether the inode was already processed or not, use
         * send_root or parent_root for ref lookup.
         */
-        if (ino < sctx->send_progress)
+        if (ino < sctx->send_progress && !skip_name_cache)
                ret = get_first_ref(sctx->send_root, ino,
                                    parent_ino, parent_gen, dest);
        else
@@ -2014,6 +2062,8 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
                        goto out;
                ret = 1;
        }
+        if (skip_name_cache)
+                goto out;
 out_cache:
        /*
@@ -2081,6 +2131,9 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
        u64 parent_inode = 0;
        u64 parent_gen = 0;
        int stop = 0;
+        u64 start_ino = ino;
+        u64 start_gen = gen;
+        int skip_name_cache = 0;
        name = fs_path_alloc();
        if (!name) {
@@ -2088,19 +2141,32 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
                goto out;
        }
+        if (is_waiting_for_move(sctx, ino))
+                skip_name_cache = 1;
+again:
        dest->reversed = 1;
        fs_path_reset(dest);
        while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) {
                fs_path_reset(name);
-                ret = __get_cur_name_and_parent(sctx, ino, gen,
+                ret = __get_cur_name_and_parent(sctx, ino, gen, skip_name_cache,
                                &parent_inode, &parent_gen, name);
                if (ret < 0)
                        goto out;
                if (ret)
                        stop = 1;
+                if (!skip_name_cache &&
+                    is_waiting_for_move(sctx, parent_inode)) {
+                        ino = start_ino;
+                        gen = start_gen;
+                        stop = 0;
+                        skip_name_cache = 1;
+                        goto again;
+                }
                ret = fs_path_add_path(dest, name);
                if (ret < 0)
                        goto out;
@@ -2131,7 +2197,7 @@ static int send_subvol_begin(struct send_ctx *sctx)
        char *name = NULL;
        int namelen;
-        path = alloc_path_for_send();
+        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
@@ -2180,12 +2246,12 @@ static int send_subvol_begin(struct send_ctx *sctx)
        TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID,
                        sctx->send_root->root_item.uuid);
        TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID,
-                        sctx->send_root->root_item.ctransid);
+                    le64_to_cpu(sctx->send_root->root_item.ctransid));
        if (parent_root) {
                TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
                                sctx->parent_root->root_item.uuid);
                TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
-                                sctx->parent_root->root_item.ctransid);
+                            le64_to_cpu(sctx->parent_root->root_item.ctransid));
        }
        ret = send_cmd(sctx);
@@ -2672,10 +2738,347 @@ out:
        return ret;
 }
+static int is_waiting_for_move(struct send_ctx *sctx, u64 ino)
+{
+        struct rb_node *n = sctx->waiting_dir_moves.rb_node;
+        struct waiting_dir_move *entry;
+        while (n) {
+                entry = rb_entry(n, struct waiting_dir_move, node);
+                if (ino < entry->ino)
+                        n = n->rb_left;
+                else if (ino > entry->ino)
+                        n = n->rb_right;
+                else
+                        return 1;
+        }
+        return 0;
+}
+static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino)
+{
+        struct rb_node **p = &sctx->waiting_dir_moves.rb_node;
+        struct rb_node *parent = NULL;
+        struct waiting_dir_move *entry, *dm;
+        dm = kmalloc(sizeof(*dm), GFP_NOFS);
+        if (!dm)
+                return -ENOMEM;
+        dm->ino = ino;
+        while (*p) {
+                parent = *p;
+                entry = rb_entry(parent, struct waiting_dir_move, node);
+                if (ino < entry->ino) {
+                        p = &(*p)->rb_left;
+                } else if (ino > entry->ino) {
+                        p = &(*p)->rb_right;
+                } else {
+                        kfree(dm);
+                        return -EEXIST;
+                }
+        }
+        rb_link_node(&dm->node, parent, p);
+        rb_insert_color(&dm->node, &sctx->waiting_dir_moves);
+        return 0;
+}
+static int del_waiting_dir_move(struct send_ctx *sctx, u64 ino)
+{
+        struct rb_node *n = sctx->waiting_dir_moves.rb_node;
+        struct waiting_dir_move *entry;
+        while (n) {
+                entry = rb_entry(n, struct waiting_dir_move, node);
+                if (ino < entry->ino) {
+                        n = n->rb_left;
+                } else if (ino > entry->ino) {
+                        n = n->rb_right;
+                } else {
+                        rb_erase(&entry->node, &sctx->waiting_dir_moves);
+                        kfree(entry);
+                        return 0;
+                }
+        }
+        return -ENOENT;
+}
+static int add_pending_dir_move(struct send_ctx *sctx, u64 parent_ino)
+{
+        struct rb_node **p = &sctx->pending_dir_moves.rb_node;
+        struct rb_node *parent = NULL;
+        struct pending_dir_move *entry, *pm;
+        struct recorded_ref *cur;
+        int exists = 0;
+        int ret;
+        pm = kmalloc(sizeof(*pm), GFP_NOFS);
+        if (!pm)
+                return -ENOMEM;
+        pm->parent_ino = parent_ino;
+        pm->ino = sctx->cur_ino;
+        pm->gen = sctx->cur_inode_gen;
+        INIT_LIST_HEAD(&pm->list);
+        INIT_LIST_HEAD(&pm->update_refs);
+        RB_CLEAR_NODE(&pm->node);
+        while (*p) {
+                parent = *p;
+                entry = rb_entry(parent, struct pending_dir_move, node);
+                if (parent_ino < entry->parent_ino) {
+                        p = &(*p)->rb_left;
+                } else if (parent_ino > entry->parent_ino) {
+                        p = &(*p)->rb_right;
+                } else {
+                        exists = 1;
+                        break;
+                }
+        }
+        list_for_each_entry(cur, &sctx->deleted_refs, list) {
+                ret = dup_ref(cur, &pm->update_refs);
+                if (ret < 0)
+                        goto out;
+        }
+        list_for_each_entry(cur, &sctx->new_refs, list) {
+                ret = dup_ref(cur, &pm->update_refs);
+                if (ret < 0)
+                        goto out;
+        }
+        ret = add_waiting_dir_move(sctx, pm->ino);
+        if (ret)
+                goto out;
+        if (exists) {
+                list_add_tail(&pm->list, &entry->list);
+        } else {
+                rb_link_node(&pm->node, parent, p);
+                rb_insert_color(&pm->node, &sctx->pending_dir_moves);
+        }
+        ret = 0;
+out:
+        if (ret) {
+                __free_recorded_refs(&pm->update_refs);
+                kfree(pm);
+        }
+        return ret;
+}
+static struct pending_dir_move *get_pending_dir_moves(struct send_ctx *sctx,
+                                                      u64 parent_ino)
+{
+        struct rb_node *n = sctx->pending_dir_moves.rb_node;
+        struct pending_dir_move *entry;
+        while (n) {
+                entry = rb_entry(n, struct pending_dir_move, node);
+                if (parent_ino < entry->parent_ino)
+                        n = n->rb_left;
+                else if (parent_ino > entry->parent_ino)
+                        n = n->rb_right;
+                else
+                        return entry;
+        }
+        return NULL;
+}
+static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
+{
+        struct fs_path *from_path = NULL;
+        struct fs_path *to_path = NULL;
+        u64 orig_progress = sctx->send_progress;
+        struct recorded_ref *cur;
+        int ret;
+        from_path = fs_path_alloc();
+        if (!from_path)
+                return -ENOMEM;
+        sctx->send_progress = pm->ino;
+        ret = get_cur_path(sctx, pm->ino, pm->gen, from_path);
+        if (ret < 0)
+                goto out;
+        to_path = fs_path_alloc();
+        if (!to_path) {
+                ret = -ENOMEM;
+                goto out;
+        }
+        sctx->send_progress = sctx->cur_ino + 1;
+        ret = del_waiting_dir_move(sctx, pm->ino);
+        ASSERT(ret == 0);
+        ret = get_cur_path(sctx, pm->ino, pm->gen, to_path);
+        if (ret < 0)
+                goto out;
+        ret = send_rename(sctx, from_path, to_path);
+        if (ret < 0)
+                goto out;
+        ret = send_utimes(sctx, pm->ino, pm->gen);
+        if (ret < 0)
+                goto out;
+        /*
+         * After rename/move, need to update the utimes of both new parent(s)
+         * and old parent(s).
+         */
+        list_for_each_entry(cur, &pm->update_refs, list) {
+                ret = send_utimes(sctx, cur->dir, cur->dir_gen);
+                if (ret < 0)
+                        goto out;
+        }
+out:
+        fs_path_free(from_path);
+        fs_path_free(to_path);
+        sctx->send_progress = orig_progress;
+        return ret;
+}
+static void free_pending_move(struct send_ctx *sctx, struct pending_dir_move *m)
+{
+        if (!list_empty(&m->list))
+                list_del(&m->list);
+        if (!RB_EMPTY_NODE(&m->node))
+                rb_erase(&m->node, &sctx->pending_dir_moves);
+        __free_recorded_refs(&m->update_refs);
+        kfree(m);
+}
+static void tail_append_pending_moves(struct pending_dir_move *moves,
+                                      struct list_head *stack)
+{
+        if (list_empty(&moves->list)) {
+                list_add_tail(&moves->list, stack);
+        } else {
+                LIST_HEAD(list);
+                list_splice_init(&moves->list, &list);
+                list_add_tail(&moves->list, stack);
+                list_splice_tail(&list, stack);
+        }
+}
+static int apply_children_dir_moves(struct send_ctx *sctx)
+{
+        struct pending_dir_move *pm;
+        struct list_head stack;
+        u64 parent_ino = sctx->cur_ino;
+        int ret = 0;
+        pm = get_pending_dir_moves(sctx, parent_ino);
+        if (!pm)
+                return 0;
+        INIT_LIST_HEAD(&stack);
+        tail_append_pending_moves(pm, &stack);
+        while (!list_empty(&stack)) {
+                pm = list_first_entry(&stack, struct pending_dir_move, list);
+                parent_ino = pm->ino;
+                ret = apply_dir_move(sctx, pm);
+                free_pending_move(sctx, pm);
+                if (ret)
+                        goto out;
+                pm = get_pending_dir_moves(sctx, parent_ino);
+                if (pm)
+                        tail_append_pending_moves(pm, &stack);
+        }
+        return 0;
+out:
+        while (!list_empty(&stack)) {
+                pm = list_first_entry(&stack, struct pending_dir_move, list);
+                free_pending_move(sctx, pm);
+        }
+        return ret;
+}
+static int wait_for_parent_move(struct send_ctx *sctx,
+                                struct recorded_ref *parent_ref)
+{
+        int ret;
+        u64 ino = parent_ref->dir;
+        u64 parent_ino_before, parent_ino_after;
+        u64 new_gen, old_gen;
+        struct fs_path *path_before = NULL;
+        struct fs_path *path_after = NULL;
+        int len1, len2;
+        if (parent_ref->dir <= sctx->cur_ino)
+                return 0;
+        if (is_waiting_for_move(sctx, ino))
+                return 1;
+        ret = get_inode_info(sctx->parent_root, ino, NULL, &old_gen,
+                             NULL, NULL, NULL, NULL);
+        if (ret == -ENOENT)
+                return 0;
+        else if (ret < 0)
+                return ret;
+        ret = get_inode_info(sctx->send_root, ino, NULL, &new_gen,
+                             NULL, NULL, NULL, NULL);
+        if (ret < 0)
+                return ret;
+        if (new_gen != old_gen)
+                return 0;
+        path_before = fs_path_alloc();
+        if (!path_before)
+                return -ENOMEM;
+        ret = get_first_ref(sctx->parent_root, ino, &parent_ino_before,
+                            NULL, path_before);
+        if (ret == -ENOENT) {
+                ret = 0;
+                goto out;
+        } else if (ret < 0) {
+                goto out;
+        }
+        path_after = fs_path_alloc();
+        if (!path_after) {
+                ret = -ENOMEM;
+                goto out;
+        }
+        ret = get_first_ref(sctx->send_root, ino, &parent_ino_after,
+                            NULL, path_after);
+        if (ret == -ENOENT) {
+                ret = 0;
+                goto out;
+        } else if (ret < 0) {
+                goto out;
+        }
+        len1 = fs_path_len(path_before);
+        len2 = fs_path_len(path_after);
+        if ((parent_ino_before != parent_ino_after) && (len1 != len2 ||
+             memcmp(path_before->start, path_after->start, len1))) {
+                ret = 1;
+                goto out;
+        }
+        ret = 0;
+out:
+        fs_path_free(path_before);
+        fs_path_free(path_after);
+        return ret;
+}
 /*
 * This does all the move/link/unlink/rmdir magic.
 */
-static int process_recorded_refs(struct send_ctx *sctx)
+static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
 {
        int ret = 0;
        struct recorded_ref *cur;
@@ -2824,11 +3227,17 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
                                 * dirs, we always have one new and one deleted
                                 * ref. The deleted ref is ignored later.
                                 */
-                                ret = send_rename(sctx, valid_path,
+                                if (wait_for_parent_move(sctx, cur)) {
-                                                cur->full_path);
+                                        ret = add_pending_dir_move(sctx,
-                                if (ret < 0)
+                                                                   cur->dir);
-                                        goto out;
+                                        *pending_move = 1;
-                                ret = fs_path_copy(valid_path, cur->full_path);
+                                } else {
+                                        ret = send_rename(sctx, valid_path,
+                                                          cur->full_path);
+                                        if (!ret)
+                                                ret = fs_path_copy(valid_path,
+                                                               cur->full_path);
+                                }
                                if (ret < 0)
                                        goto out;
                        } else {
@@ -3197,6 +3606,7 @@ static int process_all_refs(struct send_ctx *sctx,
        struct extent_buffer *eb;
        int slot;
        iterate_inode_ref_t cb;
+        int pending_move = 0;
        path = alloc_path_for_send();
        if (!path)
@@ -3240,7 +3650,9 @@ static int process_all_refs(struct send_ctx *sctx,
        }
        btrfs_release_path(path);
-        ret = process_recorded_refs(sctx);
+        ret = process_recorded_refs(sctx, &pending_move);
+        /* Only applicable to an incremental send. */
+        ASSERT(pending_move == 0);
 out:
        btrfs_free_path(path);
@@ -3706,7 +4118,7 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, "
        TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
                        clone_root->root->root_item.uuid);
        TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
-                        clone_root->root->root_item.ctransid);
+                    le64_to_cpu(clone_root->root->root_item.ctransid));
        TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p);
        TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_OFFSET,
                        clone_root->offset);
@@ -3752,6 +4164,39 @@ out:
        return ret;
 }
+static int send_hole(struct send_ctx *sctx, u64 end)
+{
+        struct fs_path *p = NULL;
+        u64 offset = sctx->cur_inode_last_extent;
+        u64 len;
+        int ret = 0;
+        p = fs_path_alloc();
+        if (!p)
+                return -ENOMEM;
+        memset(sctx->read_buf, 0, BTRFS_SEND_READ_SIZE);
+        while (offset < end) {
+                len = min_t(u64, end - offset, BTRFS_SEND_READ_SIZE);
+                ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
+                if (ret < 0)
+                        break;
+                ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
+                if (ret < 0)
+                        break;
+                TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
+                TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
+                TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, len);
+                ret = send_cmd(sctx);
+                if (ret < 0)
+                        break;
+                offset += len;
+        }
+tlv_put_failure:
+        fs_path_free(p);
+        return ret;
+}
 static int send_write_or_clone(struct send_ctx *sctx,
                               struct btrfs_path *path,
                               struct btrfs_key *key,
@@ -3764,12 +4209,14 @@ static int send_write_or_clone(struct send_ctx *sctx,
        u64 len;
        u32 l;
        u8 type;
+        u64 bs = sctx->send_root->fs_info->sb->s_blocksize;
        ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
                        struct btrfs_file_extent_item);
        type = btrfs_file_extent_type(path->nodes[0], ei);
        if (type == BTRFS_FILE_EXTENT_INLINE) {
-                len = btrfs_file_extent_inline_len(path->nodes[0], ei);
+                len = btrfs_file_extent_inline_len(path->nodes[0],
+                                                   path->slots[0], ei);
                /*
                 * it is possible the inline item won't cover the whole page,
                 * but there may be items after this page.  Make
@@ -3787,7 +4234,7 @@ static int send_write_or_clone(struct send_ctx *sctx,
                goto out;
        }
-        if (clone_root) {
+        if (clone_root && IS_ALIGNED(offset + len, bs)) {
                ret = send_clone(sctx, offset, len, clone_root);
        } else if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) {
                ret = send_update_extent(sctx, offset, len);
@@ -3979,6 +4426,101 @@ out:
        return ret;
 }
+static int get_last_extent(struct send_ctx *sctx, u64 offset)
+{
+        struct btrfs_path *path;
+        struct btrfs_root *root = sctx->send_root;
+        struct btrfs_file_extent_item *fi;
+        struct btrfs_key key;
+        u64 extent_end;
+        u8 type;
+        int ret;
+        path = alloc_path_for_send();
+        if (!path)
+                return -ENOMEM;
+        sctx->cur_inode_last_extent = 0;
+        key.objectid = sctx->cur_ino;
+        key.type = BTRFS_EXTENT_DATA_KEY;
+        key.offset = offset;
+        ret = btrfs_search_slot_for_read(root, &key, path, 0, 1);
+        if (ret < 0)
+                goto out;
+        ret = 0;
+        btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+        if (key.objectid != sctx->cur_ino || key.type != BTRFS_EXTENT_DATA_KEY)
+                goto out;
+        fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
+                            struct btrfs_file_extent_item);
+        type = btrfs_file_extent_type(path->nodes[0], fi);
+        if (type == BTRFS_FILE_EXTENT_INLINE) {
+                u64 size = btrfs_file_extent_inline_len(path->nodes[0],
+                                                        path->slots[0], fi);
+                extent_end = ALIGN(key.offset + size,
+                                   sctx->send_root->sectorsize);
+        } else {
+                extent_end = key.offset +
+                        btrfs_file_extent_num_bytes(path->nodes[0], fi);
+        }
+        sctx->cur_inode_last_extent = extent_end;
+out:
+        btrfs_free_path(path);
+        return ret;
+}
+static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path,
+                           struct btrfs_key *key)
+{
+        struct btrfs_file_extent_item *fi;
+        u64 extent_end;
+        u8 type;
+        int ret = 0;
+        if (sctx->cur_ino != key->objectid || !need_send_hole(sctx))
+                return 0;
+        if (sctx->cur_inode_last_extent == (u64)-1) {
+                ret = get_last_extent(sctx, key->offset - 1);
+                if (ret)
+                        return ret;
+        }
+        fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
+                            struct btrfs_file_extent_item);
+        type = btrfs_file_extent_type(path->nodes[0], fi);
+        if (type == BTRFS_FILE_EXTENT_INLINE) {
+                u64 size = btrfs_file_extent_inline_len(path->nodes[0],
+                                                        path->slots[0], fi);
+                extent_end = ALIGN(key->offset + size,
+                                   sctx->send_root->sectorsize);
+        } else {
+                extent_end = key->offset +
+                        btrfs_file_extent_num_bytes(path->nodes[0], fi);
+        }
+        if (path->slots[0] == 0 &&
+            sctx->cur_inode_last_extent < key->offset) {
+                /*
+                 * We might have skipped entire leafs that contained only
+                 * file extent items for our current inode. These leafs have
+                 * a generation number smaller (older) than the one in the
+                 * current leaf and the leaf our last extent came from, and
+                 * are located between these 2 leafs.
+                 */
+                ret = get_last_extent(sctx, key->offset - 1);
+                if (ret)
+                        return ret;
+        }
+        if (sctx->cur_inode_last_extent < key->offset)
+                ret = send_hole(sctx, key->offset);
+        sctx->cur_inode_last_extent = extent_end;
+        return ret;
+}
 static int process_extent(struct send_ctx *sctx,
                          struct btrfs_path *path,
                          struct btrfs_key *key)
@@ -3995,7 +4537,7 @@ static int process_extent(struct send_ctx *sctx,
                        goto out;
                if (ret) {
                        ret = 0;
-                        goto out;
+                        goto out_hole;
                }
        } else {
                struct btrfs_file_extent_item *ei;
@@ -4031,7 +4573,10 @@ static int process_extent(struct send_ctx *sctx,
                goto out;
        ret = send_write_or_clone(sctx, path, key, found_clone);
+        if (ret)
+                goto out;
+out_hole:
+        ret = maybe_send_hole(sctx, path, key);
 out:
        return ret;
 }
@@ -4054,17 +4599,25 @@ static int process_all_extents(struct send_ctx *sctx)
        key.objectid = sctx->cmp_key->objectid;
        key.type = BTRFS_EXTENT_DATA_KEY;
        key.offset = 0;
-        while (1) {
+        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-                ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
+        if (ret < 0)
-                if (ret < 0)
+                goto out;
-                        goto out;
-                if (ret) {
-                        ret = 0;
-                        goto out;
-                }
+        while (1) {
                eb = path->nodes[0];
                slot = path->slots[0];
+                if (slot >= btrfs_header_nritems(eb)) {
+                        ret = btrfs_next_leaf(root, path);
+                        if (ret < 0) {
+                                goto out;
+                        } else if (ret > 0) {
+                                ret = 0;
+                                break;
+                        }
+                        continue;
+                }
                btrfs_item_key_to_cpu(eb, &found_key, slot);
                if (found_key.objectid != key.objectid ||
@@ -4077,8 +4630,7 @@ static int process_all_extents(struct send_ctx *sctx)
                if (ret < 0)
                        goto out;
-                btrfs_release_path(path);
+                path->slots[0]++;
-                key.offset = found_key.offset + 1;
        }
 out:
@@ -4086,7 +4638,9 @@ out:
        return ret;
 }
-static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end)
+static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end,
+                                           int *pending_move,
+                                           int *refs_processed)
 {
        int ret = 0;
@@ -4098,17 +4652,11 @@ static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end)
        if (list_empty(&sctx->new_refs) && list_empty(&sctx->deleted_refs))
                goto out;
-        ret = process_recorded_refs(sctx);
+        ret = process_recorded_refs(sctx, pending_move);
        if (ret < 0)
                goto out;
-        /*
+        *refs_processed = 1;
-         * We have processed the refs and thus need to advance send_progress.
-         * Now, calls to get_cur_xxx will take the updated refs of the current
-         * inode into account.
-         */
-        sctx->send_progress = sctx->cur_ino + 1;
 out:
        return ret;
 }
@@ -4124,11 +4672,29 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
        u64 right_gid;
        int need_chmod = 0;
        int need_chown = 0;
+        int pending_move = 0;
+        int refs_processed = 0;
-        ret = process_recorded_refs_if_needed(sctx, at_end);
+        ret = process_recorded_refs_if_needed(sctx, at_end, &pending_move,
+                                              &refs_processed);
        if (ret < 0)
                goto out;
+        /*
+         * We have processed the refs and thus need to advance send_progress.
+         * Now, calls to get_cur_xxx will take the updated refs of the current
+         * inode into account.
+         *
+         * On the other hand, if our current inode is a directory and couldn't
+         * be moved/renamed because its parent was renamed/moved too and it has
+         * a higher inode number, we can only move/rename our current inode
+         * after we moved/renamed its parent. Therefore in this case operate on
+         * the old path (pre move/rename) of our current inode, and the
+         * move/rename will be performed later.
+         */
+        if (refs_processed && !pending_move)
+                sctx->send_progress = sctx->cur_ino + 1;
        if (sctx->cur_ino == 0 || sctx->cur_inode_deleted)
                goto out;
        if (!at_end && sctx->cmp_key->objectid == sctx->cur_ino)
@@ -4157,6 +4723,19 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
        }
        if (S_ISREG(sctx->cur_inode_mode)) {
+                if (need_send_hole(sctx)) {
+                        if (sctx->cur_inode_last_extent == (u64)-1) {
+                                ret = get_last_extent(sctx, (u64)-1);
+                                if (ret)
+                                        goto out;
+                        }
+                        if (sctx->cur_inode_last_extent <
+                            sctx->cur_inode_size) {
+                                ret = send_hole(sctx, sctx->cur_inode_size);
+                                if (ret)
+                                        goto out;
+                        }
+                }
                ret = send_truncate(sctx, sctx->cur_ino, sctx->cur_inode_gen,
                                sctx->cur_inode_size);
                if (ret < 0)
@@ -4177,9 +4756,21 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
        }
        /*
-         * Need to send that every time, no matter if it actually changed
+         * If other directory inodes depended on our current directory
-         * between the two trees as we have done changes to the inode before.
+         * inode's move/rename, now do their move/rename operations.
+         */
+        if (!is_waiting_for_move(sctx, sctx->cur_ino)) {
+                ret = apply_children_dir_moves(sctx);
+                if (ret)
+                        goto out;
+        }
+        /*
+         * Need to send that every time, no matter if it actually
+         * changed between the two trees as we have done changes to
+         * the inode before.
         */
+        sctx->send_progress = sctx->cur_ino + 1;
        ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen);
        if (ret < 0)
                goto out;
@@ -4200,6 +4791,7 @@ static int changed_inode(struct send_ctx *sctx,
        sctx->cur_ino = key->objectid;
        sctx->cur_inode_new_gen = 0;
+        sctx->cur_inode_last_extent = (u64)-1;
        /*
         * Set send_progress to current inode. This will tell all get_cur_xxx
@@ -4480,14 +5072,18 @@ static int changed_cb(struct btrfs_root *left_root,
        struct send_ctx *sctx = ctx;
        if (result == BTRFS_COMPARE_TREE_SAME) {
-                if (key->type != BTRFS_INODE_REF_KEY &&
+                if (key->type == BTRFS_INODE_REF_KEY ||
-                    key->type != BTRFS_INODE_EXTREF_KEY)
+                    key->type == BTRFS_INODE_EXTREF_KEY) {
-                        return 0;
+                        ret = compare_refs(sctx, left_path, key);
-                ret = compare_refs(sctx, left_path, key);
+                        if (!ret)
-                if (!ret)
+                                return 0;
+                        if (ret < 0)
+                                return ret;
+                } else if (key->type == BTRFS_EXTENT_DATA_KEY) {
+                        return maybe_send_hole(sctx, left_path, key);
+                } else {
                        return 0;
-                if (ret < 0)
+                }
-                        return ret;
                result = BTRFS_COMPARE_TREE_CHANGED;
                ret = 0;
        }
@@ -4522,7 +5118,6 @@ out:
 static int full_send_tree(struct send_ctx *sctx)
 {
        int ret;
-        struct btrfs_trans_handle *trans = NULL;
        struct btrfs_root *send_root = sctx->send_root;
        struct btrfs_key key;
        struct btrfs_key found_key;
@@ -4544,19 +5139,6 @@ static int full_send_tree(struct send_ctx *sctx)
        key.type = BTRFS_INODE_ITEM_KEY;
        key.offset = 0;
-join_trans:
-        /*
-         * We need to make sure the transaction does not get committed
-         * while we do anything on commit roots. Join a transaction to prevent
-         * this.
-         */
-        trans = btrfs_join_transaction(send_root);
-        if (IS_ERR(trans)) {
-                ret = PTR_ERR(trans);
-                trans = NULL;
-                goto out;
-        }
        /*
         * Make sure the tree has not changed after re-joining. We detect this
         * by comparing start_ctransid and ctransid. They should always match.
@@ -4566,7 +5148,7 @@ join_trans:
        spin_unlock(&send_root->root_item_lock);
        if (ctransid != start_ctransid) {
-                WARN(1, KERN_WARNING "btrfs: the root that you're trying to "
+                WARN(1, KERN_WARNING "BTRFS: the root that you're trying to "
                                     "send was modified in between. This is "
                                     "probably a bug.\n");
                ret = -EIO;
@@ -4580,19 +5162,6 @@ join_trans:
                goto out_finish;
        while (1) {
-                /*
-                 * When someone want to commit while we iterate, end the
-                 * joined transaction and rejoin.
-                 */
-                if (btrfs_should_end_transaction(trans, send_root)) {
-                        ret = btrfs_end_transaction(trans, send_root);
-                        trans = NULL;
-                        if (ret < 0)
-                                goto out;
-                        btrfs_release_path(path);
-                        goto join_trans;
-                }
                eb = path->nodes[0];
                slot = path->slots[0];
                btrfs_item_key_to_cpu(eb, &found_key, slot);
@@ -4620,12 +5189,6 @@ out_finish:
 out:
        btrfs_free_path(path);
-        if (trans) {
-                if (!ret)
-                        ret = btrfs_end_transaction(trans, send_root);
-                else
-                        btrfs_end_transaction(trans, send_root);
-        }
        return ret;
 }
@@ -4662,6 +5225,21 @@ out:
        return ret;
 }
+static void btrfs_root_dec_send_in_progress(struct btrfs_root* root)
+{
+        spin_lock(&root->root_item_lock);
+        root->send_in_progress--;
+        /*
+         * Not much left to do, we don't know why it's unbalanced and
+         * can't blindly reset it to 0.
+         */
+        if (root->send_in_progress < 0)
+                btrfs_err(root->fs_info,
+                        "send_in_progres unbalanced %d root %llu\n",
+                        root->send_in_progress, root->root_key.objectid);
+        spin_unlock(&root->root_item_lock);
+}
 long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
 {
        int ret = 0;
@@ -4673,6 +5251,9 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
        struct send_ctx *sctx = NULL;
        u32 i;
        u64 *clone_sources_tmp = NULL;
+        int clone_sources_to_rollback = 0;
+        int sort_clone_roots = 0;
+        int index;
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
@@ -4681,38 +5262,26 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
        fs_info = send_root->fs_info;
        /*
+         * The subvolume must remain read-only during send, protect against
+         * making it RW.
+         */
+        spin_lock(&send_root->root_item_lock);
+        send_root->send_in_progress++;
+        spin_unlock(&send_root->root_item_lock);
+        /*
         * This is done when we lookup the root, it should already be complete
         * by the time we get here.
         */
        WARN_ON(send_root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE);
        /*
-         * If we just created this root we need to make sure that the orphan
+         * Userspace tools do the checks and warn the user if it's
-         * cleanup has been done and committed since we search the commit root,
+         * not RO.
-         * so check its commit root transid with our otransid and if they match
-         * commit the transaction to make sure everything is updated.
         */
-        down_read(&send_root->fs_info->extent_commit_sem);
+        if (!btrfs_root_readonly(send_root)) {
-        if (btrfs_header_generation(send_root->commit_root) ==
+                ret = -EPERM;
-            btrfs_root_otransid(&send_root->root_item)) {
+                goto out;
-                struct btrfs_trans_handle *trans;
-                up_read(&send_root->fs_info->extent_commit_sem);
-                trans = btrfs_attach_transaction_barrier(send_root);
-                if (IS_ERR(trans)) {
-                        if (PTR_ERR(trans) != -ENOENT) {
-                                ret = PTR_ERR(trans);
-                                goto out;
-                        }
-                        /* ENOENT means theres no transaction */
-                } else {
-                        ret = btrfs_commit_transaction(trans, send_root);
-                        if (ret)
-                                goto out;
-                }
-        } else {
-                up_read(&send_root->fs_info->extent_commit_sem);
        }
        arg = memdup_user(arg_, sizeof(*arg));
@@ -4753,8 +5322,6 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
                goto out;
        }
-        sctx->mnt = mnt_file->f_path.mnt;
        sctx->send_root = send_root;
        sctx->clone_roots_cnt = arg->clone_sources_count;
@@ -4771,6 +5338,9 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
                goto out;
        }
+        sctx->pending_dir_moves = RB_ROOT;
+        sctx->waiting_dir_moves = RB_ROOT;
        sctx->clone_roots = vzalloc(sizeof(struct clone_root) *
                        (arg->clone_sources_count + 1));
        if (!sctx->clone_roots) {
@@ -4798,11 +5368,27 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
                        key.objectid = clone_sources_tmp[i];
                        key.type = BTRFS_ROOT_ITEM_KEY;
                        key.offset = (u64)-1;
+                        index = srcu_read_lock(&fs_info->subvol_srcu);
                        clone_root = btrfs_read_fs_root_no_name(fs_info, &key);
                        if (IS_ERR(clone_root)) {
+                                srcu_read_unlock(&fs_info->subvol_srcu, index);
                                ret = PTR_ERR(clone_root);
                                goto out;
                        }
+                        clone_sources_to_rollback = i + 1;
+                        spin_lock(&clone_root->root_item_lock);
+                        clone_root->send_in_progress++;
+                        if (!btrfs_root_readonly(clone_root)) {
+                                spin_unlock(&clone_root->root_item_lock);
+                                srcu_read_unlock(&fs_info->subvol_srcu, index);
+                                ret = -EPERM;
+                                goto out;
+                        }
+                        spin_unlock(&clone_root->root_item_lock);
+                        srcu_read_unlock(&fs_info->subvol_srcu, index);
                        sctx->clone_roots[i].root = clone_root;
                }
                vfree(clone_sources_tmp);
@@ -4813,11 +5399,27 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
                key.objectid = arg->parent_root;
                key.type = BTRFS_ROOT_ITEM_KEY;
                key.offset = (u64)-1;
+                index = srcu_read_lock(&fs_info->subvol_srcu);
                sctx->parent_root = btrfs_read_fs_root_no_name(fs_info, &key);
                if (IS_ERR(sctx->parent_root)) {
+                        srcu_read_unlock(&fs_info->subvol_srcu, index);
                        ret = PTR_ERR(sctx->parent_root);
                        goto out;
                }
+                spin_lock(&sctx->parent_root->root_item_lock);
+                sctx->parent_root->send_in_progress++;
+                if (!btrfs_root_readonly(sctx->parent_root)) {
+                        spin_unlock(&sctx->parent_root->root_item_lock);
+                        srcu_read_unlock(&fs_info->subvol_srcu, index);
+                        ret = -EPERM;
+                        goto out;
+                }
+                spin_unlock(&sctx->parent_root->root_item_lock);
+                srcu_read_unlock(&fs_info->subvol_srcu, index);
        }
        /*
@@ -4831,6 +5433,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
        sort(sctx->clone_roots, sctx->clone_roots_cnt,
                        sizeof(*sctx->clone_roots), __clone_root_cmp_sort,
                        NULL);
+        sort_clone_roots = 1;
        ret = send_subvol(sctx);
        if (ret < 0)
@@ -4846,6 +5449,48 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
        }
 out:
+        WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->pending_dir_moves));
+        while (sctx && !RB_EMPTY_ROOT(&sctx->pending_dir_moves)) {
+                struct rb_node *n;
+                struct pending_dir_move *pm;
+                n = rb_first(&sctx->pending_dir_moves);
+                pm = rb_entry(n, struct pending_dir_move, node);
+                while (!list_empty(&pm->list)) {
+                        struct pending_dir_move *pm2;
+                        pm2 = list_first_entry(&pm->list,
+                                               struct pending_dir_move, list);
+                        free_pending_move(sctx, pm2);
+                }
+                free_pending_move(sctx, pm);
+        }
+        WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->waiting_dir_moves));
+        while (sctx && !RB_EMPTY_ROOT(&sctx->waiting_dir_moves)) {
+                struct rb_node *n;
+                struct waiting_dir_move *dm;
+                n = rb_first(&sctx->waiting_dir_moves);
+                dm = rb_entry(n, struct waiting_dir_move, node);
+                rb_erase(&dm->node, &sctx->waiting_dir_moves);
+                kfree(dm);
+        }
+        if (sort_clone_roots) {
+                for (i = 0; i < sctx->clone_roots_cnt; i++)
+                        btrfs_root_dec_send_in_progress(
+                                        sctx->clone_roots[i].root);
+        } else {
+                for (i = 0; sctx && i < clone_sources_to_rollback; i++)
+                        btrfs_root_dec_send_in_progress(
+                                        sctx->clone_roots[i].root);
+                btrfs_root_dec_send_in_progress(send_root);
+        }
+        if (sctx && !IS_ERR_OR_NULL(sctx->parent_root))
+                btrfs_root_dec_send_in_progress(sctx->parent_root);
        kfree(arg);
        vfree(clone_sources_tmp);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index d71a11d13dfa..d04db817be5c 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -48,6 +48,8 @@
 #include "transaction.h"
 #include "btrfs_inode.h"
 #include "print-tree.h"
+#include "hash.h"
+#include "props.h"
 #include "xattr.h"
 #include "volumes.h"
 #include "export.h"
@@ -152,11 +154,12 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
                vaf.fmt = fmt;
                vaf.va = &args;
-                printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: errno=%d %s (%pV)\n",
+                printk(KERN_CRIT
+                        "BTRFS: error (device %s) in %s:%d: errno=%d %s (%pV)\n",
                        sb->s_id, function, line, errno, errstr, &vaf);
                va_end(args);
        } else {
-                printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: errno=%d %s\n",
+                printk(KERN_CRIT "BTRFS: error (device %s) in %s:%d: errno=%d %s\n",
                        sb->s_id, function, line, errno, errstr);
        }
@@ -250,7 +253,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
         */
        if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED,
                                &root->fs_info->fs_state)) {
-                WARN(1, KERN_DEBUG "btrfs: Transaction aborted (error %d)\n",
+                WARN(1, KERN_DEBUG "BTRFS: Transaction aborted (error %d)\n",
                                errno);
        }
        trans->aborted = errno;
@@ -294,8 +297,8 @@ void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
                panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n",
                        s_id, function, line, &vaf, errno, errstr);
-        printk(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n",
+        btrfs_crit(fs_info, "panic in %s:%d: %pV (errno=%d %s)",
-               s_id, function, line, &vaf, errno, errstr);
+                   function, line, &vaf, errno, errstr);
        va_end(args);
        /* Caller calls BUG() */
 }
@@ -322,7 +325,9 @@ enum {
        Opt_no_space_cache, Opt_recovery, Opt_skip_balance,
        Opt_check_integrity, Opt_check_integrity_including_extent_data,
        Opt_check_integrity_print_mask, Opt_fatal_errors, Opt_rescan_uuid_tree,
-        Opt_commit_interval,
+        Opt_commit_interval, Opt_barrier, Opt_nodefrag, Opt_nodiscard,
+        Opt_noenospc_debug, Opt_noflushoncommit, Opt_acl, Opt_datacow,
+        Opt_datasum, Opt_treelog, Opt_noinode_cache,
        Opt_err,
 };
@@ -332,8 +337,11 @@ static match_table_t tokens = {
        {Opt_subvolid, "subvolid=%s"},
        {Opt_device, "device=%s"},
        {Opt_nodatasum, "nodatasum"},
+        {Opt_datasum, "datasum"},
        {Opt_nodatacow, "nodatacow"},
+        {Opt_datacow, "datacow"},
        {Opt_nobarrier, "nobarrier"},
+        {Opt_barrier, "barrier"},
        {Opt_max_inline, "max_inline=%s"},
        {Opt_alloc_start, "alloc_start=%s"},
        {Opt_thread_pool, "thread_pool=%d"},
@@ -344,18 +352,25 @@ static match_table_t tokens = {
        {Opt_ssd, "ssd"},
        {Opt_ssd_spread, "ssd_spread"},
        {Opt_nossd, "nossd"},
+        {Opt_acl, "acl"},
        {Opt_noacl, "noacl"},
        {Opt_notreelog, "notreelog"},
+        {Opt_treelog, "treelog"},
        {Opt_flushoncommit, "flushoncommit"},
+        {Opt_noflushoncommit, "noflushoncommit"},
        {Opt_ratio, "metadata_ratio=%d"},
        {Opt_discard, "discard"},
+        {Opt_nodiscard, "nodiscard"},
        {Opt_space_cache, "space_cache"},
        {Opt_clear_cache, "clear_cache"},
        {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"},
        {Opt_enospc_debug, "enospc_debug"},
+        {Opt_noenospc_debug, "noenospc_debug"},
        {Opt_subvolrootid, "subvolrootid=%d"},
        {Opt_defrag, "autodefrag"},
+        {Opt_nodefrag, "noautodefrag"},
        {Opt_inode_cache, "inode_cache"},
+        {Opt_noinode_cache, "noinode_cache"},
        {Opt_no_space_cache, "nospace_cache"},
        {Opt_recovery, "recovery"},
        {Opt_skip_balance, "skip_balance"},
@@ -368,6 +383,20 @@ static match_table_t tokens = {
        {Opt_err, NULL},
 };
+#define btrfs_set_and_info(root, opt, fmt, args...)                     \
+{                                                                       \
+        if (!btrfs_test_opt(root, opt))                                 \
+                btrfs_info(root->fs_info, fmt, ##args);                 \
+        btrfs_set_opt(root->fs_info->mount_opt, opt);                   \
+}
+#define btrfs_clear_and_info(root, opt, fmt, args...)                   \
+{                                                                       \
+        if (btrfs_test_opt(root, opt))                                  \
+                btrfs_info(root->fs_info, fmt, ##args);                 \
+        btrfs_clear_opt(root->fs_info->mount_opt, opt);                 \
+}
 /*
 * Regular mount options parser.  Everything that is needed only when
 * reading in a new superblock is parsed here.
@@ -383,6 +412,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
        int ret = 0;
        char *compress_type;
        bool compress_force = false;
+        bool compress = false;
        cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
        if (cache_gen)
@@ -409,7 +439,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                token = match_token(p, tokens, args);
                switch (token) {
                case Opt_degraded:
-                        printk(KERN_INFO "btrfs: allowing degraded mounts\n");
+                        btrfs_info(root->fs_info, "allowing degraded mounts");
                        btrfs_set_opt(info->mount_opt, DEGRADED);
                        break;
                case Opt_subvol:
@@ -422,27 +452,45 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                         */
                        break;
                case Opt_nodatasum:
-                        printk(KERN_INFO "btrfs: setting nodatasum\n");
+                        btrfs_set_and_info(root, NODATASUM,
-                        btrfs_set_opt(info->mount_opt, NODATASUM);
+                                           "setting nodatasum");
+                        break;
+                case Opt_datasum:
+                        if (btrfs_test_opt(root, NODATASUM)) {
+                                if (btrfs_test_opt(root, NODATACOW))
+                                        btrfs_info(root->fs_info, "setting datasum, datacow enabled");
+                                else
+                                        btrfs_info(root->fs_info, "setting datasum");
+                        }
+                        btrfs_clear_opt(info->mount_opt, NODATACOW);
+                        btrfs_clear_opt(info->mount_opt, NODATASUM);
                        break;
                case Opt_nodatacow:
-                        if (!btrfs_test_opt(root, COMPRESS) ||
+                        if (!btrfs_test_opt(root, NODATACOW)) {
-                                !btrfs_test_opt(root, FORCE_COMPRESS)) {
+                                if (!btrfs_test_opt(root, COMPRESS) ||
-                                        printk(KERN_INFO "btrfs: setting nodatacow, compression disabled\n");
+                                    !btrfs_test_opt(root, FORCE_COMPRESS)) {
-                        } else {
+                                        btrfs_info(root->fs_info,
-                                printk(KERN_INFO "btrfs: setting nodatacow\n");
+                                                   "setting nodatacow, compression disabled");
+                                } else {
+                                        btrfs_info(root->fs_info, "setting nodatacow");
+                                }
                        }
                        btrfs_clear_opt(info->mount_opt, COMPRESS);
                        btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
                        btrfs_set_opt(info->mount_opt, NODATACOW);
                        btrfs_set_opt(info->mount_opt, NODATASUM);
                        break;
+                case Opt_datacow:
+                        btrfs_clear_and_info(root, NODATACOW,
+                                             "setting datacow");
+                        break;
                case Opt_compress_force:
                case Opt_compress_force_type:
                        compress_force = true;
                        /* Fallthrough */
                case Opt_compress:
                case Opt_compress_type:
+                        compress = true;
                        if (token == Opt_compress ||
                            token == Opt_compress_force ||
                            strcmp(args[0].from, "zlib") == 0) {
@@ -469,34 +517,36 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                        }
                        if (compress_force) {
-                                btrfs_set_opt(info->mount_opt, FORCE_COMPRESS);
+                                btrfs_set_and_info(root, FORCE_COMPRESS,
-                                pr_info("btrfs: force %s compression\n",
+                                                   "force %s compression",
-                                        compress_type);
+                                                   compress_type);
-                        } else if (btrfs_test_opt(root, COMPRESS)) {
+                        } else if (compress) {
-                                pr_info("btrfs: use %s compression\n",
+                                if (!btrfs_test_opt(root, COMPRESS))
-                                        compress_type);
+                                        btrfs_info(root->fs_info,
+                                                   "btrfs: use %s compression\n",
+                                                   compress_type);
                        }
                        break;
                case Opt_ssd:
-                        printk(KERN_INFO "btrfs: use ssd allocation scheme\n");
+                        btrfs_set_and_info(root, SSD,
-                        btrfs_set_opt(info->mount_opt, SSD);
+                                           "use ssd allocation scheme");
                        break;
                case Opt_ssd_spread:
-                        printk(KERN_INFO "btrfs: use spread ssd "
+                        btrfs_set_and_info(root, SSD_SPREAD,
-                               "allocation scheme\n");
+                                           "use spread ssd allocation scheme");
-                        btrfs_set_opt(info->mount_opt, SSD);
-                        btrfs_set_opt(info->mount_opt, SSD_SPREAD);
                        break;
                case Opt_nossd:
-                        printk(KERN_INFO "btrfs: not using ssd allocation "
+                        btrfs_clear_and_info(root, NOSSD,
-                               "scheme\n");
+                                             "not using ssd allocation scheme");
-                        btrfs_set_opt(info->mount_opt, NOSSD);
                        btrfs_clear_opt(info->mount_opt, SSD);
-                        btrfs_clear_opt(info->mount_opt, SSD_SPREAD);
+                        break;
+                case Opt_barrier:
+                        btrfs_clear_and_info(root, NOBARRIER,
+                                             "turning on barriers");
                        break;
                case Opt_nobarrier:
-                        printk(KERN_INFO "btrfs: turning off barriers\n");
+                        btrfs_set_and_info(root, NOBARRIER,
-                        btrfs_set_opt(info->mount_opt, NOBARRIER);
+                                           "turning off barriers");
                        break;
                case Opt_thread_pool:
                        ret = match_int(&args[0], &intarg);
@@ -516,11 +566,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                                kfree(num);
                                if (info->max_inline) {
-                                        info->max_inline = max_t(u64,
+                                        info->max_inline = min_t(u64,
                                                info->max_inline,
                                                root->sectorsize);
                                }
-                                printk(KERN_INFO "btrfs: max_inline at %llu\n",
+                                btrfs_info(root->fs_info, "max_inline at %llu",
                                        info->max_inline);
                        } else {
                                ret = -ENOMEM;
@@ -534,24 +584,34 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                                info->alloc_start = memparse(num, NULL);
                                mutex_unlock(&info->chunk_mutex);
                                kfree(num);
-                                printk(KERN_INFO
+                                btrfs_info(root->fs_info, "allocations start at %llu",
-                                        "btrfs: allocations start at %llu\n",
                                        info->alloc_start);
                        } else {
                                ret = -ENOMEM;
                                goto out;
                        }
                        break;
+                case Opt_acl:
+                        root->fs_info->sb->s_flags |= MS_POSIXACL;
+                        break;
                case Opt_noacl:
                        root->fs_info->sb->s_flags &= ~MS_POSIXACL;
                        break;
                case Opt_notreelog:
-                        printk(KERN_INFO "btrfs: disabling tree log\n");
+                        btrfs_set_and_info(root, NOTREELOG,
-                        btrfs_set_opt(info->mount_opt, NOTREELOG);
+                                           "disabling tree log");
+                        break;
+                case Opt_treelog:
+                        btrfs_clear_and_info(root, NOTREELOG,
+                                             "enabling tree log");
                        break;
                case Opt_flushoncommit:
-                        printk(KERN_INFO "btrfs: turning on flush-on-commit\n");
+                        btrfs_set_and_info(root, FLUSHONCOMMIT,
-                        btrfs_set_opt(info->mount_opt, FLUSHONCOMMIT);
+                                           "turning on flush-on-commit");
+                        break;
+                case Opt_noflushoncommit:
+                        btrfs_clear_and_info(root, FLUSHONCOMMIT,
+                                             "turning off flush-on-commit");
                        break;
                case Opt_ratio:
                        ret = match_int(&args[0], &intarg);
@@ -559,7 +619,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                                goto out;
                        } else if (intarg >= 0) {
                                info->metadata_ratio = intarg;
-                                printk(KERN_INFO "btrfs: metadata ratio %d\n",
+                                btrfs_info(root->fs_info, "metadata ratio %d",
                                       info->metadata_ratio);
                        } else {
                                ret = -EINVAL;
@@ -567,25 +627,35 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                        }
                        break;
                case Opt_discard:
-                        btrfs_set_opt(info->mount_opt, DISCARD);
+                        btrfs_set_and_info(root, DISCARD,
+                                           "turning on discard");
+                        break;
+                case Opt_nodiscard:
+                        btrfs_clear_and_info(root, DISCARD,
+                                             "turning off discard");
                        break;
                case Opt_space_cache:
-                        btrfs_set_opt(info->mount_opt, SPACE_CACHE);
+                        btrfs_set_and_info(root, SPACE_CACHE,
+                                           "enabling disk space caching");
                        break;
                case Opt_rescan_uuid_tree:
                        btrfs_set_opt(info->mount_opt, RESCAN_UUID_TREE);
                        break;
                case Opt_no_space_cache:
-                        printk(KERN_INFO "btrfs: disabling disk space caching\n");
+                        btrfs_clear_and_info(root, SPACE_CACHE,
-                        btrfs_clear_opt(info->mount_opt, SPACE_CACHE);
+                                             "disabling disk space caching");
                        break;
                case Opt_inode_cache:
-                        printk(KERN_INFO "btrfs: enabling inode map caching\n");
+                        btrfs_set_and_info(root, CHANGE_INODE_CACHE,
-                        btrfs_set_opt(info->mount_opt, INODE_MAP_CACHE);
+                                           "enabling inode map caching");
+                        break;
+                case Opt_noinode_cache:
+                        btrfs_clear_and_info(root, CHANGE_INODE_CACHE,
+                                             "disabling inode map caching");
                        break;
                case Opt_clear_cache:
-                        printk(KERN_INFO "btrfs: force clearing of disk cache\n");
+                        btrfs_set_and_info(root, CLEAR_CACHE,
-                        btrfs_set_opt(info->mount_opt, CLEAR_CACHE);
+                                           "force clearing of disk cache");
                        break;
                case Opt_user_subvol_rm_allowed:
                        btrfs_set_opt(info->mount_opt, USER_SUBVOL_RM_ALLOWED);
@@ -593,12 +663,19 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                case Opt_enospc_debug:
                        btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG);
                        break;
+                case Opt_noenospc_debug:
+                        btrfs_clear_opt(info->mount_opt, ENOSPC_DEBUG);
+                        break;
                case Opt_defrag:
-                        printk(KERN_INFO "btrfs: enabling auto defrag\n");
+                        btrfs_set_and_info(root, AUTO_DEFRAG,
-                        btrfs_set_opt(info->mount_opt, AUTO_DEFRAG);
+                                           "enabling auto defrag");
+                        break;
+                case Opt_nodefrag:
+                        btrfs_clear_and_info(root, AUTO_DEFRAG,
+                                             "disabling auto defrag");
                        break;
                case Opt_recovery:
-                        printk(KERN_INFO "btrfs: enabling auto recovery\n");
+                        btrfs_info(root->fs_info, "enabling auto recovery");
                        btrfs_set_opt(info->mount_opt, RECOVERY);
                        break;
                case Opt_skip_balance:
@@ -606,14 +683,14 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                        break;
 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
                case Opt_check_integrity_including_extent_data:
-                        printk(KERN_INFO "btrfs: enabling check integrity"
+                        btrfs_info(root->fs_info,
-                               " including extent data\n");
+                                   "enabling check integrity including extent data");
                        btrfs_set_opt(info->mount_opt,
                                      CHECK_INTEGRITY_INCLUDING_EXTENT_DATA);
                        btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
                        break;
                case Opt_check_integrity:
-                        printk(KERN_INFO "btrfs: enabling check integrity\n");
+                        btrfs_info(root->fs_info, "enabling check integrity");
                        btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
                        break;
                case Opt_check_integrity_print_mask:
@@ -622,8 +699,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                                goto out;
                        } else if (intarg >= 0) {
                                info->check_integrity_print_mask = intarg;
-                                printk(KERN_INFO "btrfs:"
+                                btrfs_info(root->fs_info, "check_integrity_print_mask 0x%x",
-                                       " check_integrity_print_mask 0x%x\n",
                                       info->check_integrity_print_mask);
                        } else {
                                ret = -EINVAL;
@@ -634,8 +710,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                case Opt_check_integrity_including_extent_data:
                case Opt_check_integrity:
                case Opt_check_integrity_print_mask:
-                        printk(KERN_ERR "btrfs: support for check_integrity*"
+                        btrfs_err(root->fs_info,
-                               " not compiled in!\n");
+                                "support for check_integrity* not compiled in!");
                        ret = -EINVAL;
                        goto out;
 #endif
@@ -655,28 +731,24 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                        intarg = 0;
                        ret = match_int(&args[0], &intarg);
                        if (ret < 0) {
-                                printk(KERN_ERR
+                                btrfs_err(root->fs_info, "invalid commit interval");
-                                        "btrfs: invalid commit interval\n");
                                ret = -EINVAL;
                                goto out;
                        }
                        if (intarg > 0) {
                                if (intarg > 300) {
-                                        printk(KERN_WARNING
+                                        btrfs_warn(root->fs_info, "excessive commit interval %d",
-                                            "btrfs: excessive commit interval %d\n",
                                                        intarg);
                                }
                                info->commit_interval = intarg;
                        } else {
-                                printk(KERN_INFO
+                                btrfs_info(root->fs_info, "using default commit interval %ds",
-                                    "btrfs: using default commit interval %ds\n",
                                    BTRFS_DEFAULT_COMMIT_INTERVAL);
                                info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
                        }
                        break;
                case Opt_err:
-                        printk(KERN_INFO "btrfs: unrecognized mount option "
+                        btrfs_info(root->fs_info, "unrecognized mount option '%s'", p);
-                               "'%s'\n", p);
                        ret = -EINVAL;
                        goto out;
                default:
@@ -685,7 +757,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
        }
 out:
        if (!ret && btrfs_test_opt(root, SPACE_CACHE))
-                printk(KERN_INFO "btrfs: disk space caching is enabled\n");
+                btrfs_info(root->fs_info, "disk space caching is enabled");
        kfree(orig);
        return ret;
 }
@@ -748,7 +820,8 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
                        break;
                case Opt_subvolrootid:
                        printk(KERN_WARNING
-                                "btrfs: 'subvolrootid' mount option is deprecated and has no effect\n");
+                                "BTRFS: 'subvolrootid' mount option is deprecated and has "
+                                "no effect\n");
                        break;
                case Opt_device:
                        device_name = match_strdup(&args[0]);
@@ -782,6 +855,7 @@ static struct dentry *get_default_root(struct super_block *sb,
        struct btrfs_path *path;
        struct btrfs_key location;
        struct inode *inode;
+        struct dentry *dentry;
        u64 dir_id;
        int new = 0;
@@ -852,7 +926,13 @@ setup_root:
                return dget(sb->s_root);
        }
-        return d_obtain_alias(inode);
+        dentry = d_obtain_alias(inode);
+        if (!IS_ERR(dentry)) {
+                spin_lock(&dentry->d_lock);
+                dentry->d_flags &= ~DCACHE_DISCONNECTED;
+                spin_unlock(&dentry->d_lock);
+        }
+        return dentry;
 }
 static int btrfs_fill_super(struct super_block *sb,
@@ -877,7 +957,7 @@ static int btrfs_fill_super(struct super_block *sb,
        sb->s_flags |= MS_I_VERSION;
        err = open_ctree(sb, fs_devices, (char *)data);
        if (err) {
-                printk("btrfs: open_ctree failed\n");
+                printk(KERN_ERR "BTRFS: open_ctree failed\n");
                return err;
        }
@@ -1115,7 +1195,7 @@ static struct dentry *mount_subvol(const char *subvol_name, int flags,
                dput(root);
                root = ERR_PTR(-EINVAL);
                deactivate_locked_super(s);
-                printk(KERN_ERR "btrfs: '%s' is not a valid subvolume\n",
+                printk(KERN_ERR "BTRFS: '%s' is not a valid subvolume\n",
                                subvol_name);
        }
@@ -1240,7 +1320,7 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
        fs_info->thread_pool_size = new_pool_size;
-        printk(KERN_INFO "btrfs: resize thread pool %d -> %d\n",
+        btrfs_info(fs_info, "resize thread pool %d -> %d",
               old_pool_size, new_pool_size);
        btrfs_set_max_workers(&fs_info->generic_worker, new_pool_size);
@@ -1346,7 +1426,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
        } else {
                if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) {
                        btrfs_err(fs_info,
-                                "Remounting read-write after error is not allowed\n");
+                                "Remounting read-write after error is not allowed");
                        ret = -EINVAL;
                        goto restore;
                }
@@ -1358,8 +1438,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
                if (fs_info->fs_devices->missing_devices >
                     fs_info->num_tolerated_disk_barrier_failures &&
                    !(*flags & MS_RDONLY)) {
-                        printk(KERN_WARNING
+                        btrfs_warn(fs_info,
-                               "Btrfs: too many missing devices, writeable remount is not allowed\n");
+                                "too many missing devices, writeable remount is not allowed");
                        ret = -EACCES;
                        goto restore;
                }
@@ -1384,16 +1464,15 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
                ret = btrfs_resume_dev_replace_async(fs_info);
                if (ret) {
-                        pr_warn("btrfs: failed to resume dev_replace\n");
+                        btrfs_warn(fs_info, "failed to resume dev_replace");
                        goto restore;
                }
                if (!fs_info->uuid_root) {
-                        pr_info("btrfs: creating UUID tree\n");
+                        btrfs_info(fs_info, "creating UUID tree");
                        ret = btrfs_create_uuid_tree(fs_info);
                        if (ret) {
-                                pr_warn("btrfs: failed to create the uuid tree"
+                                btrfs_warn(fs_info, "failed to create the UUID tree %d", ret);
-                                        "%d\n", ret);
                                goto restore;
                        }
                }
@@ -1773,7 +1852,7 @@ static int btrfs_interface_init(void)
 static void btrfs_interface_exit(void)
 {
        if (misc_deregister(&btrfs_misc) < 0)
-                printk(KERN_INFO "btrfs: misc_deregister failed for control device\n");
+                printk(KERN_INFO "BTRFS: misc_deregister failed for control device\n");
 }
 static void btrfs_print_info(void)
@@ -1818,10 +1897,16 @@ static int __init init_btrfs_fs(void)
 {
        int err;
-        err = btrfs_init_sysfs();
+        err = btrfs_hash_init();
        if (err)
                return err;
+        btrfs_props_init();
+        err = btrfs_init_sysfs();
+        if (err)
+                goto free_hash;
        btrfs_init_compress();
        err = btrfs_init_cachep();
@@ -1895,6 +1980,8 @@ free_cachep:
 free_compress:
        btrfs_exit_compress();
        btrfs_exit_sysfs();
+free_hash:
+        btrfs_hash_exit();
        return err;
 }
@@ -1913,9 +2000,10 @@ static void __exit exit_btrfs_fs(void)
        btrfs_exit_sysfs();
        btrfs_cleanup_fs_uuids();
        btrfs_exit_compress();
+        btrfs_hash_exit();
 }
-module_init(init_btrfs_fs)
+late_initcall(init_btrfs_fs);
 module_exit(exit_btrfs_fs)
 MODULE_LICENSE("GPL");
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 5b326cd60a4a..865f4cf9a769 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -22,24 +22,647 @@
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
 #include <linux/kobject.h>
+#include <linux/bug.h>
+#include <linux/genhd.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
+#include "sysfs.h"
+#include "volumes.h"
+static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj);
+static u64 get_features(struct btrfs_fs_info *fs_info,
+                        enum btrfs_feature_set set)
+{
+        struct btrfs_super_block *disk_super = fs_info->super_copy;
+        if (set == FEAT_COMPAT)
+                return btrfs_super_compat_flags(disk_super);
+        else if (set == FEAT_COMPAT_RO)
+                return btrfs_super_compat_ro_flags(disk_super);
+        else
+                return btrfs_super_incompat_flags(disk_super);
+}
+static void set_features(struct btrfs_fs_info *fs_info,
+                         enum btrfs_feature_set set, u64 features)
+{
+        struct btrfs_super_block *disk_super = fs_info->super_copy;
+        if (set == FEAT_COMPAT)
+                btrfs_set_super_compat_flags(disk_super, features);
+        else if (set == FEAT_COMPAT_RO)
+                btrfs_set_super_compat_ro_flags(disk_super, features);
+        else
+                btrfs_set_super_incompat_flags(disk_super, features);
+}
+static int can_modify_feature(struct btrfs_feature_attr *fa)
+{
+        int val = 0;
+        u64 set, clear;
+        switch (fa->feature_set) {
+        case FEAT_COMPAT:
+                set = BTRFS_FEATURE_COMPAT_SAFE_SET;
+                clear = BTRFS_FEATURE_COMPAT_SAFE_CLEAR;
+                break;
+        case FEAT_COMPAT_RO:
+                set = BTRFS_FEATURE_COMPAT_RO_SAFE_SET;
+                clear = BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR;
+                break;
+        case FEAT_INCOMPAT:
+                set = BTRFS_FEATURE_INCOMPAT_SAFE_SET;
+                clear = BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR;
+                break;
+        default:
+                printk(KERN_WARNING "btrfs: sysfs: unknown feature set %d\n",
+                                fa->feature_set);
+                return 0;
+        }
+        if (set & fa->feature_bit)
+                val |= 1;
+        if (clear & fa->feature_bit)
+                val |= 2;
+        return val;
+}
+static ssize_t btrfs_feature_attr_show(struct kobject *kobj,
+                                       struct kobj_attribute *a, char *buf)
+{
+        int val = 0;
+        struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+        struct btrfs_feature_attr *fa = to_btrfs_feature_attr(a);
+        if (fs_info) {
+                u64 features = get_features(fs_info, fa->feature_set);
+                if (features & fa->feature_bit)
+                        val = 1;
+        } else
+                val = can_modify_feature(fa);
+        return snprintf(buf, PAGE_SIZE, "%d\n", val);
+}
+static ssize_t btrfs_feature_attr_store(struct kobject *kobj,
+                                        struct kobj_attribute *a,
+                                        const char *buf, size_t count)
+{
+        struct btrfs_fs_info *fs_info;
+        struct btrfs_feature_attr *fa = to_btrfs_feature_attr(a);
+        struct btrfs_trans_handle *trans;
+        u64 features, set, clear;
+        unsigned long val;
+        int ret;
+        fs_info = to_fs_info(kobj);
+        if (!fs_info)
+                return -EPERM;
+        ret = kstrtoul(skip_spaces(buf), 0, &val);
+        if (ret)
+                return ret;
+        if (fa->feature_set == FEAT_COMPAT) {
+                set = BTRFS_FEATURE_COMPAT_SAFE_SET;
+                clear = BTRFS_FEATURE_COMPAT_SAFE_CLEAR;
+        } else if (fa->feature_set == FEAT_COMPAT_RO) {
+                set = BTRFS_FEATURE_COMPAT_RO_SAFE_SET;
+                clear = BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR;
+        } else {
+                set = BTRFS_FEATURE_INCOMPAT_SAFE_SET;
+                clear = BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR;
+        }
+        features = get_features(fs_info, fa->feature_set);
+        /* Nothing to do */
+        if ((val && (features & fa->feature_bit)) ||
+            (!val && !(features & fa->feature_bit)))
+                return count;
+        if ((val && !(set & fa->feature_bit)) ||
+            (!val && !(clear & fa->feature_bit))) {
+                btrfs_info(fs_info,
+                        "%sabling feature %s on mounted fs is not supported.",
+                        val ? "En" : "Dis", fa->kobj_attr.attr.name);
+                return -EPERM;
+        }
+        btrfs_info(fs_info, "%s %s feature flag",
+                   val ? "Setting" : "Clearing", fa->kobj_attr.attr.name);
+        trans = btrfs_start_transaction(fs_info->fs_root, 0);
+        if (IS_ERR(trans))
+                return PTR_ERR(trans);
+        spin_lock(&fs_info->super_lock);
+        features = get_features(fs_info, fa->feature_set);
+        if (val)
+                features |= fa->feature_bit;
+        else
+                features &= ~fa->feature_bit;
+        set_features(fs_info, fa->feature_set, features);
+        spin_unlock(&fs_info->super_lock);
+        ret = btrfs_commit_transaction(trans, fs_info->fs_root);
+        if (ret)
+                return ret;
+        return count;
+}
+static umode_t btrfs_feature_visible(struct kobject *kobj,
+                                     struct attribute *attr, int unused)
+{
+        struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+        umode_t mode = attr->mode;
+        if (fs_info) {
+                struct btrfs_feature_attr *fa;
+                u64 features;
+                fa = attr_to_btrfs_feature_attr(attr);
+                features = get_features(fs_info, fa->feature_set);
+                if (can_modify_feature(fa))
+                        mode |= S_IWUSR;
+                else if (!(features & fa->feature_bit))
+                        mode = 0;
+        }
+        return mode;
+}
+BTRFS_FEAT_ATTR_INCOMPAT(mixed_backref, MIXED_BACKREF);
+BTRFS_FEAT_ATTR_INCOMPAT(default_subvol, DEFAULT_SUBVOL);
+BTRFS_FEAT_ATTR_INCOMPAT(mixed_groups, MIXED_GROUPS);
+BTRFS_FEAT_ATTR_INCOMPAT(compress_lzo, COMPRESS_LZO);
+BTRFS_FEAT_ATTR_INCOMPAT(big_metadata, BIG_METADATA);
+BTRFS_FEAT_ATTR_INCOMPAT(extended_iref, EXTENDED_IREF);
+BTRFS_FEAT_ATTR_INCOMPAT(raid56, RAID56);
+BTRFS_FEAT_ATTR_INCOMPAT(skinny_metadata, SKINNY_METADATA);
+BTRFS_FEAT_ATTR_INCOMPAT(no_holes, NO_HOLES);
+static struct attribute *btrfs_supported_feature_attrs[] = {
+        BTRFS_FEAT_ATTR_PTR(mixed_backref),
+        BTRFS_FEAT_ATTR_PTR(default_subvol),
+        BTRFS_FEAT_ATTR_PTR(mixed_groups),
+        BTRFS_FEAT_ATTR_PTR(compress_lzo),
+        BTRFS_FEAT_ATTR_PTR(big_metadata),
+        BTRFS_FEAT_ATTR_PTR(extended_iref),
+        BTRFS_FEAT_ATTR_PTR(raid56),
+        BTRFS_FEAT_ATTR_PTR(skinny_metadata),
+        BTRFS_FEAT_ATTR_PTR(no_holes),
+        NULL
+};
+static const struct attribute_group btrfs_feature_attr_group = {
+        .name = "features",
+        .is_visible = btrfs_feature_visible,
+        .attrs = btrfs_supported_feature_attrs,
+};
+static ssize_t btrfs_show_u64(u64 *value_ptr, spinlock_t *lock, char *buf)
+{
+        u64 val;
+        if (lock)
+                spin_lock(lock);
+        val = *value_ptr;
+        if (lock)
+                spin_unlock(lock);
+        return snprintf(buf, PAGE_SIZE, "%llu\n", val);
+}
+static ssize_t global_rsv_size_show(struct kobject *kobj,
+                                    struct kobj_attribute *ka, char *buf)
+{
+        struct btrfs_fs_info *fs_info = to_fs_info(kobj->parent);
+        struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
+        return btrfs_show_u64(&block_rsv->size, &block_rsv->lock, buf);
+}
+BTRFS_ATTR(global_rsv_size, 0444, global_rsv_size_show);
+static ssize_t global_rsv_reserved_show(struct kobject *kobj,
+                                        struct kobj_attribute *a, char *buf)
+{
+        struct btrfs_fs_info *fs_info = to_fs_info(kobj->parent);
+        struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
+        return btrfs_show_u64(&block_rsv->reserved, &block_rsv->lock, buf);
+}
+BTRFS_ATTR(global_rsv_reserved, 0444, global_rsv_reserved_show);
+#define to_space_info(_kobj) container_of(_kobj, struct btrfs_space_info, kobj)
+static ssize_t raid_bytes_show(struct kobject *kobj,
+                               struct kobj_attribute *attr, char *buf);
+BTRFS_RAID_ATTR(total_bytes, raid_bytes_show);
+BTRFS_RAID_ATTR(used_bytes, raid_bytes_show);
+static ssize_t raid_bytes_show(struct kobject *kobj,
+                               struct kobj_attribute *attr, char *buf)
+{
+        struct btrfs_space_info *sinfo = to_space_info(kobj->parent);
+        struct btrfs_block_group_cache *block_group;
+        int index = kobj - sinfo->block_group_kobjs;
+        u64 val = 0;
+        down_read(&sinfo->groups_sem);
+        list_for_each_entry(block_group, &sinfo->block_groups[index], list) {
+                if (&attr->attr == BTRFS_RAID_ATTR_PTR(total_bytes))
+                        val += block_group->key.offset;
+                else
+                        val += btrfs_block_group_used(&block_group->item);
+        }
+        up_read(&sinfo->groups_sem);
+        return snprintf(buf, PAGE_SIZE, "%llu\n", val);
+}
+static struct attribute *raid_attributes[] = {
+        BTRFS_RAID_ATTR_PTR(total_bytes),
+        BTRFS_RAID_ATTR_PTR(used_bytes),
+        NULL
+};
+static void release_raid_kobj(struct kobject *kobj)
+{
+        kobject_put(kobj->parent);
+}
+struct kobj_type btrfs_raid_ktype = {
+        .sysfs_ops = &kobj_sysfs_ops,
+        .release = release_raid_kobj,
+        .default_attrs = raid_attributes,
+};
+#define SPACE_INFO_ATTR(field)                                          \
+static ssize_t btrfs_space_info_show_##field(struct kobject *kobj,      \
+                                             struct kobj_attribute *a,  \
+                                             char *buf)                 \
+{                                                                       \
+        struct btrfs_space_info *sinfo = to_space_info(kobj);           \
+        return btrfs_show_u64(&sinfo->field, &sinfo->lock, buf);        \
+}                                                                       \
+BTRFS_ATTR(field, 0444, btrfs_space_info_show_##field)
+static ssize_t btrfs_space_info_show_total_bytes_pinned(struct kobject *kobj,
+                                                       struct kobj_attribute *a,
+                                                       char *buf)
+{
+        struct btrfs_space_info *sinfo = to_space_info(kobj);
+        s64 val = percpu_counter_sum(&sinfo->total_bytes_pinned);
+        return snprintf(buf, PAGE_SIZE, "%lld\n", val);
+}
+SPACE_INFO_ATTR(flags);
+SPACE_INFO_ATTR(total_bytes);
+SPACE_INFO_ATTR(bytes_used);
+SPACE_INFO_ATTR(bytes_pinned);
+SPACE_INFO_ATTR(bytes_reserved);
+SPACE_INFO_ATTR(bytes_may_use);
+SPACE_INFO_ATTR(disk_used);
+SPACE_INFO_ATTR(disk_total);
+BTRFS_ATTR(total_bytes_pinned, 0444, btrfs_space_info_show_total_bytes_pinned);
+static struct attribute *space_info_attrs[] = {
+        BTRFS_ATTR_PTR(flags),
+        BTRFS_ATTR_PTR(total_bytes),
+        BTRFS_ATTR_PTR(bytes_used),
+        BTRFS_ATTR_PTR(bytes_pinned),
+        BTRFS_ATTR_PTR(bytes_reserved),
+        BTRFS_ATTR_PTR(bytes_may_use),
+        BTRFS_ATTR_PTR(disk_used),
+        BTRFS_ATTR_PTR(disk_total),
+        BTRFS_ATTR_PTR(total_bytes_pinned),
+        NULL,
+};
+static void space_info_release(struct kobject *kobj)
+{
+        struct btrfs_space_info *sinfo = to_space_info(kobj);
+        percpu_counter_destroy(&sinfo->total_bytes_pinned);
+        kfree(sinfo);
+}
+struct kobj_type space_info_ktype = {
+        .sysfs_ops = &kobj_sysfs_ops,
+        .release = space_info_release,
+        .default_attrs = space_info_attrs,
+};
+static const struct attribute *allocation_attrs[] = {
+        BTRFS_ATTR_PTR(global_rsv_reserved),
+        BTRFS_ATTR_PTR(global_rsv_size),
+        NULL,
+};
+static ssize_t btrfs_label_show(struct kobject *kobj,
+                                struct kobj_attribute *a, char *buf)
+{
+        struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+        return snprintf(buf, PAGE_SIZE, "%s\n", fs_info->super_copy->label);
+}
+static ssize_t btrfs_label_store(struct kobject *kobj,
+                                 struct kobj_attribute *a,
+                                 const char *buf, size_t len)
+{
+        struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+        struct btrfs_trans_handle *trans;
+        struct btrfs_root *root = fs_info->fs_root;
+        int ret;
+        if (len >= BTRFS_LABEL_SIZE) {
+                pr_err("BTRFS: unable to set label with more than %d bytes\n",
+                       BTRFS_LABEL_SIZE - 1);
+                return -EINVAL;
+        }
+        trans = btrfs_start_transaction(root, 0);
+        if (IS_ERR(trans))
+                return PTR_ERR(trans);
+        spin_lock(&root->fs_info->super_lock);
+        strcpy(fs_info->super_copy->label, buf);
+        spin_unlock(&root->fs_info->super_lock);
+        ret = btrfs_commit_transaction(trans, root);
+        if (!ret)
+                return len;
+        return ret;
+}
+BTRFS_ATTR_RW(label, 0644, btrfs_label_show, btrfs_label_store);
+static struct attribute *btrfs_attrs[] = {
+        BTRFS_ATTR_PTR(label),
+        NULL,
+};
+static void btrfs_release_super_kobj(struct kobject *kobj)
+{
+        struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+        complete(&fs_info->kobj_unregister);
+}
+static struct kobj_type btrfs_ktype = {
+        .sysfs_ops      = &kobj_sysfs_ops,
+        .release        = btrfs_release_super_kobj,
+        .default_attrs  = btrfs_attrs,
+};
+static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj)
+{
+        if (kobj->ktype != &btrfs_ktype)
+                return NULL;
+        return container_of(kobj, struct btrfs_fs_info, super_kobj);
+}
+#define NUM_FEATURE_BITS 64
+static char btrfs_unknown_feature_names[3][NUM_FEATURE_BITS][13];
+static struct btrfs_feature_attr btrfs_feature_attrs[3][NUM_FEATURE_BITS];
+static u64 supported_feature_masks[3] = {
+        [FEAT_COMPAT]    = BTRFS_FEATURE_COMPAT_SUPP,
+        [FEAT_COMPAT_RO] = BTRFS_FEATURE_COMPAT_RO_SUPP,
+        [FEAT_INCOMPAT]  = BTRFS_FEATURE_INCOMPAT_SUPP,
+};
+static int addrm_unknown_feature_attrs(struct btrfs_fs_info *fs_info, bool add)
+{
+        int set;
+        for (set = 0; set < FEAT_MAX; set++) {
+                int i;
+                struct attribute *attrs[2];
+                struct attribute_group agroup = {
+                        .name = "features",
+                        .attrs = attrs,
+                };
+                u64 features = get_features(fs_info, set);
+                features &= ~supported_feature_masks[set];
+                if (!features)
+                        continue;
+                attrs[1] = NULL;
+                for (i = 0; i < NUM_FEATURE_BITS; i++) {
+                        struct btrfs_feature_attr *fa;
+                        if (!(features & (1ULL << i)))
+                                continue;
+                        fa = &btrfs_feature_attrs[set][i];
+                        attrs[0] = &fa->kobj_attr.attr;
+                        if (add) {
+                                int ret;
+                                ret = sysfs_merge_group(&fs_info->super_kobj,
+                                                        &agroup);
+                                if (ret)
+                                        return ret;
+                        } else
+                                sysfs_unmerge_group(&fs_info->super_kobj,
+                                                    &agroup);
+                }
+        }
+        return 0;
+}
+static void __btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info)
+{
+        kobject_del(&fs_info->super_kobj);
+        kobject_put(&fs_info->super_kobj);
+        wait_for_completion(&fs_info->kobj_unregister);
+}
+void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info)
+{
+        if (fs_info->space_info_kobj) {
+                sysfs_remove_files(fs_info->space_info_kobj, allocation_attrs);
+                kobject_del(fs_info->space_info_kobj);
+                kobject_put(fs_info->space_info_kobj);
+        }
+        kobject_del(fs_info->device_dir_kobj);
+        kobject_put(fs_info->device_dir_kobj);
+        addrm_unknown_feature_attrs(fs_info, false);
+        sysfs_remove_group(&fs_info->super_kobj, &btrfs_feature_attr_group);
+        __btrfs_sysfs_remove_one(fs_info);
+}
+const char * const btrfs_feature_set_names[3] = {
+        [FEAT_COMPAT]    = "compat",
+        [FEAT_COMPAT_RO] = "compat_ro",
+        [FEAT_INCOMPAT]  = "incompat",
+};
+char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags)
+{
+        size_t bufsize = 4096; /* safe max, 64 names * 64 bytes */
+        int len = 0;
+        int i;
+        char *str;
+        str = kmalloc(bufsize, GFP_KERNEL);
+        if (!str)
+                return str;
+        for (i = 0; i < ARRAY_SIZE(btrfs_feature_attrs[set]); i++) {
+                const char *name;
+                if (!(flags & (1ULL << i)))
+                        continue;
+                name = btrfs_feature_attrs[set][i].kobj_attr.attr.name;
+                len += snprintf(str + len, bufsize - len, "%s%s",
+                                len ? "," : "", name);
+        }
+        return str;
+}
+static void init_feature_attrs(void)
+{
+        struct btrfs_feature_attr *fa;
+        int set, i;
+        BUILD_BUG_ON(ARRAY_SIZE(btrfs_unknown_feature_names) !=
+                     ARRAY_SIZE(btrfs_feature_attrs));
+        BUILD_BUG_ON(ARRAY_SIZE(btrfs_unknown_feature_names[0]) !=
+                     ARRAY_SIZE(btrfs_feature_attrs[0]));
+        memset(btrfs_feature_attrs, 0, sizeof(btrfs_feature_attrs));
+        memset(btrfs_unknown_feature_names, 0,
+               sizeof(btrfs_unknown_feature_names));
+        for (i = 0; btrfs_supported_feature_attrs[i]; i++) {
+                struct btrfs_feature_attr *sfa;
+                struct attribute *a = btrfs_supported_feature_attrs[i];
+                int bit;
+                sfa = attr_to_btrfs_feature_attr(a);
+                bit = ilog2(sfa->feature_bit);
+                fa = &btrfs_feature_attrs[sfa->feature_set][bit];
+                fa->kobj_attr.attr.name = sfa->kobj_attr.attr.name;
+        }
+        for (set = 0; set < FEAT_MAX; set++) {
+                for (i = 0; i < ARRAY_SIZE(btrfs_feature_attrs[set]); i++) {
+                        char *name = btrfs_unknown_feature_names[set][i];
+                        fa = &btrfs_feature_attrs[set][i];
+                        if (fa->kobj_attr.attr.name)
+                                continue;
+                        snprintf(name, 13, "%s:%u",
+                                 btrfs_feature_set_names[set], i);
+                        fa->kobj_attr.attr.name = name;
+                        fa->kobj_attr.attr.mode = S_IRUGO;
+                        fa->feature_set = set;
+                        fa->feature_bit = 1ULL << i;
+                }
+        }
+}
+static int add_device_membership(struct btrfs_fs_info *fs_info)
+{
+        int error = 0;
+        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+        struct btrfs_device *dev;
+        fs_info->device_dir_kobj = kobject_create_and_add("devices",
+                                                &fs_info->super_kobj);
+        if (!fs_info->device_dir_kobj)
+                return -ENOMEM;
+        list_for_each_entry(dev, &fs_devices->devices, dev_list) {
+                struct hd_struct *disk;
+                struct kobject *disk_kobj;
+                if (!dev->bdev)
+                        continue;
+                disk = dev->bdev->bd_part;
+                disk_kobj = &part_to_dev(disk)->kobj;
+                error = sysfs_create_link(fs_info->device_dir_kobj,
+                                          disk_kobj, disk_kobj->name);
+                if (error)
+                        break;
+        }
+        return error;
+}
 /* /sys/fs/btrfs/ entry */
 static struct kset *btrfs_kset;
+int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info)
+{
+        int error;
+        init_completion(&fs_info->kobj_unregister);
+        fs_info->super_kobj.kset = btrfs_kset;
+        error = kobject_init_and_add(&fs_info->super_kobj, &btrfs_ktype, NULL,
+                                     "%pU", fs_info->fsid);
+        if (error)
+                return error;
+        error = sysfs_create_group(&fs_info->super_kobj,
+                                   &btrfs_feature_attr_group);
+        if (error) {
+                __btrfs_sysfs_remove_one(fs_info);
+                return error;
+        }
+        error = addrm_unknown_feature_attrs(fs_info, true);
+        if (error)
+                goto failure;
+        error = add_device_membership(fs_info);
+        if (error)
+                goto failure;
+        fs_info->space_info_kobj = kobject_create_and_add("allocation",
+                                                  &fs_info->super_kobj);
+        if (!fs_info->space_info_kobj) {
+                error = -ENOMEM;
+                goto failure;
+        }
+        error = sysfs_create_files(fs_info->space_info_kobj, allocation_attrs);
+        if (error)
+                goto failure;
+        return 0;
+failure:
+        btrfs_sysfs_remove_one(fs_info);
+        return error;
+}
 int btrfs_init_sysfs(void)
 {
+        int ret;
        btrfs_kset = kset_create_and_add("btrfs", NULL, fs_kobj);
        if (!btrfs_kset)
                return -ENOMEM;
+        init_feature_attrs();
+        ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_feature_attr_group);
+        if (ret) {
+                kset_unregister(btrfs_kset);
+                return ret;
+        }
        return 0;
 }
 void btrfs_exit_sysfs(void)
 {
+        sysfs_remove_group(&btrfs_kset->kobj, &btrfs_feature_attr_group);
        kset_unregister(btrfs_kset);
 }
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
new file mode 100644
index 000000000000..f3cea3710d44
--- /dev/null
+++ b/fs/btrfs/sysfs.h
@@ -0,0 +1,64 @@
+#ifndef _BTRFS_SYSFS_H_
+#define _BTRFS_SYSFS_H_
+enum btrfs_feature_set {
+        FEAT_COMPAT,
+        FEAT_COMPAT_RO,
+        FEAT_INCOMPAT,
+        FEAT_MAX
+};
+#define __INIT_KOBJ_ATTR(_name, _mode, _show, _store)                   \
+{                                                                       \
+        .attr   = { .name = __stringify(_name), .mode = _mode },        \
+        .show   = _show,                                                \
+        .store  = _store,                                               \
+}
+#define BTRFS_ATTR_RW(_name, _mode, _show, _store)                      \
+static struct kobj_attribute btrfs_attr_##_name =                       \
+                        __INIT_KOBJ_ATTR(_name, _mode, _show, _store)
+#define BTRFS_ATTR(_name, _mode, _show)                                 \
+        BTRFS_ATTR_RW(_name, _mode, _show, NULL)
+#define BTRFS_ATTR_PTR(_name)    (&btrfs_attr_##_name.attr)
+#define BTRFS_RAID_ATTR(_name, _show)                                   \
+static struct kobj_attribute btrfs_raid_attr_##_name =                  \
+                        __INIT_KOBJ_ATTR(_name, 0444, _show, NULL)
+#define BTRFS_RAID_ATTR_PTR(_name)    (&btrfs_raid_attr_##_name.attr)
+struct btrfs_feature_attr {
+        struct kobj_attribute kobj_attr;
+        enum btrfs_feature_set feature_set;
+        u64 feature_bit;
+};
+#define BTRFS_FEAT_ATTR(_name, _feature_set, _prefix, _feature_bit)          \
+static struct btrfs_feature_attr btrfs_attr_##_name = {                      \
+        .kobj_attr = __INIT_KOBJ_ATTR(_name, S_IRUGO,                        \
+                                      btrfs_feature_attr_show,               \
+                                      btrfs_feature_attr_store),             \
+        .feature_set    = _feature_set,                                      \
+        .feature_bit    = _prefix ##_## _feature_bit,                        \
+}
+#define BTRFS_FEAT_ATTR_PTR(_name)    (&btrfs_attr_##_name.kobj_attr.attr)
+#define BTRFS_FEAT_ATTR_COMPAT(name, feature) \
+        BTRFS_FEAT_ATTR(name, FEAT_COMPAT, BTRFS_FEATURE_COMPAT, feature)
+#define BTRFS_FEAT_ATTR_COMPAT_RO(name, feature) \
+        BTRFS_FEAT_ATTR(name, FEAT_COMPAT_RO, BTRFS_FEATURE_COMPAT, feature)
+#define BTRFS_FEAT_ATTR_INCOMPAT(name, feature) \
+        BTRFS_FEAT_ATTR(name, FEAT_INCOMPAT, BTRFS_FEATURE_INCOMPAT, feature)
+/* convert from attribute */
+#define to_btrfs_feature_attr(a) \
+                        container_of(a, struct btrfs_feature_attr, kobj_attr)
+#define attr_to_btrfs_attr(a) container_of(a, struct kobj_attribute, attr)
+#define attr_to_btrfs_feature_attr(a) \
+                        to_btrfs_feature_attr(attr_to_btrfs_attr(a))
+char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags);
+extern const char * const btrfs_feature_set_names[3];
+extern struct kobj_type space_info_ktype;
+extern struct kobj_type btrfs_raid_ktype;
+#endif /* _BTRFS_SYSFS_H_ */
diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h
index b353bc806ca0..312560a9123d 100644
--- a/fs/btrfs/tests/btrfs-tests.h
+++ b/fs/btrfs/tests/btrfs-tests.h
@@ -21,7 +21,7 @@
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
-#define test_msg(fmt, ...) pr_info("btrfs: selftest: " fmt, ##__VA_ARGS__)
+#define test_msg(fmt, ...) pr_info("BTRFS: selftest: " fmt, ##__VA_ARGS__)
 int btrfs_test_free_space_cache(void);
 int btrfs_test_extent_buffer_operations(void);
diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c
index 6fc82010dc15..c8d9ddf84c69 100644
--- a/fs/btrfs/tests/free-space-tests.c
+++ b/fs/btrfs/tests/free-space-tests.c
@@ -101,7 +101,7 @@ static int test_extents(struct btrfs_block_group_cache *cache)
        ret = btrfs_remove_free_space(cache, 2 * 1024 * 1024, 4096);
        if (ret) {
-                test_msg("Error removing middle peice %d\n", ret);
+                test_msg("Error removing middle piece %d\n", ret);
                return ret;
        }
@@ -266,7 +266,7 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
        }
        if (test_check_exists(cache, 512 * 1024, 3 * 1024 * 1024)) {
-                test_msg("Left over peices after removing overlapping\n");
+                test_msg("Left over pieces after removing overlapping\n");
                return -1;
        }
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index c6a872a8a468..34cd83184c4a 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -62,7 +62,7 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
        WARN_ON(atomic_read(&transaction->use_count) == 0);
        if (atomic_dec_and_test(&transaction->use_count)) {
                BUG_ON(!list_empty(&transaction->list));
-                WARN_ON(transaction->delayed_refs.root.rb_node);
+                WARN_ON(!RB_EMPTY_ROOT(&transaction->delayed_refs.href_root));
                while (!list_empty(&transaction->pending_chunks)) {
                        struct extent_map *em;
@@ -183,8 +183,8 @@ loop:
        atomic_set(&cur_trans->use_count, 2);
        cur_trans->start_time = get_seconds();
-        cur_trans->delayed_refs.root = RB_ROOT;
+        cur_trans->delayed_refs.href_root = RB_ROOT;
-        cur_trans->delayed_refs.num_entries = 0;
+        atomic_set(&cur_trans->delayed_refs.num_entries, 0);
        cur_trans->delayed_refs.num_heads_ready = 0;
        cur_trans->delayed_refs.num_heads = 0;
        cur_trans->delayed_refs.flushing = 0;
@@ -196,17 +196,14 @@ loop:
         */
        smp_mb();
        if (!list_empty(&fs_info->tree_mod_seq_list))
-                WARN(1, KERN_ERR "btrfs: tree_mod_seq_list not empty when "
+                WARN(1, KERN_ERR "BTRFS: tree_mod_seq_list not empty when "
                        "creating a fresh transaction\n");
        if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log))
-                WARN(1, KERN_ERR "btrfs: tree_mod_log rb tree not empty when "
+                WARN(1, KERN_ERR "BTRFS: tree_mod_log rb tree not empty when "
                        "creating a fresh transaction\n");
        atomic64_set(&fs_info->tree_mod_seq, 0);
        spin_lock_init(&cur_trans->delayed_refs.lock);
-        atomic_set(&cur_trans->delayed_refs.procs_running_refs, 0);
-        atomic_set(&cur_trans->delayed_refs.ref_seq, 0);
-        init_waitqueue_head(&cur_trans->delayed_refs.wait);
        INIT_LIST_HEAD(&cur_trans->pending_snapshots);
        INIT_LIST_HEAD(&cur_trans->ordered_operations);
@@ -472,6 +469,7 @@ again:
        h->type = type;
        h->allocating_chunk = false;
        h->reloc_reserved = false;
+        h->sync = false;
        INIT_LIST_HEAD(&h->qgroup_ref_list);
        INIT_LIST_HEAD(&h->new_bgs);
@@ -647,7 +645,7 @@ static int should_end_transaction(struct btrfs_trans_handle *trans,
                                  struct btrfs_root *root)
 {
        if (root->fs_info->global_block_rsv.space_info->full &&
-            btrfs_should_throttle_delayed_refs(trans, root))
+            btrfs_check_space_for_delayed_refs(trans, root))
                return 1;
        return !!btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5);
@@ -711,8 +709,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
                btrfs_create_pending_block_groups(trans, root);
        trans->delayed_ref_updates = 0;
-        if (btrfs_should_throttle_delayed_refs(trans, root)) {
+        if (!trans->sync && btrfs_should_throttle_delayed_refs(trans, root)) {
-                cur = max_t(unsigned long, cur, 1);
+                cur = max_t(unsigned long, cur, 32);
                trans->delayed_ref_updates = 0;
                btrfs_run_delayed_refs(trans, root, cur);
        }
@@ -788,12 +786,6 @@ int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
        return __btrfs_end_transaction(trans, root, 1);
 }
-int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans,
-                                struct btrfs_root *root)
-{
-        return __btrfs_end_transaction(trans, root, 1);
-}
 /*
 * when btree blocks are allocated, they have some corresponding bits set for
 * them in one of two extent_io trees.  This is used to make sure all of
@@ -1105,7 +1097,7 @@ int btrfs_defrag_root(struct btrfs_root *root)
                        break;
                if (btrfs_defrag_cancelled(root->fs_info)) {
-                        printk(KERN_DEBUG "btrfs: defrag_root cancelled\n");
+                        pr_debug("BTRFS: defrag_root cancelled\n");
                        ret = -EAGAIN;
                        break;
                }
@@ -1746,6 +1738,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                goto cleanup_transaction;
        btrfs_wait_delalloc_flush(root->fs_info);
+        btrfs_scrub_pause(root);
        /*
         * Ok now we need to make sure to block out any other joins while we
         * commit the transaction.  We could have started a join before setting
@@ -1810,7 +1804,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        WARN_ON(cur_trans != trans->transaction);
-        btrfs_scrub_pause(root);
        /* btrfs_commit_tree_roots is responsible for getting the
         * various roots consistent with each other.  Every pointer
         * in the tree of tree roots has to point to the most up to date
@@ -1833,6 +1826,15 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                goto cleanup_transaction;
        }
+        /*
+         * Since the transaction is done, we should set the inode map cache flag
+         * before any other comming transaction.
+         */
+        if (btrfs_test_opt(root, CHANGE_INODE_CACHE))
+                btrfs_set_opt(root->fs_info->mount_opt, INODE_MAP_CACHE);
+        else
+                btrfs_clear_opt(root->fs_info->mount_opt, INODE_MAP_CACHE);
        /* commit_fs_roots gets rid of all the tree log roots, it is now
         * safe to free the root of tree log roots
         */
@@ -1975,10 +1977,23 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root)
        }
        root = list_first_entry(&fs_info->dead_roots,
                        struct btrfs_root, root_list);
+        /*
+         * Make sure root is not involved in send,
+         * if we fail with first root, we return
+         * directly rather than continue.
+         */
+        spin_lock(&root->root_item_lock);
+        if (root->send_in_progress) {
+                spin_unlock(&fs_info->trans_lock);
+                spin_unlock(&root->root_item_lock);
+                return 0;
+        }
+        spin_unlock(&root->root_item_lock);
        list_del_init(&root->root_list);
        spin_unlock(&fs_info->trans_lock);
-        pr_debug("btrfs: cleaner removing %llu\n", root->objectid);
+        pr_debug("BTRFS: cleaner removing %llu\n", root->objectid);
        btrfs_kill_all_delayed_nodes(root);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 7657d115067d..6ac037e9f9f0 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -93,6 +93,7 @@ struct btrfs_trans_handle {
        short adding_csums;
        bool allocating_chunk;
        bool reloc_reserved;
+        bool sync;
        unsigned int type;
        /*
         * this root is only needed to validate that the root passed to
@@ -154,8 +155,6 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
                                   int wait_for_unblock);
 int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
                                   struct btrfs_root *root);
-int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans,
-                                struct btrfs_root *root);
 int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root);
 void btrfs_throttle(struct btrfs_root *root);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 9f7fc51ca334..39d83da03e03 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -570,7 +570,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
                if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
                        nbytes = 0;
        } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
-                size = btrfs_file_extent_inline_len(eb, item);
+                size = btrfs_file_extent_inline_len(eb, slot, item);
                nbytes = btrfs_file_extent_ram_bytes(eb, item);
                extent_end = ALIGN(start + size, root->sectorsize);
        } else {
@@ -1238,7 +1238,8 @@ static int insert_orphan_item(struct btrfs_trans_handle *trans,
                              struct btrfs_root *root, u64 offset)
 {
        int ret;
-        ret = btrfs_find_orphan_item(root, offset);
+        ret = btrfs_find_item(root, NULL, BTRFS_ORPHAN_OBJECTID,
+                        offset, BTRFS_ORPHAN_ITEM_KEY, NULL);
        if (ret > 0)
                ret = btrfs_insert_orphan_item(trans, root, offset);
        return ret;
@@ -3194,7 +3195,7 @@ static int log_inode_item(struct btrfs_trans_handle *trans,
 static noinline int copy_items(struct btrfs_trans_handle *trans,
                               struct inode *inode,
                               struct btrfs_path *dst_path,
-                               struct extent_buffer *src,
+                               struct btrfs_path *src_path, u64 *last_extent,
                               int start_slot, int nr, int inode_only)
 {
        unsigned long src_offset;
@@ -3202,6 +3203,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
        struct btrfs_root *log = BTRFS_I(inode)->root->log_root;
        struct btrfs_file_extent_item *extent;
        struct btrfs_inode_item *inode_item;
+        struct extent_buffer *src = src_path->nodes[0];
+        struct btrfs_key first_key, last_key, key;
        int ret;
        struct btrfs_key *ins_keys;
        u32 *ins_sizes;
@@ -3209,6 +3212,9 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
        int i;
        struct list_head ordered_sums;
        int skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
+        bool has_extents = false;
+        bool need_find_last_extent = (*last_extent == 0);
+        bool done = false;
        INIT_LIST_HEAD(&ordered_sums);
@@ -3217,6 +3223,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
        if (!ins_data)
                return -ENOMEM;
+        first_key.objectid = (u64)-1;
        ins_sizes = (u32 *)ins_data;
        ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
@@ -3237,6 +3245,9 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
                src_offset = btrfs_item_ptr_offset(src, start_slot + i);
+                if ((i == (nr - 1)))
+                        last_key = ins_keys[i];
                if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
                        inode_item = btrfs_item_ptr(dst_path->nodes[0],
                                                    dst_path->slots[0],
@@ -3248,6 +3259,21 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
                                           src_offset, ins_sizes[i]);
                }
+                /*
+                 * We set need_find_last_extent here in case we know we were
+                 * processing other items and then walk into the first extent in
+                 * the inode.  If we don't hit an extent then nothing changes,
+                 * we'll do the last search the next time around.
+                 */
+                if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) {
+                        has_extents = true;
+                        if (need_find_last_extent &&
+                            first_key.objectid == (u64)-1)
+                                first_key = ins_keys[i];
+                } else {
+                        need_find_last_extent = false;
+                }
                /* take a reference on file data extents so that truncates
                 * or deletes of this inode don't have to relog the inode
                 * again
@@ -3312,6 +3338,128 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
                list_del(&sums->list);
                kfree(sums);
        }
+        if (!has_extents)
+                return ret;
+        /*
+         * Because we use btrfs_search_forward we could skip leaves that were
+         * not modified and then assume *last_extent is valid when it really
+         * isn't.  So back up to the previous leaf and read the end of the last
+         * extent before we go and fill in holes.
+         */
+        if (need_find_last_extent) {
+                u64 len;
+                ret = btrfs_prev_leaf(BTRFS_I(inode)->root, src_path);
+                if (ret < 0)
+                        return ret;
+                if (ret)
+                        goto fill_holes;
+                if (src_path->slots[0])
+                        src_path->slots[0]--;
+                src = src_path->nodes[0];
+                btrfs_item_key_to_cpu(src, &key, src_path->slots[0]);
+                if (key.objectid != btrfs_ino(inode) ||
+                    key.type != BTRFS_EXTENT_DATA_KEY)
+                        goto fill_holes;
+                extent = btrfs_item_ptr(src, src_path->slots[0],
+                                        struct btrfs_file_extent_item);
+                if (btrfs_file_extent_type(src, extent) ==
+                    BTRFS_FILE_EXTENT_INLINE) {
+                        len = btrfs_file_extent_inline_len(src,
+                                                           src_path->slots[0],
+                                                           extent);
+                        *last_extent = ALIGN(key.offset + len,
+                                             log->sectorsize);
+                } else {
+                        len = btrfs_file_extent_num_bytes(src, extent);
+                        *last_extent = key.offset + len;
+                }
+        }
+fill_holes:
+        /* So we did prev_leaf, now we need to move to the next leaf, but a few
+         * things could have happened
+         *
+         * 1) A merge could have happened, so we could currently be on a leaf
+         * that holds what we were copying in the first place.
+         * 2) A split could have happened, and now not all of the items we want
+         * are on the same leaf.
+         *
+         * So we need to adjust how we search for holes, we need to drop the
+         * path and re-search for the first extent key we found, and then walk
+         * forward until we hit the last one we copied.
+         */
+        if (need_find_last_extent) {
+                /* btrfs_prev_leaf could return 1 without releasing the path */
+                btrfs_release_path(src_path);
+                ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, &first_key,
+                                        src_path, 0, 0);
+                if (ret < 0)
+                        return ret;
+                ASSERT(ret == 0);
+                src = src_path->nodes[0];
+                i = src_path->slots[0];
+        } else {
+                i = start_slot;
+        }
+        /*
+         * Ok so here we need to go through and fill in any holes we may have
+         * to make sure that holes are punched for those areas in case they had
+         * extents previously.
+         */
+        while (!done) {
+                u64 offset, len;
+                u64 extent_end;
+                if (i >= btrfs_header_nritems(src_path->nodes[0])) {
+                        ret = btrfs_next_leaf(BTRFS_I(inode)->root, src_path);
+                        if (ret < 0)
+                                return ret;
+                        ASSERT(ret == 0);
+                        src = src_path->nodes[0];
+                        i = 0;
+                }
+                btrfs_item_key_to_cpu(src, &key, i);
+                if (!btrfs_comp_cpu_keys(&key, &last_key))
+                        done = true;
+                if (key.objectid != btrfs_ino(inode) ||
+                    key.type != BTRFS_EXTENT_DATA_KEY) {
+                        i++;
+                        continue;
+                }
+                extent = btrfs_item_ptr(src, i, struct btrfs_file_extent_item);
+                if (btrfs_file_extent_type(src, extent) ==
+                    BTRFS_FILE_EXTENT_INLINE) {
+                        len = btrfs_file_extent_inline_len(src, i, extent);
+                        extent_end = ALIGN(key.offset + len, log->sectorsize);
+                } else {
+                        len = btrfs_file_extent_num_bytes(src, extent);
+                        extent_end = key.offset + len;
+                }
+                i++;
+                if (*last_extent == key.offset) {
+                        *last_extent = extent_end;
+                        continue;
+                }
+                offset = *last_extent;
+                len = key.offset - *last_extent;
+                ret = btrfs_insert_file_extent(trans, log, btrfs_ino(inode),
+                                               offset, 0, 0, len, 0, len, 0,
+                                               0, 0);
+                if (ret)
+                        break;
+                *last_extent = offset + len;
+        }
+        /*
+         * Need to let the callers know we dropped the path so they should
+         * re-search.
+         */
+        if (!ret && need_find_last_extent)
+                ret = 1;
        return ret;
 }
@@ -3349,21 +3497,27 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
        int ret;
        int index = log->log_transid % 2;
        bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
+        int extent_inserted = 0;
-        ret = __btrfs_drop_extents(trans, log, inode, path, em->start,
-                                   em->start + em->len, NULL, 0);
-        if (ret)
-                return ret;
        INIT_LIST_HEAD(&ordered_sums);
        btrfs_init_map_token(&token);
-        key.objectid = btrfs_ino(inode);
-        key.type = BTRFS_EXTENT_DATA_KEY;
-        key.offset = em->start;
-        ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*fi));
+        ret = __btrfs_drop_extents(trans, log, inode, path, em->start,
+                                   em->start + em->len, NULL, 0, 1,
+                                   sizeof(*fi), &extent_inserted);
        if (ret)
                return ret;
+        if (!extent_inserted) {
+                key.objectid = btrfs_ino(inode);
+                key.type = BTRFS_EXTENT_DATA_KEY;
+                key.offset = em->start;
+                ret = btrfs_insert_empty_item(trans, log, path, &key,
+                                              sizeof(*fi));
+                if (ret)
+                        return ret;
+        }
        leaf = path->nodes[0];
        fi = btrfs_item_ptr(leaf, path->slots[0],
                            struct btrfs_file_extent_item);
@@ -3485,7 +3639,11 @@ again:
                 * start over after this.
                 */
-                wait_event(ordered->wait, ordered->csum_bytes_left == 0);
+                if (ordered->csum_bytes_left) {
+                        btrfs_start_ordered_extent(inode, ordered, 0);
+                        wait_event(ordered->wait,
+                                   ordered->csum_bytes_left == 0);
+                }
                list_for_each_entry(sum, &ordered->list, list) {
                        ret = btrfs_csum_file_blocks(trans, log, sum);
@@ -3630,6 +3788,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
        struct btrfs_key max_key;
        struct btrfs_root *log = root->log_root;
        struct extent_buffer *src = NULL;
+        u64 last_extent = 0;
        int err = 0;
        int ret;
        int nritems;
@@ -3745,11 +3904,15 @@ again:
                        goto next_slot;
                }
-                ret = copy_items(trans, inode, dst_path, src, ins_start_slot,
+                ret = copy_items(trans, inode, dst_path, path, &last_extent,
-                                 ins_nr, inode_only);
+                                 ins_start_slot, ins_nr, inode_only);
-                if (ret) {
+                if (ret < 0) {
                        err = ret;
                        goto out_unlock;
+                } if (ret) {
+                        ins_nr = 0;
+                        btrfs_release_path(path);
+                        continue;
                }
                ins_nr = 1;
                ins_start_slot = path->slots[0];
@@ -3763,13 +3926,14 @@ next_slot:
                        goto again;
                }
                if (ins_nr) {
-                        ret = copy_items(trans, inode, dst_path, src,
+                        ret = copy_items(trans, inode, dst_path, path,
-                                         ins_start_slot,
+                                         &last_extent, ins_start_slot,
                                         ins_nr, inode_only);
-                        if (ret) {
+                        if (ret < 0) {
                                err = ret;
                                goto out_unlock;
                        }
+                        ret = 0;
                        ins_nr = 0;
                }
                btrfs_release_path(path);
@@ -3784,12 +3948,13 @@ next_slot:
                }
        }
        if (ins_nr) {
-                ret = copy_items(trans, inode, dst_path, src, ins_start_slot,
+                ret = copy_items(trans, inode, dst_path, path, &last_extent,
-                                 ins_nr, inode_only);
+                                 ins_start_slot, ins_nr, inode_only);
-                if (ret) {
+                if (ret < 0) {
                        err = ret;
                        goto out_unlock;
                }
+                ret = 0;
                ins_nr = 0;
        }
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
index b0a523b2c60e..840a38b2778a 100644
--- a/fs/btrfs/ulist.c
+++ b/fs/btrfs/ulist.c
@@ -5,8 +5,8 @@
 */
 #include <linux/slab.h>
-#include <linux/export.h>
 #include "ulist.h"
+#include "ctree.h"
 /*
 * ulist is a generic data structure to hold a collection of unique u64
@@ -14,10 +14,6 @@
 * enumerating it.
 * It is possible to store an auxiliary value along with the key.
 *
- * The implementation is preliminary and can probably be sped up
- * significantly. A first step would be to store the values in an rbtree
- * as soon as ULIST_SIZE is exceeded.
- *
 * A sample usage for ulists is the enumeration of directed graphs without
 * visiting a node twice. The pseudo-code could look like this:
 *
@@ -50,12 +46,10 @@
 */
 void ulist_init(struct ulist *ulist)
 {
-        ulist->nnodes = 0;
+        INIT_LIST_HEAD(&ulist->nodes);
-        ulist->nodes = ulist->int_nodes;
-        ulist->nodes_alloced = ULIST_SIZE;
        ulist->root = RB_ROOT;
+        ulist->nnodes = 0;
 }
-EXPORT_SYMBOL(ulist_init);
 /**
 * ulist_fini - free up additionally allocated memory for the ulist
@@ -64,18 +58,17 @@ EXPORT_SYMBOL(ulist_init);
 * This is useful in cases where the base 'struct ulist' has been statically
 * allocated.
 */
-void ulist_fini(struct ulist *ulist)
+static void ulist_fini(struct ulist *ulist)
 {
-        /*
+        struct ulist_node *node;
-         * The first ULIST_SIZE elements are stored inline in struct ulist.
+        struct ulist_node *next;
-         * Only if more elements are alocated they need to be freed.
-         */
+        list_for_each_entry_safe(node, next, &ulist->nodes, list) {
-        if (ulist->nodes_alloced > ULIST_SIZE)
+                kfree(node);
-                kfree(ulist->nodes);
+        }
-        ulist->nodes_alloced = 0;       /* in case ulist_fini is called twice */
        ulist->root = RB_ROOT;
+        INIT_LIST_HEAD(&ulist->nodes);
 }
-EXPORT_SYMBOL(ulist_fini);
 /**
 * ulist_reinit - prepare a ulist for reuse
@@ -89,7 +82,6 @@ void ulist_reinit(struct ulist *ulist)
        ulist_fini(ulist);
        ulist_init(ulist);
 }
-EXPORT_SYMBOL(ulist_reinit);
 /**
 * ulist_alloc - dynamically allocate a ulist
@@ -108,7 +100,6 @@ struct ulist *ulist_alloc(gfp_t gfp_mask)
        return ulist;
 }
-EXPORT_SYMBOL(ulist_alloc);
 /**
 * ulist_free - free dynamically allocated ulist
@@ -123,7 +114,6 @@ void ulist_free(struct ulist *ulist)
        ulist_fini(ulist);
        kfree(ulist);
 }
-EXPORT_SYMBOL(ulist_free);
 static struct ulist_node *ulist_rbtree_search(struct ulist *ulist, u64 val)
 {
@@ -192,63 +182,32 @@ int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask)
 int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
                    u64 *old_aux, gfp_t gfp_mask)
 {
-        int ret = 0;
+        int ret;
-        struct ulist_node *node = NULL;
+        struct ulist_node *node;
        node = ulist_rbtree_search(ulist, val);
        if (node) {
                if (old_aux)
                        *old_aux = node->aux;
                return 0;
        }
+        node = kmalloc(sizeof(*node), gfp_mask);
+        if (!node)
+                return -ENOMEM;
-        if (ulist->nnodes >= ulist->nodes_alloced) {
+        node->val = val;
-                u64 new_alloced = ulist->nodes_alloced + 128;
+        node->aux = aux;
-                struct ulist_node *new_nodes;
+#ifdef CONFIG_BTRFS_DEBUG
-                void *old = NULL;
+        node->seqnum = ulist->nnodes;
-                int i;
+#endif
-                for (i = 0; i < ulist->nnodes; i++)
-                        rb_erase(&ulist->nodes[i].rb_node, &ulist->root);
-                /*
-                 * if nodes_alloced == ULIST_SIZE no memory has been allocated
-                 * yet, so pass NULL to krealloc
-                 */
-                if (ulist->nodes_alloced > ULIST_SIZE)
-                        old = ulist->nodes;
-                new_nodes = krealloc(old, sizeof(*new_nodes) * new_alloced,
+        ret = ulist_rbtree_insert(ulist, node);
-                                     gfp_mask);
+        ASSERT(!ret);
-                if (!new_nodes)
+        list_add_tail(&node->list, &ulist->nodes);
-                        return -ENOMEM;
+        ulist->nnodes++;
-                if (!old)
-                        memcpy(new_nodes, ulist->int_nodes,
-                               sizeof(ulist->int_nodes));
-                ulist->nodes = new_nodes;
-                ulist->nodes_alloced = new_alloced;
-                /*
-                 * krealloc actually uses memcpy, which does not copy rb_node
-                 * pointers, so we have to do it ourselves.  Otherwise we may
-                 * be bitten by crashes.
-                 */
-                for (i = 0; i < ulist->nnodes; i++) {
-                        ret = ulist_rbtree_insert(ulist, &ulist->nodes[i]);
-                        if (ret < 0)
-                                return ret;
-                }
-        }
-        ulist->nodes[ulist->nnodes].val = val;
-        ulist->nodes[ulist->nnodes].aux = aux;
-        ret = ulist_rbtree_insert(ulist, &ulist->nodes[ulist->nnodes]);
-        BUG_ON(ret);
-        ++ulist->nnodes;
        return 1;
 }
-EXPORT_SYMBOL(ulist_add);
 /**
 * ulist_next - iterate ulist
@@ -268,11 +227,25 @@ EXPORT_SYMBOL(ulist_add);
 */
 struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_iterator *uiter)
 {
-        if (ulist->nnodes == 0)
+        struct ulist_node *node;
+        if (list_empty(&ulist->nodes))
                return NULL;
-        if (uiter->i < 0 || uiter->i >= ulist->nnodes)
+        if (uiter->cur_list && uiter->cur_list->next == &ulist->nodes)
                return NULL;
+        if (uiter->cur_list) {
-        return &ulist->nodes[uiter->i++];
+                uiter->cur_list = uiter->cur_list->next;
+        } else {
+                uiter->cur_list = ulist->nodes.next;
+#ifdef CONFIG_BTRFS_DEBUG
+                uiter->i = 0;
+#endif
+        }
+        node = list_entry(uiter->cur_list, struct ulist_node, list);
+#ifdef CONFIG_BTRFS_DEBUG
+        ASSERT(node->seqnum == uiter->i);
+        ASSERT(uiter->i >= 0 && uiter->i < ulist->nnodes);
+        uiter->i++;
+#endif
+        return node;
 }
-EXPORT_SYMBOL(ulist_next);
diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h
index fb36731074b5..7f78cbf5cf41 100644
--- a/fs/btrfs/ulist.h
+++ b/fs/btrfs/ulist.h
@@ -17,18 +17,12 @@
 * enumerating it.
 * It is possible to store an auxiliary value along with the key.
 *
- * The implementation is preliminary and can probably be sped up
- * significantly. A first step would be to store the values in an rbtree
- * as soon as ULIST_SIZE is exceeded.
 */
-/*
- * number of elements statically allocated inside struct ulist
- */
-#define ULIST_SIZE 16
 struct ulist_iterator {
+#ifdef CONFIG_BTRFS_DEBUG
        int i;
+#endif
+        struct list_head *cur_list;  /* hint to start search */
 };
 /*
@@ -37,6 +31,12 @@ struct ulist_iterator {
 struct ulist_node {
        u64 val;                /* value to store */
        u64 aux;                /* auxiliary value saved along with the val */
+#ifdef CONFIG_BTRFS_DEBUG
+        int seqnum;             /* sequence number this node is added */
+#endif
+        struct list_head list;  /* used to link node */
        struct rb_node rb_node; /* used to speed up search */
 };
@@ -46,28 +46,11 @@ struct ulist {
         */
        unsigned long nnodes;
-        /*
+        struct list_head nodes;
-         * number of nodes we already have room for
-         */
-        unsigned long nodes_alloced;
-        /*
-         * pointer to the array storing the elements. The first ULIST_SIZE
-         * elements are stored inline. In this case the it points to int_nodes.
-         * After exceeding ULIST_SIZE, dynamic memory is allocated.
-         */
-        struct ulist_node *nodes;
        struct rb_root root;
-        /*
-         * inline storage space for the first ULIST_SIZE entries
-         */
-        struct ulist_node int_nodes[ULIST_SIZE];
 };
 void ulist_init(struct ulist *ulist);
-void ulist_fini(struct ulist *ulist);
 void ulist_reinit(struct ulist *ulist);
 struct ulist *ulist_alloc(gfp_t gfp_mask);
 void ulist_free(struct ulist *ulist);
@@ -77,6 +60,6 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
 struct ulist_node *ulist_next(struct ulist *ulist,
                              struct ulist_iterator *uiter);
-#define ULIST_ITER_INIT(uiter) ((uiter)->i = 0)
+#define ULIST_ITER_INIT(uiter) ((uiter)->cur_list = NULL)
 #endif
diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c
index fbda90004fe9..f6a4c03ee7d8 100644
--- a/fs/btrfs/uuid-tree.c
+++ b/fs/btrfs/uuid-tree.c
@@ -69,7 +69,7 @@ static int btrfs_uuid_tree_lookup(struct btrfs_root *uuid_root, u8 *uuid,
        ret = -ENOENT;
        if (!IS_ALIGNED(item_size, sizeof(u64))) {
-                pr_warn("btrfs: uuid item with illegal size %lu!\n",
+                btrfs_warn(uuid_root->fs_info, "uuid item with illegal size %lu!",
                        (unsigned long)item_size);
                goto out;
        }
@@ -137,7 +137,8 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans,
                offset = btrfs_item_ptr_offset(eb, slot);
                offset += btrfs_item_size_nr(eb, slot) - sizeof(subid_le);
        } else if (ret < 0) {
-                pr_warn("btrfs: insert uuid item failed %d (0x%016llx, 0x%016llx) type %u!\n",
+                btrfs_warn(uuid_root->fs_info, "insert uuid item failed %d "
+                        "(0x%016llx, 0x%016llx) type %u!",
                        ret, (unsigned long long)key.objectid,
                        (unsigned long long)key.offset, type);
                goto out;
@@ -183,7 +184,7 @@ int btrfs_uuid_tree_rem(struct btrfs_trans_handle *trans,
        ret = btrfs_search_slot(trans, uuid_root, &key, path, -1, 1);
        if (ret < 0) {
-                pr_warn("btrfs: error %d while searching for uuid item!\n",
+                btrfs_warn(uuid_root->fs_info, "error %d while searching for uuid item!",
                        ret);
                goto out;
        }
@@ -197,7 +198,7 @@ int btrfs_uuid_tree_rem(struct btrfs_trans_handle *trans,
        offset = btrfs_item_ptr_offset(eb, slot);
        item_size = btrfs_item_size_nr(eb, slot);
        if (!IS_ALIGNED(item_size, sizeof(u64))) {
-                pr_warn("btrfs: uuid item with illegal size %lu!\n",
+                btrfs_warn(uuid_root->fs_info, "uuid item with illegal size %lu!",
                        (unsigned long)item_size);
                ret = -ENOENT;
                goto out;
@@ -299,7 +300,7 @@ again_search_slot:
                offset = btrfs_item_ptr_offset(leaf, slot);
                item_size = btrfs_item_size_nr(leaf, slot);
                if (!IS_ALIGNED(item_size, sizeof(u64))) {
-                        pr_warn("btrfs: uuid item with illegal size %lu!\n",
+                        btrfs_warn(fs_info, "uuid item with illegal size %lu!",
                                (unsigned long)item_size);
                        goto skip;
                }
@@ -349,6 +350,6 @@ skip:
 out:
        btrfs_free_path(path);
        if (ret)
-                pr_warn("btrfs: btrfs_uuid_tree_iterate failed %d\n", ret);
+                btrfs_warn(fs_info, "btrfs_uuid_tree_iterate failed %d", ret);
        return 0;
 }
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 92303f42baaa..bab0b84d8f80 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -125,7 +125,7 @@ static void btrfs_kobject_uevent(struct block_device *bdev,
        ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action);
        if (ret)
-                pr_warn("Sending event '%d' to kobject: '%s' (%p): failed\n",
+                pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n",
                        action,
                        kobject_name(&disk_to_dev(bdev->bd_disk)->kobj),
                        &disk_to_dev(bdev->bd_disk)->kobj);
@@ -200,7 +200,7 @@ btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
        if (IS_ERR(*bdev)) {
                ret = PTR_ERR(*bdev);
-                printk(KERN_INFO "btrfs: open %s failed\n", device_path);
+                printk(KERN_INFO "BTRFS: open %s failed\n", device_path);
                goto error;
        }
@@ -912,9 +912,9 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
        if (disk_super->label[0]) {
                if (disk_super->label[BTRFS_LABEL_SIZE - 1])
                        disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0';
-                printk(KERN_INFO "btrfs: device label %s ", disk_super->label);
+                printk(KERN_INFO "BTRFS: device label %s ", disk_super->label);
        } else {
-                printk(KERN_INFO "btrfs: device fsid %pU ", disk_super->fsid);
+                printk(KERN_INFO "BTRFS: device fsid %pU ", disk_super->fsid);
        }
        printk(KERN_CONT "devid %llu transid %llu %s\n", devid, transid, path);
@@ -1813,7 +1813,7 @@ int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
                }
                if (!*device) {
-                        pr_err("btrfs: no missing device found\n");
+                        btrfs_err(root->fs_info, "no missing device found");
                        return -ENOENT;
                }
@@ -3052,7 +3052,7 @@ loop:
 error:
        btrfs_free_path(path);
        if (enospc_errors) {
-                printk(KERN_INFO "btrfs: %d enospc errors during balance\n",
+                btrfs_info(fs_info, "%d enospc errors during balance",
                       enospc_errors);
                if (!ret)
                        ret = -ENOSPC;
@@ -3138,8 +3138,8 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
                if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
                    !(bctl->flags & BTRFS_BALANCE_METADATA) ||
                    memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
-                        printk(KERN_ERR "btrfs: with mixed groups data and "
+                        btrfs_err(fs_info, "with mixed groups data and "
-                               "metadata balance options must be the same\n");
+                                   "metadata balance options must be the same");
                        ret = -EINVAL;
                        goto out;
                }
@@ -3165,8 +3165,8 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
        if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
            (!alloc_profile_is_valid(bctl->data.target, 1) ||
             (bctl->data.target & ~allowed))) {
-                printk(KERN_ERR "btrfs: unable to start balance with target "
+                btrfs_err(fs_info, "unable to start balance with target "
-                       "data profile %llu\n",
+                           "data profile %llu",
                       bctl->data.target);
                ret = -EINVAL;
                goto out;
@@ -3174,8 +3174,8 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
        if ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
            (!alloc_profile_is_valid(bctl->meta.target, 1) ||
             (bctl->meta.target & ~allowed))) {
-                printk(KERN_ERR "btrfs: unable to start balance with target "
+                btrfs_err(fs_info,
-                       "metadata profile %llu\n",
+                           "unable to start balance with target metadata profile %llu",
                       bctl->meta.target);
                ret = -EINVAL;
                goto out;
@@ -3183,8 +3183,8 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
        if ((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
            (!alloc_profile_is_valid(bctl->sys.target, 1) ||
             (bctl->sys.target & ~allowed))) {
-                printk(KERN_ERR "btrfs: unable to start balance with target "
+                btrfs_err(fs_info,
-                       "system profile %llu\n",
+                           "unable to start balance with target system profile %llu",
                       bctl->sys.target);
                ret = -EINVAL;
                goto out;
@@ -3193,7 +3193,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
        /* allow dup'ed data chunks only in mixed mode */
        if (!mixed && (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
            (bctl->data.target & BTRFS_BLOCK_GROUP_DUP)) {
-                printk(KERN_ERR "btrfs: dup for data is not allowed\n");
+                btrfs_err(fs_info, "dup for data is not allowed");
                ret = -EINVAL;
                goto out;
        }
@@ -3213,11 +3213,10 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
                     (fs_info->avail_metadata_alloc_bits & allowed) &&
                     !(bctl->meta.target & allowed))) {
                        if (bctl->flags & BTRFS_BALANCE_FORCE) {
-                                printk(KERN_INFO "btrfs: force reducing metadata "
+                                btrfs_info(fs_info, "force reducing metadata integrity");
-                                       "integrity\n");
                        } else {
-                                printk(KERN_ERR "btrfs: balance will reduce metadata "
+                                btrfs_err(fs_info, "balance will reduce metadata "
-                                       "integrity, use force if you want this\n");
+                                           "integrity, use force if you want this");
                                ret = -EINVAL;
                                goto out;
                        }
@@ -3303,7 +3302,7 @@ static int balance_kthread(void *data)
        mutex_lock(&fs_info->balance_mutex);
        if (fs_info->balance_ctl) {
-                printk(KERN_INFO "btrfs: continuing balance\n");
+                btrfs_info(fs_info, "continuing balance");
                ret = btrfs_balance(fs_info->balance_ctl, NULL);
        }
@@ -3325,7 +3324,7 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
        spin_unlock(&fs_info->balance_lock);
        if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) {
-                printk(KERN_INFO "btrfs: force skipping balance\n");
+                btrfs_info(fs_info, "force skipping balance");
                return 0;
        }
@@ -3543,7 +3542,7 @@ update_tree:
                                                  BTRFS_UUID_KEY_SUBVOL,
                                                  key.objectid);
                        if (ret < 0) {
-                                pr_warn("btrfs: uuid_tree_add failed %d\n",
+                                btrfs_warn(fs_info, "uuid_tree_add failed %d",
                                        ret);
                                break;
                        }
@@ -3555,7 +3554,7 @@ update_tree:
                                                 BTRFS_UUID_KEY_RECEIVED_SUBVOL,
                                                  key.objectid);
                        if (ret < 0) {
-                                pr_warn("btrfs: uuid_tree_add failed %d\n",
+                                btrfs_warn(fs_info, "uuid_tree_add failed %d",
                                        ret);
                                break;
                        }
@@ -3590,7 +3589,7 @@ out:
        if (trans && !IS_ERR(trans))
                btrfs_end_transaction(trans, fs_info->uuid_root);
        if (ret)
-                pr_warn("btrfs: btrfs_uuid_scan_kthread failed %d\n", ret);
+                btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret);
        else
                fs_info->update_uuid_tree_gen = 1;
        up(&fs_info->uuid_tree_rescan_sem);
@@ -3654,7 +3653,7 @@ static int btrfs_uuid_rescan_kthread(void *data)
         */
        ret = btrfs_uuid_tree_iterate(fs_info, btrfs_check_uuid_tree_entry);
        if (ret < 0) {
-                pr_warn("btrfs: iterating uuid_tree failed %d\n", ret);
+                btrfs_warn(fs_info, "iterating uuid_tree failed %d", ret);
                up(&fs_info->uuid_tree_rescan_sem);
                return ret;
        }
@@ -3695,7 +3694,7 @@ int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
        task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid");
        if (IS_ERR(task)) {
                /* fs_info->update_uuid_tree_gen remains 0 in all error case */
-                pr_warn("btrfs: failed to start uuid_scan task\n");
+                btrfs_warn(fs_info, "failed to start uuid_scan task");
                up(&fs_info->uuid_tree_rescan_sem);
                return PTR_ERR(task);
        }
@@ -3711,7 +3710,7 @@ int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info)
        task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid");
        if (IS_ERR(task)) {
                /* fs_info->update_uuid_tree_gen remains 0 in all error case */
-                pr_warn("btrfs: failed to start uuid_rescan task\n");
+                btrfs_warn(fs_info, "failed to start uuid_rescan task");
                up(&fs_info->uuid_tree_rescan_sem);
                return PTR_ERR(task);
        }
@@ -4033,7 +4032,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
                max_stripe_size = 32 * 1024 * 1024;
                max_chunk_size = 2 * max_stripe_size;
        } else {
-                printk(KERN_ERR "btrfs: invalid chunk type 0x%llx requested\n",
+                btrfs_err(info, "invalid chunk type 0x%llx requested\n",
                       type);
                BUG_ON(1);
        }
@@ -4065,7 +4064,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
                if (!device->writeable) {
                        WARN(1, KERN_ERR
-                               "btrfs: read-only device in alloc_list\n");
+                               "BTRFS: read-only device in alloc_list\n");
                        continue;
                }
@@ -5193,13 +5192,13 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
        read_unlock(&em_tree->lock);
        if (!em) {
-                printk(KERN_ERR "btrfs: couldn't find em for chunk %Lu\n",
+                printk(KERN_ERR "BTRFS: couldn't find em for chunk %Lu\n",
                       chunk_start);
                return -EIO;
        }
        if (em->start != chunk_start) {
-                printk(KERN_ERR "btrfs: bad chunk start, em=%Lu, wanted=%Lu\n",
+                printk(KERN_ERR "BTRFS: bad chunk start, em=%Lu, wanted=%Lu\n",
                       em->start, chunk_start);
                free_extent_map(em);
                return -EIO;
@@ -5298,6 +5297,13 @@ static void btrfs_end_bio(struct bio *bio, int err)
                        bio_put(bio);
                        bio = bbio->orig_bio;
                }
+                /*
+                 * We have original bio now. So increment bi_remaining to
+                 * account for it in endio
+                 */
+                atomic_inc(&bio->bi_remaining);
                bio->bi_private = bbio->private;
                bio->bi_end_io = bbio->end_io;
                btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
@@ -5411,7 +5417,7 @@ static int bio_size_ok(struct block_device *bdev, struct bio *bio,
        if (!q->merge_bvec_fn)
                return 1;
-        bvm.bi_size = bio->bi_size - prev->bv_len;
+        bvm.bi_size = bio->bi_iter.bi_size - prev->bv_len;
        if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len)
                return 0;
        return 1;
@@ -5426,7 +5432,7 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
        bio->bi_private = bbio;
        btrfs_io_bio(bio)->stripe_index = dev_nr;
        bio->bi_end_io = btrfs_end_bio;
-        bio->bi_sector = physical >> 9;
+        bio->bi_iter.bi_sector = physical >> 9;
 #ifdef DEBUG
        {
                struct rcu_string *name;
@@ -5464,7 +5470,7 @@ again:
        while (bvec <= (first_bio->bi_io_vec + first_bio->bi_vcnt - 1)) {
                if (bio_add_page(bio, bvec->bv_page, bvec->bv_len,
                                 bvec->bv_offset) < bvec->bv_len) {
-                        u64 len = bio->bi_size;
+                        u64 len = bio->bi_iter.bi_size;
                        atomic_inc(&bbio->stripes_pending);
                        submit_stripe_bio(root, bbio, bio, physical, dev_nr,
@@ -5486,7 +5492,7 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
                bio->bi_private = bbio->private;
                bio->bi_end_io = bbio->end_io;
                btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
-                bio->bi_sector = logical >> 9;
+                bio->bi_iter.bi_sector = logical >> 9;
                kfree(bbio);
                bio_endio(bio, -EIO);
        }
@@ -5497,7 +5503,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 {
        struct btrfs_device *dev;
        struct bio *first_bio = bio;
-        u64 logical = (u64)bio->bi_sector << 9;
+        u64 logical = (u64)bio->bi_iter.bi_sector << 9;
        u64 length = 0;
        u64 map_length;
        u64 *raid_map = NULL;
@@ -5506,7 +5512,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
        int total_devs = 1;
        struct btrfs_bio *bbio = NULL;
-        length = bio->bi_size;
+        length = bio->bi_iter.bi_size;
        map_length = length;
        ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
@@ -6123,7 +6129,8 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
        BUG_ON(!path);
        ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
        if (ret < 0) {
-                printk_in_rcu(KERN_WARNING "btrfs: error %d while searching for dev_stats item for device %s!\n",
+                printk_in_rcu(KERN_WARNING "BTRFS: "
+                        "error %d while searching for dev_stats item for device %s!\n",
                              ret, rcu_str_deref(device->name));
                goto out;
        }
@@ -6133,7 +6140,8 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
                /* need to delete old one and insert a new one */
                ret = btrfs_del_item(trans, dev_root, path);
                if (ret != 0) {
-                        printk_in_rcu(KERN_WARNING "btrfs: delete too small dev_stats item for device %s failed %d!\n",
+                        printk_in_rcu(KERN_WARNING "BTRFS: "
+                                "delete too small dev_stats item for device %s failed %d!\n",
                                      rcu_str_deref(device->name), ret);
                        goto out;
                }
@@ -6146,7 +6154,8 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
                ret = btrfs_insert_empty_item(trans, dev_root, path,
                                              &key, sizeof(*ptr));
                if (ret < 0) {
-                        printk_in_rcu(KERN_WARNING "btrfs: insert dev_stats item for device %s failed %d!\n",
+                        printk_in_rcu(KERN_WARNING "BTRFS: "
+                                          "insert dev_stats item for device %s failed %d!\n",
                                      rcu_str_deref(device->name), ret);
                        goto out;
                }
@@ -6199,16 +6208,14 @@ static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
 {
        if (!dev->dev_stats_valid)
                return;
-        printk_ratelimited_in_rcu(KERN_ERR
+        printk_ratelimited_in_rcu(KERN_ERR "BTRFS: "
-                           "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
+                           "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
                           rcu_str_deref(dev->name),
                           btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
                           btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
                           btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
-                           btrfs_dev_stat_read(dev,
+                           btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
-                                               BTRFS_DEV_STAT_CORRUPTION_ERRS),
+                           btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
-                           btrfs_dev_stat_read(dev,
-                                               BTRFS_DEV_STAT_GENERATION_ERRS));
 }
 static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
@@ -6221,7 +6228,8 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
        if (i == BTRFS_DEV_STAT_VALUES_MAX)
                return; /* all values == 0, suppress message */
-        printk_in_rcu(KERN_INFO "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
+        printk_in_rcu(KERN_INFO "BTRFS: "
+                   "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
               rcu_str_deref(dev->name),
               btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
               btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
@@ -6242,12 +6250,10 @@ int btrfs_get_dev_stats(struct btrfs_root *root,
        mutex_unlock(&fs_devices->device_list_mutex);
        if (!dev) {
-                printk(KERN_WARNING
+                btrfs_warn(root->fs_info, "get dev_stats failed, device not found");
-                       "btrfs: get dev_stats failed, device not found\n");
                return -ENODEV;
        } else if (!dev->dev_stats_valid) {
-                printk(KERN_WARNING
+                btrfs_warn(root->fs_info, "get dev_stats failed, not yet valid");
-                       "btrfs: get dev_stats failed, not yet valid\n");
                return -ENODEV;
        } else if (stats->flags & BTRFS_DEV_STATS_RESET) {
                for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 05740b9789e4..ad8328d797ea 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -22,11 +22,13 @@
 #include <linux/rwsem.h>
 #include <linux/xattr.h>
 #include <linux/security.h>
+#include <linux/posix_acl_xattr.h>
 #include "ctree.h"
 #include "btrfs_inode.h"
 #include "transaction.h"
 #include "xattr.h"
 #include "disk-io.h"
+#include "props.h"
 ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
@@ -313,8 +315,8 @@ err:
 */
 const struct xattr_handler *btrfs_xattr_handlers[] = {
 #ifdef CONFIG_BTRFS_FS_POSIX_ACL
-        &btrfs_xattr_acl_access_handler,
+        &posix_acl_access_xattr_handler,
-        &btrfs_xattr_acl_default_handler,
+        &posix_acl_default_xattr_handler,
 #endif
        NULL,
 };
@@ -331,7 +333,8 @@ static bool btrfs_is_valid_xattr(const char *name)
                        XATTR_SECURITY_PREFIX_LEN) ||
               !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) ||
               !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
-               !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
+               !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) ||
+                !strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN);
 }
 ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
@@ -373,6 +376,10 @@ int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
        if (!btrfs_is_valid_xattr(name))
                return -EOPNOTSUPP;
+        if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN))
+                return btrfs_set_prop(dentry->d_inode, name,
+                                      value, size, flags);
        if (size == 0)
                value = "";  /* empty EA, do not remove */
@@ -402,6 +409,10 @@ int btrfs_removexattr(struct dentry *dentry, const char *name)
        if (!btrfs_is_valid_xattr(name))
                return -EOPNOTSUPP;
+        if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN))
+                return btrfs_set_prop(dentry->d_inode, name,
+                                      NULL, 0, XATTR_REPLACE);
        return __btrfs_setxattr(NULL, dentry->d_inode, name, NULL, 0,
                                XATTR_REPLACE);
 }
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
index b3cc8039134b..5049608d1388 100644
--- a/fs/btrfs/xattr.h
+++ b/fs/btrfs/xattr.h
@@ -21,8 +21,6 @@
 #include <linux/xattr.h>
-extern const struct xattr_handler btrfs_xattr_acl_access_handler;
-extern const struct xattr_handler btrfs_xattr_acl_default_handler;
 extern const struct xattr_handler *btrfs_xattr_handlers[];
 extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 9acb846c3e7f..8e57191950cb 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -97,7 +97,7 @@ static int zlib_compress_pages(struct list_head *ws,
        *total_in = 0;
        if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
-                printk(KERN_WARNING "btrfs: deflateInit failed\n");
+                printk(KERN_WARNING "BTRFS: deflateInit failed\n");
                ret = -1;
                goto out;
        }
@@ -125,7 +125,7 @@ static int zlib_compress_pages(struct list_head *ws,
        while (workspace->def_strm.total_in < len) {
                ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH);
                if (ret != Z_OK) {
-                        printk(KERN_DEBUG "btrfs: deflate in loop returned %d\n",
+                        printk(KERN_DEBUG "BTRFS: deflate in loop returned %d\n",
                               ret);
                        zlib_deflateEnd(&workspace->def_strm);
                        ret = -1;
@@ -252,7 +252,7 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
        }
        if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
-                printk(KERN_WARNING "btrfs: inflateInit failed\n");
+                printk(KERN_WARNING "BTRFS: inflateInit failed\n");
                return -1;
        }
        while (workspace->inf_strm.total_in < srclen) {
@@ -336,7 +336,7 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
        }
        if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
-                printk(KERN_WARNING "btrfs: inflateInit failed\n");
+                printk(KERN_WARNING "BTRFS: inflateInit failed\n");
                return -1;
        }
diff --git a/fs/buffer.c b/fs/buffer.c
index 6024877335ca..27265a8b43c1 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -654,14 +654,16 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode);
 static void __set_page_dirty(struct page *page,
                struct address_space *mapping, int warn)
 {
-        spin_lock_irq(&mapping->tree_lock);
+        unsigned long flags;
+        spin_lock_irqsave(&mapping->tree_lock, flags);
        if (page->mapping) {    /* Race with truncate? */
                WARN_ON_ONCE(warn && !PageUptodate(page));
                account_page_dirtied(page, mapping);
                radix_tree_tag_set(&mapping->page_tree,
                                page_index(page), PAGECACHE_TAG_DIRTY);
        }
-        spin_unlock_irq(&mapping->tree_lock);
+        spin_unlock_irqrestore(&mapping->tree_lock, flags);
        __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 }
@@ -1312,7 +1314,7 @@ static void bh_lru_install(struct buffer_head *bh)
                }
                while (out < BH_LRU_SIZE)
                        bhs[out++] = NULL;
-                memcpy(__this_cpu_ptr(&bh_lrus.bhs), bhs, sizeof(bhs));
+                memcpy(this_cpu_ptr(&bh_lrus.bhs), bhs, sizeof(bhs));
        }
        bh_lru_unlock();
@@ -2982,11 +2984,11 @@ static void guard_bh_eod(int rw, struct bio *bio, struct buffer_head *bh)
         * let it through, and the IO layer will turn it into
         * an EIO.
         */
-        if (unlikely(bio->bi_sector >= maxsector))
+        if (unlikely(bio->bi_iter.bi_sector >= maxsector))
                return;
-        maxsector -= bio->bi_sector;
+        maxsector -= bio->bi_iter.bi_sector;
-        bytes = bio->bi_size;
+        bytes = bio->bi_iter.bi_size;
        if (likely((bytes >> 9) <= maxsector))
                return;
@@ -2994,7 +2996,7 @@ static void guard_bh_eod(int rw, struct bio *bio, struct buffer_head *bh)
        bytes = maxsector << 9;
        /* Truncate the bio.. */
-        bio->bi_size = bytes;
+        bio->bi_iter.bi_size = bytes;
        bio->bi_io_vec[0].bv_len = bytes;
        /* ..and clear the end of the buffer for reads */
@@ -3029,14 +3031,14 @@ int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
         */
        bio = bio_alloc(GFP_NOIO, 1);
-        bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
+        bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
        bio->bi_bdev = bh->b_bdev;
        bio->bi_io_vec[0].bv_page = bh->b_page;
        bio->bi_io_vec[0].bv_len = bh->b_size;
        bio->bi_io_vec[0].bv_offset = bh_offset(bh);
        bio->bi_vcnt = 1;
-        bio->bi_size = bh->b_size;
+        bio->bi_iter.bi_size = bh->b_size;
        bio->bi_end_io = end_bio_bh_io_sync;
        bio->bi_private = bh;
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
index ac9a2ef5bb9b..264e9bf83ff3 100644
--- a/fs/ceph/Kconfig
+++ b/fs/ceph/Kconfig
@@ -25,3 +25,16 @@ config CEPH_FSCACHE
          caching support for Ceph clients using FS-Cache
 endif
+config CEPH_FS_POSIX_ACL
+        bool "Ceph POSIX Access Control Lists"
+        depends on CEPH_FS
+        select FS_POSIX_ACL
+        help
+          POSIX Access Control Lists (ACLs) support permissions for users and
+          groups beyond the owner/group/world scheme.
+          To learn more about Access Control Lists, visit the POSIX ACLs for
+          Linux website <http://acl.bestbits.at/>.
+          If you don't know what Access Control Lists are, say N
diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
index 32e30106a2f0..85a4230b9bff 100644
--- a/fs/ceph/Makefile
+++ b/fs/ceph/Makefile
@@ -10,3 +10,4 @@ ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
        debugfs.o
 ceph-$(CONFIG_CEPH_FSCACHE) += cache.o
+ceph-$(CONFIG_CEPH_FS_POSIX_ACL) += acl.o
diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c
new file mode 100644
index 000000000000..21887d63dad5
--- /dev/null
+++ b/fs/ceph/acl.c
@@ -0,0 +1,200 @@
+/*
+ * linux/fs/ceph/acl.c
+ *
+ * Copyright (C) 2013 Guangliang Zhao, <lucienchao@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/ceph/ceph_debug.h>
+#include <linux/fs.h>
+#include <linux/string.h>
+#include <linux/xattr.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/posix_acl.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include "super.h"
+static inline void ceph_set_cached_acl(struct inode *inode,
+                                        int type, struct posix_acl *acl)
+{
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        spin_lock(&ci->i_ceph_lock);
+        if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 0))
+                set_cached_acl(inode, type, acl);
+        spin_unlock(&ci->i_ceph_lock);
+}
+static inline struct posix_acl *ceph_get_cached_acl(struct inode *inode,
+                                                        int type)
+{
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        struct posix_acl *acl = ACL_NOT_CACHED;
+        spin_lock(&ci->i_ceph_lock);
+        if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 0))
+                acl = get_cached_acl(inode, type);
+        spin_unlock(&ci->i_ceph_lock);
+        return acl;
+}
+struct posix_acl *ceph_get_acl(struct inode *inode, int type)
+{
+        int size;
+        const char *name;
+        char *value = NULL;
+        struct posix_acl *acl;
+        switch (type) {
+        case ACL_TYPE_ACCESS:
+                name = POSIX_ACL_XATTR_ACCESS;
+                break;
+        case ACL_TYPE_DEFAULT:
+                name = POSIX_ACL_XATTR_DEFAULT;
+                break;
+        default:
+                BUG();
+        }
+        size = __ceph_getxattr(inode, name, "", 0);
+        if (size > 0) {
+                value = kzalloc(size, GFP_NOFS);
+                if (!value)
+                        return ERR_PTR(-ENOMEM);
+                size = __ceph_getxattr(inode, name, value, size);
+        }
+        if (size > 0)
+                acl = posix_acl_from_xattr(&init_user_ns, value, size);
+        else if (size == -ERANGE || size == -ENODATA || size == 0)
+                acl = NULL;
+        else
+                acl = ERR_PTR(-EIO);
+        kfree(value);
+        if (!IS_ERR(acl))
+                ceph_set_cached_acl(inode, type, acl);
+        return acl;
+}
+int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+{
+        int ret = 0, size = 0;
+        const char *name = NULL;
+        char *value = NULL;
+        struct iattr newattrs;
+        umode_t new_mode = inode->i_mode, old_mode = inode->i_mode;
+        struct dentry *dentry;
+        if (acl) {
+                ret = posix_acl_valid(acl);
+                if (ret < 0)
+                        goto out;
+        }
+        switch (type) {
+        case ACL_TYPE_ACCESS:
+                name = POSIX_ACL_XATTR_ACCESS;
+                if (acl) {
+                        ret = posix_acl_equiv_mode(acl, &new_mode);
+                        if (ret < 0)
+                                goto out;
+                        if (ret == 0)
+                                acl = NULL;
+                }
+                break;
+        case ACL_TYPE_DEFAULT:
+                if (!S_ISDIR(inode->i_mode)) {
+                        ret = acl ? -EINVAL : 0;
+                        goto out;
+                }
+                name = POSIX_ACL_XATTR_DEFAULT;
+                break;
+        default:
+                ret = -EINVAL;
+                goto out;
+        }
+        if (acl) {
+                size = posix_acl_xattr_size(acl->a_count);
+                value = kmalloc(size, GFP_NOFS);
+                if (!value) {
+                        ret = -ENOMEM;
+                        goto out;
+                }
+                ret = posix_acl_to_xattr(&init_user_ns, acl, value, size);
+                if (ret < 0)
+                        goto out_free;
+        }
+        dentry = d_find_alias(inode);
+        if (new_mode != old_mode) {
+                newattrs.ia_mode = new_mode;
+                newattrs.ia_valid = ATTR_MODE;
+                ret = ceph_setattr(dentry, &newattrs);
+                if (ret)
+                        goto out_dput;
+        }
+        ret = __ceph_setxattr(dentry, name, value, size, 0);
+        if (ret) {
+                if (new_mode != old_mode) {
+                        newattrs.ia_mode = old_mode;
+                        newattrs.ia_valid = ATTR_MODE;
+                        ceph_setattr(dentry, &newattrs);
+                }
+                goto out_dput;
+        }
+        ceph_set_cached_acl(inode, type, acl);
+out_dput:
+        dput(dentry);
+out_free:
+        kfree(value);
+out:
+        return ret;
+}
+int ceph_init_acl(struct dentry *dentry, struct inode *inode, struct inode *dir)
+{
+        struct posix_acl *default_acl, *acl;
+        int error;
+        error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
+        if (error)
+                return error;
+        if (!default_acl && !acl)
+                cache_no_acl(inode);
+        if (default_acl) {
+                error = ceph_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
+                posix_acl_release(default_acl);
+        }
+        if (acl) {
+                if (!error)
+                        error = ceph_set_acl(inode, acl, ACL_TYPE_ACCESS);
+                posix_acl_release(acl);
+        }
+        return error;
+}
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index ec3ba43b9faa..b53278c9fd97 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -209,6 +209,7 @@ static int readpage_nounlock(struct file *filp, struct page *page)
                err = 0;
        if (err < 0) {
                SetPageError(page);
+                ceph_fscache_readpage_cancel(inode, page);
                goto out;
        } else {
                if (err < PAGE_CACHE_SIZE) {
@@ -256,6 +257,8 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
        for (i = 0; i < num_pages; i++) {
                struct page *page = osd_data->pages[i];
+                if (rc < 0)
+                        goto unlock;
                if (bytes < (int)PAGE_CACHE_SIZE) {
                        /* zero (remainder of) page */
                        int s = bytes < 0 ? 0 : bytes;
@@ -266,6 +269,7 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
                flush_dcache_page(page);
                SetPageUptodate(page);
                ceph_readpage_to_fscache(inode, page);
+unlock:
                unlock_page(page);
                page_cache_release(page);
                bytes -= PAGE_CACHE_SIZE;
@@ -1207,6 +1211,41 @@ const struct address_space_operations ceph_aops = {
 /*
 * vm ops
 */
+static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+        struct inode *inode = file_inode(vma->vm_file);
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        struct ceph_file_info *fi = vma->vm_file->private_data;
+        loff_t off = vmf->pgoff << PAGE_CACHE_SHIFT;
+        int want, got, ret;
+        dout("filemap_fault %p %llx.%llx %llu~%zd trying to get caps\n",
+             inode, ceph_vinop(inode), off, (size_t)PAGE_CACHE_SIZE);
+        if (fi->fmode & CEPH_FILE_MODE_LAZY)
+                want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
+        else
+                want = CEPH_CAP_FILE_CACHE;
+        while (1) {
+                got = 0;
+                ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, &got, -1);
+                if (ret == 0)
+                        break;
+                if (ret != -ERESTARTSYS) {
+                        WARN_ON(1);
+                        return VM_FAULT_SIGBUS;
+                }
+        }
+        dout("filemap_fault %p %llu~%zd got cap refs on %s\n",
+             inode, off, (size_t)PAGE_CACHE_SIZE, ceph_cap_string(got));
+        ret = filemap_fault(vma, vmf);
+        dout("filemap_fault %p %llu~%zd dropping cap refs on %s ret %d\n",
+             inode, off, (size_t)PAGE_CACHE_SIZE, ceph_cap_string(got), ret);
+        ceph_put_cap_refs(ci, got);
+        return ret;
+}
 /*
 * Reuse write_begin here for simplicity.
@@ -1214,23 +1253,41 @@ const struct address_space_operations ceph_aops = {
 static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
        struct inode *inode = file_inode(vma->vm_file);
-        struct page *page = vmf->page;
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        struct ceph_file_info *fi = vma->vm_file->private_data;
        struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+        struct page *page = vmf->page;
        loff_t off = page_offset(page);
-        loff_t size, len;
+        loff_t size = i_size_read(inode);
-        int ret;
+        size_t len;
+        int want, got, ret;
-        /* Update time before taking page lock */
-        file_update_time(vma->vm_file);
-        size = i_size_read(inode);
        if (off + PAGE_CACHE_SIZE <= size)
                len = PAGE_CACHE_SIZE;
        else
                len = size & ~PAGE_CACHE_MASK;
-        dout("page_mkwrite %p %llu~%llu page %p idx %lu\n", inode,
+        dout("page_mkwrite %p %llx.%llx %llu~%zd getting caps i_size %llu\n",
-             off, len, page, page->index);
+             inode, ceph_vinop(inode), off, len, size);
+        if (fi->fmode & CEPH_FILE_MODE_LAZY)
+                want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
+        else
+                want = CEPH_CAP_FILE_BUFFER;
+        while (1) {
+                got = 0;
+                ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, off + len);
+                if (ret == 0)
+                        break;
+                if (ret != -ERESTARTSYS) {
+                        WARN_ON(1);
+                        return VM_FAULT_SIGBUS;
+                }
+        }
+        dout("page_mkwrite %p %llu~%zd got cap refs on %s\n",
+             inode, off, len, ceph_cap_string(got));
+        /* Update time before taking page lock */
+        file_update_time(vma->vm_file);
        lock_page(page);
@@ -1252,14 +1309,26 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
                        ret = VM_FAULT_SIGBUS;
        }
 out:
-        dout("page_mkwrite %p %llu~%llu = %d\n", inode, off, len, ret);
+        if (ret != VM_FAULT_LOCKED) {
-        if (ret != VM_FAULT_LOCKED)
                unlock_page(page);
+        } else {
+                int dirty;
+                spin_lock(&ci->i_ceph_lock);
+                dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
+                spin_unlock(&ci->i_ceph_lock);
+                if (dirty)
+                        __mark_inode_dirty(inode, dirty);
+        }
+        dout("page_mkwrite %p %llu~%zd dropping cap refs on %s ret %d\n",
+             inode, off, len, ceph_cap_string(got), ret);
+        ceph_put_cap_refs(ci, got);
        return ret;
 }
 static struct vm_operations_struct ceph_vmops = {
-        .fault          = filemap_fault,
+        .fault          = ceph_filemap_fault,
        .page_mkwrite   = ceph_page_mkwrite,
        .remap_pages    = generic_file_remap_pages,
 };
diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h
index ba949408a336..da95f61b7a09 100644
--- a/fs/ceph/cache.h
+++ b/fs/ceph/cache.h
@@ -67,6 +67,14 @@ static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp)
        return fscache_maybe_release_page(ci->fscache, page, gfp);
 }
+static inline void ceph_fscache_readpage_cancel(struct inode *inode,
+                                                struct page *page)
+{
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        if (fscache_cookie_valid(ci->fscache) && PageFsCache(page))
+                __fscache_uncache_page(ci->fscache, page);
+}
 static inline void ceph_fscache_readpages_cancel(struct inode *inode,
                                                 struct list_head *pages)
 {
@@ -145,6 +153,11 @@ static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp)
        return 1;
 }
+static inline void ceph_fscache_readpage_cancel(struct inode *inode,
+                                                struct page *page)
+{
+}
 static inline void ceph_fscache_readpages_cancel(struct inode *inode,
                                                 struct list_head *pages)
 {
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 3c0a4bd74996..17543383545c 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -555,21 +555,34 @@ retry:
                cap->ci = ci;
                __insert_cap_node(ci, cap);
-                /* clear out old exporting info?  (i.e. on cap import) */
-                if (ci->i_cap_exporting_mds == mds) {
-                        ci->i_cap_exporting_issued = 0;
-                        ci->i_cap_exporting_mseq = 0;
-                        ci->i_cap_exporting_mds = -1;
-                }
                /* add to session cap list */
                cap->session = session;
                spin_lock(&session->s_cap_lock);
                list_add_tail(&cap->session_caps, &session->s_caps);
                session->s_nr_caps++;
                spin_unlock(&session->s_cap_lock);
-        } else if (new_cap)
+        } else {
-                ceph_put_cap(mdsc, new_cap);
+                if (new_cap)
+                        ceph_put_cap(mdsc, new_cap);
+                /*
+                 * auth mds of the inode changed. we received the cap export
+                 * message, but still haven't received the cap import message.
+                 * handle_cap_export() updated the new auth MDS' cap.
+                 *
+                 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
+                 * a message that was send before the cap import message. So
+                 * don't remove caps.
+                 */
+                if (ceph_seq_cmp(seq, cap->seq) <= 0) {
+                        WARN_ON(cap != ci->i_auth_cap);
+                        WARN_ON(cap->cap_id != cap_id);
+                        seq = cap->seq;
+                        mseq = cap->mseq;
+                        issued |= cap->issued;
+                        flags |= CEPH_CAP_FLAG_AUTH;
+                }
+        }
        if (!ci->i_snap_realm) {
                /*
@@ -611,15 +624,9 @@ retry:
                if (ci->i_auth_cap == NULL ||
                    ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0)
                        ci->i_auth_cap = cap;
-        } else if (ci->i_auth_cap == cap) {
+                ci->i_cap_exporting_issued = 0;
-                ci->i_auth_cap = NULL;
+        } else {
-                spin_lock(&mdsc->cap_dirty_lock);
+                WARN_ON(ci->i_auth_cap == cap);
-                if (!list_empty(&ci->i_dirty_item)) {
-                        dout(" moving %p to cap_dirty_migrating\n", inode);
-                        list_move(&ci->i_dirty_item,
-                                  &mdsc->cap_dirty_migrating);
-                }
-                spin_unlock(&mdsc->cap_dirty_lock);
        }
        dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n",
@@ -628,7 +635,7 @@ retry:
        cap->cap_id = cap_id;
        cap->issued = issued;
        cap->implemented |= issued;
-        if (mseq > cap->mseq)
+        if (ceph_seq_cmp(mseq, cap->mseq) > 0)
                cap->mds_wanted = wanted;
        else
                cap->mds_wanted |= wanted;
@@ -816,7 +823,7 @@ int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
        for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
                cap = rb_entry(p, struct ceph_cap, ci_node);
-                if (cap != ocap && __cap_is_valid(cap) &&
+                if (cap != ocap &&
                    (cap->implemented & ~cap->issued & mask))
                        return 1;
        }
@@ -888,7 +895,19 @@ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci)
 */
 static int __ceph_is_any_caps(struct ceph_inode_info *ci)
 {
-        return !RB_EMPTY_ROOT(&ci->i_caps) || ci->i_cap_exporting_mds >= 0;
+        return !RB_EMPTY_ROOT(&ci->i_caps) || ci->i_cap_exporting_issued;
+}
+int ceph_is_any_caps(struct inode *inode)
+{
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        int ret;
+        spin_lock(&ci->i_ceph_lock);
+        ret = __ceph_is_any_caps(ci);
+        spin_unlock(&ci->i_ceph_lock);
+        return ret;
 }
 /*
@@ -1383,13 +1402,10 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
                                ci->i_snap_realm->cached_context);
                dout(" inode %p now dirty snapc %p auth cap %p\n",
                     &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap);
+                WARN_ON(!ci->i_auth_cap);
                BUG_ON(!list_empty(&ci->i_dirty_item));
                spin_lock(&mdsc->cap_dirty_lock);
-                if (ci->i_auth_cap)
+                list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
-                        list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
-                else
-                        list_add(&ci->i_dirty_item,
-                                 &mdsc->cap_dirty_migrating);
                spin_unlock(&mdsc->cap_dirty_lock);
                if (ci->i_flushing_caps == 0) {
                        ihold(inode);
@@ -1735,13 +1751,12 @@ ack:
 /*
 * Try to flush dirty caps back to the auth mds.
 */
-static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session,
+static int try_flush_caps(struct inode *inode, unsigned *flush_tid)
-                          unsigned *flush_tid)
 {
        struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
        struct ceph_inode_info *ci = ceph_inode(inode);
-        int unlock_session = session ? 0 : 1;
        int flushing = 0;
+        struct ceph_mds_session *session = NULL;
 retry:
        spin_lock(&ci->i_ceph_lock);
@@ -1755,13 +1770,14 @@ retry:
                int want = __ceph_caps_wanted(ci);
                int delayed;
-                if (!session) {
+                if (!session || session != cap->session) {
                        spin_unlock(&ci->i_ceph_lock);
+                        if (session)
+                                mutex_unlock(&session->s_mutex);
                        session = cap->session;
                        mutex_lock(&session->s_mutex);
                        goto retry;
                }
-                BUG_ON(session != cap->session);
                if (cap->session->s_state < CEPH_MDS_SESSION_OPEN)
                        goto out;
@@ -1780,7 +1796,7 @@ retry:
 out:
        spin_unlock(&ci->i_ceph_lock);
 out_unlocked:
-        if (session && unlock_session)
+        if (session)
                mutex_unlock(&session->s_mutex);
        return flushing;
 }
@@ -1865,7 +1881,7 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
                return ret;
        mutex_lock(&inode->i_mutex);
-        dirty = try_flush_caps(inode, NULL, &flush_tid);
+        dirty = try_flush_caps(inode, &flush_tid);
        dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
        /*
@@ -1900,7 +1916,7 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
        dout("write_inode %p wait=%d\n", inode, wait);
        if (wait) {
-                dirty = try_flush_caps(inode, NULL, &flush_tid);
+                dirty = try_flush_caps(inode, &flush_tid);
                if (dirty)
                        err = wait_event_interruptible(ci->i_cap_wq,
                                       caps_are_flushed(inode, flush_tid));
@@ -2350,11 +2366,11 @@ static void invalidate_aliases(struct inode *inode)
        d_prune_aliases(inode);
        /*
         * For non-directory inode, d_find_alias() only returns
-         * connected dentry. After calling d_invalidate(), the
+         * hashed dentry. After calling d_invalidate(), the
-         * dentry become disconnected.
+         * dentry becomes unhashed.
         *
         * For directory inode, d_find_alias() can return
-         * disconnected dentry. But directory inode should have
+         * unhashed dentry. But directory inode should have
         * one alias at most.
         */
        while ((dn = d_find_alias(inode))) {
@@ -2408,6 +2424,22 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
        dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
                inode->i_size);
+        /*
+         * auth mds of the inode changed. we received the cap export message,
+         * but still haven't received the cap import message. handle_cap_export
+         * updated the new auth MDS' cap.
+         *
+         * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing a message
+         * that was sent before the cap import message. So don't remove caps.
+         */
+        if (ceph_seq_cmp(seq, cap->seq) <= 0) {
+                WARN_ON(cap != ci->i_auth_cap);
+                WARN_ON(cap->cap_id != le64_to_cpu(grant->cap_id));
+                seq = cap->seq;
+                newcaps |= cap->issued;
+        }
        /*
         * If CACHE is being revoked, and we have no dirty buffers,
         * try to invalidate (once).  (If there are dirty buffers, we
@@ -2434,6 +2466,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
        issued |= implemented | __ceph_caps_dirty(ci);
        cap->cap_gen = session->s_cap_gen;
+        cap->seq = seq;
        __check_cap_issue(ci, cap, newcaps);
@@ -2464,6 +2497,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
                                ceph_buffer_put(ci->i_xattrs.blob);
                        ci->i_xattrs.blob = ceph_buffer_get(xattr_buf);
                        ci->i_xattrs.version = version;
+                        ceph_forget_all_cached_acls(inode);
                }
        }
@@ -2483,6 +2517,10 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
                            le32_to_cpu(grant->time_warp_seq), &ctime, &mtime,
                            &atime);
+        /* file layout may have changed */
+        ci->i_layout = grant->layout;
        /* max size increase? */
        if (ci->i_auth_cap == cap && max_size != ci->i_max_size) {
                dout("max_size %lld -> %llu\n", ci->i_max_size, max_size);
@@ -2511,11 +2549,6 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
                        check_caps = 1;
        }
-        cap->seq = seq;
-        /* file layout may have changed */
-        ci->i_layout = grant->layout;
        /* revocation, grant, or no-op? */
        if (cap->issued & ~newcaps) {
                int revoking = cap->issued & ~newcaps;
@@ -2741,65 +2774,114 @@ static void handle_cap_trunc(struct inode *inode,
 * caller holds s_mutex
 */
 static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
-                              struct ceph_mds_session *session,
+                              struct ceph_mds_cap_peer *ph,
-                              int *open_target_sessions)
+                              struct ceph_mds_session *session)
 {
        struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+        struct ceph_mds_session *tsession = NULL;
+        struct ceph_cap *cap, *tcap;
        struct ceph_inode_info *ci = ceph_inode(inode);
-        int mds = session->s_mds;
+        u64 t_cap_id;
        unsigned mseq = le32_to_cpu(ex->migrate_seq);
-        struct ceph_cap *cap = NULL, *t;
+        unsigned t_seq, t_mseq;
-        struct rb_node *p;
+        int target, issued;
-        int remember = 1;
+        int mds = session->s_mds;
-        dout("handle_cap_export inode %p ci %p mds%d mseq %d\n",
+        if (ph) {
-             inode, ci, mds, mseq);
+                t_cap_id = le64_to_cpu(ph->cap_id);
+                t_seq = le32_to_cpu(ph->seq);
+                t_mseq = le32_to_cpu(ph->mseq);
+                target = le32_to_cpu(ph->mds);
+        } else {
+                t_cap_id = t_seq = t_mseq = 0;
+                target = -1;
+        }
+        dout("handle_cap_export inode %p ci %p mds%d mseq %d target %d\n",
+             inode, ci, mds, mseq, target);
+retry:
        spin_lock(&ci->i_ceph_lock);
+        cap = __get_cap_for_mds(ci, mds);
+        if (!cap)
+                goto out_unlock;
-        /* make sure we haven't seen a higher mseq */
+        if (target < 0) {
-        for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+                __ceph_remove_cap(cap, false);
-                t = rb_entry(p, struct ceph_cap, ci_node);
+                goto out_unlock;
-                if (ceph_seq_cmp(t->mseq, mseq) > 0) {
-                        dout(" higher mseq on cap from mds%d\n",
-                             t->session->s_mds);
-                        remember = 0;
-                }
-                if (t->session->s_mds == mds)
-                        cap = t;
        }
-        if (cap) {
+        /*
-                if (remember) {
+         * now we know we haven't received the cap import message yet
-                        /* make note */
+         * because the exported cap still exist.
-                        ci->i_cap_exporting_mds = mds;
+         */
-                        ci->i_cap_exporting_mseq = mseq;
-                        ci->i_cap_exporting_issued = cap->issued;
-                        /*
-                         * make sure we have open sessions with all possible
-                         * export targets, so that we get the matching IMPORT
-                         */
-                        *open_target_sessions = 1;
-                        /*
+        issued = cap->issued;
-                         * we can't flush dirty caps that we've seen the
+        WARN_ON(issued != cap->implemented);
-                         * EXPORT but no IMPORT for
-                         */
+        tcap = __get_cap_for_mds(ci, target);
-                        spin_lock(&mdsc->cap_dirty_lock);
+        if (tcap) {
-                        if (!list_empty(&ci->i_dirty_item)) {
+                /* already have caps from the target */
-                                dout(" moving %p to cap_dirty_migrating\n",
+                if (tcap->cap_id != t_cap_id ||
-                                     inode);
+                    ceph_seq_cmp(tcap->seq, t_seq) < 0) {
-                                list_move(&ci->i_dirty_item,
+                        dout(" updating import cap %p mds%d\n", tcap, target);
-                                          &mdsc->cap_dirty_migrating);
+                        tcap->cap_id = t_cap_id;
+                        tcap->seq = t_seq - 1;
+                        tcap->issue_seq = t_seq - 1;
+                        tcap->mseq = t_mseq;
+                        tcap->issued |= issued;
+                        tcap->implemented |= issued;
+                        if (cap == ci->i_auth_cap)
+                                ci->i_auth_cap = tcap;
+                        if (ci->i_flushing_caps && ci->i_auth_cap == tcap) {
+                                spin_lock(&mdsc->cap_dirty_lock);
+                                list_move_tail(&ci->i_flushing_item,
+                                               &tcap->session->s_cap_flushing);
+                                spin_unlock(&mdsc->cap_dirty_lock);
                        }
-                        spin_unlock(&mdsc->cap_dirty_lock);
                }
                __ceph_remove_cap(cap, false);
+                goto out_unlock;
        }
-        /* else, we already released it */
+        if (tsession) {
+                int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0;
+                spin_unlock(&ci->i_ceph_lock);
+                /* add placeholder for the export tagert */
+                ceph_add_cap(inode, tsession, t_cap_id, -1, issued, 0,
+                             t_seq - 1, t_mseq, (u64)-1, flag, NULL);
+                goto retry;
+        }
+        spin_unlock(&ci->i_ceph_lock);
+        mutex_unlock(&session->s_mutex);
+        /* open target session */
+        tsession = ceph_mdsc_open_export_target_session(mdsc, target);
+        if (!IS_ERR(tsession)) {
+                if (mds > target) {
+                        mutex_lock(&session->s_mutex);
+                        mutex_lock_nested(&tsession->s_mutex,
+                                          SINGLE_DEPTH_NESTING);
+                } else {
+                        mutex_lock(&tsession->s_mutex);
+                        mutex_lock_nested(&session->s_mutex,
+                                          SINGLE_DEPTH_NESTING);
+                }
+                ceph_add_cap_releases(mdsc, tsession);
+        } else {
+                WARN_ON(1);
+                tsession = NULL;
+                target = -1;
+        }
+        goto retry;
+out_unlock:
        spin_unlock(&ci->i_ceph_lock);
+        mutex_unlock(&session->s_mutex);
+        if (tsession) {
+                mutex_unlock(&tsession->s_mutex);
+                ceph_put_mds_session(tsession);
+        }
 }
 /*
@@ -2810,10 +2892,12 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
 */
 static void handle_cap_import(struct ceph_mds_client *mdsc,
                              struct inode *inode, struct ceph_mds_caps *im,
+                              struct ceph_mds_cap_peer *ph,
                              struct ceph_mds_session *session,
                              void *snaptrace, int snaptrace_len)
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
+        struct ceph_cap *cap;
        int mds = session->s_mds;
        unsigned issued = le32_to_cpu(im->caps);
        unsigned wanted = le32_to_cpu(im->wanted);
@@ -2821,28 +2905,44 @@ static void handle_cap_import(struct ceph_mds_client *mdsc,
        unsigned mseq = le32_to_cpu(im->migrate_seq);
        u64 realmino = le64_to_cpu(im->realm);
        u64 cap_id = le64_to_cpu(im->cap_id);
+        u64 p_cap_id;
+        int peer;
-        if (ci->i_cap_exporting_mds >= 0 &&
+        if (ph) {
-            ceph_seq_cmp(ci->i_cap_exporting_mseq, mseq) < 0) {
+                p_cap_id = le64_to_cpu(ph->cap_id);
-                dout("handle_cap_import inode %p ci %p mds%d mseq %d"
+                peer = le32_to_cpu(ph->mds);
-                     " - cleared exporting from mds%d\n",
+        } else {
-                     inode, ci, mds, mseq,
+                p_cap_id = 0;
-                     ci->i_cap_exporting_mds);
+                peer = -1;
-                ci->i_cap_exporting_issued = 0;
+        }
-                ci->i_cap_exporting_mseq = 0;
-                ci->i_cap_exporting_mds = -1;
-                spin_lock(&mdsc->cap_dirty_lock);
+        dout("handle_cap_import inode %p ci %p mds%d mseq %d peer %d\n",
-                if (!list_empty(&ci->i_dirty_item)) {
+             inode, ci, mds, mseq, peer);
-                        dout(" moving %p back to cap_dirty\n", inode);
-                        list_move(&ci->i_dirty_item, &mdsc->cap_dirty);
+        spin_lock(&ci->i_ceph_lock);
+        cap = peer >= 0 ? __get_cap_for_mds(ci, peer) : NULL;
+        if (cap && cap->cap_id == p_cap_id) {
+                dout(" remove export cap %p mds%d flags %d\n",
+                     cap, peer, ph->flags);
+                if ((ph->flags & CEPH_CAP_FLAG_AUTH) &&
+                    (cap->seq != le32_to_cpu(ph->seq) ||
+                     cap->mseq != le32_to_cpu(ph->mseq))) {
+                        pr_err("handle_cap_import: mismatched seq/mseq: "
+                               "ino (%llx.%llx) mds%d seq %d mseq %d "
+                               "importer mds%d has peer seq %d mseq %d\n",
+                               ceph_vinop(inode), peer, cap->seq,
+                               cap->mseq, mds, le32_to_cpu(ph->seq),
+                               le32_to_cpu(ph->mseq));
                }
-                spin_unlock(&mdsc->cap_dirty_lock);
+                ci->i_cap_exporting_issued = cap->issued;
-        } else {
+                __ceph_remove_cap(cap, (ph->flags & CEPH_CAP_FLAG_RELEASE));
-                dout("handle_cap_import inode %p ci %p mds%d mseq %d\n",
-                     inode, ci, mds, mseq);
        }
+        /* make sure we re-request max_size, if necessary */
+        ci->i_wanted_max_size = 0;
+        ci->i_requested_max_size = 0;
+        spin_unlock(&ci->i_ceph_lock);
        down_write(&mdsc->snap_rwsem);
        ceph_update_snap_trace(mdsc, snaptrace, snaptrace+snaptrace_len,
                               false);
@@ -2853,11 +2953,6 @@ static void handle_cap_import(struct ceph_mds_client *mdsc,
        kick_flushing_inode_caps(mdsc, session, inode);
        up_read(&mdsc->snap_rwsem);
-        /* make sure we re-request max_size, if necessary */
-        spin_lock(&ci->i_ceph_lock);
-        ci->i_wanted_max_size = 0;  /* reset */
-        ci->i_requested_max_size = 0;
-        spin_unlock(&ci->i_ceph_lock);
 }
 /*
@@ -2875,6 +2970,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
        struct ceph_inode_info *ci;
        struct ceph_cap *cap;
        struct ceph_mds_caps *h;
+        struct ceph_mds_cap_peer *peer = NULL;
        int mds = session->s_mds;
        int op;
        u32 seq, mseq;
@@ -2885,12 +2981,13 @@ void ceph_handle_caps(struct ceph_mds_session *session,
        void *snaptrace;
        size_t snaptrace_len;
        void *flock;
+        void *end;
        u32 flock_len;
-        int open_target_sessions = 0;
        dout("handle_caps from mds%d\n", mds);
        /* decode */
+        end = msg->front.iov_base + msg->front.iov_len;
        tid = le64_to_cpu(msg->hdr.tid);
        if (msg->front.iov_len < sizeof(*h))
                goto bad;
@@ -2908,17 +3005,28 @@ void ceph_handle_caps(struct ceph_mds_session *session,
        snaptrace_len = le32_to_cpu(h->snap_trace_len);
        if (le16_to_cpu(msg->hdr.version) >= 2) {
-                void *p, *end;
+                void *p = snaptrace + snaptrace_len;
-                p = snaptrace + snaptrace_len;
-                end = msg->front.iov_base + msg->front.iov_len;
                ceph_decode_32_safe(&p, end, flock_len, bad);
+                if (p + flock_len > end)
+                        goto bad;
                flock = p;
        } else {
                flock = NULL;
                flock_len = 0;
        }
+        if (le16_to_cpu(msg->hdr.version) >= 3) {
+                if (op == CEPH_CAP_OP_IMPORT) {
+                        void *p = flock + flock_len;
+                        if (p + sizeof(*peer) > end)
+                                goto bad;
+                        peer = p;
+                } else if (op == CEPH_CAP_OP_EXPORT) {
+                        /* recorded in unused fields */
+                        peer = (void *)&h->size;
+                }
+        }
        mutex_lock(&session->s_mutex);
        session->s_seq++;
        dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
@@ -2951,11 +3059,11 @@ void ceph_handle_caps(struct ceph_mds_session *session,
                goto done;
        case CEPH_CAP_OP_EXPORT:
-                handle_cap_export(inode, h, session, &open_target_sessions);
+                handle_cap_export(inode, h, peer, session);
-                goto done;
+                goto done_unlocked;
        case CEPH_CAP_OP_IMPORT:
-                handle_cap_import(mdsc, inode, h, session,
+                handle_cap_import(mdsc, inode, h, peer, session,
                                  snaptrace, snaptrace_len);
        }
@@ -3007,8 +3115,6 @@ done:
 done_unlocked:
        if (inode)
                iput(inode);
-        if (open_target_sessions)
-                ceph_mdsc_open_export_target_sessions(mdsc, session);
        return;
 bad:
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 2a0bcaeb189a..45eda6d7a40c 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -100,6 +100,14 @@ static unsigned fpos_off(loff_t p)
        return p & 0xffffffff;
 }
+static int fpos_cmp(loff_t l, loff_t r)
+{
+        int v = ceph_frag_compare(fpos_frag(l), fpos_frag(r));
+        if (v)
+                return v;
+        return (int)(fpos_off(l) - fpos_off(r));
+}
 /*
 * When possible, we try to satisfy a readdir by peeking at the
 * dcache.  We make this work by carefully ordering dentries on
@@ -156,7 +164,7 @@ more:
                if (!d_unhashed(dentry) && dentry->d_inode &&
                    ceph_snap(dentry->d_inode) != CEPH_SNAPDIR &&
                    ceph_ino(dentry->d_inode) != CEPH_INO_CEPH &&
-                    ctx->pos <= di->offset)
+                    fpos_cmp(ctx->pos, di->offset) <= 0)
                        break;
                dout(" skipping %p %.*s at %llu (%llu)%s%s\n", dentry,
                     dentry->d_name.len, dentry->d_name.name, di->offset,
@@ -693,7 +701,10 @@ static int ceph_mknod(struct inode *dir, struct dentry *dentry,
        if (!err && !req->r_reply_info.head->is_dentry)
                err = ceph_handle_notrace_create(dir, dentry);
        ceph_mdsc_put_request(req);
-        if (err)
+        if (!err)
+                ceph_init_acl(dentry, dentry->d_inode, dir);
+        else
                d_drop(dentry);
        return err;
 }
@@ -731,7 +742,9 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry,
        if (!err && !req->r_reply_info.head->is_dentry)
                err = ceph_handle_notrace_create(dir, dentry);
        ceph_mdsc_put_request(req);
-        if (err)
+        if (!err)
+                ceph_init_acl(dentry, dentry->d_inode, dir);
+        else
                d_drop(dentry);
        return err;
 }
@@ -772,7 +785,9 @@ static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
                err = ceph_handle_notrace_create(dir, dentry);
        ceph_mdsc_put_request(req);
 out:
-        if (err < 0)
+        if (!err)
+                ceph_init_acl(dentry, dentry->d_inode, dir);
+        else
                d_drop(dentry);
        return err;
 }
@@ -1037,14 +1052,19 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
                valid = 1;
        } else if (dentry_lease_is_valid(dentry) ||
                   dir_lease_is_valid(dir, dentry)) {
-                valid = 1;
+                if (dentry->d_inode)
+                        valid = ceph_is_any_caps(dentry->d_inode);
+                else
+                        valid = 1;
        }
        dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid");
-        if (valid)
+        if (valid) {
                ceph_dentry_lru_touch(dentry);
-        else
+        } else {
+                ceph_dir_clear_complete(dir);
                d_drop(dentry);
+        }
        iput(dir);
        return valid;
 }
@@ -1293,6 +1313,8 @@ const struct inode_operations ceph_dir_iops = {
        .getxattr = ceph_getxattr,
        .listxattr = ceph_listxattr,
        .removexattr = ceph_removexattr,
+        .get_acl = ceph_get_acl,
+        .set_acl = ceph_set_acl,
        .mknod = ceph_mknod,
        .symlink = ceph_symlink,
        .mkdir = ceph_mkdir,
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 3de89829e2a1..09c7afe32e49 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -286,6 +286,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
        } else {
                dout("atomic_open finish_open on dn %p\n", dn);
                if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) {
+                        ceph_init_acl(dentry, dentry->d_inode, dir);
                        *opened |= FILE_CREATED;
                }
                err = finish_open(file, dentry, ceph_open, opened);
@@ -408,51 +409,92 @@ more:
 *
 * If the read spans object boundary, just do multiple reads.
 */
-static ssize_t ceph_sync_read(struct file *file, char __user *data,
+static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i,
-                              unsigned len, loff_t *poff, int *checkeof)
+                                int *checkeof)
 {
+        struct file *file = iocb->ki_filp;
        struct inode *inode = file_inode(file);
        struct page **pages;
-        u64 off = *poff;
+        u64 off = iocb->ki_pos;
        int num_pages, ret;
+        size_t len = i->count;
-        dout("sync_read on file %p %llu~%u %s\n", file, off, len,
+        dout("sync_read on file %p %llu~%u %s\n", file, off,
+             (unsigned)len,
             (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
-        if (file->f_flags & O_DIRECT) {
-                num_pages = calc_pages_for((unsigned long)data, len);
-                pages = ceph_get_direct_page_vector(data, num_pages, true);
-        } else {
-                num_pages = calc_pages_for(off, len);
-                pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
-        }
-        if (IS_ERR(pages))
-                return PTR_ERR(pages);
        /*
         * flush any page cache pages in this range.  this
         * will make concurrent normal and sync io slow,
         * but it will at least behave sensibly when they are
         * in sequence.
         */
-        ret = filemap_write_and_wait(inode->i_mapping);
+        ret = filemap_write_and_wait_range(inode->i_mapping, off,
+                                                off + len);
        if (ret < 0)
-                goto done;
+                return ret;
-        ret = striped_read(inode, off, len, pages, num_pages, checkeof,
+        if (file->f_flags & O_DIRECT) {
-                           file->f_flags & O_DIRECT,
+                while (iov_iter_count(i)) {
-                           (unsigned long)data & ~PAGE_MASK);
+                        void __user *data = i->iov[0].iov_base + i->iov_offset;
+                        size_t len = i->iov[0].iov_len - i->iov_offset;
+                        num_pages = calc_pages_for((unsigned long)data, len);
+                        pages = ceph_get_direct_page_vector(data,
+                                                            num_pages, true);
+                        if (IS_ERR(pages))
+                                return PTR_ERR(pages);
+                        ret = striped_read(inode, off, len,
+                                           pages, num_pages, checkeof,
+                                           1, (unsigned long)data & ~PAGE_MASK);
+                        ceph_put_page_vector(pages, num_pages, true);
+                        if (ret <= 0)
+                                break;
+                        off += ret;
+                        iov_iter_advance(i, ret);
+                        if (ret < len)
+                                break;
+                }
+        } else {
+                num_pages = calc_pages_for(off, len);
+                pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
+                if (IS_ERR(pages))
+                        return PTR_ERR(pages);
+                ret = striped_read(inode, off, len, pages,
+                                        num_pages, checkeof, 0, 0);
+                if (ret > 0) {
+                        int l, k = 0;
+                        size_t left = len = ret;
+                        while (left) {
+                                void __user *data = i->iov[0].iov_base
+                                                        + i->iov_offset;
+                                l = min(i->iov[0].iov_len - i->iov_offset,
+                                        left);
+                                ret = ceph_copy_page_vector_to_user(&pages[k],
+                                                                    data, off,
+                                                                    l);
+                                if (ret > 0) {
+                                        iov_iter_advance(i, ret);
+                                        left -= ret;
+                                        off += ret;
+                                        k = calc_pages_for(iocb->ki_pos,
+                                                           len - left + 1) - 1;
+                                        BUG_ON(k >= num_pages && left);
+                                } else
+                                        break;
+                        }
+                }
+                ceph_release_page_vector(pages, num_pages);
+        }
-        if (ret >= 0 && (file->f_flags & O_DIRECT) == 0)
+        if (off > iocb->ki_pos) {
-                ret = ceph_copy_page_vector_to_user(pages, data, off, ret);
+                ret = off - iocb->ki_pos;
-        if (ret >= 0)
+                iocb->ki_pos = off;
-                *poff = off + ret;
+        }
-done:
-        if (file->f_flags & O_DIRECT)
-                ceph_put_page_vector(pages, num_pages, true);
-        else
-                ceph_release_page_vector(pages, num_pages);
        dout("sync_read result %d\n", ret);
        return ret;
 }
@@ -489,83 +531,79 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe)
        }
 }
 /*
- * Synchronous write, straight from __user pointer or user pages (if
+ * Synchronous write, straight from __user pointer or user pages.
- * O_DIRECT).
 *
 * If write spans object boundary, just do multiple writes.  (For a
 * correct atomic write, we should e.g. take write locks on all
 * objects, rollback on failure, etc.)
 */
-static ssize_t ceph_sync_write(struct file *file, const char __user *data,
+static ssize_t
-                               size_t left, loff_t pos, loff_t *ppos)
+ceph_sync_direct_write(struct kiocb *iocb, const struct iovec *iov,
+                       unsigned long nr_segs, size_t count)
 {
+        struct file *file = iocb->ki_filp;
        struct inode *inode = file_inode(file);
        struct ceph_inode_info *ci = ceph_inode(inode);
        struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
        struct ceph_snap_context *snapc;
        struct ceph_vino vino;
        struct ceph_osd_request *req;
-        int num_ops = 1;
        struct page **pages;
        int num_pages;
-        u64 len;
        int written = 0;
        int flags;
        int check_caps = 0;
-        int page_align, io_align;
+        int page_align;
-        unsigned long buf_align;
        int ret;
        struct timespec mtime = CURRENT_TIME;
-        bool own_pages = false;
+        loff_t pos = iocb->ki_pos;
+        struct iov_iter i;
        if (ceph_snap(file_inode(file)) != CEPH_NOSNAP)
                return -EROFS;
-        dout("sync_write on file %p %lld~%u %s\n", file, pos,
+        dout("sync_direct_write on file %p %lld~%u\n", file, pos,
-             (unsigned)left, (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
+             (unsigned)count);
-        ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left);
+        ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count);
        if (ret < 0)
                return ret;
        ret = invalidate_inode_pages2_range(inode->i_mapping,
                                            pos >> PAGE_CACHE_SHIFT,
-                                            (pos + left) >> PAGE_CACHE_SHIFT);
+                                            (pos + count) >> PAGE_CACHE_SHIFT);
        if (ret < 0)
                dout("invalidate_inode_pages2_range returned %d\n", ret);
        flags = CEPH_OSD_FLAG_ORDERSNAP |
                CEPH_OSD_FLAG_ONDISK |
                CEPH_OSD_FLAG_WRITE;
-        if ((file->f_flags & (O_SYNC|O_DIRECT)) == 0)
-                flags |= CEPH_OSD_FLAG_ACK;
-        else
-                num_ops++;      /* Also include a 'startsync' command. */
-        /*
+        iov_iter_init(&i, iov, nr_segs, count, 0);
-         * we may need to do multiple writes here if we span an object
-         * boundary.  this isn't atomic, unfortunately.  :(
+        while (iov_iter_count(&i) > 0) {
-         */
+                void __user *data = i.iov->iov_base + i.iov_offset;
-more:
+                u64 len = i.iov->iov_len - i.iov_offset;
-        io_align = pos & ~PAGE_MASK;
-        buf_align = (unsigned long)data & ~PAGE_MASK;
+                page_align = (unsigned long)data & ~PAGE_MASK;
-        len = left;
+                snapc = ci->i_snap_realm->cached_context;
-        snapc = ci->i_snap_realm->cached_context;
+                vino = ceph_vino(inode);
-        vino = ceph_vino(inode);
+                req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
-        req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
+                                            vino, pos, &len,
-                                    vino, pos, &len, num_ops,
+                                            2,/*include a 'startsync' command*/
-                                    CEPH_OSD_OP_WRITE, flags, snapc,
+                                            CEPH_OSD_OP_WRITE, flags, snapc,
-                                    ci->i_truncate_seq, ci->i_truncate_size,
+                                            ci->i_truncate_seq,
-                                    false);
+                                            ci->i_truncate_size,
-        if (IS_ERR(req))
+                                            false);
-                return PTR_ERR(req);
+                if (IS_ERR(req)) {
+                        ret = PTR_ERR(req);
+                        goto out;
+                }
-        /* write from beginning of first page, regardless of io alignment */
+                num_pages = calc_pages_for(page_align, len);
-        page_align = file->f_flags & O_DIRECT ? buf_align : io_align;
-        num_pages = calc_pages_for(page_align, len);
-        if (file->f_flags & O_DIRECT) {
                pages = ceph_get_direct_page_vector(data, num_pages, false);
                if (IS_ERR(pages)) {
                        ret = PTR_ERR(pages);
@@ -577,60 +615,175 @@ more:
                 * may block.
                 */
                truncate_inode_pages_range(inode->i_mapping, pos,
-                                           (pos+len) | (PAGE_CACHE_SIZE-1));
+                                   (pos+len) | (PAGE_CACHE_SIZE-1));
-        } else {
+                osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align,
+                                                false, false);
+                /* BUG_ON(vino.snap != CEPH_NOSNAP); */
+                ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime);
+                ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
+                if (!ret)
+                        ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
+                ceph_put_page_vector(pages, num_pages, false);
+out:
+                ceph_osdc_put_request(req);
+                if (ret == 0) {
+                        pos += len;
+                        written += len;
+                        iov_iter_advance(&i, (size_t)len);
+                        if (pos > i_size_read(inode)) {
+                                check_caps = ceph_inode_set_size(inode, pos);
+                                if (check_caps)
+                                        ceph_check_caps(ceph_inode(inode),
+                                                        CHECK_CAPS_AUTHONLY,
+                                                        NULL);
+                        }
+                } else
+                        break;
+        }
+        if (ret != -EOLDSNAPC && written > 0) {
+                iocb->ki_pos = pos;
+                ret = written;
+        }
+        return ret;
+}
+/*
+ * Synchronous write, straight from __user pointer or user pages.
+ *
+ * If write spans object boundary, just do multiple writes.  (For a
+ * correct atomic write, we should e.g. take write locks on all
+ * objects, rollback on failure, etc.)
+ */
+static ssize_t ceph_sync_write(struct kiocb *iocb, const struct iovec *iov,
+                               unsigned long nr_segs, size_t count)
+{
+        struct file *file = iocb->ki_filp;
+        struct inode *inode = file_inode(file);
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+        struct ceph_snap_context *snapc;
+        struct ceph_vino vino;
+        struct ceph_osd_request *req;
+        struct page **pages;
+        u64 len;
+        int num_pages;
+        int written = 0;
+        int flags;
+        int check_caps = 0;
+        int ret;
+        struct timespec mtime = CURRENT_TIME;
+        loff_t pos = iocb->ki_pos;
+        struct iov_iter i;
+        if (ceph_snap(file_inode(file)) != CEPH_NOSNAP)
+                return -EROFS;
+        dout("sync_write on file %p %lld~%u\n", file, pos, (unsigned)count);
+        ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count);
+        if (ret < 0)
+                return ret;
+        ret = invalidate_inode_pages2_range(inode->i_mapping,
+                                            pos >> PAGE_CACHE_SHIFT,
+                                            (pos + count) >> PAGE_CACHE_SHIFT);
+        if (ret < 0)
+                dout("invalidate_inode_pages2_range returned %d\n", ret);
+        flags = CEPH_OSD_FLAG_ORDERSNAP |
+                CEPH_OSD_FLAG_ONDISK |
+                CEPH_OSD_FLAG_WRITE |
+                CEPH_OSD_FLAG_ACK;
+        iov_iter_init(&i, iov, nr_segs, count, 0);
+        while ((len = iov_iter_count(&i)) > 0) {
+                size_t left;
+                int n;
+                snapc = ci->i_snap_realm->cached_context;
+                vino = ceph_vino(inode);
+                req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
+                                            vino, pos, &len, 1,
+                                            CEPH_OSD_OP_WRITE, flags, snapc,
+                                            ci->i_truncate_seq,
+                                            ci->i_truncate_size,
+                                            false);
+                if (IS_ERR(req)) {
+                        ret = PTR_ERR(req);
+                        goto out;
+                }
+                /*
+                 * write from beginning of first page,
+                 * regardless of io alignment
+                 */
+                num_pages = (len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
                pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
                if (IS_ERR(pages)) {
                        ret = PTR_ERR(pages);
                        goto out;
                }
-                ret = ceph_copy_user_to_page_vector(pages, data, pos, len);
+                left = len;
+                for (n = 0; n < num_pages; n++) {
+                        size_t plen = min_t(size_t, left, PAGE_SIZE);
+                        ret = iov_iter_copy_from_user(pages[n], &i, 0, plen);
+                        if (ret != plen) {
+                                ret = -EFAULT;
+                                break;
+                        }
+                        left -= ret;
+                        iov_iter_advance(&i, ret);
+                }
                if (ret < 0) {
                        ceph_release_page_vector(pages, num_pages);
                        goto out;
                }
-                if ((file->f_flags & O_SYNC) == 0) {
+                /* get a second commit callback */
-                        /* get a second commit callback */
+                req->r_unsafe_callback = ceph_sync_write_unsafe;
-                        req->r_unsafe_callback = ceph_sync_write_unsafe;
+                req->r_inode = inode;
-                        req->r_inode = inode;
-                        own_pages = true;
-                }
-        }
-        osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align,
-                                        false, own_pages);
-        /* BUG_ON(vino.snap != CEPH_NOSNAP); */
+                osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0,
-        ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime);
+                                                false, true);
-        ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
+                /* BUG_ON(vino.snap != CEPH_NOSNAP); */
-        if (!ret)
+                ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime);
-                ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
-        if (file->f_flags & O_DIRECT)
+                ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
-                ceph_put_page_vector(pages, num_pages, false);
+                if (!ret)
-        else if (file->f_flags & O_SYNC)
+                        ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
-                ceph_release_page_vector(pages, num_pages);
 out:
-        ceph_osdc_put_request(req);
+                ceph_osdc_put_request(req);
-        if (ret == 0) {
+                if (ret == 0) {
-                pos += len;
+                        pos += len;
-                written += len;
+                        written += len;
-                left -= len;
-                data += len;
+                        if (pos > i_size_read(inode)) {
-                if (left)
+                                check_caps = ceph_inode_set_size(inode, pos);
-                        goto more;
+                                if (check_caps)
+                                        ceph_check_caps(ceph_inode(inode),
+                                                        CHECK_CAPS_AUTHONLY,
+                                                        NULL);
+                        }
+                } else
+                        break;
+        }
+        if (ret != -EOLDSNAPC && written > 0) {
                ret = written;
-                *ppos = pos;
+                iocb->ki_pos = pos;
-                if (pos > i_size_read(inode))
-                        check_caps = ceph_inode_set_size(inode, pos);
-                if (check_caps)
-                        ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY,
-                                        NULL);
-        } else if (ret != -EOLDSNAPC && written > 0) {
-                ret = written;
        }
        return ret;
 }
@@ -647,55 +800,84 @@ static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov,
 {
        struct file *filp = iocb->ki_filp;
        struct ceph_file_info *fi = filp->private_data;
-        loff_t *ppos = &iocb->ki_pos;
+        size_t len = iocb->ki_nbytes;
-        size_t len = iov->iov_len;
        struct inode *inode = file_inode(filp);
        struct ceph_inode_info *ci = ceph_inode(inode);
-        void __user *base = iov->iov_base;
        ssize_t ret;
        int want, got = 0;
        int checkeof = 0, read = 0;
-        dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
-             inode, ceph_vinop(inode), pos, (unsigned)len, inode);
 again:
+        dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
+             inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, inode);
        if (fi->fmode & CEPH_FILE_MODE_LAZY)
                want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
        else
                want = CEPH_CAP_FILE_CACHE;
        ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, &got, -1);
        if (ret < 0)
-                goto out;
+                return ret;
-        dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n",
-             inode, ceph_vinop(inode), pos, (unsigned)len,
-             ceph_cap_string(got));
        if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 ||
            (iocb->ki_filp->f_flags & O_DIRECT) ||
-            (fi->flags & CEPH_F_SYNC))
+            (fi->flags & CEPH_F_SYNC)) {
+                struct iov_iter i;
+                dout("aio_sync_read %p %llx.%llx %llu~%u got cap refs on %s\n",
+                     inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
+                     ceph_cap_string(got));
+                if (!read) {
+                        ret = generic_segment_checks(iov, &nr_segs,
+                                                        &len, VERIFY_WRITE);
+                        if (ret)
+                                goto out;
+                }
+                iov_iter_init(&i, iov, nr_segs, len, read);
                /* hmm, this isn't really async... */
-                ret = ceph_sync_read(filp, base, len, ppos, &checkeof);
+                ret = ceph_sync_read(iocb, &i, &checkeof);
-        else
+        } else {
-                ret = generic_file_aio_read(iocb, iov, nr_segs, pos);
+                /*
+                 * We can't modify the content of iov,
+                 * so we only read from beginning.
+                 */
+                if (read) {
+                        iocb->ki_pos = pos;
+                        len = iocb->ki_nbytes;
+                        read = 0;
+                }
+                dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n",
+                     inode, ceph_vinop(inode), pos, (unsigned)len,
+                     ceph_cap_string(got));
+                ret = generic_file_aio_read(iocb, iov, nr_segs, pos);
+        }
 out:
        dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
             inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret);
        ceph_put_cap_refs(ci, got);
        if (checkeof && ret >= 0) {
-                int statret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
+                int statret = ceph_do_getattr(inode,
+                                              CEPH_STAT_CAP_SIZE);
                /* hit EOF or hole? */
-                if (statret == 0 && *ppos < inode->i_size) {
+                if (statret == 0 && iocb->ki_pos < inode->i_size &&
-                        dout("aio_read sync_read hit hole, ppos %lld < size %lld, reading more\n", *ppos, inode->i_size);
+                        ret < len) {
+                        dout("sync_read hit hole, ppos %lld < size %lld"
+                             ", reading more\n", iocb->ki_pos,
+                             inode->i_size);
                        read += ret;
-                        base += ret;
                        len -= ret;
                        checkeof = 0;
                        goto again;
                }
        }
        if (ret >= 0)
                ret += read;
@@ -772,11 +954,13 @@ retry_snap:
             inode, ceph_vinop(inode), pos, count, ceph_cap_string(got));
        if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 ||
-            (iocb->ki_filp->f_flags & O_DIRECT) ||
+            (file->f_flags & O_DIRECT) || (fi->flags & CEPH_F_SYNC)) {
-            (fi->flags & CEPH_F_SYNC)) {
                mutex_unlock(&inode->i_mutex);
-                written = ceph_sync_write(file, iov->iov_base, count,
+                if (file->f_flags & O_DIRECT)
-                                          pos, &iocb->ki_pos);
+                        written = ceph_sync_direct_write(iocb, iov,
+                                                         nr_segs, count);
+                else
+                        written = ceph_sync_write(iocb, iov, nr_segs, count);
                if (written == -EOLDSNAPC) {
                        dout("aio_write %p %llx.%llx %llu~%u"
                                "got EOLDSNAPC, retrying\n",
@@ -1018,7 +1202,7 @@ static long ceph_fallocate(struct file *file, int mode,
                                loff_t offset, loff_t length)
 {
        struct ceph_file_info *fi = file->private_data;
-        struct inode *inode = file->f_dentry->d_inode;
+        struct inode *inode = file_inode(file);
        struct ceph_inode_info *ci = ceph_inode(inode);
        struct ceph_osd_client *osdc =
                &ceph_inode_to_client(inode)->client->osdc;
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 278fd2891288..32d519d8a2e2 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -9,6 +9,7 @@
 #include <linux/namei.h>
 #include <linux/writeback.h>
 #include <linux/vmalloc.h>
+#include <linux/posix_acl.h>
 #include "super.h"
 #include "mds_client.h"
@@ -95,6 +96,8 @@ const struct inode_operations ceph_file_iops = {
        .getxattr = ceph_getxattr,
        .listxattr = ceph_listxattr,
        .removexattr = ceph_removexattr,
+        .get_acl = ceph_get_acl,
+        .set_acl = ceph_set_acl,
 };
@@ -335,12 +338,10 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
        ci->i_hold_caps_min = 0;
        ci->i_hold_caps_max = 0;
        INIT_LIST_HEAD(&ci->i_cap_delay_list);
-        ci->i_cap_exporting_mds = 0;
-        ci->i_cap_exporting_mseq = 0;
-        ci->i_cap_exporting_issued = 0;
        INIT_LIST_HEAD(&ci->i_cap_snaps);
        ci->i_head_snapc = NULL;
        ci->i_snap_caps = 0;
+        ci->i_cap_exporting_issued = 0;
        for (i = 0; i < CEPH_FILE_MODE_NUM; i++)
                ci->i_nr_by_mode[i] = 0;
@@ -436,6 +437,16 @@ void ceph_destroy_inode(struct inode *inode)
        call_rcu(&inode->i_rcu, ceph_i_callback);
 }
+int ceph_drop_inode(struct inode *inode)
+{
+        /*
+         * Positve dentry and corresponding inode are always accompanied
+         * in MDS reply. So no need to keep inode in the cache after
+         * dropping all its aliases.
+         */
+        return 1;
+}
 /*
 * Helpers to fill in size, ctime, mtime, and atime.  We have to be
 * careful because either the client or MDS may have more up to date
@@ -670,6 +681,7 @@ static int fill_inode(struct inode *inode,
                        memcpy(ci->i_xattrs.blob->vec.iov_base,
                               iinfo->xattr_data, iinfo->xattr_len);
                ci->i_xattrs.version = le64_to_cpu(info->xattr_version);
+                ceph_forget_all_cached_acls(inode);
                xattr_blob = NULL;
        }
@@ -1454,7 +1466,8 @@ static void ceph_invalidate_work(struct work_struct *work)
        dout("invalidate_pages %p gen %d revoking %d\n", inode,
             ci->i_rdcache_gen, ci->i_rdcache_revoking);
        if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
-                /* nevermind! */
+                if (__ceph_caps_revoking_other(ci, NULL, CEPH_CAP_FILE_CACHE))
+                        check = 1;
                spin_unlock(&ci->i_ceph_lock);
                mutex_unlock(&ci->i_truncate_mutex);
                goto out;
@@ -1475,13 +1488,14 @@ static void ceph_invalidate_work(struct work_struct *work)
                dout("invalidate_pages %p gen %d raced, now %d revoking %d\n",
                     inode, orig_gen, ci->i_rdcache_gen,
                     ci->i_rdcache_revoking);
+                if (__ceph_caps_revoking_other(ci, NULL, CEPH_CAP_FILE_CACHE))
+                        check = 1;
        }
        spin_unlock(&ci->i_ceph_lock);
        mutex_unlock(&ci->i_truncate_mutex);
+out:
        if (check)
                ceph_check_caps(ci, 0, NULL);
-out:
        iput(inode);
 }
@@ -1602,6 +1616,8 @@ static const struct inode_operations ceph_symlink_iops = {
        .getxattr = ceph_getxattr,
        .listxattr = ceph_listxattr,
        .removexattr = ceph_removexattr,
+        .get_acl = ceph_get_acl,
+        .set_acl = ceph_set_acl,
 };
 /*
@@ -1675,6 +1691,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
                        dirtied |= CEPH_CAP_AUTH_EXCL;
                } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
                           attr->ia_mode != inode->i_mode) {
+                        inode->i_mode = attr->ia_mode;
                        req->r_args.setattr.mode = cpu_to_le32(attr->ia_mode);
                        mask |= CEPH_SETATTR_MODE;
                        release |= CEPH_CAP_AUTH_SHARED;
@@ -1790,6 +1807,12 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
        if (inode_dirty_flags)
                __mark_inode_dirty(inode, inode_dirty_flags);
+        if (ia_valid & ATTR_MODE) {
+                err = posix_acl_chmod(inode, attr->ia_mode);
+                if (err)
+                        goto out_put;
+        }
        if (mask) {
                req->r_inode = inode;
                ihold(inode);
@@ -1809,6 +1832,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
        return err;
 out:
        spin_unlock(&ci->i_ceph_lock);
+out_put:
        ceph_mdsc_put_request(req);
        return err;
 }
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 669622fd1ae3..dc66c9e023e4 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -183,6 +183,8 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
        struct ceph_inode_info *ci = ceph_inode(inode);
        struct ceph_osd_client *osdc =
                &ceph_sb_to_client(inode->i_sb)->client->osdc;
+        struct ceph_object_locator oloc;
+        struct ceph_object_id oid;
        u64 len = 1, olen;
        u64 tmp;
        struct ceph_pg pgid;
@@ -211,8 +213,10 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
        snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx",
                 ceph_ino(inode), dl.object_no);
-        r = ceph_calc_ceph_pg(&pgid, dl.object_name, osdc->osdmap,
+        oloc.pool = ceph_file_layout_pg_pool(ci->i_layout);
-                                ceph_file_layout_pg_pool(ci->i_layout));
+        ceph_oid_set_name(&oid, dl.object_name);
+        r = ceph_oloc_oid_to_pg(osdc->osdmap, &oloc, &oid, &pgid);
        if (r < 0) {
                up_read(&osdc->map_sem);
                return r;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index d90861f45210..f4f050a69a48 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -63,7 +63,7 @@ static const struct ceph_connection_operations mds_con_ops;
 */
 static int parse_reply_info_in(void **p, void *end,
                               struct ceph_mds_reply_info_in *info,
-                               int features)
+                               u64 features)
 {
        int err = -EIO;
@@ -98,7 +98,7 @@ bad:
 */
 static int parse_reply_info_trace(void **p, void *end,
                                  struct ceph_mds_reply_info_parsed *info,
-                                  int features)
+                                  u64 features)
 {
        int err;
@@ -145,7 +145,7 @@ out_bad:
 */
 static int parse_reply_info_dir(void **p, void *end,
                                struct ceph_mds_reply_info_parsed *info,
-                                int features)
+                                u64 features)
 {
        u32 num, i = 0;
        int err;
@@ -217,7 +217,7 @@ out_bad:
 */
 static int parse_reply_info_filelock(void **p, void *end,
                                     struct ceph_mds_reply_info_parsed *info,
-                                     int features)
+                                     u64 features)
 {
        if (*p + sizeof(*info->filelock_reply) > end)
                goto bad;
@@ -238,7 +238,7 @@ bad:
 */
 static int parse_reply_info_create(void **p, void *end,
                                  struct ceph_mds_reply_info_parsed *info,
-                                  int features)
+                                  u64 features)
 {
        if (features & CEPH_FEATURE_REPLY_CREATE_INODE) {
                if (*p == end) {
@@ -262,7 +262,7 @@ bad:
 */
 static int parse_reply_info_extra(void **p, void *end,
                                  struct ceph_mds_reply_info_parsed *info,
-                                  int features)
+                                  u64 features)
 {
        if (info->head->op == CEPH_MDS_OP_GETFILELOCK)
                return parse_reply_info_filelock(p, end, info, features);
@@ -280,7 +280,7 @@ static int parse_reply_info_extra(void **p, void *end,
 */
 static int parse_reply_info(struct ceph_msg *msg,
                            struct ceph_mds_reply_info_parsed *info,
-                            int features)
+                            u64 features)
 {
        void *p, *end;
        u32 len;
@@ -713,14 +713,15 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
                        struct dentry *dn = get_nonsnap_parent(parent);
                        inode = dn->d_inode;
                        dout("__choose_mds using nonsnap parent %p\n", inode);
-                } else if (req->r_dentry->d_inode) {
+                } else {
                        /* dentry target */
                        inode = req->r_dentry->d_inode;
-                } else {
+                        if (!inode || mode == USE_AUTH_MDS) {
-                        /* dir + name */
+                                /* dir + name */
-                        inode = dir;
+                                inode = dir;
-                        hash = ceph_dentry_hash(dir, req->r_dentry);
+                                hash = ceph_dentry_hash(dir, req->r_dentry);
-                        is_hash = true;
+                                is_hash = true;
+                        }
                }
        }
@@ -846,35 +847,56 @@ static int __open_session(struct ceph_mds_client *mdsc,
 *
 * called under mdsc->mutex
 */
+static struct ceph_mds_session *
+__open_export_target_session(struct ceph_mds_client *mdsc, int target)
+{
+        struct ceph_mds_session *session;
+        session = __ceph_lookup_mds_session(mdsc, target);
+        if (!session) {
+                session = register_session(mdsc, target);
+                if (IS_ERR(session))
+                        return session;
+        }
+        if (session->s_state == CEPH_MDS_SESSION_NEW ||
+            session->s_state == CEPH_MDS_SESSION_CLOSING)
+                __open_session(mdsc, session);
+        return session;
+}
+struct ceph_mds_session *
+ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target)
+{
+        struct ceph_mds_session *session;
+        dout("open_export_target_session to mds%d\n", target);
+        mutex_lock(&mdsc->mutex);
+        session = __open_export_target_session(mdsc, target);
+        mutex_unlock(&mdsc->mutex);
+        return session;
+}
 static void __open_export_target_sessions(struct ceph_mds_client *mdsc,
                                          struct ceph_mds_session *session)
 {
        struct ceph_mds_info *mi;
        struct ceph_mds_session *ts;
        int i, mds = session->s_mds;
-        int target;
        if (mds >= mdsc->mdsmap->m_max_mds)
                return;
        mi = &mdsc->mdsmap->m_info[mds];
        dout("open_export_target_sessions for mds%d (%d targets)\n",
             session->s_mds, mi->num_export_targets);
        for (i = 0; i < mi->num_export_targets; i++) {
-                target = mi->export_targets[i];
+                ts = __open_export_target_session(mdsc, mi->export_targets[i]);
-                ts = __ceph_lookup_mds_session(mdsc, target);
+                if (!IS_ERR(ts))
-                if (!ts) {
+                        ceph_put_mds_session(ts);
-                        ts = register_session(mdsc, target);
-                        if (IS_ERR(ts))
-                                return;
-                }
-                if (session->s_state == CEPH_MDS_SESSION_NEW ||
-                    session->s_state == CEPH_MDS_SESSION_CLOSING)
-                        __open_session(mdsc, session);
-                else
-                        dout(" mds%d target mds%d %p is %s\n", session->s_mds,
-                             i, ts, session_state_name(ts->s_state));
-                ceph_put_mds_session(ts);
        }
 }
@@ -1136,6 +1158,21 @@ static int send_renew_caps(struct ceph_mds_client *mdsc,
        return 0;
 }
+static int send_flushmsg_ack(struct ceph_mds_client *mdsc,
+                             struct ceph_mds_session *session, u64 seq)
+{
+        struct ceph_msg *msg;
+        dout("send_flushmsg_ack to mds%d (%s)s seq %lld\n",
+             session->s_mds, session_state_name(session->s_state), seq);
+        msg = create_session_msg(CEPH_SESSION_FLUSHMSG_ACK, seq);
+        if (!msg)
+                return -ENOMEM;
+        ceph_con_send(&session->s_con, msg);
+        return 0;
+}
 /*
 * Note new cap ttl, and any transition from stale -> not stale (fresh?).
 *
@@ -1214,7 +1251,7 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
 {
        struct ceph_mds_session *session = arg;
        struct ceph_inode_info *ci = ceph_inode(inode);
-        int used, oissued, mine;
+        int used, wanted, oissued, mine;
        if (session->s_trim_caps <= 0)
                return -1;
@@ -1222,14 +1259,19 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
        spin_lock(&ci->i_ceph_lock);
        mine = cap->issued | cap->implemented;
        used = __ceph_caps_used(ci);
+        wanted = __ceph_caps_file_wanted(ci);
        oissued = __ceph_caps_issued_other(ci, cap);
-        dout("trim_caps_cb %p cap %p mine %s oissued %s used %s\n",
+        dout("trim_caps_cb %p cap %p mine %s oissued %s used %s wanted %s\n",
             inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued),
-             ceph_cap_string(used));
+             ceph_cap_string(used), ceph_cap_string(wanted));
-        if (ci->i_dirty_caps)
+        if (cap == ci->i_auth_cap) {
-                goto out;   /* dirty caps */
+                if (ci->i_dirty_caps | ci->i_flushing_caps)
-        if ((used & ~oissued) & mine)
+                        goto out;
+                if ((used | wanted) & CEPH_CAP_ANY_WR)
+                        goto out;
+        }
+        if ((used | wanted) & ~oissued & mine)
                goto out;   /* we need these caps */
        session->s_trim_caps--;
@@ -2156,26 +2198,16 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
         */
        if (result == -ESTALE) {
                dout("got ESTALE on request %llu", req->r_tid);
-                if (!req->r_inode) {
+                if (req->r_direct_mode != USE_AUTH_MDS) {
-                        /* do nothing; not an authority problem */
-                } else if (req->r_direct_mode != USE_AUTH_MDS) {
                        dout("not using auth, setting for that now");
                        req->r_direct_mode = USE_AUTH_MDS;
                        __do_request(mdsc, req);
                        mutex_unlock(&mdsc->mutex);
                        goto out;
                } else  {
-                        struct ceph_inode_info *ci = ceph_inode(req->r_inode);
+                        int mds = __choose_mds(mdsc, req);
-                        struct ceph_cap *cap = NULL;
+                        if (mds >= 0 && mds != req->r_session->s_mds) {
+                                dout("but auth changed, so resending");
-                        if (req->r_session)
-                                cap = ceph_get_cap_for_mds(ci,
-                                                   req->r_session->s_mds);
-                        dout("already using auth");
-                        if ((!cap || cap != ci->i_auth_cap) ||
-                            (cap->mseq != req->r_sent_on_mseq)) {
-                                dout("but cap changed, so resending");
                                __do_request(mdsc, req);
                                mutex_unlock(&mdsc->mutex);
                                goto out;
@@ -2400,6 +2432,10 @@ static void handle_session(struct ceph_mds_session *session,
                trim_caps(mdsc, session, le32_to_cpu(h->max_caps));
                break;
+        case CEPH_SESSION_FLUSHMSG:
+                send_flushmsg_ack(mdsc, session, seq);
+                break;
        default:
                pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds);
                WARN_ON(1);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 4c053d099ae4..68288917c737 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -383,6 +383,8 @@ extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
 extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc,
                                 struct ceph_msg *msg);
+extern struct ceph_mds_session *
+ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target);
 extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
                                          struct ceph_mds_session *session);
diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c
index 89fa4a940a0f..4440f447fd3f 100644
--- a/fs/ceph/strings.c
+++ b/fs/ceph/strings.c
@@ -41,6 +41,8 @@ const char *ceph_session_op_name(int op)
        case CEPH_SESSION_RENEWCAPS: return "renewcaps";
        case CEPH_SESSION_STALE: return "stale";
        case CEPH_SESSION_RECALL_STATE: return "recall_state";
+        case CEPH_SESSION_FLUSHMSG: return "flushmsg";
+        case CEPH_SESSION_FLUSHMSG_ACK: return "flushmsg_ack";
        }
        return "???";
 }
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 6a0951e43044..10a4ccbf38da 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -144,7 +144,11 @@ enum {
        Opt_ino32,
        Opt_noino32,
        Opt_fscache,
-        Opt_nofscache
+        Opt_nofscache,
+#ifdef CONFIG_CEPH_FS_POSIX_ACL
+        Opt_acl,
+#endif
+        Opt_noacl
 };
 static match_table_t fsopt_tokens = {
@@ -172,6 +176,10 @@ static match_table_t fsopt_tokens = {
        {Opt_noino32, "noino32"},
        {Opt_fscache, "fsc"},
        {Opt_nofscache, "nofsc"},
+#ifdef CONFIG_CEPH_FS_POSIX_ACL
+        {Opt_acl, "acl"},
+#endif
+        {Opt_noacl, "noacl"},
        {-1, NULL}
 };
@@ -271,6 +279,14 @@ static int parse_fsopt_token(char *c, void *private)
        case Opt_nofscache:
                fsopt->flags &= ~CEPH_MOUNT_OPT_FSCACHE;
                break;
+#ifdef CONFIG_CEPH_FS_POSIX_ACL
+        case Opt_acl:
+                fsopt->sb_flags |= MS_POSIXACL;
+                break;
+#endif
+        case Opt_noacl:
+                fsopt->sb_flags &= ~MS_POSIXACL;
+                break;
        default:
                BUG_ON(token);
        }
@@ -438,6 +454,13 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
        else
                seq_puts(m, ",nofsc");
+#ifdef CONFIG_CEPH_FS_POSIX_ACL
+        if (fsopt->sb_flags & MS_POSIXACL)
+                seq_puts(m, ",acl");
+        else
+                seq_puts(m, ",noacl");
+#endif
        if (fsopt->wsize)
                seq_printf(m, ",wsize=%d", fsopt->wsize);
        if (fsopt->rsize != CEPH_RSIZE_DEFAULT)
@@ -490,10 +513,10 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
                                        struct ceph_options *opt)
 {
        struct ceph_fs_client *fsc;
-        const unsigned supported_features =
+        const u64 supported_features =
                CEPH_FEATURE_FLOCK |
                CEPH_FEATURE_DIRLAYOUTHASH;
-        const unsigned required_features = 0;
+        const u64 required_features = 0;
        int page_count;
        size_t size;
        int err = -ENOMEM;
@@ -686,6 +709,7 @@ static const struct super_operations ceph_super_ops = {
        .alloc_inode    = ceph_alloc_inode,
        .destroy_inode  = ceph_destroy_inode,
        .write_inode    = ceph_write_inode,
+        .drop_inode     = ceph_drop_inode,
        .sync_fs        = ceph_sync_fs,
        .put_super      = ceph_put_super,
        .show_options   = ceph_show_options,
@@ -819,6 +843,7 @@ static int ceph_set_super(struct super_block *s, void *data)
        s->s_flags = fsc->mount_options->sb_flags;
        s->s_maxbytes = 1ULL << 40;  /* temp value until we get mdsmap */
+        s->s_xattr = ceph_xattr_handlers;
        s->s_fs_info = fsc;
        fsc->sb = s;
@@ -906,6 +931,10 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type,
        struct ceph_options *opt = NULL;
        dout("ceph_mount\n");
+#ifdef CONFIG_CEPH_FS_POSIX_ACL
+        flags |= MS_POSIXACL;
+#endif
        err = parse_mount_options(&fsopt, &opt, flags, data, dev_name, &path);
        if (err < 0) {
                res = ERR_PTR(err);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index ef4ac38bb614..d8801a95b685 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -13,6 +13,7 @@
 #include <linux/wait.h>
 #include <linux/writeback.h>
 #include <linux/slab.h>
+#include <linux/posix_acl.h>
 #include <linux/ceph/libceph.h>
@@ -287,14 +288,12 @@ struct ceph_inode_info {
        unsigned long i_hold_caps_min; /* jiffies */
        unsigned long i_hold_caps_max; /* jiffies */
        struct list_head i_cap_delay_list;  /* for delayed cap release to mds */
-        int i_cap_exporting_mds;         /* to handle cap migration between */
-        unsigned i_cap_exporting_mseq;   /*  mds's. */
-        unsigned i_cap_exporting_issued;
        struct ceph_cap_reservation i_cap_migration_resv;
        struct list_head i_cap_snaps;   /* snapped state pending flush to mds */
        struct ceph_snap_context *i_head_snapc;  /* set if wr_buffer_head > 0 or
                                                    dirty|flushing caps */
        unsigned i_snap_caps;           /* cap bits for snapped files */
+        unsigned i_cap_exporting_issued;
        int i_nr_by_mode[CEPH_FILE_MODE_NUM];  /* open file counts */
@@ -335,7 +334,6 @@ struct ceph_inode_info {
        u32 i_fscache_gen; /* sequence, for delayed fscache validate */
        struct work_struct i_revalidate_work;
 #endif
        struct inode vfs_inode; /* at end */
 };
@@ -529,6 +527,8 @@ static inline int __ceph_caps_dirty(struct ceph_inode_info *ci)
 }
 extern int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask);
+extern int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
+                                      struct ceph_cap *ocap, int mask);
 extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask);
 extern int __ceph_caps_used(struct ceph_inode_info *ci);
@@ -691,6 +691,7 @@ extern const struct inode_operations ceph_file_iops;
 extern struct inode *ceph_alloc_inode(struct super_block *sb);
 extern void ceph_destroy_inode(struct inode *inode);
+extern int ceph_drop_inode(struct inode *inode);
 extern struct inode *ceph_get_inode(struct super_block *sb,
                                    struct ceph_vino vino);
@@ -724,6 +725,9 @@ extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
 /* xattr.c */
 extern int ceph_setxattr(struct dentry *, const char *, const void *,
                         size_t, int);
+int __ceph_setxattr(struct dentry *, const char *, const void *, size_t, int);
+ssize_t __ceph_getxattr(struct inode *, const char *, void *, size_t);
+int __ceph_removexattr(struct dentry *, const char *);
 extern ssize_t ceph_getxattr(struct dentry *, const char *, void *, size_t);
 extern ssize_t ceph_listxattr(struct dentry *, char *, size_t);
 extern int ceph_removexattr(struct dentry *, const char *);
@@ -732,6 +736,42 @@ extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci);
 extern void __init ceph_xattr_init(void);
 extern void ceph_xattr_exit(void);
+/* acl.c */
+extern const struct xattr_handler *ceph_xattr_handlers[];
+#ifdef CONFIG_CEPH_FS_POSIX_ACL
+struct posix_acl *ceph_get_acl(struct inode *, int);
+int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+int ceph_init_acl(struct dentry *, struct inode *, struct inode *);
+static inline void ceph_forget_all_cached_acls(struct inode *inode)
+{
+       forget_all_cached_acls(inode);
+}
+#else
+#define ceph_get_acl NULL
+#define ceph_set_acl NULL
+static inline int ceph_init_acl(struct dentry *dentry, struct inode *inode,
+                                struct inode *dir)
+{
+        return 0;
+}
+static inline int ceph_acl_chmod(struct dentry *dentry, struct inode *inode)
+{
+        return 0;
+}
+static inline void ceph_forget_all_cached_acls(struct inode *inode)
+{
+}
+#endif
 /* caps.c */
 extern const char *ceph_cap_string(int c);
 extern void ceph_handle_caps(struct ceph_mds_session *session,
@@ -744,6 +784,7 @@ extern int ceph_add_cap(struct inode *inode,
 extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release);
 extern void ceph_put_cap(struct ceph_mds_client *mdsc,
                         struct ceph_cap *cap);
+extern int ceph_is_any_caps(struct inode *inode);
 extern void __queue_cap_release(struct ceph_mds_session *session, u64 ino,
                                u64 cap_id, u32 migrate_seq, u32 issue_seq);
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index be661d8f532a..a55ec37378c6 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -6,16 +6,33 @@
 #include <linux/ceph/decode.h>
 #include <linux/xattr.h>
+#include <linux/posix_acl_xattr.h>
 #include <linux/slab.h>
 #define XATTR_CEPH_PREFIX "ceph."
 #define XATTR_CEPH_PREFIX_LEN (sizeof (XATTR_CEPH_PREFIX) - 1)
+static int __remove_xattr(struct ceph_inode_info *ci,
+                          struct ceph_inode_xattr *xattr);
+/*
+ * List of handlers for synthetic system.* attributes. Other
+ * attributes are handled directly.
+ */
+const struct xattr_handler *ceph_xattr_handlers[] = {
+#ifdef CONFIG_CEPH_FS_POSIX_ACL
+        &posix_acl_access_xattr_handler,
+        &posix_acl_default_xattr_handler,
+#endif
+        NULL,
+};
 static bool ceph_is_valid_xattr(const char *name)
 {
        return !strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) ||
               !strncmp(name, XATTR_SECURITY_PREFIX,
                        XATTR_SECURITY_PREFIX_LEN) ||
+               !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) ||
               !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
               !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
 }
@@ -305,8 +322,7 @@ static struct ceph_vxattr *ceph_match_vxattr(struct inode *inode,
 static int __set_xattr(struct ceph_inode_info *ci,
                           const char *name, int name_len,
                           const char *val, int val_len,
-                           int dirty,
+                           int flags, int update_xattr,
-                           int should_free_name, int should_free_val,
                           struct ceph_inode_xattr **newxattr)
 {
        struct rb_node **p;
@@ -335,12 +351,31 @@ static int __set_xattr(struct ceph_inode_info *ci,
                xattr = NULL;
        }
+        if (update_xattr) {
+                int err = 0;
+                if (xattr && (flags & XATTR_CREATE))
+                        err = -EEXIST;
+                else if (!xattr && (flags & XATTR_REPLACE))
+                        err = -ENODATA;
+                if (err) {
+                        kfree(name);
+                        kfree(val);
+                        return err;
+                }
+                if (update_xattr < 0) {
+                        if (xattr)
+                                __remove_xattr(ci, xattr);
+                        kfree(name);
+                        return 0;
+                }
+        }
        if (!xattr) {
                new = 1;
                xattr = *newxattr;
                xattr->name = name;
                xattr->name_len = name_len;
-                xattr->should_free_name = should_free_name;
+                xattr->should_free_name = update_xattr;
                ci->i_xattrs.count++;
                dout("__set_xattr count=%d\n", ci->i_xattrs.count);
@@ -350,7 +385,7 @@ static int __set_xattr(struct ceph_inode_info *ci,
                if (xattr->should_free_val)
                        kfree((void *)xattr->val);
-                if (should_free_name) {
+                if (update_xattr) {
                        kfree((void *)name);
                        name = xattr->name;
                }
@@ -365,8 +400,8 @@ static int __set_xattr(struct ceph_inode_info *ci,
                xattr->val = "";
        xattr->val_len = val_len;
-        xattr->dirty = dirty;
+        xattr->dirty = update_xattr;
-        xattr->should_free_val = (val && should_free_val);
+        xattr->should_free_val = (val && update_xattr);
        if (new) {
                rb_link_node(&xattr->node, parent, p);
@@ -428,7 +463,7 @@ static int __remove_xattr(struct ceph_inode_info *ci,
                          struct ceph_inode_xattr *xattr)
 {
        if (!xattr)
-                return -EOPNOTSUPP;
+                return -ENODATA;
        rb_erase(&xattr->node, &ci->i_xattrs.index);
@@ -574,7 +609,7 @@ start:
                        p += len;
                        err = __set_xattr(ci, name, namelen, val, len,
-                                          0, 0, 0, &xattrs[numattr]);
+                                          0, 0, &xattrs[numattr]);
                        if (err < 0)
                                goto bad;
@@ -663,10 +698,9 @@ void __ceph_build_xattrs_blob(struct ceph_inode_info *ci)
        }
 }
-ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
+ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
                      size_t size)
 {
-        struct inode *inode = dentry->d_inode;
        struct ceph_inode_info *ci = ceph_inode(inode);
        int err;
        struct ceph_inode_xattr *xattr;
@@ -675,7 +709,6 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
        if (!ceph_is_valid_xattr(name))
                return -ENODATA;
        /* let's see if a virtual xattr was requested */
        vxattr = ceph_match_vxattr(inode, name);
        if (vxattr && !(vxattr->exists_cb && !vxattr->exists_cb(ci))) {
@@ -725,6 +758,15 @@ out:
        return err;
 }
+ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
+                      size_t size)
+{
+        if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+                return generic_getxattr(dentry, name, value, size);
+        return __ceph_getxattr(dentry->d_inode, name, value, size);
+}
 ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
 {
        struct inode *inode = dentry->d_inode;
@@ -829,6 +871,9 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
        dout("setxattr value=%.*s\n", (int)size, value);
+        if (!value)
+                flags |= CEPH_XATTR_REMOVE;
        /* do request */
        req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETXATTR,
                                       USE_AUTH_MDS);
@@ -863,15 +908,15 @@ out:
        return err;
 }
-int ceph_setxattr(struct dentry *dentry, const char *name,
+int __ceph_setxattr(struct dentry *dentry, const char *name,
-                  const void *value, size_t size, int flags)
+                        const void *value, size_t size, int flags)
 {
        struct inode *inode = dentry->d_inode;
        struct ceph_vxattr *vxattr;
        struct ceph_inode_info *ci = ceph_inode(inode);
        int issued;
        int err;
-        int dirty;
+        int dirty = 0;
        int name_len = strlen(name);
        int val_len = size;
        char *newname = NULL;
@@ -879,9 +924,6 @@ int ceph_setxattr(struct dentry *dentry, const char *name,
        struct ceph_inode_xattr *xattr = NULL;
        int required_blob_size;
-        if (ceph_snap(inode) != CEPH_NOSNAP)
-                return -EROFS;
        if (!ceph_is_valid_xattr(name))
                return -EOPNOTSUPP;
@@ -935,12 +977,14 @@ retry:
                goto retry;
        }
-        err = __set_xattr(ci, newname, name_len, newval,
+        err = __set_xattr(ci, newname, name_len, newval, val_len,
-                          val_len, 1, 1, 1, &xattr);
+                          flags, value ? 1 : -1, &xattr);
-        dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
+        if (!err) {
-        ci->i_xattrs.dirty = true;
+                dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
-        inode->i_ctime = CURRENT_TIME;
+                ci->i_xattrs.dirty = true;
+                inode->i_ctime = CURRENT_TIME;
+        }
        spin_unlock(&ci->i_ceph_lock);
        if (dirty)
@@ -958,6 +1002,18 @@ out:
        return err;
 }
+int ceph_setxattr(struct dentry *dentry, const char *name,
+                  const void *value, size_t size, int flags)
+{
+        if (ceph_snap(dentry->d_inode) != CEPH_NOSNAP)
+                return -EROFS;
+        if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+                return generic_setxattr(dentry, name, value, size, flags);
+        return __ceph_setxattr(dentry, name, value, size, flags);
+}
 static int ceph_send_removexattr(struct dentry *dentry, const char *name)
 {
        struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
@@ -984,7 +1040,7 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name)
        return err;
 }
-int ceph_removexattr(struct dentry *dentry, const char *name)
+int __ceph_removexattr(struct dentry *dentry, const char *name)
 {
        struct inode *inode = dentry->d_inode;
        struct ceph_vxattr *vxattr;
@@ -994,9 +1050,6 @@ int ceph_removexattr(struct dentry *dentry, const char *name)
        int required_blob_size;
        int dirty;
-        if (ceph_snap(inode) != CEPH_NOSNAP)
-                return -EROFS;
        if (!ceph_is_valid_xattr(name))
                return -EOPNOTSUPP;
@@ -1053,3 +1106,13 @@ out:
        return err;
 }
+int ceph_removexattr(struct dentry *dentry, const char *name)
+{
+        if (ceph_snap(dentry->d_inode) != CEPH_NOSNAP)
+                return -EROFS;
+        if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+                return generic_removexattr(dentry, name);
+        return __ceph_removexattr(dentry, name);
+}
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 51f5e0ee7237..7ff866dbb89e 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -865,8 +865,8 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
        return rc;
 }
-static struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb,
+struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb,
-                __u16 fid, u32 *pacllen)
+                const struct cifs_fid *cifsfid, u32 *pacllen)
 {
        struct cifs_ntsd *pntsd = NULL;
        unsigned int xid;
@@ -877,7 +877,8 @@ static struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb,
                return ERR_CAST(tlink);
        xid = get_xid();
-        rc = CIFSSMBGetCIFSACL(xid, tlink_tcon(tlink), fid, &pntsd, pacllen);
+        rc = CIFSSMBGetCIFSACL(xid, tlink_tcon(tlink), cifsfid->netfid, &pntsd,
+                                pacllen);
        free_xid(xid);
        cifs_put_tlink(tlink);
@@ -895,9 +896,10 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
        int oplock = 0;
        unsigned int xid;
        int rc, create_options = 0;
-        __u16 fid;
        struct cifs_tcon *tcon;
        struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
+        struct cifs_fid fid;
+        struct cifs_open_parms oparms;
        if (IS_ERR(tlink))
                return ERR_CAST(tlink);
@@ -908,12 +910,19 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
        if (backup_cred(cifs_sb))
                create_options |= CREATE_OPEN_BACKUP_INTENT;
-        rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, READ_CONTROL,
+        oparms.tcon = tcon;
-                        create_options, &fid, &oplock, NULL, cifs_sb->local_nls,
+        oparms.cifs_sb = cifs_sb;
-                        cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+        oparms.desired_access = READ_CONTROL;
+        oparms.create_options = create_options;
+        oparms.disposition = FILE_OPEN;
+        oparms.path = path;
+        oparms.fid = &fid;
+        oparms.reconnect = false;
+        rc = CIFS_open(xid, &oparms, &oplock, NULL);
        if (!rc) {
-                rc = CIFSSMBGetCIFSACL(xid, tcon, fid, &pntsd, pacllen);
+                rc = CIFSSMBGetCIFSACL(xid, tcon, fid.netfid, &pntsd, pacllen);
-                CIFSSMBClose(xid, tcon, fid);
+                CIFSSMBClose(xid, tcon, fid.netfid);
        }
        cifs_put_tlink(tlink);
@@ -938,7 +947,7 @@ struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb,
        if (!open_file)
                return get_cifs_acl_by_path(cifs_sb, path, pacllen);
-        pntsd = get_cifs_acl_by_fid(cifs_sb, open_file->fid.netfid, pacllen);
+        pntsd = get_cifs_acl_by_fid(cifs_sb, &open_file->fid, pacllen);
        cifsFileInfo_put(open_file);
        return pntsd;
 }
@@ -950,10 +959,11 @@ int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
        int oplock = 0;
        unsigned int xid;
        int rc, access_flags, create_options = 0;
-        __u16 fid;
        struct cifs_tcon *tcon;
        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
        struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
+        struct cifs_fid fid;
+        struct cifs_open_parms oparms;
        if (IS_ERR(tlink))
                return PTR_ERR(tlink);
@@ -969,18 +979,25 @@ int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
        else
                access_flags = WRITE_DAC;
-        rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, access_flags,
+        oparms.tcon = tcon;
-                        create_options, &fid, &oplock, NULL, cifs_sb->local_nls,
+        oparms.cifs_sb = cifs_sb;
-                        cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+        oparms.desired_access = access_flags;
+        oparms.create_options = create_options;
+        oparms.disposition = FILE_OPEN;
+        oparms.path = path;
+        oparms.fid = &fid;
+        oparms.reconnect = false;
+        rc = CIFS_open(xid, &oparms, &oplock, NULL);
        if (rc) {
                cifs_dbg(VFS, "Unable to open file to set ACL\n");
                goto out;
        }
-        rc = CIFSSMBSetCIFSACL(xid, tcon, fid, pnntsd, acllen, aclflag);
+        rc = CIFSSMBSetCIFSACL(xid, tcon, fid.netfid, pnntsd, acllen, aclflag);
        cifs_dbg(NOISY, "SetCIFSACL rc = %d\n", rc);
-        CIFSSMBClose(xid, tcon, fid);
+        CIFSSMBClose(xid, tcon, fid.netfid);
 out:
        free_xid(xid);
        cifs_put_tlink(tlink);
@@ -990,19 +1007,31 @@ out:
 /* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */
 int
 cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
-                  struct inode *inode, const char *path, const __u16 *pfid)
+                  struct inode *inode, const char *path,
+                  const struct cifs_fid *pfid)
 {
        struct cifs_ntsd *pntsd = NULL;
        u32 acllen = 0;
        int rc = 0;
+        struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
+        struct cifs_tcon *tcon;
        cifs_dbg(NOISY, "converting ACL to mode for %s\n", path);
-        if (pfid)
+        if (IS_ERR(tlink))
-                pntsd = get_cifs_acl_by_fid(cifs_sb, *pfid, &acllen);
+                return PTR_ERR(tlink);
-        else
+        tcon = tlink_tcon(tlink);
-                pntsd = get_cifs_acl(cifs_sb, inode, path, &acllen);
+        if (pfid && (tcon->ses->server->ops->get_acl_by_fid))
+                pntsd = tcon->ses->server->ops->get_acl_by_fid(cifs_sb, pfid,
+                                                          &acllen);
+        else if (tcon->ses->server->ops->get_acl)
+                pntsd = tcon->ses->server->ops->get_acl(cifs_sb, inode, path,
+                                                        &acllen);
+        else {
+                cifs_put_tlink(tlink);
+                return -EOPNOTSUPP;
+        }
        /* if we can retrieve the ACL, now parse Access Control Entries, ACEs */
        if (IS_ERR(pntsd)) {
                rc = PTR_ERR(pntsd);
@@ -1014,6 +1043,8 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
                        cifs_dbg(VFS, "parse sec desc failed rc = %d\n", rc);
        }
+        cifs_put_tlink(tlink);
        return rc;
 }
@@ -1027,15 +1058,30 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode,
        __u32 secdesclen = 0;
        struct cifs_ntsd *pntsd = NULL; /* acl obtained from server */
        struct cifs_ntsd *pnntsd = NULL; /* modified acl to be sent to server */
+        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+        struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
+        struct cifs_tcon *tcon;
+        if (IS_ERR(tlink))
+                return PTR_ERR(tlink);
+        tcon = tlink_tcon(tlink);
        cifs_dbg(NOISY, "set ACL from mode for %s\n", path);
        /* Get the security descriptor */
-        pntsd = get_cifs_acl(CIFS_SB(inode->i_sb), inode, path, &secdesclen);
+        if (tcon->ses->server->ops->get_acl == NULL) {
+                cifs_put_tlink(tlink);
+                return -EOPNOTSUPP;
+        }
+        pntsd = tcon->ses->server->ops->get_acl(cifs_sb, inode, path,
+                                                &secdesclen);
        if (IS_ERR(pntsd)) {
                rc = PTR_ERR(pntsd);
                cifs_dbg(VFS, "%s: error %d getting sec desc\n", __func__, rc);
-                goto out;
+                cifs_put_tlink(tlink);
+                return rc;
        }
        /*
@@ -1048,6 +1094,7 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode,
        pnntsd = kmalloc(secdesclen, GFP_KERNEL);
        if (!pnntsd) {
                kfree(pntsd);
+                cifs_put_tlink(tlink);
                return -ENOMEM;
        }
@@ -1056,14 +1103,18 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode,
        cifs_dbg(NOISY, "build_sec_desc rc: %d\n", rc);
+        if (tcon->ses->server->ops->set_acl == NULL)
+                rc = -EOPNOTSUPP;
        if (!rc) {
                /* Set the security descriptor */
-                rc = set_cifs_acl(pnntsd, secdesclen, inode, path, aclflag);
+                rc = tcon->ses->server->ops->set_acl(pnntsd, secdesclen, inode,
+                                                     path, aclflag);
                cifs_dbg(NOISY, "set_cifs_acl rc: %d\n", rc);
        }
+        cifs_put_tlink(tlink);
        kfree(pnntsd);
        kfree(pntsd);
-out:
        return rc;
 }
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index f918a998a087..c0f3718b77a8 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -323,7 +323,8 @@ struct smb_version_operations {
        /* async read from the server */
        int (*async_readv)(struct cifs_readdata *);
        /* async write to the server */
-        int (*async_writev)(struct cifs_writedata *);
+        int (*async_writev)(struct cifs_writedata *,
+                            void (*release)(struct kref *));
        /* sync read from the server */
        int (*sync_read)(const unsigned int, struct cifsFileInfo *,
                         struct cifs_io_parms *, unsigned int *, char **,
@@ -370,8 +371,12 @@ struct smb_version_operations {
        void (*new_lease_key)(struct cifs_fid *);
        int (*generate_signingkey)(struct cifs_ses *);
        int (*calc_signature)(struct smb_rqst *, struct TCP_Server_Info *);
-        int (*query_mf_symlink)(const unsigned char *, char *, unsigned int *,
+        int (*query_mf_symlink)(unsigned int, struct cifs_tcon *,
-                                struct cifs_sb_info *, unsigned int);
+                                struct cifs_sb_info *, const unsigned char *,
+                                char *, unsigned int *);
+        int (*create_mf_symlink)(unsigned int, struct cifs_tcon *,
+                                 struct cifs_sb_info *, const unsigned char *,
+                                 char *, unsigned int *);
        /* if we can do cache read operations */
        bool (*is_read_op)(__u32);
        /* set oplock level for the inode */
@@ -385,6 +390,18 @@ struct smb_version_operations {
                        struct cifsFileInfo *target_file, u64 src_off, u64 len,
                        u64 dest_off);
        int (*validate_negotiate)(const unsigned int, struct cifs_tcon *);
+        ssize_t (*query_all_EAs)(const unsigned int, struct cifs_tcon *,
+                        const unsigned char *, const unsigned char *, char *,
+                        size_t, const struct nls_table *, int);
+        int (*set_EA)(const unsigned int, struct cifs_tcon *, const char *,
+                        const char *, const void *, const __u16,
+                        const struct nls_table *, int);
+        struct cifs_ntsd * (*get_acl)(struct cifs_sb_info *, struct inode *,
+                        const char *, u32 *);
+        struct cifs_ntsd * (*get_acl_by_fid)(struct cifs_sb_info *,
+                        const struct cifs_fid *, u32 *);
+        int (*set_acl)(struct cifs_ntsd *, __u32, struct inode *, const char *,
+                        int);
 };
 struct smb_version_values {
@@ -496,7 +513,7 @@ struct cifs_mnt_data {
 static inline unsigned int
 get_rfc1002_length(void *buf)
 {
-        return be32_to_cpu(*((__be32 *)buf));
+        return be32_to_cpu(*((__be32 *)buf)) & 0xffffff;
 }
 static inline void
@@ -1054,7 +1071,7 @@ struct cifs_writedata {
        unsigned int                    pagesz;
        unsigned int                    tailsz;
        unsigned int                    nr_pages;
-        struct page                     *pages[1];
+        struct page                     *pages[];
 };
 /*
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 2c29db6a247e..acc4ee8ed075 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -151,7 +151,7 @@ extern struct inode *cifs_iget(struct super_block *sb,
 extern int cifs_get_inode_info(struct inode **inode, const char *full_path,
                               FILE_ALL_INFO *data, struct super_block *sb,
-                               int xid, const __u16 *fid);
+                               int xid, const struct cifs_fid *fid);
 extern int cifs_get_inode_info_unix(struct inode **pinode,
                        const unsigned char *search_path,
                        struct super_block *sb, unsigned int xid);
@@ -162,11 +162,13 @@ extern int cifs_rename_pending_delete(const char *full_path,
                                      const unsigned int xid);
 extern int cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb,
                              struct cifs_fattr *fattr, struct inode *inode,
-                              const char *path, const __u16 *pfid);
+                              const char *path, const struct cifs_fid *pfid);
 extern int id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64,
                                        kuid_t, kgid_t);
 extern struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *, struct inode *,
                                        const char *, u32 *);
+extern struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *,
+                                                const struct cifs_fid *, u32 *);
 extern int set_cifs_acl(struct cifs_ntsd *, __u32, struct inode *,
                                const char *, int);
@@ -362,11 +364,8 @@ extern int CIFSSMBQuerySymLink(const unsigned int xid, struct cifs_tcon *tcon,
                               const struct nls_table *nls_codepage);
 extern int CIFSSMB_set_compression(const unsigned int xid,
                                   struct cifs_tcon *tcon, __u16 fid);
-extern int CIFSSMBOpen(const unsigned int xid, struct cifs_tcon *tcon,
+extern int CIFS_open(const unsigned int xid, struct cifs_open_parms *oparms,
-                        const char *fileName, const int disposition,
+                     int *oplock, FILE_ALL_INFO *buf);
-                        const int access_flags, const int omode,
-                        __u16 *netfid, int *pOplock, FILE_ALL_INFO *,
-                        const struct nls_table *nls_codepage, int remap);
 extern int SMBLegacyOpen(const unsigned int xid, struct cifs_tcon *tcon,
                        const char *fileName, const int disposition,
                        const int access_flags, const int omode,
@@ -476,8 +475,8 @@ extern int CIFSSMBSetPosixACL(const unsigned int xid, struct cifs_tcon *tcon,
 extern int CIFSGetExtAttr(const unsigned int xid, struct cifs_tcon *tcon,
                        const int netfid, __u64 *pExtAttrBits, __u64 *pMask);
 extern void cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb);
-extern bool CIFSCouldBeMFSymlink(const struct cifs_fattr *fattr);
+extern bool couldbe_mf_symlink(const struct cifs_fattr *fattr);
-extern int CIFSCheckMFSymlink(unsigned int xid, struct cifs_tcon *tcon,
+extern int check_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
                              struct cifs_sb_info *cifs_sb,
                              struct cifs_fattr *fattr,
                              const unsigned char *path);
@@ -491,12 +490,18 @@ void cifs_readdata_release(struct kref *refcount);
 int cifs_async_readv(struct cifs_readdata *rdata);
 int cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid);
-int cifs_async_writev(struct cifs_writedata *wdata);
+int cifs_async_writev(struct cifs_writedata *wdata,
+                      void (*release)(struct kref *kref));
 void cifs_writev_complete(struct work_struct *work);
 struct cifs_writedata *cifs_writedata_alloc(unsigned int nr_pages,
                                                work_func_t complete);
 void cifs_writedata_release(struct kref *refcount);
-int open_query_close_cifs_symlink(const unsigned char *path, char *pbuf,
+int cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
-                        unsigned int *pbytes_read, struct cifs_sb_info *cifs_sb,
+                          struct cifs_sb_info *cifs_sb,
-                        unsigned int xid);
+                          const unsigned char *path, char *pbuf,
+                          unsigned int *pbytes_read);
+int cifs_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
+                           struct cifs_sb_info *cifs_sb,
+                           const unsigned char *path, char *pbuf,
+                           unsigned int *pbytes_written);
 #endif                  /* _CIFSPROTO_H */
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index d707edb6b852..f3264bd7a83d 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1273,104 +1273,124 @@ OldOpenRetry:
 }
 int
-CIFSSMBOpen(const unsigned int xid, struct cifs_tcon *tcon,
+CIFS_open(const unsigned int xid, struct cifs_open_parms *oparms, int *oplock,
-            const char *fileName, const int openDisposition,
+          FILE_ALL_INFO *buf)
-            const int access_flags, const int create_options, __u16 *netfid,
-            int *pOplock, FILE_ALL_INFO *pfile_info,
-            const struct nls_table *nls_codepage, int remap)
 {
        int rc = -EACCES;
-        OPEN_REQ *pSMB = NULL;
+        OPEN_REQ *req = NULL;
-        OPEN_RSP *pSMBr = NULL;
+        OPEN_RSP *rsp = NULL;
        int bytes_returned;
        int name_len;
        __u16 count;
+        struct cifs_sb_info *cifs_sb = oparms->cifs_sb;
+        struct cifs_tcon *tcon = oparms->tcon;
+        int remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR;
+        const struct nls_table *nls = cifs_sb->local_nls;
+        int create_options = oparms->create_options;
+        int desired_access = oparms->desired_access;
+        int disposition = oparms->disposition;
+        const char *path = oparms->path;
 openRetry:
-        rc = smb_init(SMB_COM_NT_CREATE_ANDX, 24, tcon, (void **) &pSMB,
+        rc = smb_init(SMB_COM_NT_CREATE_ANDX, 24, tcon, (void **)&req,
-                      (void **) &pSMBr);
+                      (void **)&rsp);
        if (rc)
                return rc;
-        pSMB->AndXCommand = 0xFF;       /* none */
+        /* no commands go after this */
+        req->AndXCommand = 0xFF;
-        if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
+        if (req->hdr.Flags2 & SMBFLG2_UNICODE) {
-                count = 1;      /* account for one byte pad to word boundary */
+                /* account for one byte pad to word boundary */
-                name_len =
+                count = 1;
-                    cifsConvertToUTF16((__le16 *) (pSMB->fileName + 1),
+                name_len = cifsConvertToUTF16((__le16 *)(req->fileName + 1),
-                                       fileName, PATH_MAX, nls_codepage, remap);
+                                              path, PATH_MAX, nls, remap);
-                name_len++;     /* trailing null */
+                /* trailing null */
+                name_len++;
                name_len *= 2;
-                pSMB->NameLength = cpu_to_le16(name_len);
+                req->NameLength = cpu_to_le16(name_len);
-        } else {                /* BB improve check for buffer overruns BB */
+        } else {
-                count = 0;      /* no pad */
+                /* BB improve check for buffer overruns BB */
-                name_len = strnlen(fileName, PATH_MAX);
+                /* no pad */
-                name_len++;     /* trailing null */
+                count = 0;
-                pSMB->NameLength = cpu_to_le16(name_len);
+                name_len = strnlen(path, PATH_MAX);
-                strncpy(pSMB->fileName, fileName, name_len);
+                /* trailing null */
+                name_len++;
+                req->NameLength = cpu_to_le16(name_len);
+                strncpy(req->fileName, path, name_len);
        }
-        if (*pOplock & REQ_OPLOCK)
-                pSMB->OpenFlags = cpu_to_le32(REQ_OPLOCK);
+        if (*oplock & REQ_OPLOCK)
-        else if (*pOplock & REQ_BATCHOPLOCK)
+                req->OpenFlags = cpu_to_le32(REQ_OPLOCK);
-                pSMB->OpenFlags = cpu_to_le32(REQ_BATCHOPLOCK);
+        else if (*oplock & REQ_BATCHOPLOCK)
-        pSMB->DesiredAccess = cpu_to_le32(access_flags);
+                req->OpenFlags = cpu_to_le32(REQ_BATCHOPLOCK);
-        pSMB->AllocationSize = 0;
-        /* set file as system file if special file such
+        req->DesiredAccess = cpu_to_le32(desired_access);
-           as fifo and server expecting SFU style and
+        req->AllocationSize = 0;
-           no Unix extensions */
+        /*
+         * Set file as system file if special file such as fifo and server
+         * expecting SFU style and no Unix extensions.
+         */
        if (create_options & CREATE_OPTION_SPECIAL)
-                pSMB->FileAttributes = cpu_to_le32(ATTR_SYSTEM);
+                req->FileAttributes = cpu_to_le32(ATTR_SYSTEM);
        else
-                pSMB->FileAttributes = cpu_to_le32(ATTR_NORMAL);
+                req->FileAttributes = cpu_to_le32(ATTR_NORMAL);
-        /* XP does not handle ATTR_POSIX_SEMANTICS */
+        /*
-        /* but it helps speed up case sensitive checks for other
+         * XP does not handle ATTR_POSIX_SEMANTICS but it helps speed up case
-        servers such as Samba */
+         * sensitive checks for other servers such as Samba.
+         */
        if (tcon->ses->capabilities & CAP_UNIX)
-                pSMB->FileAttributes |= cpu_to_le32(ATTR_POSIX_SEMANTICS);
+                req->FileAttributes |= cpu_to_le32(ATTR_POSIX_SEMANTICS);
        if (create_options & CREATE_OPTION_READONLY)
-                pSMB->FileAttributes |= cpu_to_le32(ATTR_READONLY);
+                req->FileAttributes |= cpu_to_le32(ATTR_READONLY);
+        req->ShareAccess = cpu_to_le32(FILE_SHARE_ALL);
+        req->CreateDisposition = cpu_to_le32(disposition);
+        req->CreateOptions = cpu_to_le32(create_options & CREATE_OPTIONS_MASK);
-        pSMB->ShareAccess = cpu_to_le32(FILE_SHARE_ALL);
-        pSMB->CreateDisposition = cpu_to_le32(openDisposition);
-        pSMB->CreateOptions = cpu_to_le32(create_options & CREATE_OPTIONS_MASK);
        /* BB Expirement with various impersonation levels and verify */
-        pSMB->ImpersonationLevel = cpu_to_le32(SECURITY_IMPERSONATION);
+        req->ImpersonationLevel = cpu_to_le32(SECURITY_IMPERSONATION);
-        pSMB->SecurityFlags =
+        req->SecurityFlags = SECURITY_CONTEXT_TRACKING|SECURITY_EFFECTIVE_ONLY;
-            SECURITY_CONTEXT_TRACKING | SECURITY_EFFECTIVE_ONLY;
        count += name_len;
-        inc_rfc1001_len(pSMB, count);
+        inc_rfc1001_len(req, count);
-        pSMB->ByteCount = cpu_to_le16(count);
+        req->ByteCount = cpu_to_le16(count);
-        /* long_op set to 1 to allow for oplock break timeouts */
+        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *)req,
-        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+                         (struct smb_hdr *)rsp, &bytes_returned, 0);
-                        (struct smb_hdr *)pSMBr, &bytes_returned, 0);
        cifs_stats_inc(&tcon->stats.cifs_stats.num_opens);
        if (rc) {
                cifs_dbg(FYI, "Error in Open = %d\n", rc);
-        } else {
+                cifs_buf_release(req);
-                *pOplock = pSMBr->OplockLevel; /* 1 byte no need to le_to_cpu */
+                if (rc == -EAGAIN)
-                *netfid = pSMBr->Fid;   /* cifs fid stays in le */
+                        goto openRetry;
-                /* Let caller know file was created so we can set the mode. */
+                return rc;
-                /* Do we care about the CreateAction in any other cases? */
-                if (cpu_to_le32(FILE_CREATE) == pSMBr->CreateAction)
-                        *pOplock |= CIFS_CREATE_ACTION;
-                if (pfile_info) {
-                        memcpy((char *)pfile_info, (char *)&pSMBr->CreationTime,
-                                36 /* CreationTime to Attributes */);
-                        /* the file_info buf is endian converted by caller */
-                        pfile_info->AllocationSize = pSMBr->AllocationSize;
-                        pfile_info->EndOfFile = pSMBr->EndOfFile;
-                        pfile_info->NumberOfLinks = cpu_to_le32(1);
-                        pfile_info->DeletePending = 0;
-                }
        }
-        cifs_buf_release(pSMB);
+        /* 1 byte no need to le_to_cpu */
-        if (rc == -EAGAIN)
+        *oplock = rsp->OplockLevel;
-                goto openRetry;
+        /* cifs fid stays in le */
+        oparms->fid->netfid = rsp->Fid;
+        /* Let caller know file was created so we can set the mode. */
+        /* Do we care about the CreateAction in any other cases? */
+        if (cpu_to_le32(FILE_CREATE) == rsp->CreateAction)
+                *oplock |= CIFS_CREATE_ACTION;
+        if (buf) {
+                /* copy from CreationTime to Attributes */
+                memcpy((char *)buf, (char *)&rsp->CreationTime, 36);
+                /* the file_info buf is endian converted by caller */
+                buf->AllocationSize = rsp->AllocationSize;
+                buf->EndOfFile = rsp->EndOfFile;
+                buf->NumberOfLinks = cpu_to_le32(1);
+                buf->DeletePending = 0;
+        }
+        cifs_buf_release(req);
        return rc;
 }
@@ -1890,7 +1910,7 @@ cifs_writev_requeue(struct cifs_writedata *wdata)
        do {
                server = tlink_tcon(wdata->cfile->tlink)->ses->server;
-                rc = server->ops->async_writev(wdata);
+                rc = server->ops->async_writev(wdata, cifs_writedata_release);
        } while (rc == -EAGAIN);
        for (i = 0; i < wdata->nr_pages; i++) {
@@ -1942,15 +1962,9 @@ cifs_writedata_alloc(unsigned int nr_pages, work_func_t complete)
 {
        struct cifs_writedata *wdata;
-        /* this would overflow */
-        if (nr_pages == 0) {
-                cifs_dbg(VFS, "%s: called with nr_pages == 0!\n", __func__);
-                return NULL;
-        }
        /* writedata + number of page pointers */
        wdata = kzalloc(sizeof(*wdata) +
-                        sizeof(struct page *) * (nr_pages - 1), GFP_NOFS);
+                        sizeof(struct page *) * nr_pages, GFP_NOFS);
        if (wdata != NULL) {
                kref_init(&wdata->refcount);
                INIT_LIST_HEAD(&wdata->list);
@@ -2011,7 +2025,8 @@ cifs_writev_callback(struct mid_q_entry *mid)
 /* cifs_async_writev - send an async write, and set up mid to handle result */
 int
-cifs_async_writev(struct cifs_writedata *wdata)
+cifs_async_writev(struct cifs_writedata *wdata,
+                  void (*release)(struct kref *kref))
 {
        int rc = -EACCES;
        WRITE_REQ *smb = NULL;
@@ -2085,7 +2100,7 @@ cifs_async_writev(struct cifs_writedata *wdata)
        if (rc == 0)
                cifs_stats_inc(&tcon->stats.cifs_stats.num_writes);
        else
-                kref_put(&wdata->refcount, cifs_writedata_release);
+                kref_put(&wdata->refcount, release);
 async_writev_out:
        cifs_small_buf_release(smb);
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index a514e0a65f69..3db0c5fd9a11 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -378,7 +378,7 @@ cifs_create_get_file_info:
                                              xid);
        else {
                rc = cifs_get_inode_info(&newinode, full_path, buf, inode->i_sb,
-                                         xid, &fid->netfid);
+                                         xid, fid);
                if (newinode) {
                        if (server->ops->set_lease_key)
                                server->ops->set_lease_key(newinode, fid);
@@ -565,12 +565,13 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode,
        int create_options = CREATE_NOT_DIR | CREATE_OPTION_SPECIAL;
        struct cifs_sb_info *cifs_sb;
        struct tcon_link *tlink;
-        struct cifs_tcon *pTcon;
+        struct cifs_tcon *tcon;
        struct cifs_io_parms io_parms;
        char *full_path = NULL;
        struct inode *newinode = NULL;
        int oplock = 0;
-        u16 fileHandle;
+        struct cifs_fid fid;
+        struct cifs_open_parms oparms;
        FILE_ALL_INFO *buf = NULL;
        unsigned int bytes_written;
        struct win_dev *pdev;
@@ -583,7 +584,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode,
        if (IS_ERR(tlink))
                return PTR_ERR(tlink);
-        pTcon = tlink_tcon(tlink);
+        tcon = tlink_tcon(tlink);
        xid = get_xid();
@@ -593,7 +594,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode,
                goto mknod_out;
        }
-        if (pTcon->unix_ext) {
+        if (tcon->unix_ext) {
                struct cifs_unix_set_info_args args = {
                        .mode   = mode & ~current_umask(),
                        .ctime  = NO_CHANGE_64,
@@ -608,7 +609,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode,
                        args.uid = INVALID_UID; /* no change */
                        args.gid = INVALID_GID; /* no change */
                }
-                rc = CIFSSMBUnixSetPathInfo(xid, pTcon, full_path, &args,
+                rc = CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args,
                                            cifs_sb->local_nls,
                                            cifs_sb->mnt_cifs_flags &
                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -640,42 +641,44 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode,
        if (backup_cred(cifs_sb))
                create_options |= CREATE_OPEN_BACKUP_INTENT;
-        rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_CREATE,
+        oparms.tcon = tcon;
-                         GENERIC_WRITE, create_options,
+        oparms.cifs_sb = cifs_sb;
-                         &fileHandle, &oplock, buf, cifs_sb->local_nls,
+        oparms.desired_access = GENERIC_WRITE;
-                         cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+        oparms.create_options = create_options;
+        oparms.disposition = FILE_CREATE;
+        oparms.path = full_path;
+        oparms.fid = &fid;
+        oparms.reconnect = false;
+        rc = CIFS_open(xid, &oparms, &oplock, buf);
        if (rc)
                goto mknod_out;
-        /* BB Do not bother to decode buf since no local inode yet to put
+        /*
-         * timestamps in, but we can reuse it safely */
+         * BB Do not bother to decode buf since no local inode yet to put
+         * timestamps in, but we can reuse it safely.
+         */
        pdev = (struct win_dev *)buf;
-        io_parms.netfid = fileHandle;
+        io_parms.netfid = fid.netfid;
        io_parms.pid = current->tgid;
-        io_parms.tcon = pTcon;
+        io_parms.tcon = tcon;
        io_parms.offset = 0;
        io_parms.length = sizeof(struct win_dev);
        if (S_ISCHR(mode)) {
                memcpy(pdev->type, "IntxCHR", 8);
-                pdev->major =
+                pdev->major = cpu_to_le64(MAJOR(device_number));
-                      cpu_to_le64(MAJOR(device_number));
+                pdev->minor = cpu_to_le64(MINOR(device_number));
-                pdev->minor =
+                rc = CIFSSMBWrite(xid, &io_parms, &bytes_written, (char *)pdev,
-                      cpu_to_le64(MINOR(device_number));
+                                  NULL, 0);
-                rc = CIFSSMBWrite(xid, &io_parms,
-                        &bytes_written, (char *)pdev,
-                        NULL, 0);
        } else if (S_ISBLK(mode)) {
                memcpy(pdev->type, "IntxBLK", 8);
-                pdev->major =
+                pdev->major = cpu_to_le64(MAJOR(device_number));
-                      cpu_to_le64(MAJOR(device_number));
+                pdev->minor = cpu_to_le64(MINOR(device_number));
-                pdev->minor =
+                rc = CIFSSMBWrite(xid, &io_parms, &bytes_written, (char *)pdev,
-                      cpu_to_le64(MINOR(device_number));
+                                  NULL, 0);
-                rc = CIFSSMBWrite(xid, &io_parms,
-                        &bytes_written, (char *)pdev,
-                        NULL, 0);
        } /* else if (S_ISFIFO) */
-        CIFSSMBClose(xid, pTcon, fileHandle);
+        CIFSSMBClose(xid, tcon, fid.netfid);
        d_drop(direntry);
        /* FIXME: add code here to set EAs */
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 5a5a87240fe2..834fce759d80 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -244,7 +244,7 @@ cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb,
                                              xid);
        else
                rc = cifs_get_inode_info(&inode, full_path, buf, inode->i_sb,
-                                         xid, &fid->netfid);
+                                         xid, fid);
 out:
        kfree(buf);
@@ -678,7 +678,7 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush)
        /*
         * Can not refresh inode by passing in file_info buf to be returned by
-         * CIFSSMBOpen and then calling get_inode_info with returned buf since
+         * ops->open and then calling get_inode_info with returned buf since
         * file might have write behind data that needs to be flushed and server
         * version of file size can be stale. If we knew for sure that inode was
         * not dirty locally we could do this.
@@ -2043,7 +2043,8 @@ retry:
                        }
                        wdata->pid = wdata->cfile->pid;
                        server = tlink_tcon(wdata->cfile->tlink)->ses->server;
-                        rc = server->ops->async_writev(wdata);
+                        rc = server->ops->async_writev(wdata,
+                                                        cifs_writedata_release);
                } while (wbc->sync_mode == WB_SYNC_ALL && rc == -EAGAIN);
                for (i = 0; i < nr_pages; ++i)
@@ -2331,9 +2332,20 @@ size_t get_numpages(const size_t wsize, const size_t len, size_t *cur_len)
 }
 static void
-cifs_uncached_writev_complete(struct work_struct *work)
+cifs_uncached_writedata_release(struct kref *refcount)
 {
        int i;
+        struct cifs_writedata *wdata = container_of(refcount,
+                                        struct cifs_writedata, refcount);
+        for (i = 0; i < wdata->nr_pages; i++)
+                put_page(wdata->pages[i]);
+        cifs_writedata_release(refcount);
+}
+static void
+cifs_uncached_writev_complete(struct work_struct *work)
+{
        struct cifs_writedata *wdata = container_of(work,
                                        struct cifs_writedata, work);
        struct inode *inode = wdata->cfile->dentry->d_inode;
@@ -2347,12 +2359,7 @@ cifs_uncached_writev_complete(struct work_struct *work)
        complete(&wdata->done);
-        if (wdata->result != -EAGAIN) {
+        kref_put(&wdata->refcount, cifs_uncached_writedata_release);
-                for (i = 0; i < wdata->nr_pages; i++)
-                        put_page(wdata->pages[i]);
-        }
-        kref_put(&wdata->refcount, cifs_writedata_release);
 }
 /* attempt to send write to server, retry on any -EAGAIN errors */
@@ -2370,7 +2377,8 @@ cifs_uncached_retry_writev(struct cifs_writedata *wdata)
                        if (rc != 0)
                                continue;
                }
-                rc = server->ops->async_writev(wdata);
+                rc = server->ops->async_writev(wdata,
+                                               cifs_uncached_writedata_release);
        } while (rc == -EAGAIN);
        return rc;
@@ -2381,7 +2389,7 @@ cifs_iovec_write(struct file *file, const struct iovec *iov,
                 unsigned long nr_segs, loff_t *poffset)
 {
        unsigned long nr_pages, i;
-        size_t copied, len, cur_len;
+        size_t bytes, copied, len, cur_len;
        ssize_t total_written = 0;
        loff_t offset;
        struct iov_iter it;
@@ -2436,14 +2444,45 @@ cifs_iovec_write(struct file *file, const struct iovec *iov,
                save_len = cur_len;
                for (i = 0; i < nr_pages; i++) {
-                        copied = min_t(const size_t, cur_len, PAGE_SIZE);
+                        bytes = min_t(const size_t, cur_len, PAGE_SIZE);
                        copied = iov_iter_copy_from_user(wdata->pages[i], &it,
-                                                         0, copied);
+                                                         0, bytes);
                        cur_len -= copied;
                        iov_iter_advance(&it, copied);
+                        /*
+                         * If we didn't copy as much as we expected, then that
+                         * may mean we trod into an unmapped area. Stop copying
+                         * at that point. On the next pass through the big
+                         * loop, we'll likely end up getting a zero-length
+                         * write and bailing out of it.
+                         */
+                        if (copied < bytes)
+                                break;
                }
                cur_len = save_len - cur_len;
+                /*
+                 * If we have no data to send, then that probably means that
+                 * the copy above failed altogether. That's most likely because
+                 * the address in the iovec was bogus. Set the rc to -EFAULT,
+                 * free anything we allocated and bail out.
+                 */
+                if (!cur_len) {
+                        for (i = 0; i < nr_pages; i++)
+                                put_page(wdata->pages[i]);
+                        kfree(wdata);
+                        rc = -EFAULT;
+                        break;
+                }
+                /*
+                 * i + 1 now represents the number of pages we actually used in
+                 * the copy phase above. Bring nr_pages down to that, and free
+                 * any pages that we didn't use.
+                 */
+                for ( ; nr_pages > i + 1; nr_pages--)
+                        put_page(wdata->pages[nr_pages - 1]);
                wdata->sync_mode = WB_SYNC_ALL;
                wdata->nr_pages = nr_pages;
                wdata->offset = (__u64)offset;
@@ -2454,7 +2493,8 @@ cifs_iovec_write(struct file *file, const struct iovec *iov,
                wdata->tailsz = cur_len - ((nr_pages - 1) * PAGE_SIZE);
                rc = cifs_uncached_retry_writev(wdata);
                if (rc) {
-                        kref_put(&wdata->refcount, cifs_writedata_release);
+                        kref_put(&wdata->refcount,
+                                 cifs_uncached_writedata_release);
                        break;
                }
@@ -2496,7 +2536,7 @@ restart_loop:
                        }
                }
                list_del_init(&wdata->list);
-                kref_put(&wdata->refcount, cifs_writedata_release);
+                kref_put(&wdata->refcount, cifs_uncached_writedata_release);
        }
        if (total_written > 0)
@@ -2539,31 +2579,19 @@ cifs_writev(struct kiocb *iocb, const struct iovec *iov,
        struct cifsInodeInfo *cinode = CIFS_I(inode);
        struct TCP_Server_Info *server = tlink_tcon(cfile->tlink)->ses->server;
        ssize_t rc = -EACCES;
+        loff_t lock_pos = pos;
-        BUG_ON(iocb->ki_pos != pos);
+        if (file->f_flags & O_APPEND)
+                lock_pos = i_size_read(inode);
        /*
         * We need to hold the sem to be sure nobody modifies lock list
         * with a brlock that prevents writing.
         */
        down_read(&cinode->lock_sem);
-        if (!cifs_find_lock_conflict(cfile, pos, iov_length(iov, nr_segs),
+        if (!cifs_find_lock_conflict(cfile, lock_pos, iov_length(iov, nr_segs),
                                     server->vals->exclusive_lock_type, NULL,
-                                     CIFS_WRITE_OP)) {
+                                     CIFS_WRITE_OP))
-                mutex_lock(&inode->i_mutex);
+                rc = generic_file_aio_write(iocb, iov, nr_segs, pos);
-                rc = __generic_file_aio_write(iocb, iov, nr_segs,
-                                               &iocb->ki_pos);
-                mutex_unlock(&inode->i_mutex);
-        }
-        if (rc > 0) {
-                ssize_t err;
-                err = generic_write_sync(file, pos, rc);
-                if (err < 0 && rc > 0)
-                        rc = err;
-        }
        up_read(&cinode->lock_sem);
        return rc;
 }
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 49719b8228e5..aadc2b68678b 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -383,10 +383,10 @@ int cifs_get_inode_info_unix(struct inode **pinode,
        /* check for Minshall+French symlinks */
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) {
-                int tmprc = CIFSCheckMFSymlink(xid, tcon, cifs_sb, &fattr,
+                int tmprc = check_mf_symlink(xid, tcon, cifs_sb, &fattr,
-                                               full_path);
+                                             full_path);
                if (tmprc)
-                        cifs_dbg(FYI, "CIFSCheckMFSymlink: %d\n", tmprc);
+                        cifs_dbg(FYI, "check_mf_symlink: %d\n", tmprc);
        }
        if (*pinode == NULL) {
@@ -404,18 +404,20 @@ int cifs_get_inode_info_unix(struct inode **pinode,
 }
 static int
-cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
+cifs_sfu_type(struct cifs_fattr *fattr, const char *path,
              struct cifs_sb_info *cifs_sb, unsigned int xid)
 {
        int rc;
        int oplock = 0;
-        __u16 netfid;
        struct tcon_link *tlink;
        struct cifs_tcon *tcon;
+        struct cifs_fid fid;
+        struct cifs_open_parms oparms;
        struct cifs_io_parms io_parms;
        char buf[24];
        unsigned int bytes_read;
        char *pbuf;
+        int buf_type = CIFS_NO_BUFFER;
        pbuf = buf;
@@ -436,62 +438,69 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
                return PTR_ERR(tlink);
        tcon = tlink_tcon(tlink);
-        rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, GENERIC_READ,
+        oparms.tcon = tcon;
-                         CREATE_NOT_DIR, &netfid, &oplock, NULL,
+        oparms.cifs_sb = cifs_sb;
-                         cifs_sb->local_nls,
+        oparms.desired_access = GENERIC_READ;
-                         cifs_sb->mnt_cifs_flags &
+        oparms.create_options = CREATE_NOT_DIR;
-                                CIFS_MOUNT_MAP_SPECIAL_CHR);
+        oparms.disposition = FILE_OPEN;
-        if (rc == 0) {
+        oparms.path = path;
-                int buf_type = CIFS_NO_BUFFER;
+        oparms.fid = &fid;
-                        /* Read header */
+        oparms.reconnect = false;
-                io_parms.netfid = netfid;
-                io_parms.pid = current->tgid;
+        rc = CIFS_open(xid, &oparms, &oplock, NULL);
-                io_parms.tcon = tcon;
+        if (rc) {
-                io_parms.offset = 0;
+                cifs_put_tlink(tlink);
-                io_parms.length = 24;
+                return rc;
-                rc = CIFSSMBRead(xid, &io_parms, &bytes_read, &pbuf,
+        }
-                                 &buf_type);
-                if ((rc == 0) && (bytes_read >= 8)) {
+        /* Read header */
-                        if (memcmp("IntxBLK", pbuf, 8) == 0) {
+        io_parms.netfid = fid.netfid;
-                                cifs_dbg(FYI, "Block device\n");
+        io_parms.pid = current->tgid;
-                                fattr->cf_mode |= S_IFBLK;
+        io_parms.tcon = tcon;
-                                fattr->cf_dtype = DT_BLK;
+        io_parms.offset = 0;
-                                if (bytes_read == 24) {
+        io_parms.length = 24;
-                                        /* we have enough to decode dev num */
-                                        __u64 mjr; /* major */
+        rc = CIFSSMBRead(xid, &io_parms, &bytes_read, &pbuf, &buf_type);
-                                        __u64 mnr; /* minor */
+        if ((rc == 0) && (bytes_read >= 8)) {
-                                        mjr = le64_to_cpu(*(__le64 *)(pbuf+8));
+                if (memcmp("IntxBLK", pbuf, 8) == 0) {
-                                        mnr = le64_to_cpu(*(__le64 *)(pbuf+16));
+                        cifs_dbg(FYI, "Block device\n");
-                                        fattr->cf_rdev = MKDEV(mjr, mnr);
+                        fattr->cf_mode |= S_IFBLK;
-                                }
+                        fattr->cf_dtype = DT_BLK;
-                        } else if (memcmp("IntxCHR", pbuf, 8) == 0) {
+                        if (bytes_read == 24) {
-                                cifs_dbg(FYI, "Char device\n");
+                                /* we have enough to decode dev num */
-                                fattr->cf_mode |= S_IFCHR;
+                                __u64 mjr; /* major */
-                                fattr->cf_dtype = DT_CHR;
+                                __u64 mnr; /* minor */
-                                if (bytes_read == 24) {
+                                mjr = le64_to_cpu(*(__le64 *)(pbuf+8));
-                                        /* we have enough to decode dev num */
+                                mnr = le64_to_cpu(*(__le64 *)(pbuf+16));
-                                        __u64 mjr; /* major */
+                                fattr->cf_rdev = MKDEV(mjr, mnr);
-                                        __u64 mnr; /* minor */
-                                        mjr = le64_to_cpu(*(__le64 *)(pbuf+8));
-                                        mnr = le64_to_cpu(*(__le64 *)(pbuf+16));
-                                        fattr->cf_rdev = MKDEV(mjr, mnr);
-                                }
-                        } else if (memcmp("IntxLNK", pbuf, 7) == 0) {
-                                cifs_dbg(FYI, "Symlink\n");
-                                fattr->cf_mode |= S_IFLNK;
-                                fattr->cf_dtype = DT_LNK;
-                        } else {
-                                fattr->cf_mode |= S_IFREG; /* file? */
-                                fattr->cf_dtype = DT_REG;
-                                rc = -EOPNOTSUPP;
                        }
+                } else if (memcmp("IntxCHR", pbuf, 8) == 0) {
+                        cifs_dbg(FYI, "Char device\n");
+                        fattr->cf_mode |= S_IFCHR;
+                        fattr->cf_dtype = DT_CHR;
+                        if (bytes_read == 24) {
+                                /* we have enough to decode dev num */
+                                __u64 mjr; /* major */
+                                __u64 mnr; /* minor */
+                                mjr = le64_to_cpu(*(__le64 *)(pbuf+8));
+                                mnr = le64_to_cpu(*(__le64 *)(pbuf+16));
+                                fattr->cf_rdev = MKDEV(mjr, mnr);
+                        }
+                } else if (memcmp("IntxLNK", pbuf, 7) == 0) {
+                        cifs_dbg(FYI, "Symlink\n");
+                        fattr->cf_mode |= S_IFLNK;
+                        fattr->cf_dtype = DT_LNK;
                } else {
-                        fattr->cf_mode |= S_IFREG; /* then it is a file */
+                        fattr->cf_mode |= S_IFREG; /* file? */
                        fattr->cf_dtype = DT_REG;
-                        rc = -EOPNOTSUPP; /* or some unknown SFU type */
+                        rc = -EOPNOTSUPP;
                }
-                CIFSSMBClose(xid, tcon, netfid);
+        } else {
+                fattr->cf_mode |= S_IFREG; /* then it is a file */
+                fattr->cf_dtype = DT_REG;
+                rc = -EOPNOTSUPP; /* or some unknown SFU type */
        }
+        CIFSSMBClose(xid, tcon, fid.netfid);
        cifs_put_tlink(tlink);
        return rc;
 }
@@ -518,10 +527,15 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path,
                return PTR_ERR(tlink);
        tcon = tlink_tcon(tlink);
-        rc = CIFSSMBQAllEAs(xid, tcon, path, "SETFILEBITS",
+        if (tcon->ses->server->ops->query_all_EAs == NULL) {
-                            ea_value, 4 /* size of buf */, cifs_sb->local_nls,
+                cifs_put_tlink(tlink);
-                            cifs_sb->mnt_cifs_flags &
+                return -EOPNOTSUPP;
-                                CIFS_MOUNT_MAP_SPECIAL_CHR);
+        }
+        rc = tcon->ses->server->ops->query_all_EAs(xid, tcon, path,
+                        "SETFILEBITS", ea_value, 4 /* size of buf */,
+                        cifs_sb->local_nls,
+                        cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
        cifs_put_tlink(tlink);
        if (rc < 0)
                return (int)rc;
@@ -663,7 +677,7 @@ cgfi_exit:
 int
 cifs_get_inode_info(struct inode **inode, const char *full_path,
                    FILE_ALL_INFO *data, struct super_block *sb, int xid,
-                    const __u16 *fid)
+                    const struct cifs_fid *fid)
 {
        bool validinum = false;
        __u16 srchflgs;
@@ -800,10 +814,10 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
        /* check for Minshall+French symlinks */
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) {
-                tmprc = CIFSCheckMFSymlink(xid, tcon, cifs_sb, &fattr,
+                tmprc = check_mf_symlink(xid, tcon, cifs_sb, &fattr,
-                                           full_path);
+                                         full_path);
                if (tmprc)
-                        cifs_dbg(FYI, "CIFSCheckMFSymlink: %d\n", tmprc);
+                        cifs_dbg(FYI, "check_mf_symlink: %d\n", tmprc);
        }
        if (!*inode) {
@@ -1032,7 +1046,8 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry,
 {
        int oplock = 0;
        int rc;
-        __u16 netfid;
+        struct cifs_fid fid;
+        struct cifs_open_parms oparms;
        struct inode *inode = dentry->d_inode;
        struct cifsInodeInfo *cifsInode = CIFS_I(inode);
        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
@@ -1055,10 +1070,16 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry,
                goto out;
        }
-        rc = CIFSSMBOpen(xid, tcon, full_path, FILE_OPEN,
+        oparms.tcon = tcon;
-                         DELETE|FILE_WRITE_ATTRIBUTES, CREATE_NOT_DIR,
+        oparms.cifs_sb = cifs_sb;
-                         &netfid, &oplock, NULL, cifs_sb->local_nls,
+        oparms.desired_access = DELETE | FILE_WRITE_ATTRIBUTES;
-                         cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+        oparms.create_options = CREATE_NOT_DIR;
+        oparms.disposition = FILE_OPEN;
+        oparms.path = full_path;
+        oparms.fid = &fid;
+        oparms.reconnect = false;
+        rc = CIFS_open(xid, &oparms, &oplock, NULL);
        if (rc != 0)
                goto out;
@@ -1079,7 +1100,7 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry,
                        goto out_close;
                }
                info_buf->Attributes = cpu_to_le32(dosattr);
-                rc = CIFSSMBSetFileInfo(xid, tcon, info_buf, netfid,
+                rc = CIFSSMBSetFileInfo(xid, tcon, info_buf, fid.netfid,
                                        current->tgid);
                /* although we would like to mark the file hidden
                   if that fails we will still try to rename it */
@@ -1090,7 +1111,8 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry,
        }
        /* rename the file */
-        rc = CIFSSMBRenameOpenFile(xid, tcon, netfid, NULL, cifs_sb->local_nls,
+        rc = CIFSSMBRenameOpenFile(xid, tcon, fid.netfid, NULL,
+                                   cifs_sb->local_nls,
                                   cifs_sb->mnt_cifs_flags &
                                            CIFS_MOUNT_MAP_SPECIAL_CHR);
        if (rc != 0) {
@@ -1100,7 +1122,7 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry,
        /* try to set DELETE_ON_CLOSE */
        if (!cifsInode->delete_pending) {
-                rc = CIFSSMBSetFileDisposition(xid, tcon, true, netfid,
+                rc = CIFSSMBSetFileDisposition(xid, tcon, true, fid.netfid,
                                               current->tgid);
                /*
                 * some samba versions return -ENOENT when we try to set the
@@ -1120,7 +1142,7 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry,
        }
 out_close:
-        CIFSSMBClose(xid, tcon, netfid);
+        CIFSSMBClose(xid, tcon, fid.netfid);
 out:
        kfree(info_buf);
        cifs_put_tlink(tlink);
@@ -1132,13 +1154,13 @@ out:
         * them anyway.
         */
 undo_rename:
-        CIFSSMBRenameOpenFile(xid, tcon, netfid, dentry->d_name.name,
+        CIFSSMBRenameOpenFile(xid, tcon, fid.netfid, dentry->d_name.name,
                                cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
                                            CIFS_MOUNT_MAP_SPECIAL_CHR);
 undo_setattr:
        if (dosattr != origattr) {
                info_buf->Attributes = cpu_to_le32(origattr);
-                if (!CIFSSMBSetFileInfo(xid, tcon, info_buf, netfid,
+                if (!CIFSSMBSetFileInfo(xid, tcon, info_buf, fid.netfid,
                                        current->tgid))
                        cifsInode->cifsAttrs = origattr;
        }
@@ -1549,7 +1571,8 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry,
        struct tcon_link *tlink;
        struct cifs_tcon *tcon;
        struct TCP_Server_Info *server;
-        __u16 srcfid;
+        struct cifs_fid fid;
+        struct cifs_open_parms oparms;
        int oplock, rc;
        tlink = cifs_sb_tlink(cifs_sb);
@@ -1576,17 +1599,23 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry,
        if (to_dentry->d_parent != from_dentry->d_parent)
                goto do_rename_exit;
+        oparms.tcon = tcon;
+        oparms.cifs_sb = cifs_sb;
        /* open the file to be renamed -- we need DELETE perms */
-        rc = CIFSSMBOpen(xid, tcon, from_path, FILE_OPEN, DELETE,
+        oparms.desired_access = DELETE;
-                         CREATE_NOT_DIR, &srcfid, &oplock, NULL,
+        oparms.create_options = CREATE_NOT_DIR;
-                         cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
+        oparms.disposition = FILE_OPEN;
-                                CIFS_MOUNT_MAP_SPECIAL_CHR);
+        oparms.path = from_path;
+        oparms.fid = &fid;
+        oparms.reconnect = false;
+        rc = CIFS_open(xid, &oparms, &oplock, NULL);
        if (rc == 0) {
-                rc = CIFSSMBRenameOpenFile(xid, tcon, srcfid,
+                rc = CIFSSMBRenameOpenFile(xid, tcon, fid.netfid,
                                (const char *) to_dentry->d_name.name,
                                cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
-                CIFSSMBClose(xid, tcon, srcfid);
+                CIFSSMBClose(xid, tcon, fid.netfid);
        }
 do_rename_exit:
        cifs_put_tlink(tlink);
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 92aee08483a5..264ece71bdb2 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -29,6 +29,10 @@
 #include "cifs_debug.h"
 #include "cifs_fs_sb.h"
+/*
+ * M-F Symlink Functions - Begin
+ */
 #define CIFS_MF_SYMLINK_LEN_OFFSET (4+1)
 #define CIFS_MF_SYMLINK_MD5_OFFSET (CIFS_MF_SYMLINK_LEN_OFFSET+(4+1))
 #define CIFS_MF_SYMLINK_LINK_OFFSET (CIFS_MF_SYMLINK_MD5_OFFSET+(32+1))
@@ -91,10 +95,8 @@ symlink_hash_err:
 }
 static int
-CIFSParseMFSymlink(const u8 *buf,
+parse_mf_symlink(const u8 *buf, unsigned int buf_len, unsigned int *_link_len,
-                   unsigned int buf_len,
+                 char **_link_str)
-                   unsigned int *_link_len,
-                   char **_link_str)
 {
        int rc;
        unsigned int link_len;
@@ -137,7 +139,7 @@ CIFSParseMFSymlink(const u8 *buf,
 }
 static int
-CIFSFormatMFSymlink(u8 *buf, unsigned int buf_len, const char *link_str)
+format_mf_symlink(u8 *buf, unsigned int buf_len, const char *link_str)
 {
        int rc;
        unsigned int link_len;
@@ -180,190 +182,94 @@ CIFSFormatMFSymlink(u8 *buf, unsigned int buf_len, const char *link_str)
        return 0;
 }
+bool
+couldbe_mf_symlink(const struct cifs_fattr *fattr)
+{
+        if (!S_ISREG(fattr->cf_mode))
+                /* it's not a symlink */
+                return false;
+        if (fattr->cf_eof != CIFS_MF_SYMLINK_FILE_SIZE)
+                /* it's not a symlink */
+                return false;
+        return true;
+}
 static int
-CIFSCreateMFSymLink(const unsigned int xid, struct cifs_tcon *tcon,
+create_mf_symlink(const unsigned int xid, struct cifs_tcon *tcon,
-                    const char *fromName, const char *toName,
+                  struct cifs_sb_info *cifs_sb, const char *fromName,
-                    struct cifs_sb_info *cifs_sb)
+                  const char *toName)
 {
        int rc;
-        int oplock = 0;
-        int remap;
-        int create_options = CREATE_NOT_DIR;
-        __u16 netfid = 0;
        u8 *buf;
        unsigned int bytes_written = 0;
-        struct cifs_io_parms io_parms;
-        struct nls_table *nls_codepage;
-        nls_codepage = cifs_sb->local_nls;
-        remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR;
        buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL);
        if (!buf)
                return -ENOMEM;
-        rc = CIFSFormatMFSymlink(buf, CIFS_MF_SYMLINK_FILE_SIZE, toName);
+        rc = format_mf_symlink(buf, CIFS_MF_SYMLINK_FILE_SIZE, toName);
-        if (rc != 0) {
+        if (rc)
-                kfree(buf);
+                goto out;
-                return rc;
-        }
-        if (backup_cred(cifs_sb))
-                create_options |= CREATE_OPEN_BACKUP_INTENT;
-        rc = CIFSSMBOpen(xid, tcon, fromName, FILE_CREATE, GENERIC_WRITE,
-                         create_options, &netfid, &oplock, NULL,
-                         nls_codepage, remap);
-        if (rc != 0) {
-                kfree(buf);
-                return rc;
-        }
-        io_parms.netfid = netfid;
-        io_parms.pid = current->tgid;
-        io_parms.tcon = tcon;
-        io_parms.offset = 0;
-        io_parms.length = CIFS_MF_SYMLINK_FILE_SIZE;
-        rc = CIFSSMBWrite(xid, &io_parms, &bytes_written, buf, NULL, 0);
+        rc = tcon->ses->server->ops->create_mf_symlink(xid, tcon, cifs_sb,
-        CIFSSMBClose(xid, tcon, netfid);
+                                        fromName, buf, &bytes_written);
-        kfree(buf);
+        if (rc)
-        if (rc != 0)
+                goto out;
-                return rc;
        if (bytes_written != CIFS_MF_SYMLINK_FILE_SIZE)
-                return -EIO;
+                rc = -EIO;
+out:
-        return 0;
+        kfree(buf);
+        return rc;
 }
 static int
-CIFSQueryMFSymLink(const unsigned int xid, struct cifs_tcon *tcon,
+query_mf_symlink(const unsigned int xid, struct cifs_tcon *tcon,
-                   const unsigned char *searchName, char **symlinkinfo,
+                 struct cifs_sb_info *cifs_sb, const unsigned char *path,
-                   const struct nls_table *nls_codepage, int remap)
+                 char **symlinkinfo)
 {
        int rc;
-        int oplock = 0;
+        u8 *buf = NULL;
-        __u16 netfid = 0;
-        u8 *buf;
-        char *pbuf;
-        unsigned int bytes_read = 0;
-        int buf_type = CIFS_NO_BUFFER;
        unsigned int link_len = 0;
-        struct cifs_io_parms io_parms;
+        unsigned int bytes_read = 0;
-        FILE_ALL_INFO file_info;
-        rc = CIFSSMBOpen(xid, tcon, searchName, FILE_OPEN, GENERIC_READ,
-                         CREATE_NOT_DIR, &netfid, &oplock, &file_info,
-                         nls_codepage, remap);
-        if (rc != 0)
-                return rc;
-        if (file_info.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) {
-                CIFSSMBClose(xid, tcon, netfid);
-                /* it's not a symlink */
-                return -EINVAL;
-        }
        buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL);
        if (!buf)
                return -ENOMEM;
-        pbuf = buf;
-        io_parms.netfid = netfid;
-        io_parms.pid = current->tgid;
-        io_parms.tcon = tcon;
-        io_parms.offset = 0;
-        io_parms.length = CIFS_MF_SYMLINK_FILE_SIZE;
-        rc = CIFSSMBRead(xid, &io_parms, &bytes_read, &pbuf, &buf_type);
-        CIFSSMBClose(xid, tcon, netfid);
-        if (rc != 0) {
-                kfree(buf);
-                return rc;
-        }
-        rc = CIFSParseMFSymlink(buf, bytes_read, &link_len, symlinkinfo);
-        kfree(buf);
-        if (rc != 0)
-                return rc;
-        return 0;
-}
-bool
-CIFSCouldBeMFSymlink(const struct cifs_fattr *fattr)
-{
-        if (!(fattr->cf_mode & S_IFREG))
-                /* it's not a symlink */
-                return false;
-        if (fattr->cf_eof != CIFS_MF_SYMLINK_FILE_SIZE)
+        if (tcon->ses->server->ops->query_mf_symlink)
-                /* it's not a symlink */
+                rc = tcon->ses->server->ops->query_mf_symlink(xid, tcon,
-                return false;
+                                              cifs_sb, path, buf, &bytes_read);
+        else
-        return true;
+                rc = -ENOSYS;
-}
-int
-open_query_close_cifs_symlink(const unsigned char *path, char *pbuf,
-                        unsigned int *pbytes_read, struct cifs_sb_info *cifs_sb,
-                        unsigned int xid)
-{
-        int rc;
-        int oplock = 0;
-        __u16 netfid = 0;
-        struct tcon_link *tlink;
-        struct cifs_tcon *ptcon;
-        struct cifs_io_parms io_parms;
-        int buf_type = CIFS_NO_BUFFER;
-        FILE_ALL_INFO file_info;
-        tlink = cifs_sb_tlink(cifs_sb);
-        if (IS_ERR(tlink))
-                return PTR_ERR(tlink);
-        ptcon = tlink_tcon(tlink);
-        rc = CIFSSMBOpen(xid, ptcon, path, FILE_OPEN, GENERIC_READ,
+        if (rc)
-                         CREATE_NOT_DIR, &netfid, &oplock, &file_info,
+                goto out;
-                         cifs_sb->local_nls,
-                         cifs_sb->mnt_cifs_flags &
-                                CIFS_MOUNT_MAP_SPECIAL_CHR);
-        if (rc != 0) {
-                cifs_put_tlink(tlink);
-                return rc;
-        }
-        if (file_info.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) {
+        if (bytes_read == 0) { /* not a symlink */
-                CIFSSMBClose(xid, ptcon, netfid);
+                rc = -EINVAL;
-                cifs_put_tlink(tlink);
+                goto out;
-                /* it's not a symlink */
-                return rc;
        }
-        io_parms.netfid = netfid;
+        rc = parse_mf_symlink(buf, bytes_read, &link_len, symlinkinfo);
-        io_parms.pid = current->tgid;
+out:
-        io_parms.tcon = ptcon;
+        kfree(buf);
-        io_parms.offset = 0;
-        io_parms.length = CIFS_MF_SYMLINK_FILE_SIZE;
-        rc = CIFSSMBRead(xid, &io_parms, pbytes_read, &pbuf, &buf_type);
-        CIFSSMBClose(xid, ptcon, netfid);
-        cifs_put_tlink(tlink);
        return rc;
 }
 int
-CIFSCheckMFSymlink(unsigned int xid, struct cifs_tcon *tcon,
+check_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
-                   struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
+                 struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
-                   const unsigned char *path)
+                 const unsigned char *path)
 {
        int rc;
        u8 *buf = NULL;
        unsigned int link_len = 0;
        unsigned int bytes_read = 0;
-        if (!CIFSCouldBeMFSymlink(fattr))
+        if (!couldbe_mf_symlink(fattr))
                /* it's not a symlink */
                return 0;
@@ -372,8 +278,8 @@ CIFSCheckMFSymlink(unsigned int xid, struct cifs_tcon *tcon,
                return -ENOMEM;
        if (tcon->ses->server->ops->query_mf_symlink)
-                rc = tcon->ses->server->ops->query_mf_symlink(path, buf,
+                rc = tcon->ses->server->ops->query_mf_symlink(xid, tcon,
-                                                &bytes_read, cifs_sb, xid);
+                                              cifs_sb, path, buf, &bytes_read);
        else
                rc = -ENOSYS;
@@ -383,7 +289,7 @@ CIFSCheckMFSymlink(unsigned int xid, struct cifs_tcon *tcon,
        if (bytes_read == 0) /* not a symlink */
                goto out;
-        rc = CIFSParseMFSymlink(buf, bytes_read, &link_len, NULL);
+        rc = parse_mf_symlink(buf, bytes_read, &link_len, NULL);
        if (rc == -EINVAL) {
                /* it's not a symlink */
                rc = 0;
@@ -403,6 +309,95 @@ out:
        return rc;
 }
+/*
+ * SMB 1.0 Protocol specific functions
+ */
+int
+cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
+                      struct cifs_sb_info *cifs_sb, const unsigned char *path,
+                      char *pbuf, unsigned int *pbytes_read)
+{
+        int rc;
+        int oplock = 0;
+        struct cifs_fid fid;
+        struct cifs_open_parms oparms;
+        struct cifs_io_parms io_parms;
+        int buf_type = CIFS_NO_BUFFER;
+        FILE_ALL_INFO file_info;
+        oparms.tcon = tcon;
+        oparms.cifs_sb = cifs_sb;
+        oparms.desired_access = GENERIC_READ;
+        oparms.create_options = CREATE_NOT_DIR;
+        oparms.disposition = FILE_OPEN;
+        oparms.path = path;
+        oparms.fid = &fid;
+        oparms.reconnect = false;
+        rc = CIFS_open(xid, &oparms, &oplock, &file_info);
+        if (rc)
+                return rc;
+        if (file_info.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE))
+                /* it's not a symlink */
+                goto out;
+        io_parms.netfid = fid.netfid;
+        io_parms.pid = current->tgid;
+        io_parms.tcon = tcon;
+        io_parms.offset = 0;
+        io_parms.length = CIFS_MF_SYMLINK_FILE_SIZE;
+        rc = CIFSSMBRead(xid, &io_parms, pbytes_read, &pbuf, &buf_type);
+out:
+        CIFSSMBClose(xid, tcon, fid.netfid);
+        return rc;
+}
+int
+cifs_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
+                       struct cifs_sb_info *cifs_sb, const unsigned char *path,
+                       char *pbuf, unsigned int *pbytes_written)
+{
+        int rc;
+        int oplock = 0;
+        struct cifs_fid fid;
+        struct cifs_open_parms oparms;
+        struct cifs_io_parms io_parms;
+        int create_options = CREATE_NOT_DIR;
+        if (backup_cred(cifs_sb))
+                create_options |= CREATE_OPEN_BACKUP_INTENT;
+        oparms.tcon = tcon;
+        oparms.cifs_sb = cifs_sb;
+        oparms.desired_access = GENERIC_WRITE;
+        oparms.create_options = create_options;
+        oparms.disposition = FILE_OPEN;
+        oparms.path = path;
+        oparms.fid = &fid;
+        oparms.reconnect = false;
+        rc = CIFS_open(xid, &oparms, &oplock, NULL);
+        if (rc)
+                return rc;
+        io_parms.netfid = fid.netfid;
+        io_parms.pid = current->tgid;
+        io_parms.tcon = tcon;
+        io_parms.offset = 0;
+        io_parms.length = CIFS_MF_SYMLINK_FILE_SIZE;
+        rc = CIFSSMBWrite(xid, &io_parms, pbytes_written, pbuf, NULL, 0);
+        CIFSSMBClose(xid, tcon, fid.netfid);
+        return rc;
+}
+/*
+ * M-F Symlink Functions - End
+ */
 int
 cifs_hardlink(struct dentry *old_file, struct inode *inode,
              struct dentry *direntry)
@@ -438,8 +433,10 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode,
                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
        else {
                server = tcon->ses->server;
-                if (!server->ops->create_hardlink)
+                if (!server->ops->create_hardlink) {
-                        return -ENOSYS;
+                        rc = -ENOSYS;
+                        goto cifs_hl_exit;
+                }
                rc = server->ops->create_hardlink(xid, tcon, from_name, to_name,
                                                  cifs_sb);
                if ((rc == -EIO) || (rc == -EINVAL))
@@ -530,15 +527,10 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
         * and fallback to UNIX Extensions Symlinks.
         */
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
-                rc = CIFSQueryMFSymLink(xid, tcon, full_path, &target_path,
+                rc = query_mf_symlink(xid, tcon, cifs_sb, full_path,
-                                        cifs_sb->local_nls,
+                                      &target_path);
-                                        cifs_sb->mnt_cifs_flags &
-                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
-        if ((rc != 0) && cap_unix(tcon->ses))
+        if (rc != 0 && server->ops->query_symlink)
-                rc = CIFSSMBUnixQuerySymLink(xid, tcon, full_path, &target_path,
-                                             cifs_sb->local_nls);
-        else if (rc != 0 && server->ops->query_symlink)
                rc = server->ops->query_symlink(xid, tcon, full_path,
                                                &target_path, cifs_sb);
@@ -587,8 +579,7 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
        /* BB what if DFS and this volume is on different share? BB */
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
-                rc = CIFSCreateMFSymLink(xid, pTcon, full_path, symname,
+                rc = create_mf_symlink(xid, pTcon, cifs_sb, full_path, symname);
-                                        cifs_sb);
        else if (pTcon->unix_ext)
                rc = CIFSUnixCreateSymLink(xid, pTcon, full_path, symname,
                                           cifs_sb->local_nls);
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 5940ecabbe6a..b15862e0f68c 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -749,7 +749,7 @@ static int cifs_filldir(char *find_entry, struct file *file,
        }
        if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) &&
-            CIFSCouldBeMFSymlink(&fattr))
+            couldbe_mf_symlink(&fattr))
                /*
                 * trying to get the type and mode can be slow,
                 * so just call those regular files for now, and mark
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index 5f5ba0dc2ee1..526fb89f9230 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -560,17 +560,24 @@ cifs_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
        if (!rc && (le32_to_cpu(data->Attributes) & ATTR_REPARSE)) {
                int tmprc;
                int oplock = 0;
-                __u16 netfid;
+                struct cifs_fid fid;
+                struct cifs_open_parms oparms;
+                oparms.tcon = tcon;
+                oparms.cifs_sb = cifs_sb;
+                oparms.desired_access = FILE_READ_ATTRIBUTES;
+                oparms.create_options = 0;
+                oparms.disposition = FILE_OPEN;
+                oparms.path = full_path;
+                oparms.fid = &fid;
+                oparms.reconnect = false;
                /* Need to check if this is a symbolic link or not */
-                tmprc = CIFSSMBOpen(xid, tcon, full_path, FILE_OPEN,
+                tmprc = CIFS_open(xid, &oparms, &oplock, NULL);
-                                    FILE_READ_ATTRIBUTES, 0, &netfid, &oplock,
-                                    NULL, cifs_sb->local_nls,
-                        cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
                if (tmprc == -EOPNOTSUPP)
                        *symlink = true;
                else
-                        CIFSSMBClose(xid, tcon, netfid);
+                        CIFSSMBClose(xid, tcon, fid.netfid);
        }
        return rc;
@@ -705,12 +712,7 @@ cifs_open_file(const unsigned int xid, struct cifs_open_parms *oparms,
                                     oparms->cifs_sb->local_nls,
                                     oparms->cifs_sb->mnt_cifs_flags
                                                & CIFS_MOUNT_MAP_SPECIAL_CHR);
-        return CIFSSMBOpen(xid, oparms->tcon, oparms->path,
+        return CIFS_open(xid, oparms, oplock, buf);
-                           oparms->disposition, oparms->desired_access,
-                           oparms->create_options, &oparms->fid->netfid, oplock,
-                           buf, oparms->cifs_sb->local_nls,
-                           oparms->cifs_sb->mnt_cifs_flags &
-                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
 }
 static void
@@ -761,8 +763,9 @@ smb_set_file_info(struct inode *inode, const char *full_path,
 {
        int oplock = 0;
        int rc;
-        __u16 netfid;
        __u32 netpid;
+        struct cifs_fid fid;
+        struct cifs_open_parms oparms;
        struct cifsFileInfo *open_file;
        struct cifsInodeInfo *cinode = CIFS_I(inode);
        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
@@ -772,7 +775,7 @@ smb_set_file_info(struct inode *inode, const char *full_path,
        /* if the file is already open for write, just use that fileid */
        open_file = find_writable_file(cinode, true);
        if (open_file) {
-                netfid = open_file->fid.netfid;
+                fid.netfid = open_file->fid.netfid;
                netpid = open_file->pid;
                tcon = tlink_tcon(open_file->tlink);
                goto set_via_filehandle;
@@ -796,12 +799,17 @@ smb_set_file_info(struct inode *inode, const char *full_path,
                goto out;
        }
-        cifs_dbg(FYI, "calling SetFileInfo since SetPathInfo for times not supported by this server\n");
+        oparms.tcon = tcon;
-        rc = CIFSSMBOpen(xid, tcon, full_path, FILE_OPEN,
+        oparms.cifs_sb = cifs_sb;
-                         SYNCHRONIZE | FILE_WRITE_ATTRIBUTES, CREATE_NOT_DIR,
+        oparms.desired_access = SYNCHRONIZE | FILE_WRITE_ATTRIBUTES;
-                         &netfid, &oplock, NULL, cifs_sb->local_nls,
+        oparms.create_options = CREATE_NOT_DIR;
-                         cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+        oparms.disposition = FILE_OPEN;
+        oparms.path = full_path;
+        oparms.fid = &fid;
+        oparms.reconnect = false;
+        cifs_dbg(FYI, "calling SetFileInfo since SetPathInfo for times not supported by this server\n");
+        rc = CIFS_open(xid, &oparms, &oplock, NULL);
        if (rc != 0) {
                if (rc == -EIO)
                        rc = -EINVAL;
@@ -811,12 +819,12 @@ smb_set_file_info(struct inode *inode, const char *full_path,
        netpid = current->tgid;
 set_via_filehandle:
-        rc = CIFSSMBSetFileInfo(xid, tcon, buf, netfid, netpid);
+        rc = CIFSSMBSetFileInfo(xid, tcon, buf, fid.netfid, netpid);
        if (!rc)
                cinode->cifsAttrs = le32_to_cpu(buf->Attributes);
        if (open_file == NULL)
-                CIFSSMBClose(xid, tcon, netfid);
+                CIFSSMBClose(xid, tcon, fid.netfid);
        else
                cifsFileInfo_put(open_file);
 out:
@@ -908,33 +916,80 @@ cifs_mand_lock(const unsigned int xid, struct cifsFileInfo *cfile, __u64 offset,
 }
 static int
+cifs_unix_dfs_readlink(const unsigned int xid, struct cifs_tcon *tcon,
+                       const unsigned char *searchName, char **symlinkinfo,
+                       const struct nls_table *nls_codepage)
+{
+#ifdef CONFIG_CIFS_DFS_UPCALL
+        int rc;
+        unsigned int num_referrals = 0;
+        struct dfs_info3_param *referrals = NULL;
+        rc = get_dfs_path(xid, tcon->ses, searchName, nls_codepage,
+                          &num_referrals, &referrals, 0);
+        if (!rc && num_referrals > 0) {
+                *symlinkinfo = kstrndup(referrals->node_name,
+                                        strlen(referrals->node_name),
+                                        GFP_KERNEL);
+                if (!*symlinkinfo)
+                        rc = -ENOMEM;
+                free_dfs_info_array(referrals, num_referrals);
+        }
+        return rc;
+#else /* No DFS support */
+        return -EREMOTE;
+#endif
+}
+static int
 cifs_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
                   const char *full_path, char **target_path,
                   struct cifs_sb_info *cifs_sb)
 {
        int rc;
        int oplock = 0;
-        __u16 netfid;
+        struct cifs_fid fid;
+        struct cifs_open_parms oparms;
        cifs_dbg(FYI, "%s: path: %s\n", __func__, full_path);
-        rc = CIFSSMBOpen(xid, tcon, full_path, FILE_OPEN,
+        /* Check for unix extensions */
-                         FILE_READ_ATTRIBUTES, OPEN_REPARSE_POINT, &netfid,
+        if (cap_unix(tcon->ses)) {
-                         &oplock, NULL, cifs_sb->local_nls,
+                rc = CIFSSMBUnixQuerySymLink(xid, tcon, full_path, target_path,
-                         cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+                                             cifs_sb->local_nls);
+                if (rc == -EREMOTE)
+                        rc = cifs_unix_dfs_readlink(xid, tcon, full_path,
+                                                    target_path,
+                                                    cifs_sb->local_nls);
+                goto out;
+        }
+        oparms.tcon = tcon;
+        oparms.cifs_sb = cifs_sb;
+        oparms.desired_access = FILE_READ_ATTRIBUTES;
+        oparms.create_options = OPEN_REPARSE_POINT;
+        oparms.disposition = FILE_OPEN;
+        oparms.path = full_path;
+        oparms.fid = &fid;
+        oparms.reconnect = false;
+        rc = CIFS_open(xid, &oparms, &oplock, NULL);
        if (rc)
-                return rc;
+                goto out;
-        rc = CIFSSMBQuerySymLink(xid, tcon, netfid, target_path,
+        rc = CIFSSMBQuerySymLink(xid, tcon, fid.netfid, target_path,
                                 cifs_sb->local_nls);
-        if (rc) {
+        if (rc)
-                CIFSSMBClose(xid, tcon, netfid);
+                goto out_close;
-                return rc;
-        }
        convert_delimiter(*target_path, '/');
-        CIFSSMBClose(xid, tcon, netfid);
+out_close:
-        cifs_dbg(FYI, "%s: target path: %s\n", __func__, *target_path);
+        CIFSSMBClose(xid, tcon, fid.netfid);
+out:
+        if (!rc)
+                cifs_dbg(FYI, "%s: target path: %s\n", __func__, *target_path);
        return rc;
 }
@@ -1009,8 +1064,18 @@ struct smb_version_operations smb1_operations = {
        .mand_lock = cifs_mand_lock,
        .mand_unlock_range = cifs_unlock_range,
        .push_mand_locks = cifs_push_mandatory_locks,
-        .query_mf_symlink = open_query_close_cifs_symlink,
+        .query_mf_symlink = cifs_query_mf_symlink,
+        .create_mf_symlink = cifs_create_mf_symlink,
        .is_read_op = cifs_is_read_op,
+#ifdef CONFIG_CIFS_XATTR
+        .query_all_EAs = CIFSSMBQAllEAs,
+        .set_EA = CIFSSMBSetEA,
+#endif /* CIFS_XATTR */
+#ifdef CONFIG_CIFS_ACL
+        .get_acl = get_cifs_acl,
+        .get_acl_by_fid = get_cifs_acl_by_fid,
+        .set_acl = set_cifs_acl,
+#endif /* CIFS_ACL */
 };
 struct smb_version_values smb1_values = {
diff --git a/fs/cifs/smb2glob.h b/fs/cifs/smb2glob.h
index c38350851b08..bc0bb9c34f72 100644
--- a/fs/cifs/smb2glob.h
+++ b/fs/cifs/smb2glob.h
@@ -57,4 +57,7 @@
 #define SMB2_CMACAES_SIZE (16)
 #define SMB3_SIGNKEY_SIZE (16)
+/* Maximum buffer size value we can send with 1 credit */
+#define SMB2_MAX_BUFFER_SIZE 65536
 #endif  /* _SMB2_GLOB_H */
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 757da3e54d3d..192f51a12cf1 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -182,11 +182,8 @@ smb2_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *volume_info)
        /* start with specified wsize, or default */
        wsize = volume_info->wsize ? volume_info->wsize : CIFS_DEFAULT_IOSIZE;
        wsize = min_t(unsigned int, wsize, server->max_write);
-        /*
+        /* set it to the maximum buffer size value we can send with 1 credit */
-         * limit write size to 2 ** 16, because we don't support multicredit
+        wsize = min_t(unsigned int, wsize, SMB2_MAX_BUFFER_SIZE);
-         * requests now.
-         */
-        wsize = min_t(unsigned int, wsize, 2 << 15);
        return wsize;
 }
@@ -200,11 +197,8 @@ smb2_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *volume_info)
        /* start with specified rsize, or default */
        rsize = volume_info->rsize ? volume_info->rsize : CIFS_DEFAULT_IOSIZE;
        rsize = min_t(unsigned int, rsize, server->max_read);
-        /*
+        /* set it to the maximum buffer size value we can send with 1 credit */
-         * limit write size to 2 ** 16, because we don't support multicredit
+        rsize = min_t(unsigned int, rsize, SMB2_MAX_BUFFER_SIZE);
-         * requests now.
-         */
-        rsize = min_t(unsigned int, rsize, 2 << 15);
        return rsize;
 }
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 2013234b73ad..860344701067 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -413,7 +413,9 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
        /* SMB2 only has an extended negflavor */
        server->negflavor = CIFS_NEGFLAVOR_EXTENDED;
-        server->maxBuf = le32_to_cpu(rsp->MaxTransactSize);
+        /* set it to the maximum buffer size value we can send with 1 credit */
+        server->maxBuf = min_t(unsigned int, le32_to_cpu(rsp->MaxTransactSize),
+                               SMB2_MAX_BUFFER_SIZE);
        server->max_read = le32_to_cpu(rsp->MaxReadSize);
        server->max_write = le32_to_cpu(rsp->MaxWriteSize);
        /* BB Do we need to validate the SecurityMode? */
@@ -1890,7 +1892,8 @@ smb2_writev_callback(struct mid_q_entry *mid)
 /* smb2_async_writev - send an async write, and set up mid to handle result */
 int
-smb2_async_writev(struct cifs_writedata *wdata)
+smb2_async_writev(struct cifs_writedata *wdata,
+                  void (*release)(struct kref *kref))
 {
        int rc = -EACCES;
        struct smb2_write_req *req = NULL;
@@ -1938,7 +1941,7 @@ smb2_async_writev(struct cifs_writedata *wdata)
                                smb2_writev_callback, wdata, 0);
        if (rc) {
-                kref_put(&wdata->refcount, cifs_writedata_release);
+                kref_put(&wdata->refcount, release);
                cifs_stats_fail_inc(tcon, SMB2_WRITE_HE);
        }
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index 93adc64666f3..0ce48db20a65 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -123,7 +123,8 @@ extern int SMB2_get_srv_num(const unsigned int xid, struct cifs_tcon *tcon,
 extern int smb2_async_readv(struct cifs_readdata *rdata);
 extern int SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms,
                     unsigned int *nbytes, char **buf, int *buf_type);
-extern int smb2_async_writev(struct cifs_writedata *wdata);
+extern int smb2_async_writev(struct cifs_writedata *wdata,
+                             void (*release)(struct kref *kref));
 extern int SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms,
                      unsigned int *nbytes, struct kvec *iov, int n_vec);
 extern int SMB2_echo(struct TCP_Server_Info *server);
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index b37570952846..18cd5650a5fc 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -270,6 +270,26 @@ cifs_rqst_page_to_kvec(struct smb_rqst *rqst, unsigned int idx,
                iov->iov_len = rqst->rq_pagesz;
 }
+static unsigned long
+rqst_len(struct smb_rqst *rqst)
+{
+        unsigned int i;
+        struct kvec *iov = rqst->rq_iov;
+        unsigned long buflen = 0;
+        /* total up iov array first */
+        for (i = 0; i < rqst->rq_nvec; i++)
+                buflen += iov[i].iov_len;
+        /* add in the page array if there is one */
+        if (rqst->rq_npages) {
+                buflen += rqst->rq_pagesz * (rqst->rq_npages - 1);
+                buflen += rqst->rq_tailsz;
+        }
+        return buflen;
+}
 static int
 smb_send_rqst(struct TCP_Server_Info *server, struct smb_rqst *rqst)
 {
@@ -277,6 +297,7 @@ smb_send_rqst(struct TCP_Server_Info *server, struct smb_rqst *rqst)
        struct kvec *iov = rqst->rq_iov;
        int n_vec = rqst->rq_nvec;
        unsigned int smb_buf_length = get_rfc1002_length(iov[0].iov_base);
+        unsigned long send_length;
        unsigned int i;
        size_t total_len = 0, sent;
        struct socket *ssocket = server->ssocket;
@@ -285,6 +306,14 @@ smb_send_rqst(struct TCP_Server_Info *server, struct smb_rqst *rqst)
        if (ssocket == NULL)
                return -ENOTSOCK;
+        /* sanity check send length */
+        send_length = rqst_len(rqst);
+        if (send_length != smb_buf_length + 4) {
+                WARN(1, "Send length mismatch(send_length=%lu smb_buf_length=%u)\n",
+                        send_length, smb_buf_length);
+                return -EIO;
+        }
        cifs_dbg(FYI, "Sending smb: smb_len=%u\n", smb_buf_length);
        dump_smb(iov[0].iov_base, iov[0].iov_len);
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index 09afda4cc58e..5ac836a86b18 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -82,9 +82,11 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name)
                        goto remove_ea_exit;
                ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */
-                rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, NULL,
+                if (pTcon->ses->server->ops->set_EA)
-                        (__u16)0, cifs_sb->local_nls,
+                        rc = pTcon->ses->server->ops->set_EA(xid, pTcon,
-                        cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+                                full_path, ea_name, NULL, (__u16)0,
+                                cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
+                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
        }
 remove_ea_exit:
        kfree(full_path);
@@ -149,18 +151,22 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
                        cifs_dbg(FYI, "attempt to set cifs inode metadata\n");
                ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */
-                rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, ea_value,
+                if (pTcon->ses->server->ops->set_EA)
-                        (__u16)value_size, cifs_sb->local_nls,
+                        rc = pTcon->ses->server->ops->set_EA(xid, pTcon,
-                        cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+                                full_path, ea_name, ea_value, (__u16)value_size,
+                                cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
+                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
        } else if (strncmp(ea_name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN)
                   == 0) {
                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
                        goto set_ea_exit;
                ea_name += XATTR_OS2_PREFIX_LEN; /* skip past os2. prefix */
-                rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, ea_value,
+                if (pTcon->ses->server->ops->set_EA)
-                        (__u16)value_size, cifs_sb->local_nls,
+                        rc = pTcon->ses->server->ops->set_EA(xid, pTcon,
-                        cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+                                full_path, ea_name, ea_value, (__u16)value_size,
+                                cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
+                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
        } else if (strncmp(ea_name, CIFS_XATTR_CIFS_ACL,
                        strlen(CIFS_XATTR_CIFS_ACL)) == 0) {
 #ifdef CONFIG_CIFS_ACL
@@ -170,8 +176,12 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
                        rc = -ENOMEM;
                } else {
                        memcpy(pacl, ea_value, value_size);
-                        rc = set_cifs_acl(pacl, value_size,
+                        if (pTcon->ses->server->ops->set_acl)
-                                direntry->d_inode, full_path, CIFS_ACL_DACL);
+                                rc = pTcon->ses->server->ops->set_acl(pacl,
+                                                value_size, direntry->d_inode,
+                                                full_path, CIFS_ACL_DACL);
+                        else
+                                rc = -EOPNOTSUPP;
                        if (rc == 0) /* force revalidate of the inode */
                                CIFS_I(direntry->d_inode)->time = 0;
                        kfree(pacl);
@@ -272,17 +282,21 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
                        /* revalidate/getattr then populate from inode */
                } /* BB add else when above is implemented */
                ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */
-                rc = CIFSSMBQAllEAs(xid, pTcon, full_path, ea_name, ea_value,
+                if (pTcon->ses->server->ops->query_all_EAs)
-                        buf_size, cifs_sb->local_nls,
+                        rc = pTcon->ses->server->ops->query_all_EAs(xid, pTcon,
-                        cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+                                full_path, ea_name, ea_value, buf_size,
+                                cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
+                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
        } else if (strncmp(ea_name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) {
                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
                        goto get_ea_exit;
                ea_name += XATTR_OS2_PREFIX_LEN; /* skip past os2. prefix */
-                rc = CIFSSMBQAllEAs(xid, pTcon, full_path, ea_name, ea_value,
+                if (pTcon->ses->server->ops->query_all_EAs)
-                        buf_size, cifs_sb->local_nls,
+                        rc = pTcon->ses->server->ops->query_all_EAs(xid, pTcon,
-                        cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+                                full_path, ea_name, ea_value, buf_size,
+                                cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
+                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
        } else if (strncmp(ea_name, POSIX_ACL_XATTR_ACCESS,
                          strlen(POSIX_ACL_XATTR_ACCESS)) == 0) {
 #ifdef CONFIG_CIFS_POSIX
@@ -313,8 +327,11 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
                        u32 acllen;
                        struct cifs_ntsd *pacl;
-                        pacl = get_cifs_acl(cifs_sb, direntry->d_inode,
+                        if (pTcon->ses->server->ops->get_acl == NULL)
-                                                full_path, &acllen);
+                                goto get_ea_exit; /* rc already EOPNOTSUPP */
+                        pacl = pTcon->ses->server->ops->get_acl(cifs_sb,
+                                        direntry->d_inode, full_path, &acllen);
                        if (IS_ERR(pacl)) {
                                rc = PTR_ERR(pacl);
                                cifs_dbg(VFS, "%s: error %zd getting sec desc\n",
@@ -400,11 +417,12 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size)
        /* if proc/fs/cifs/streamstoxattr is set then
                search server for EAs or streams to
                returns as xattrs */
-        rc = CIFSSMBQAllEAs(xid, pTcon, full_path, NULL, data,
-                                buf_size, cifs_sb->local_nls,
-                                cifs_sb->mnt_cifs_flags &
-                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
+        if (pTcon->ses->server->ops->query_all_EAs)
+                rc = pTcon->ses->server->ops->query_all_EAs(xid, pTcon,
+                                full_path, NULL, data, buf_size,
+                                cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
+                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
 list_ea_exit:
        kfree(full_path);
        free_xid(xid);
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index dc52e13d58e0..3881610b6438 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -680,7 +680,8 @@ static int do_i2c_rdwr_ioctl(unsigned int fd, unsigned int cmd,
        struct i2c_msg                  __user *tmsgs;
        struct i2c_msg32                __user *umsgs;
        compat_caddr_t                  datap;
-        int                             nmsgs, i;
+        u32                             nmsgs;
+        int                             i;
        if (get_user(nmsgs, &udata->nmsgs))
                return -EFAULT;
diff --git a/fs/coredump.c b/fs/coredump.c
index bc3fbcd32558..e3ad709a4232 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -40,7 +40,6 @@
 #include <trace/events/task.h>
 #include "internal.h"
-#include "coredump.h"
 #include <trace/events/sched.h>
diff --git a/fs/coredump.h b/fs/coredump.h
deleted file mode 100644
index e39ff072110d..000000000000
--- a/fs/coredump.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef _FS_COREDUMP_H
-#define _FS_COREDUMP_H
-extern int __get_dumpable(unsigned long mm_flags);
-#endif
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index e501ac3a49ff..06610cf94d57 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -17,14 +17,30 @@
 #include <linux/init.h>
 #include <linux/string.h>
 #include <linux/blkdev.h>
-#include <linux/cramfs_fs.h>
 #include <linux/slab.h>
-#include <linux/cramfs_fs_sb.h>
 #include <linux/vfs.h>
 #include <linux/mutex.h>
+#include <uapi/linux/cramfs_fs.h>
 #include <asm/uaccess.h>
+#include "internal.h"
+/*
+ * cramfs super-block data in memory
+ */
+struct cramfs_sb_info {
+        unsigned long magic;
+        unsigned long size;
+        unsigned long blocks;
+        unsigned long files;
+        unsigned long flags;
+};
+static inline struct cramfs_sb_info *CRAMFS_SB(struct super_block *sb)
+{
+        return sb->s_fs_info;
+}
 static const struct super_operations cramfs_ops;
 static const struct inode_operations cramfs_dir_inode_operations;
 static const struct file_operations cramfs_directory_operations;
@@ -219,10 +235,11 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i
        return read_buffers[buffer] + offset;
 }
-static void cramfs_put_super(struct super_block *sb)
+static void cramfs_kill_sb(struct super_block *sb)
 {
-        kfree(sb->s_fs_info);
+        struct cramfs_sb_info *sbi = CRAMFS_SB(sb);
-        sb->s_fs_info = NULL;
+        kill_block_super(sb);
+        kfree(sbi);
 }
 static int cramfs_remount(struct super_block *sb, int *flags, char *data)
@@ -261,7 +278,7 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent)
                if (super.magic == CRAMFS_MAGIC_WEND) {
                        if (!silent)
                                printk(KERN_ERR "cramfs: wrong endianness\n");
-                        goto out;
+                        return -EINVAL;
                }
                /* check at 512 byte offset */
@@ -273,20 +290,20 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent)
                                printk(KERN_ERR "cramfs: wrong endianness\n");
                        else if (!silent)
                                printk(KERN_ERR "cramfs: wrong magic\n");
-                        goto out;
+                        return -EINVAL;
                }
        }
        /* get feature flags first */
        if (super.flags & ~CRAMFS_SUPPORTED_FLAGS) {
                printk(KERN_ERR "cramfs: unsupported filesystem features\n");
-                goto out;
+                return -EINVAL;
        }
        /* Check that the root inode is in a sane state */
        if (!S_ISDIR(super.root.mode)) {
                printk(KERN_ERR "cramfs: root is not a directory\n");
-                goto out;
+                return -EINVAL;
        }
        /* correct strange, hard-coded permissions of mkcramfs */
        super.root.mode |= (S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
@@ -310,22 +327,18 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent)
                  (root_offset != 512 + sizeof(struct cramfs_super))))
        {
                printk(KERN_ERR "cramfs: bad root offset %lu\n", root_offset);
-                goto out;
+                return -EINVAL;
        }
        /* Set it all up.. */
        sb->s_op = &cramfs_ops;
        root = get_cramfs_inode(sb, &super.root, 0);
        if (IS_ERR(root))
-                goto out;
+                return PTR_ERR(root);
        sb->s_root = d_make_root(root);
        if (!sb->s_root)
-                goto out;
+                return -ENOMEM;
        return 0;
-out:
-        kfree(sbi);
-        sb->s_fs_info = NULL;
-        return -EINVAL;
 }
 static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -550,7 +563,6 @@ static const struct inode_operations cramfs_dir_inode_operations = {
 };
 static const struct super_operations cramfs_ops = {
-        .put_super      = cramfs_put_super,
        .remount_fs     = cramfs_remount,
        .statfs         = cramfs_statfs,
 };
@@ -565,7 +577,7 @@ static struct file_system_type cramfs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "cramfs",
        .mount          = cramfs_mount,
-        .kill_sb        = kill_block_super,
+        .kill_sb        = cramfs_kill_sb,
        .fs_flags       = FS_REQUIRES_DEV,
 };
 MODULE_ALIAS_FS("cramfs");
diff --git a/fs/cramfs/internal.h b/fs/cramfs/internal.h
new file mode 100644
index 000000000000..349d71272157
--- /dev/null
+++ b/fs/cramfs/internal.h
@@ -0,0 +1,4 @@
+/* Uncompression interfaces to the underlying zlib */
+int cramfs_uncompress_block(void *dst, int dstlen, void *src, int srclen);
+int cramfs_uncompress_init(void);
+void cramfs_uncompress_exit(void);
diff --git a/fs/cramfs/uncompress.c b/fs/cramfs/uncompress.c
index 023329800d2e..1760c1b84d97 100644
--- a/fs/cramfs/uncompress.c
+++ b/fs/cramfs/uncompress.c
@@ -19,7 +19,7 @@
 #include <linux/errno.h>
 #include <linux/vmalloc.h>
 #include <linux/zlib.h>
-#include <linux/cramfs_fs.h>
+#include "internal.h"
 static z_stream stream;
 static int initialized;
diff --git a/fs/dcache.c b/fs/dcache.c
index 6055d61811d3..ca02c13a84aa 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2833,9 +2833,9 @@ static int prepend_name(char **buffer, int *buflen, struct qstr *name)
        u32 dlen = ACCESS_ONCE(name->len);
        char *p;
-        if (*buflen < dlen + 1)
-                return -ENAMETOOLONG;
        *buflen -= dlen + 1;
+        if (*buflen < 0)
+                return -ENAMETOOLONG;
        p = *buffer -= dlen + 1;
        *p++ = '/';
        while (dlen--) {
@@ -3061,8 +3061,13 @@ char *d_path(const struct path *path, char *buf, int buflen)
         * thus don't need to be hashed.  They also don't need a name until a
         * user wants to identify the object in /proc/pid/fd/.  The little hack
         * below allows us to generate a name for these objects on demand:
+         *
+         * Some pseudo inodes are mountable.  When they are mounted
+         * path->dentry == path->mnt->mnt_root.  In that case don't call d_dname
+         * and instead have d_path return the mounted path.
         */
-        if (path->dentry->d_op && path->dentry->d_op->d_dname)
+        if (path->dentry->d_op && path->dentry->d_op->d_dname &&
+            (!IS_ROOT(path->dentry) || path->dentry != path->mnt->mnt_root))
                return path->dentry->d_op->d_dname(path->dentry, buf, buflen);
        rcu_read_lock();
@@ -3111,26 +3116,28 @@ char *simple_dname(struct dentry *dentry, char *buffer, int buflen)
 /*
 * Write full pathname from the root of the filesystem into the buffer.
 */
-static char *__dentry_path(struct dentry *dentry, char *buf, int buflen)
+static char *__dentry_path(struct dentry *d, char *buf, int buflen)
 {
+        struct dentry *dentry;
        char *end, *retval;
        int len, seq = 0;
        int error = 0;
+        if (buflen < 2)
+                goto Elong;
        rcu_read_lock();
 restart:
+        dentry = d;
        end = buf + buflen;
        len = buflen;
        prepend(&end, &len, "\0", 1);
-        if (buflen < 1)
-                goto Elong;
        /* Get '/' right */
        retval = end-1;
        *retval = '/';
        read_seqbegin_or_lock(&rename_lock, &seq);
        while (!IS_ROOT(dentry)) {
                struct dentry *parent = dentry->d_parent;
-                int error;
                prefetch(parent);
                error = prepend_name(&end, &len, &dentry->d_name);
diff --git a/fs/dcookies.c b/fs/dcookies.c
index ab5954b50267..ac44a69fbea9 100644
--- a/fs/dcookies.c
+++ b/fs/dcookies.c
@@ -204,7 +204,7 @@ out:
 }
 #ifdef CONFIG_COMPAT
-COMPAT_SYSCALL_DEFINE4(lookup_dcookie, u32, w0, u32, w1, char __user *, buf, size_t, len)
+COMPAT_SYSCALL_DEFINE4(lookup_dcookie, u32, w0, u32, w1, char __user *, buf, compat_size_t, len)
 {
 #ifdef __BIG_ENDIAN
        return sys_lookup_dcookie(((u64)w0 << 32) | w1, buf, len);
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 0e04142d5962..160a5489a939 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -375,7 +375,7 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
        bio = bio_alloc(GFP_KERNEL, nr_vecs);
        bio->bi_bdev = bdev;
-        bio->bi_sector = first_sector;
+        bio->bi_iter.bi_sector = first_sector;
        if (dio->is_async)
                bio->bi_end_io = dio_bio_end_aio;
        else
@@ -719,7 +719,7 @@ static inline int dio_send_cur_page(struct dio *dio, struct dio_submit *sdio,
        if (sdio->bio) {
                loff_t cur_offset = sdio->cur_page_fs_offset;
                loff_t bio_next_offset = sdio->logical_offset_in_bio +
-                        sdio->bio->bi_size;
+                        sdio->bio->bi_iter.bi_size;
                /*
                 * See whether this new request is contiguous with the old.
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index d90909ec6aa6..3190ca973dd6 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -649,6 +649,7 @@ static void process_sctp_notification(struct connection *con,
                                      struct msghdr *msg, char *buf)
 {
        union sctp_notification *sn = (union sctp_notification *)buf;
+        struct linger linger;
        switch (sn->sn_header.sn_type) {
        case SCTP_SEND_FAILED:
@@ -713,11 +714,11 @@ static void process_sctp_notification(struct connection *con,
                                return;
                        /* Peel off a new sock */
-                        sctp_lock_sock(con->sock->sk);
+                        lock_sock(con->sock->sk);
                        ret = sctp_do_peeloff(con->sock->sk,
                                sn->sn_assoc_change.sac_assoc_id,
                                &new_con->sock);
-                        sctp_release_sock(con->sock->sk);
+                        release_sock(con->sock->sk);
                        if (ret < 0) {
                                log_print("Can't peel off a socket for "
                                          "connection %d to node %d: err=%d",
@@ -727,6 +728,13 @@ static void process_sctp_notification(struct connection *con,
                        }
                        add_sock(new_con->sock, new_con);
+                        linger.l_onoff = 1;
+                        linger.l_linger = 0;
+                        ret = kernel_setsockopt(new_con->sock, SOL_SOCKET, SO_LINGER,
+                                                (char *)&linger, sizeof(linger));
+                        if (ret < 0)
+                                log_print("set socket option SO_LINGER failed");
                        log_print("connecting to %d sctp association %d",
                                 nodeid, (int)sn->sn_assoc_change.sac_assoc_id);
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index c36c44824471..b167ca48b8ee 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -659,19 +659,17 @@ out_lock:
        return rc;
 }
-static int ecryptfs_readlink_lower(struct dentry *dentry, char **buf,
+static char *ecryptfs_readlink_lower(struct dentry *dentry, size_t *bufsiz)
-                                   size_t *bufsiz)
 {
        struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
        char *lower_buf;
+        char *buf;
        mm_segment_t old_fs;
        int rc;
        lower_buf = kmalloc(PATH_MAX, GFP_KERNEL);
-        if (!lower_buf) {
+        if (!lower_buf)
-                rc = -ENOMEM;
+                return ERR_PTR(-ENOMEM);
-                goto out;
-        }
        old_fs = get_fs();
        set_fs(get_ds());
        rc = lower_dentry->d_inode->i_op->readlink(lower_dentry,
@@ -680,21 +678,18 @@ static int ecryptfs_readlink_lower(struct dentry *dentry, char **buf,
        set_fs(old_fs);
        if (rc < 0)
                goto out;
-        rc = ecryptfs_decode_and_decrypt_filename(buf, bufsiz, dentry->d_sb,
+        rc = ecryptfs_decode_and_decrypt_filename(&buf, bufsiz, dentry->d_sb,
                                                  lower_buf, rc);
 out:
        kfree(lower_buf);
-        return rc;
+        return rc ? ERR_PTR(rc) : buf;
 }
 static void *ecryptfs_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
-        char *buf;
+        size_t len;
-        size_t len = PATH_MAX;
+        char *buf = ecryptfs_readlink_lower(dentry, &len);
-        int rc;
+        if (IS_ERR(buf))
-        rc = ecryptfs_readlink_lower(dentry, &buf, &len);
-        if (rc)
                goto out;
        fsstack_copy_attr_atime(dentry->d_inode,
                                ecryptfs_dentry_to_lower(dentry)->d_inode);
@@ -1003,10 +998,12 @@ static int ecryptfs_getattr_link(struct vfsmount *mnt, struct dentry *dentry,
                char *target;
                size_t targetsiz;
-                rc = ecryptfs_readlink_lower(dentry, &target, &targetsiz);
+                target = ecryptfs_readlink_lower(dentry, &targetsiz);
-                if (!rc) {
+                if (!IS_ERR(target)) {
                        kfree(target);
                        stat->size = targetsiz;
+                } else {
+                        rc = PTR_ERR(target);
                }
        }
        return rc;
diff --git a/fs/efs/super.c b/fs/efs/super.c
index c6f57a74a559..50215bbd6463 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -26,11 +26,18 @@ static struct dentry *efs_mount(struct file_system_type *fs_type,
        return mount_bdev(fs_type, flags, dev_name, data, efs_fill_super);
 }
+static void efs_kill_sb(struct super_block *s)
+{
+        struct efs_sb_info *sbi = SUPER_INFO(s);
+        kill_block_super(s);
+        kfree(sbi);
+}
 static struct file_system_type efs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "efs",
        .mount          = efs_mount,
-        .kill_sb        = kill_block_super,
+        .kill_sb        = efs_kill_sb,
        .fs_flags       = FS_REQUIRES_DEV,
 };
 MODULE_ALIAS_FS("efs");
@@ -105,12 +112,6 @@ static void destroy_inodecache(void)
        kmem_cache_destroy(efs_inode_cachep);
 }
-static void efs_put_super(struct super_block *s)
-{
-        kfree(s->s_fs_info);
-        s->s_fs_info = NULL;
-}
 static int efs_remount(struct super_block *sb, int *flags, char *data)
 {
        *flags |= MS_RDONLY;
@@ -120,7 +121,6 @@ static int efs_remount(struct super_block *sb, int *flags, char *data)
 static const struct super_operations efs_superblock_operations = {
        .alloc_inode    = efs_alloc_inode,
        .destroy_inode  = efs_destroy_inode,
-        .put_super      = efs_put_super,
        .statfs         = efs_statfs,
        .remount_fs     = efs_remount,
 };
@@ -259,7 +259,6 @@ static int efs_fill_super(struct super_block *s, void *d, int silent)
        struct efs_sb_info *sb;
        struct buffer_head *bh;
        struct inode *root;
-        int ret = -EINVAL;
        sb = kzalloc(sizeof(struct efs_sb_info), GFP_KERNEL);
        if (!sb)
@@ -270,7 +269,7 @@ static int efs_fill_super(struct super_block *s, void *d, int silent)
        if (!sb_set_blocksize(s, EFS_BLOCKSIZE)) {
                printk(KERN_ERR "EFS: device does not support %d byte blocks\n",
                        EFS_BLOCKSIZE);
-                goto out_no_fs_ul;
+                return -EINVAL;
        }
  
        /* read the vh (volume header) block */
@@ -278,7 +277,7 @@ static int efs_fill_super(struct super_block *s, void *d, int silent)
        if (!bh) {
                printk(KERN_ERR "EFS: cannot read volume header\n");
-                goto out_no_fs_ul;
+                return -EINVAL;
        }
        /*
@@ -290,13 +289,13 @@ static int efs_fill_super(struct super_block *s, void *d, int silent)
        brelse(bh);
        if (sb->fs_start == -1) {
-                goto out_no_fs_ul;
+                return -EINVAL;
        }
        bh = sb_bread(s, sb->fs_start + EFS_SUPER);
        if (!bh) {
                printk(KERN_ERR "EFS: cannot read superblock\n");
-                goto out_no_fs_ul;
+                return -EINVAL;
        }
                
        if (efs_validate_super(sb, (struct efs_super *) bh->b_data)) {
@@ -304,7 +303,7 @@ static int efs_fill_super(struct super_block *s, void *d, int silent)
                printk(KERN_WARNING "EFS: invalid superblock at block %u\n", sb->fs_start + EFS_SUPER);
 #endif
                brelse(bh);
-                goto out_no_fs_ul;
+                return -EINVAL;
        }
        brelse(bh);
@@ -319,24 +318,16 @@ static int efs_fill_super(struct super_block *s, void *d, int silent)
        root = efs_iget(s, EFS_ROOTINODE);
        if (IS_ERR(root)) {
                printk(KERN_ERR "EFS: get root inode failed\n");
-                ret = PTR_ERR(root);
+                return PTR_ERR(root);
-                goto out_no_fs;
        }
        s->s_root = d_make_root(root);
        if (!(s->s_root)) {
                printk(KERN_ERR "EFS: get root dentry failed\n");
-                ret = -ENOMEM;
+                return -ENOMEM;
-                goto out_no_fs;
        }
        return 0;
-out_no_fs_ul:
-out_no_fs:
-        s->s_fs_info = NULL;
-        kfree(sb);
-        return ret;
 }
 static int efs_statfs(struct dentry *dentry, struct kstatfs *buf) {
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 35470d9b96e6..d6a88e7812f3 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -349,15 +349,12 @@ EXPORT_SYMBOL_GPL(eventfd_fget);
 */
 struct eventfd_ctx *eventfd_ctx_fdget(int fd)
 {
-        struct file *file;
        struct eventfd_ctx *ctx;
+        struct fd f = fdget(fd);
-        file = eventfd_fget(fd);
+        if (!f.file)
-        if (IS_ERR(file))
+                return ERR_PTR(-EBADF);
-                return (struct eventfd_ctx *) file;
+        ctx = eventfd_ctx_fileget(f.file);
-        ctx = eventfd_ctx_get(file->private_data);
+        fdput(f);
-        fput(file);
        return ctx;
 }
 EXPORT_SYMBOL_GPL(eventfd_ctx_fdget);
diff --git a/fs/exec.c b/fs/exec.c
index 7ea097f6b341..3d78fccdd723 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -62,7 +62,6 @@
 #include <trace/events/task.h>
 #include "internal.h"
-#include "coredump.h"
 #include <trace/events/sched.h>
@@ -749,11 +748,10 @@ EXPORT_SYMBOL(setup_arg_pages);
 #endif /* CONFIG_MMU */
-struct file *open_exec(const char *name)
+static struct file *do_open_exec(struct filename *name)
 {
        struct file *file;
        int err;
-        struct filename tmp = { .name = name };
        static const struct open_flags open_exec_flags = {
                .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
                .acc_mode = MAY_EXEC | MAY_OPEN,
@@ -761,7 +759,7 @@ struct file *open_exec(const char *name)
                .lookup_flags = LOOKUP_FOLLOW,
        };
-        file = do_filp_open(AT_FDCWD, &tmp, &open_exec_flags);
+        file = do_filp_open(AT_FDCWD, name, &open_exec_flags);
        if (IS_ERR(file))
                goto out;
@@ -785,6 +783,12 @@ exit:
        fput(file);
        return ERR_PTR(err);
 }
+struct file *open_exec(const char *name)
+{
+        struct filename tmp = { .name = name };
+        return do_open_exec(&tmp);
+}
 EXPORT_SYMBOL(open_exec);
 int kernel_read(struct file *file, loff_t offset,
@@ -843,7 +847,6 @@ static int exec_mmap(struct mm_struct *mm)
        tsk->active_mm = mm;
        activate_mm(active_mm, mm);
        task_unlock(tsk);
-        arch_pick_mmap_layout(mm);
        if (old_mm) {
                up_read(&old_mm->mmap_sem);
                BUG_ON(active_mm != old_mm);
@@ -1088,8 +1091,8 @@ int flush_old_exec(struct linux_binprm * bprm)
        bprm->mm = NULL;                /* We're using it now */
        set_fs(USER_DS);
-        current->flags &=
+        current->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD |
-                ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD | PF_NOFREEZE);
+                                        PF_NOFREEZE | PF_NO_SETAFFINITY);
        flush_thread();
        current->personality &= ~bprm->per_clear;
@@ -1139,9 +1142,7 @@ void setup_new_exec(struct linux_binprm * bprm)
        /* An exec changes our domain. We are no longer part of the thread
           group */
        current->self_exec_id++;
-                        
        flush_signal_handlers(current, 0);
        do_close_on_exec(current->files);
 }
@@ -1166,13 +1167,17 @@ int prepare_bprm_creds(struct linux_binprm *bprm)
        return -ENOMEM;
 }
-void free_bprm(struct linux_binprm *bprm)
+static void free_bprm(struct linux_binprm *bprm)
 {
        free_arg_pages(bprm);
        if (bprm->cred) {
                mutex_unlock(&current->signal->cred_guard_mutex);
                abort_creds(bprm->cred);
        }
+        if (bprm->file) {
+                allow_write_access(bprm->file);
+                fput(bprm->file);
+        }
        /* If a binfmt changed the interp, free it. */
        if (bprm->interp != bprm->filename)
                kfree(bprm->interp);
@@ -1224,11 +1229,10 @@ EXPORT_SYMBOL(install_exec_creds);
 * - the caller must hold ->cred_guard_mutex to protect against
 *   PTRACE_ATTACH
 */
-static int check_unsafe_exec(struct linux_binprm *bprm)
+static void check_unsafe_exec(struct linux_binprm *bprm)
 {
        struct task_struct *p = current, *t;
        unsigned n_fs;
-        int res = 0;
        if (p->ptrace) {
                if (p->ptrace & PT_PTRACE_CAP)
@@ -1244,31 +1248,25 @@ static int check_unsafe_exec(struct linux_binprm *bprm)
        if (current->no_new_privs)
                bprm->unsafe |= LSM_UNSAFE_NO_NEW_PRIVS;
+        t = p;
        n_fs = 1;
        spin_lock(&p->fs->lock);
        rcu_read_lock();
-        for (t = next_thread(p); t != p; t = next_thread(t)) {
+        while_each_thread(p, t) {
                if (t->fs == p->fs)
                        n_fs++;
        }
        rcu_read_unlock();
-        if (p->fs->users > n_fs) {
+        if (p->fs->users > n_fs)
                bprm->unsafe |= LSM_UNSAFE_SHARE;
-        } else {
+        else
-                res = -EAGAIN;
+                p->fs->in_exec = 1;
-                if (!p->fs->in_exec) {
-                        p->fs->in_exec = 1;
-                        res = 1;
-                }
-        }
        spin_unlock(&p->fs->lock);
-        return res;
 }
-/* 
+/*
- * Fill the binprm structure from the inode. 
+ * Fill the binprm structure from the inode.
 * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes
 *
 * This may be called multiple times for binary chains (scripts for example).
@@ -1430,14 +1428,7 @@ static int exec_binprm(struct linux_binprm *bprm)
                audit_bprm(bprm);
                trace_sched_process_exec(current, old_pid, bprm);
                ptrace_event(PTRACE_EVENT_EXEC, old_vpid);
-                current->did_exec = 1;
                proc_exec_connector(current);
-                if (bprm->file) {
-                        allow_write_access(bprm->file);
-                        fput(bprm->file);
-                        bprm->file = NULL; /* to catch use-after-free */
-                }
        }
        return ret;
@@ -1446,16 +1437,18 @@ static int exec_binprm(struct linux_binprm *bprm)
 /*
 * sys_execve() executes a new program.
 */
-static int do_execve_common(const char *filename,
+static int do_execve_common(struct filename *filename,
                                struct user_arg_ptr argv,
                                struct user_arg_ptr envp)
 {
        struct linux_binprm *bprm;
        struct file *file;
        struct files_struct *displaced;
-        bool clear_in_exec;
        int retval;
+        if (IS_ERR(filename))
+                return PTR_ERR(filename);
        /*
         * We move the actual failure in case of RLIMIT_NPROC excess from
         * set*uid() to execve() because too many poorly written programs
@@ -1485,13 +1478,10 @@ static int do_execve_common(const char *filename,
        if (retval)
                goto out_free;
-        retval = check_unsafe_exec(bprm);
+        check_unsafe_exec(bprm);
-        if (retval < 0)
-                goto out_free;
-        clear_in_exec = retval;
        current->in_execve = 1;
-        file = open_exec(filename);
+        file = do_open_exec(filename);
        retval = PTR_ERR(file);
        if (IS_ERR(file))
                goto out_unmark;
@@ -1499,12 +1489,11 @@ static int do_execve_common(const char *filename,
        sched_exec();
        bprm->file = file;
-        bprm->filename = filename;
+        bprm->filename = bprm->interp = filename->name;
-        bprm->interp = filename;
        retval = bprm_mm_init(bprm);
        if (retval)
-                goto out_file;
+                goto out_unmark;
        bprm->argc = count(argv, MAX_ARG_STRINGS);
        if ((retval = bprm->argc) < 0)
@@ -1541,6 +1530,7 @@ static int do_execve_common(const char *filename,
        acct_update_integrals(current);
        task_numa_free(current);
        free_bprm(bprm);
+        putname(filename);
        if (displaced)
                put_files_struct(displaced);
        return retval;
@@ -1551,15 +1541,8 @@ out:
                mmput(bprm->mm);
        }
-out_file:
-        if (bprm->file) {
-                allow_write_access(bprm->file);
-                fput(bprm->file);
-        }
 out_unmark:
-        if (clear_in_exec)
+        current->fs->in_exec = 0;
-                current->fs->in_exec = 0;
        current->in_execve = 0;
 out_free:
@@ -1569,10 +1552,11 @@ out_files:
        if (displaced)
                reset_files_struct(displaced);
 out_ret:
+        putname(filename);
        return retval;
 }
-int do_execve(const char *filename,
+int do_execve(struct filename *filename,
        const char __user *const __user *__argv,
        const char __user *const __user *__envp)
 {
@@ -1582,7 +1566,7 @@ int do_execve(const char *filename,
 }
 #ifdef CONFIG_COMPAT
-static int compat_do_execve(const char *filename,
+static int compat_do_execve(struct filename *filename,
        const compat_uptr_t __user *__argv,
        const compat_uptr_t __user *__envp)
 {
@@ -1609,67 +1593,22 @@ void set_binfmt(struct linux_binfmt *new)
        if (new)
                __module_get(new->module);
 }
 EXPORT_SYMBOL(set_binfmt);
 /*
- * set_dumpable converts traditional three-value dumpable to two flags and
+ * set_dumpable stores three-value SUID_DUMP_* into mm->flags.
- * stores them into mm->flags.  It modifies lower two bits of mm->flags, but
- * these bits are not changed atomically.  So get_dumpable can observe the
- * intermediate state.  To avoid doing unexpected behavior, get get_dumpable
- * return either old dumpable or new one by paying attention to the order of
- * modifying the bits.
- *
- * dumpable |   mm->flags (binary)
- * old  new | initial interim  final
- * ---------+-----------------------
- *  0    1  |   00      01      01
- *  0    2  |   00      10(*)   11
- *  1    0  |   01      00      00
- *  1    2  |   01      11      11
- *  2    0  |   11      10(*)   00
- *  2    1  |   11      11      01
- *
- * (*) get_dumpable regards interim value of 10 as 11.
 */
 void set_dumpable(struct mm_struct *mm, int value)
 {
-        switch (value) {
+        unsigned long old, new;
-        case SUID_DUMP_DISABLE:
-                clear_bit(MMF_DUMPABLE, &mm->flags);
-                smp_wmb();
-                clear_bit(MMF_DUMP_SECURELY, &mm->flags);
-                break;
-        case SUID_DUMP_USER:
-                set_bit(MMF_DUMPABLE, &mm->flags);
-                smp_wmb();
-                clear_bit(MMF_DUMP_SECURELY, &mm->flags);
-                break;
-        case SUID_DUMP_ROOT:
-                set_bit(MMF_DUMP_SECURELY, &mm->flags);
-                smp_wmb();
-                set_bit(MMF_DUMPABLE, &mm->flags);
-                break;
-        }
-}
-int __get_dumpable(unsigned long mm_flags)
+        if (WARN_ON((unsigned)value > SUID_DUMP_ROOT))
-{
+                return;
-        int ret;
-        ret = mm_flags & MMF_DUMPABLE_MASK;
-        return (ret > SUID_DUMP_USER) ? SUID_DUMP_ROOT : ret;
-}
-/*
+        do {
- * This returns the actual value of the suid_dumpable flag. For things
+                old = ACCESS_ONCE(mm->flags);
- * that are using this for checking for privilege transitions, it must
+                new = (old & ~MMF_DUMPABLE_MASK) | value;
- * test against SUID_DUMP_USER rather than treating it as a boolean
+        } while (cmpxchg(&mm->flags, old, new) != old);
- * value.
- */
-int get_dumpable(struct mm_struct *mm)
-{
-        return __get_dumpable(mm->flags);
 }
 SYSCALL_DEFINE3(execve,
@@ -1677,25 +1616,13 @@ SYSCALL_DEFINE3(execve,
                const char __user *const __user *, argv,
                const char __user *const __user *, envp)
 {
-        struct filename *path = getname(filename);
+        return do_execve(getname(filename), argv, envp);
-        int error = PTR_ERR(path);
-        if (!IS_ERR(path)) {
-                error = do_execve(path->name, argv, envp);
-                putname(path);
-        }
-        return error;
 }
 #ifdef CONFIG_COMPAT
 asmlinkage long compat_sys_execve(const char __user * filename,
        const compat_uptr_t __user * argv,
        const compat_uptr_t __user * envp)
 {
-        struct filename *path = getname(filename);
+        return compat_do_execve(getname(filename), argv, envp);
-        int error = PTR_ERR(path);
-        if (!IS_ERR(path)) {
-                error = compat_do_execve(path->name, argv, envp);
-                putname(path);
-        }
-        return error;
 }
 #endif
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index a52a5d23c30b..ee4317faccb1 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -577,7 +577,7 @@ static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate)
                if (offset >= i_size) {
                        *uptodate = true;
-                        EXOFS_DBGMSG("offset >= i_size index=0x%lx\n", index);
+                        EXOFS_DBGMSG2("offset >= i_size index=0x%lx\n", index);
                        return ZERO_PAGE(0);
                }
@@ -596,10 +596,10 @@ static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate)
                        *uptodate = true;
                else
                        *uptodate = PageUptodate(page);
-                EXOFS_DBGMSG("index=0x%lx uptodate=%d\n", index, *uptodate);
+                EXOFS_DBGMSG2("index=0x%lx uptodate=%d\n", index, *uptodate);
                return page;
        } else {
-                EXOFS_DBGMSG("YES that_locked_page index=0x%lx\n",
+                EXOFS_DBGMSG2("YES that_locked_page index=0x%lx\n",
                             pcol->that_locked_page->index);
                *uptodate = true;
                return pcol->that_locked_page;
@@ -611,11 +611,11 @@ static void __r4w_put_page(void *priv, struct page *page)
        struct page_collect *pcol = priv;
        if ((pcol->that_locked_page != page) && (ZERO_PAGE(0) != page)) {
-                EXOFS_DBGMSG("index=0x%lx\n", page->index);
+                EXOFS_DBGMSG2("index=0x%lx\n", page->index);
                page_cache_release(page);
                return;
        }
-        EXOFS_DBGMSG("that_locked_page index=0x%lx\n",
+        EXOFS_DBGMSG2("that_locked_page index=0x%lx\n",
                     ZERO_PAGE(0) == page ? -1 : page->index);
 }
@@ -961,6 +961,14 @@ static void exofs_invalidatepage(struct page *page, unsigned int offset,
        WARN_ON(1);
 }
+ /* TODO: Should be easy enough to do proprly */
+static ssize_t exofs_direct_IO(int rw, struct kiocb *iocb,
+                const struct iovec *iov, loff_t offset, unsigned long nr_segs)
+{
+        return 0;
+}
 const struct address_space_operations exofs_aops = {
        .readpage       = exofs_readpage,
        .readpages      = exofs_readpages,
@@ -974,7 +982,7 @@ const struct address_space_operations exofs_aops = {
        /* Not implemented Yet */
        .bmap           = NULL, /* TODO: use osd's OSD_ACT_READ_MAP */
-        .direct_IO      = NULL, /* TODO: Should be trivial to do */
+        .direct_IO      = exofs_direct_IO,
        /* With these NULL has special meaning or default is not exported */
        .get_xip_mem    = NULL,
@@ -1010,7 +1018,7 @@ static int _do_truncate(struct inode *inode, loff_t newsize)
        if (likely(!ret))
                truncate_setsize(inode, newsize);
-        EXOFS_DBGMSG("(0x%lx) size=0x%llx ret=>%d\n",
+        EXOFS_DBGMSG2("(0x%lx) size=0x%llx ret=>%d\n",
                     inode->i_ino, newsize, ret);
        return ret;
 }
@@ -1094,14 +1102,13 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
                /* If object is lost on target we might as well enable it's
                 * delete.
                 */
-                if ((ret == -ENOENT) || (ret == -EINVAL))
+                ret = 0;
-                        ret = 0;
                goto out;
        }
        ret = extract_attr_from_ios(ios, &attrs[0]);
        if (ret) {
-                EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__);
+                EXOFS_ERR("%s: extract_attr 0 of inode failed\n", __func__);
                goto out;
        }
        WARN_ON(attrs[0].len != EXOFS_INO_ATTR_SIZE);
@@ -1109,7 +1116,7 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
        ret = extract_attr_from_ios(ios, &attrs[1]);
        if (ret) {
-                EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__);
+                EXOFS_ERR("%s: extract_attr 1 of inode failed\n", __func__);
                goto out;
        }
        if (attrs[1].len) {
@@ -1124,7 +1131,7 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
        ret = extract_attr_from_ios(ios, &attrs[2]);
        if (ret) {
-                EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__);
+                EXOFS_ERR("%s: extract_attr 2 of inode failed\n", __func__);
                goto out;
        }
        if (attrs[2].len) {
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index b74422888604..dae884694bd9 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -103,7 +103,7 @@ int ore_verify_layout(unsigned total_comps, struct ore_layout *layout)
        layout->max_io_length =
                (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE - layout->stripe_unit) *
-                                                        layout->group_width;
+                                        (layout->group_width - layout->parity);
        if (layout->parity) {
                unsigned stripe_length =
                                (layout->group_width - layout->parity) *
@@ -286,7 +286,8 @@ int  ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc,
        if (length) {
                ore_calc_stripe_info(layout, offset, length, &ios->si);
                ios->length = ios->si.length;
-                ios->nr_pages = (ios->length + PAGE_SIZE - 1) / PAGE_SIZE;
+                ios->nr_pages = ((ios->offset & (PAGE_SIZE - 1)) +
+                                 ios->length + PAGE_SIZE - 1) / PAGE_SIZE;
                if (layout->parity)
                        _ore_post_alloc_raid_stuff(ios);
        }
@@ -430,8 +431,12 @@ int ore_check_io(struct ore_io_state *ios, ore_on_dev_error on_dev_error)
                if (likely(!ret))
                        continue;
-                if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) {
+                if ((OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) &&
-                        /* start read offset passed endof file */
+                    per_dev->bio) {
+                        /* start read offset passed endof file.
+                         * Note: if we do not have bio it means read-attributes
+                         * In this case we should return error to caller.
+                         */
                        _clear_bio(per_dev->bio);
                        ORE_DBGMSG("start read offset passed end of file "
                                "offset=0x%llx, length=0x%llx\n",
@@ -536,6 +541,7 @@ void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset,
        u64     H = LmodS - G * T;
        u32     N = div_u64(H, U);
+        u32     Nlast;
        /* "H - (N * U)" is just "H % U" so it's bound to u32 */
        u32     C = (u32)(H - (N * U)) / stripe_unit + G * group_width;
@@ -568,6 +574,10 @@ void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset,
        si->length = T - H;
        if (si->length > length)
                si->length = length;
+        Nlast = div_u64(H + si->length + U - 1, U);
+        si->maxdevUnits = Nlast - N;
        si->M = M;
 }
 EXPORT_SYMBOL(ore_calc_stripe_info);
@@ -583,13 +593,16 @@ int _ore_add_stripe_unit(struct ore_io_state *ios,  unsigned *cur_pg,
        int ret;
        if (per_dev->bio == NULL) {
-                unsigned pages_in_stripe = ios->layout->group_width *
+                unsigned bio_size;
-                                        (ios->layout->stripe_unit / PAGE_SIZE);
-                unsigned nr_pages = ios->nr_pages * ios->layout->group_width /
+                if (!ios->reading) {
-                                        (ios->layout->group_width -
+                        bio_size = ios->si.maxdevUnits;
-                                         ios->layout->parity);
+                } else {
-                unsigned bio_size = (nr_pages + pages_in_stripe) /
+                        bio_size = (ios->si.maxdevUnits + 1) *
-                                        ios->layout->group_width;
+                             (ios->layout->group_width - ios->layout->parity) /
+                             ios->layout->group_width;
+                }
+                bio_size *= (ios->layout->stripe_unit / PAGE_SIZE);
                per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size);
                if (unlikely(!per_dev->bio)) {
@@ -609,8 +622,12 @@ int _ore_add_stripe_unit(struct ore_io_state *ios,  unsigned *cur_pg,
                added_len = bio_add_pc_page(q, per_dev->bio, pages[pg],
                                            pglen, pgbase);
                if (unlikely(pglen != added_len)) {
-                        ORE_DBGMSG("Failed bio_add_pc_page bi_vcnt=%u\n",
+                        /* If bi_vcnt == bi_max then this is a SW BUG */
-                                   per_dev->bio->bi_vcnt);
+                        ORE_DBGMSG("Failed bio_add_pc_page bi_vcnt=0x%x "
+                                   "bi_max=0x%x BIO_MAX=0x%x cur_len=0x%x\n",
+                                   per_dev->bio->bi_vcnt,
+                                   per_dev->bio->bi_max_vecs,
+                                   BIO_MAX_PAGES_KMALLOC, cur_len);
                        ret = -ENOMEM;
                        goto out;
                }
@@ -1098,7 +1115,7 @@ int ore_truncate(struct ore_layout *layout, struct ore_components *oc,
                size_attr->attr = g_attr_logical_length;
                size_attr->attr.val_ptr = &size_attr->newsize;
-                ORE_DBGMSG("trunc(0x%llx) obj_offset=0x%llx dev=%d\n",
+                ORE_DBGMSG2("trunc(0x%llx) obj_offset=0x%llx dev=%d\n",
                             _LLU(oc->comps->obj.id), _LLU(obj_size), i);
                ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1,
                                        &size_attr->attr);
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index 110b6b371a4e..1b8001bbe947 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -148,13 +148,6 @@ ext2_get_acl(struct inode *inode, int type)
        struct posix_acl *acl;
        int retval;
-        if (!test_opt(inode->i_sb, POSIX_ACL))
-                return NULL;
-        acl = get_cached_acl(inode, type);
-        if (acl != ACL_NOT_CACHED)
-                return acl;
        switch (type) {
        case ACL_TYPE_ACCESS:
                name_index = EXT2_XATTR_INDEX_POSIX_ACL_ACCESS;
@@ -189,19 +182,14 @@ ext2_get_acl(struct inode *inode, int type)
 /*
 * inode->i_mutex: down
 */
-static int
+int
-ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
+ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 {
        int name_index;
        void *value = NULL;
        size_t size = 0;
        int error;
-        if (S_ISLNK(inode->i_mode))
-                return -EOPNOTSUPP;
-        if (!test_opt(inode->i_sb, POSIX_ACL))
-                return 0;
        switch(type) {
                case ACL_TYPE_ACCESS:
                        name_index = EXT2_XATTR_INDEX_POSIX_ACL_ACCESS;
@@ -250,169 +238,21 @@ ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 int
 ext2_init_acl(struct inode *inode, struct inode *dir)
 {
-        struct posix_acl *acl = NULL;
+        struct posix_acl *default_acl, *acl;
-        int error = 0;
+        int error;
-        if (!S_ISLNK(inode->i_mode)) {
-                if (test_opt(dir->i_sb, POSIX_ACL)) {
-                        acl = ext2_get_acl(dir, ACL_TYPE_DEFAULT);
-                        if (IS_ERR(acl))
-                                return PTR_ERR(acl);
-                }
-                if (!acl)
-                        inode->i_mode &= ~current_umask();
-        }
-        if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
-                if (S_ISDIR(inode->i_mode)) {
-                        error = ext2_set_acl(inode, ACL_TYPE_DEFAULT, acl);
-                        if (error)
-                                goto cleanup;
-                }
-                error = posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode);
-                if (error < 0)
-                        return error;
-                if (error > 0) {
-                        /* This is an extended ACL */
-                        error = ext2_set_acl(inode, ACL_TYPE_ACCESS, acl);
-                }
-        }
-cleanup:
-       posix_acl_release(acl);
-       return error;
-}
-/*
- * Does chmod for an inode that may have an Access Control List. The
- * inode->i_mode field must be updated to the desired value by the caller
- * before calling this function.
- * Returns 0 on success, or a negative error number.
- *
- * We change the ACL rather than storing some ACL entries in the file
- * mode permission bits (which would be more efficient), because that
- * would break once additional permissions (like  ACL_APPEND, ACL_DELETE
- * for directories) are added. There are no more bits available in the
- * file mode.
- *
- * inode->i_mutex: down
- */
-int
-ext2_acl_chmod(struct inode *inode)
-{
-        struct posix_acl *acl;
-        int error;
-        if (!test_opt(inode->i_sb, POSIX_ACL))
+        error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
-                return 0;
-        if (S_ISLNK(inode->i_mode))
-                return -EOPNOTSUPP;
-        acl = ext2_get_acl(inode, ACL_TYPE_ACCESS);
-        if (IS_ERR(acl) || !acl)
-                return PTR_ERR(acl);
-        error = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
        if (error)
                return error;
-        error = ext2_set_acl(inode, ACL_TYPE_ACCESS, acl);
-        posix_acl_release(acl);
-        return error;
-}
-/*
+        if (default_acl) {
- * Extended attribut handlers
+                error = ext2_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
- */
+                posix_acl_release(default_acl);
-static size_t
+        }
-ext2_xattr_list_acl_access(struct dentry *dentry, char *list, size_t list_size,
+        if (acl) {
-                           const char *name, size_t name_len, int type)
+                if (!error)
-{
+                        error = ext2_set_acl(inode, acl, ACL_TYPE_ACCESS);
-        const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
+                posix_acl_release(acl);
+        }
-        if (!test_opt(dentry->d_sb, POSIX_ACL))
-                return 0;
-        if (list && size <= list_size)
-                memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
-        return size;
-}
-static size_t
-ext2_xattr_list_acl_default(struct dentry *dentry, char *list, size_t list_size,
-                            const char *name, size_t name_len, int type)
-{
-        const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
-        if (!test_opt(dentry->d_sb, POSIX_ACL))
-                return 0;
-        if (list && size <= list_size)
-                memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
-        return size;
-}
-static int
-ext2_xattr_get_acl(struct dentry *dentry, const char *name, void *buffer,
-                   size_t size, int type)
-{
-        struct posix_acl *acl;
-        int error;
-        if (strcmp(name, "") != 0)
-                return -EINVAL;
-        if (!test_opt(dentry->d_sb, POSIX_ACL))
-                return -EOPNOTSUPP;
-        acl = ext2_get_acl(dentry->d_inode, type);
-        if (IS_ERR(acl))
-                return PTR_ERR(acl);
-        if (acl == NULL)
-                return -ENODATA;
-        error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
-        posix_acl_release(acl);
-        return error;
-}
-static int
-ext2_xattr_set_acl(struct dentry *dentry, const char *name, const void *value,
-                   size_t size, int flags, int type)
-{
-        struct posix_acl *acl;
-        int error;
-        if (strcmp(name, "") != 0)
-                return -EINVAL;
-        if (!test_opt(dentry->d_sb, POSIX_ACL))
-                return -EOPNOTSUPP;
-        if (!inode_owner_or_capable(dentry->d_inode))
-                return -EPERM;
-        if (value) {
-                acl = posix_acl_from_xattr(&init_user_ns, value, size);
-                if (IS_ERR(acl))
-                        return PTR_ERR(acl);
-                else if (acl) {
-                        error = posix_acl_valid(acl);
-                        if (error)
-                                goto release_and_out;
-                }
-        } else
-                acl = NULL;
-        error = ext2_set_acl(dentry->d_inode, type, acl);
-release_and_out:
-        posix_acl_release(acl);
        return error;
 }
-const struct xattr_handler ext2_xattr_acl_access_handler = {
-        .prefix = POSIX_ACL_XATTR_ACCESS,
-        .flags  = ACL_TYPE_ACCESS,
-        .list   = ext2_xattr_list_acl_access,
-        .get    = ext2_xattr_get_acl,
-        .set    = ext2_xattr_set_acl,
-};
-const struct xattr_handler ext2_xattr_acl_default_handler = {
-        .prefix = POSIX_ACL_XATTR_DEFAULT,
-        .flags  = ACL_TYPE_DEFAULT,
-        .list   = ext2_xattr_list_acl_default,
-        .get    = ext2_xattr_get_acl,
-        .set    = ext2_xattr_set_acl,
-};
diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h
index 503bfb0ed79b..44937f9fcf32 100644
--- a/fs/ext2/acl.h
+++ b/fs/ext2/acl.h
@@ -55,7 +55,7 @@ static inline int ext2_acl_count(size_t size)
 /* acl.c */
 extern struct posix_acl *ext2_get_acl(struct inode *inode, int type);
-extern int ext2_acl_chmod (struct inode *);
+extern int ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type);
 extern int ext2_init_acl (struct inode *, struct inode *);
 #else
@@ -63,12 +63,6 @@ extern int ext2_init_acl (struct inode *, struct inode *);
 #define ext2_get_acl    NULL
 #define ext2_set_acl    NULL
-static inline int
-ext2_acl_chmod (struct inode *inode)
-{
-        return 0;
-}
 static inline int ext2_init_acl (struct inode *inode, struct inode *dir)
 {
        return 0;
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index a5b3a5db3120..44c36e590765 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -103,5 +103,6 @@ const struct inode_operations ext2_file_inode_operations = {
 #endif
        .setattr        = ext2_setattr,
        .get_acl        = ext2_get_acl,
+        .set_acl        = ext2_set_acl,
        .fiemap         = ext2_fiemap,
 };
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 8a337640a46a..94ed36849b71 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -1566,7 +1566,7 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
        }
        setattr_copy(inode, iattr);
        if (iattr->ia_valid & ATTR_MODE)
-                error = ext2_acl_chmod(inode);
+                error = posix_acl_chmod(inode, inode->i_mode);
        mark_inode_dirty(inode);
        return error;
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 256dd5f4c1c4..c268d0af1db9 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -421,6 +421,7 @@ const struct inode_operations ext2_dir_inode_operations = {
 #endif
        .setattr        = ext2_setattr,
        .get_acl        = ext2_get_acl,
+        .set_acl        = ext2_set_acl,
        .tmpfile        = ext2_tmpfile,
 };
@@ -433,4 +434,5 @@ const struct inode_operations ext2_special_inode_operations = {
 #endif
        .setattr        = ext2_setattr,
        .get_acl        = ext2_get_acl,
+        .set_acl        = ext2_set_acl,
 };
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index 2d7557db3ae8..91426141c33a 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -103,8 +103,8 @@ static struct mb_cache *ext2_xattr_cache;
 static const struct xattr_handler *ext2_xattr_handler_map[] = {
        [EXT2_XATTR_INDEX_USER]              = &ext2_xattr_user_handler,
 #ifdef CONFIG_EXT2_FS_POSIX_ACL
-        [EXT2_XATTR_INDEX_POSIX_ACL_ACCESS]  = &ext2_xattr_acl_access_handler,
+        [EXT2_XATTR_INDEX_POSIX_ACL_ACCESS]  = &posix_acl_access_xattr_handler,
-        [EXT2_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ext2_xattr_acl_default_handler,
+        [EXT2_XATTR_INDEX_POSIX_ACL_DEFAULT] = &posix_acl_default_xattr_handler,
 #endif
        [EXT2_XATTR_INDEX_TRUSTED]           = &ext2_xattr_trusted_handler,
 #ifdef CONFIG_EXT2_FS_SECURITY
@@ -116,8 +116,8 @@ const struct xattr_handler *ext2_xattr_handlers[] = {
        &ext2_xattr_user_handler,
        &ext2_xattr_trusted_handler,
 #ifdef CONFIG_EXT2_FS_POSIX_ACL
-        &ext2_xattr_acl_access_handler,
+        &posix_acl_access_xattr_handler,
-        &ext2_xattr_acl_default_handler,
+        &posix_acl_default_xattr_handler,
 #endif
 #ifdef CONFIG_EXT2_FS_SECURITY
        &ext2_xattr_security_handler,
diff --git a/fs/ext2/xattr.h b/fs/ext2/xattr.h
index 5e41cccff762..60edf298644e 100644
--- a/fs/ext2/xattr.h
+++ b/fs/ext2/xattr.h
@@ -57,8 +57,6 @@ struct ext2_xattr_entry {
 extern const struct xattr_handler ext2_xattr_user_handler;
 extern const struct xattr_handler ext2_xattr_trusted_handler;
-extern const struct xattr_handler ext2_xattr_acl_access_handler;
-extern const struct xattr_handler ext2_xattr_acl_default_handler;
 extern const struct xattr_handler ext2_xattr_security_handler;
 extern ssize_t ext2_listxattr(struct dentry *, char *, size_t);
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index dbb5ad59a7fc..8bbaf5bcf982 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -145,13 +145,6 @@ ext3_get_acl(struct inode *inode, int type)
        struct posix_acl *acl;
        int retval;
-        if (!test_opt(inode->i_sb, POSIX_ACL))
-                return NULL;
-        acl = get_cached_acl(inode, type);
-        if (acl != ACL_NOT_CACHED)
-                return acl;
        switch (type) {
        case ACL_TYPE_ACCESS:
                name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS;
@@ -190,7 +183,7 @@ ext3_get_acl(struct inode *inode, int type)
 * inode->i_mutex: down unless called from ext3_new_inode
 */
 static int
-ext3_set_acl(handle_t *handle, struct inode *inode, int type,
+__ext3_set_acl(handle_t *handle, struct inode *inode, int type,
             struct posix_acl *acl)
 {
        int name_index;
@@ -198,9 +191,6 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type,
        size_t size = 0;
        int error;
-        if (S_ISLNK(inode->i_mode))
-                return -EOPNOTSUPP;
        switch(type) {
                case ACL_TYPE_ACCESS:
                        name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS;
@@ -243,204 +233,49 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type,
        return error;
 }
-/*
- * Initialize the ACLs of a new inode. Called from ext3_new_inode.
- *
- * dir->i_mutex: down
- * inode->i_mutex: up (access to inode is still exclusive)
- */
 int
-ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
+ext3_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 {
-        struct posix_acl *acl = NULL;
-        int error = 0;
-        if (!S_ISLNK(inode->i_mode)) {
-                if (test_opt(dir->i_sb, POSIX_ACL)) {
-                        acl = ext3_get_acl(dir, ACL_TYPE_DEFAULT);
-                        if (IS_ERR(acl))
-                                return PTR_ERR(acl);
-                }
-                if (!acl)
-                        inode->i_mode &= ~current_umask();
-        }
-        if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
-                if (S_ISDIR(inode->i_mode)) {
-                        error = ext3_set_acl(handle, inode,
-                                             ACL_TYPE_DEFAULT, acl);
-                        if (error)
-                                goto cleanup;
-                }
-                error = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode);
-                if (error < 0)
-                        return error;
-                if (error > 0) {
-                        /* This is an extended ACL */
-                        error = ext3_set_acl(handle, inode, ACL_TYPE_ACCESS, acl);
-                }
-        }
-cleanup:
-        posix_acl_release(acl);
-        return error;
-}
-/*
- * Does chmod for an inode that may have an Access Control List. The
- * inode->i_mode field must be updated to the desired value by the caller
- * before calling this function.
- * Returns 0 on success, or a negative error number.
- *
- * We change the ACL rather than storing some ACL entries in the file
- * mode permission bits (which would be more efficient), because that
- * would break once additional permissions (like  ACL_APPEND, ACL_DELETE
- * for directories) are added. There are no more bits available in the
- * file mode.
- *
- * inode->i_mutex: down
- */
-int
-ext3_acl_chmod(struct inode *inode)
-{
-        struct posix_acl *acl;
        handle_t *handle;
-        int retries = 0;
+        int error, retries = 0;
-        int error;
-        if (S_ISLNK(inode->i_mode))
-                return -EOPNOTSUPP;
-        if (!test_opt(inode->i_sb, POSIX_ACL))
-                return 0;
-        acl = ext3_get_acl(inode, ACL_TYPE_ACCESS);
-        if (IS_ERR(acl) || !acl)
-                return PTR_ERR(acl);
-        error = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
-        if (error)
-                return error;
 retry:
-        handle = ext3_journal_start(inode,
+        handle = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS(inode->i_sb));
-                        EXT3_DATA_TRANS_BLOCKS(inode->i_sb));
+        if (IS_ERR(handle))
-        if (IS_ERR(handle)) {
+                return PTR_ERR(handle);
-                error = PTR_ERR(handle);
+        error = __ext3_set_acl(handle, inode, type, acl);
-                ext3_std_error(inode->i_sb, error);
-                goto out;
-        }
-        error = ext3_set_acl(handle, inode, ACL_TYPE_ACCESS, acl);
        ext3_journal_stop(handle);
-        if (error == -ENOSPC &&
+        if (error == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
-            ext3_should_retry_alloc(inode->i_sb, &retries))
                goto retry;
-out:
-        posix_acl_release(acl);
        return error;
 }
 /*
- * Extended attribute handlers
+ * Initialize the ACLs of a new inode. Called from ext3_new_inode.
+ *
+ * dir->i_mutex: down
+ * inode->i_mutex: up (access to inode is still exclusive)
 */
-static size_t
+int
-ext3_xattr_list_acl_access(struct dentry *dentry, char *list, size_t list_len,
+ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
-                           const char *name, size_t name_len, int type)
-{
-        const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
-        if (!test_opt(dentry->d_sb, POSIX_ACL))
-                return 0;
-        if (list && size <= list_len)
-                memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
-        return size;
-}
-static size_t
-ext3_xattr_list_acl_default(struct dentry *dentry, char *list, size_t list_len,
-                            const char *name, size_t name_len, int type)
-{
-        const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
-        if (!test_opt(dentry->d_sb, POSIX_ACL))
-                return 0;
-        if (list && size <= list_len)
-                memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
-        return size;
-}
-static int
-ext3_xattr_get_acl(struct dentry *dentry, const char *name, void *buffer,
-                   size_t size, int type)
 {
-        struct posix_acl *acl;
+        struct posix_acl *default_acl, *acl;
        int error;
-        if (strcmp(name, "") != 0)
+        error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
-                return -EINVAL;
+        if (error)
-        if (!test_opt(dentry->d_sb, POSIX_ACL))
+                return error;
-                return -EOPNOTSUPP;
-        acl = ext3_get_acl(dentry->d_inode, type);
-        if (IS_ERR(acl))
-                return PTR_ERR(acl);
-        if (acl == NULL)
-                return -ENODATA;
-        error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
-        posix_acl_release(acl);
-        return error;
-}
-static int
-ext3_xattr_set_acl(struct dentry *dentry, const char *name, const void *value,
-                   size_t size, int flags, int type)
-{
-        struct inode *inode = dentry->d_inode;
-        handle_t *handle;
-        struct posix_acl *acl;
-        int error, retries = 0;
-        if (strcmp(name, "") != 0)
-                return -EINVAL;
-        if (!test_opt(inode->i_sb, POSIX_ACL))
-                return -EOPNOTSUPP;
-        if (!inode_owner_or_capable(inode))
-                return -EPERM;
-        if (value) {
-                acl = posix_acl_from_xattr(&init_user_ns, value, size);
-                if (IS_ERR(acl))
-                        return PTR_ERR(acl);
-                else if (acl) {
-                        error = posix_acl_valid(acl);
-                        if (error)
-                                goto release_and_out;
-                }
-        } else
-                acl = NULL;
-retry:
-        handle = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS(inode->i_sb));
-        if (IS_ERR(handle))
-                return PTR_ERR(handle);
-        error = ext3_set_acl(handle, inode, type, acl);
-        ext3_journal_stop(handle);
-        if (error == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
-                goto retry;
-release_and_out:
+        if (default_acl) {
-        posix_acl_release(acl);
+                error = __ext3_set_acl(handle, inode, ACL_TYPE_DEFAULT,
+                                       default_acl);
+                posix_acl_release(default_acl);
+        }
+        if (acl) {
+                if (!error)
+                        error = __ext3_set_acl(handle, inode, ACL_TYPE_ACCESS,
+                                               acl);
+                posix_acl_release(acl);
+        }
        return error;
 }
-const struct xattr_handler ext3_xattr_acl_access_handler = {
-        .prefix = POSIX_ACL_XATTR_ACCESS,
-        .flags  = ACL_TYPE_ACCESS,
-        .list   = ext3_xattr_list_acl_access,
-        .get    = ext3_xattr_get_acl,
-        .set    = ext3_xattr_set_acl,
-};
-const struct xattr_handler ext3_xattr_acl_default_handler = {
-        .prefix = POSIX_ACL_XATTR_DEFAULT,
-        .flags  = ACL_TYPE_DEFAULT,
-        .list   = ext3_xattr_list_acl_default,
-        .get    = ext3_xattr_get_acl,
-        .set    = ext3_xattr_set_acl,
-};
diff --git a/fs/ext3/acl.h b/fs/ext3/acl.h
index dbc921e458c5..ea1c69edab9e 100644
--- a/fs/ext3/acl.h
+++ b/fs/ext3/acl.h
@@ -55,18 +55,13 @@ static inline int ext3_acl_count(size_t size)
 /* acl.c */
 extern struct posix_acl *ext3_get_acl(struct inode *inode, int type);
-extern int ext3_acl_chmod (struct inode *);
+extern int ext3_set_acl(struct inode *inode, struct posix_acl *acl, int type);
 extern int ext3_init_acl (handle_t *, struct inode *, struct inode *);
 #else  /* CONFIG_EXT3_FS_POSIX_ACL */
 #include <linux/sched.h>
 #define ext3_get_acl NULL
+#define ext3_set_acl NULL
-static inline int
-ext3_acl_chmod(struct inode *inode)
-{
-        return 0;
-}
 static inline int
 ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index bafdd48eefde..e66e4808719f 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -309,43 +309,17 @@ struct fname {
 */
 static void free_rb_tree_fname(struct rb_root *root)
 {
-        struct rb_node  *n = root->rb_node;
+        struct fname *fname, *next;
-        struct rb_node  *parent;
-        struct fname    *fname;
+        rbtree_postorder_for_each_entry_safe(fname, next, root, rb_hash)
+                do {
-        while (n) {
+                        struct fname *old = fname;
-                /* Do the node's children first */
-                if (n->rb_left) {
-                        n = n->rb_left;
-                        continue;
-                }
-                if (n->rb_right) {
-                        n = n->rb_right;
-                        continue;
-                }
-                /*
-                 * The node has no children; free it, and then zero
-                 * out parent's link to it.  Finally go to the
-                 * beginning of the loop and try to free the parent
-                 * node.
-                 */
-                parent = rb_parent(n);
-                fname = rb_entry(n, struct fname, rb_hash);
-                while (fname) {
-                        struct fname * old = fname;
                        fname = fname->next;
-                        kfree (old);
+                        kfree(old);
-                }
+                } while (fname);
-                if (!parent)
-                        *root = RB_ROOT;
-                else if (parent->rb_left == n)
-                        parent->rb_left = NULL;
-                else if (parent->rb_right == n)
-                        parent->rb_right = NULL;
-                n = parent;
-        }
-}
+        *root = RB_ROOT;
+}
 static struct dir_private_info *ext3_htree_create_dir_info(struct file *filp,
                                                           loff_t pos)
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index 25cb413277e9..aad05311392a 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -75,6 +75,7 @@ const struct inode_operations ext3_file_inode_operations = {
        .removexattr    = generic_removexattr,
 #endif
        .get_acl        = ext3_get_acl,
+        .set_acl        = ext3_set_acl,
        .fiemap         = ext3_fiemap,
 };
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 2bd85486b879..384b6ebb655f 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -3365,7 +3365,7 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr)
        mark_inode_dirty(inode);
        if (ia_valid & ATTR_MODE)
-                rc = ext3_acl_chmod(inode);
+                rc = posix_acl_chmod(inode, inode->i_mode);
 err_out:
        ext3_std_error(inode->i_sb, error);
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index f8cde46de9cd..f197736dccfa 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -2569,6 +2569,7 @@ const struct inode_operations ext3_dir_inode_operations = {
        .removexattr    = generic_removexattr,
 #endif
        .get_acl        = ext3_get_acl,
+        .set_acl        = ext3_set_acl,
 };
 const struct inode_operations ext3_special_inode_operations = {
@@ -2580,4 +2581,5 @@ const struct inode_operations ext3_special_inode_operations = {
        .removexattr    = generic_removexattr,
 #endif
        .get_acl        = ext3_get_acl,
+        .set_acl        = ext3_set_acl,
 };
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index b1fc96383e08..c6874be6d58b 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -102,8 +102,8 @@ static struct mb_cache *ext3_xattr_cache;
 static const struct xattr_handler *ext3_xattr_handler_map[] = {
        [EXT3_XATTR_INDEX_USER]              = &ext3_xattr_user_handler,
 #ifdef CONFIG_EXT3_FS_POSIX_ACL
-        [EXT3_XATTR_INDEX_POSIX_ACL_ACCESS]  = &ext3_xattr_acl_access_handler,
+        [EXT3_XATTR_INDEX_POSIX_ACL_ACCESS]  = &posix_acl_access_xattr_handler,
-        [EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ext3_xattr_acl_default_handler,
+        [EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT] = &posix_acl_default_xattr_handler,
 #endif
        [EXT3_XATTR_INDEX_TRUSTED]           = &ext3_xattr_trusted_handler,
 #ifdef CONFIG_EXT3_FS_SECURITY
@@ -115,8 +115,8 @@ const struct xattr_handler *ext3_xattr_handlers[] = {
        &ext3_xattr_user_handler,
        &ext3_xattr_trusted_handler,
 #ifdef CONFIG_EXT3_FS_POSIX_ACL
-        &ext3_xattr_acl_access_handler,
+        &posix_acl_access_xattr_handler,
-        &ext3_xattr_acl_default_handler,
+        &posix_acl_default_xattr_handler,
 #endif
 #ifdef CONFIG_EXT3_FS_SECURITY
        &ext3_xattr_security_handler,
diff --git a/fs/ext3/xattr.h b/fs/ext3/xattr.h
index 2be4f69bfa64..32e93ebf8031 100644
--- a/fs/ext3/xattr.h
+++ b/fs/ext3/xattr.h
@@ -60,8 +60,6 @@ struct ext3_xattr_entry {
 extern const struct xattr_handler ext3_xattr_user_handler;
 extern const struct xattr_handler ext3_xattr_trusted_handler;
-extern const struct xattr_handler ext3_xattr_acl_access_handler;
-extern const struct xattr_handler ext3_xattr_acl_default_handler;
 extern const struct xattr_handler ext3_xattr_security_handler;
 extern ssize_t ext3_listxattr(struct dentry *, char *, size_t);
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 39a54a0e9fe4..d40c8dbbb0d6 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -152,13 +152,6 @@ ext4_get_acl(struct inode *inode, int type)
        struct posix_acl *acl;
        int retval;
-        if (!test_opt(inode->i_sb, POSIX_ACL))
-                return NULL;
-        acl = get_cached_acl(inode, type);
-        if (acl != ACL_NOT_CACHED)
-                return acl;
        switch (type) {
        case ACL_TYPE_ACCESS:
                name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
@@ -196,7 +189,7 @@ ext4_get_acl(struct inode *inode, int type)
 * inode->i_mutex: down unless called from ext4_new_inode
 */
 static int
-ext4_set_acl(handle_t *handle, struct inode *inode, int type,
+__ext4_set_acl(handle_t *handle, struct inode *inode, int type,
             struct posix_acl *acl)
 {
        int name_index;
@@ -204,9 +197,6 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,
        size_t size = 0;
        int error;
-        if (S_ISLNK(inode->i_mode))
-                return -EOPNOTSUPP;
        switch (type) {
        case ACL_TYPE_ACCESS:
                name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
@@ -248,208 +238,51 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,
        return error;
 }
-/*
- * Initialize the ACLs of a new inode. Called from ext4_new_inode.
- *
- * dir->i_mutex: down
- * inode->i_mutex: up (access to inode is still exclusive)
- */
 int
-ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
+ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 {
-        struct posix_acl *acl = NULL;
-        int error = 0;
-        if (!S_ISLNK(inode->i_mode)) {
-                if (test_opt(dir->i_sb, POSIX_ACL)) {
-                        acl = ext4_get_acl(dir, ACL_TYPE_DEFAULT);
-                        if (IS_ERR(acl))
-                                return PTR_ERR(acl);
-                }
-                if (!acl)
-                        inode->i_mode &= ~current_umask();
-        }
-        if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
-                if (S_ISDIR(inode->i_mode)) {
-                        error = ext4_set_acl(handle, inode,
-                                             ACL_TYPE_DEFAULT, acl);
-                        if (error)
-                                goto cleanup;
-                }
-                error = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode);
-                if (error < 0)
-                        return error;
-                if (error > 0) {
-                        /* This is an extended ACL */
-                        error = ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, acl);
-                }
-        }
-cleanup:
-        posix_acl_release(acl);
-        return error;
-}
-/*
- * Does chmod for an inode that may have an Access Control List. The
- * inode->i_mode field must be updated to the desired value by the caller
- * before calling this function.
- * Returns 0 on success, or a negative error number.
- *
- * We change the ACL rather than storing some ACL entries in the file
- * mode permission bits (which would be more efficient), because that
- * would break once additional permissions (like  ACL_APPEND, ACL_DELETE
- * for directories) are added. There are no more bits available in the
- * file mode.
- *
- * inode->i_mutex: down
- */
-int
-ext4_acl_chmod(struct inode *inode)
-{
-        struct posix_acl *acl;
        handle_t *handle;
-        int retries = 0;
+        int error, retries = 0;
-        int error;
-        if (S_ISLNK(inode->i_mode))
-                return -EOPNOTSUPP;
-        if (!test_opt(inode->i_sb, POSIX_ACL))
-                return 0;
-        acl = ext4_get_acl(inode, ACL_TYPE_ACCESS);
-        if (IS_ERR(acl) || !acl)
-                return PTR_ERR(acl);
-        error = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
-        if (error)
-                return error;
 retry:
        handle = ext4_journal_start(inode, EXT4_HT_XATTR,
                                    ext4_jbd2_credits_xattr(inode));
-        if (IS_ERR(handle)) {
+        if (IS_ERR(handle))
-                error = PTR_ERR(handle);
+                return PTR_ERR(handle);
-                ext4_std_error(inode->i_sb, error);
-                goto out;
+        error = __ext4_set_acl(handle, inode, type, acl);
-        }
-        error = ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, acl);
        ext4_journal_stop(handle);
-        if (error == -ENOSPC &&
+        if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
-            ext4_should_retry_alloc(inode->i_sb, &retries))
                goto retry;
-out:
-        posix_acl_release(acl);
        return error;
 }
 /*
- * Extended attribute handlers
+ * Initialize the ACLs of a new inode. Called from ext4_new_inode.
+ *
+ * dir->i_mutex: down
+ * inode->i_mutex: up (access to inode is still exclusive)
 */
-static size_t
+int
-ext4_xattr_list_acl_access(struct dentry *dentry, char *list, size_t list_len,
+ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
-                           const char *name, size_t name_len, int type)
-{
-        const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
-        if (!test_opt(dentry->d_sb, POSIX_ACL))
-                return 0;
-        if (list && size <= list_len)
-                memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
-        return size;
-}
-static size_t
-ext4_xattr_list_acl_default(struct dentry *dentry, char *list, size_t list_len,
-                            const char *name, size_t name_len, int type)
-{
-        const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
-        if (!test_opt(dentry->d_sb, POSIX_ACL))
-                return 0;
-        if (list && size <= list_len)
-                memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
-        return size;
-}
-static int
-ext4_xattr_get_acl(struct dentry *dentry, const char *name, void *buffer,
-                   size_t size, int type)
 {
-        struct posix_acl *acl;
+        struct posix_acl *default_acl, *acl;
        int error;
-        if (strcmp(name, "") != 0)
+        error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
-                return -EINVAL;
+        if (error)
-        if (!test_opt(dentry->d_sb, POSIX_ACL))
+                return error;
-                return -EOPNOTSUPP;
-        acl = ext4_get_acl(dentry->d_inode, type);
-        if (IS_ERR(acl))
-                return PTR_ERR(acl);
-        if (acl == NULL)
-                return -ENODATA;
-        error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
-        posix_acl_release(acl);
-        return error;
-}
-static int
-ext4_xattr_set_acl(struct dentry *dentry, const char *name, const void *value,
-                   size_t size, int flags, int type)
-{
-        struct inode *inode = dentry->d_inode;
-        handle_t *handle;
-        struct posix_acl *acl;
-        int error, retries = 0;
-        if (strcmp(name, "") != 0)
-                return -EINVAL;
-        if (!test_opt(inode->i_sb, POSIX_ACL))
-                return -EOPNOTSUPP;
-        if (!inode_owner_or_capable(inode))
-                return -EPERM;
-        if (value) {
-                acl = posix_acl_from_xattr(&init_user_ns, value, size);
-                if (IS_ERR(acl))
-                        return PTR_ERR(acl);
-                else if (acl) {
-                        error = posix_acl_valid(acl);
-                        if (error)
-                                goto release_and_out;
-                }
-        } else
-                acl = NULL;
-retry:
+        if (default_acl) {
-        handle = ext4_journal_start(inode, EXT4_HT_XATTR,
+                error = __ext4_set_acl(handle, inode, ACL_TYPE_DEFAULT,
-                                    ext4_jbd2_credits_xattr(inode));
+                                       default_acl);
-        if (IS_ERR(handle)) {
+                posix_acl_release(default_acl);
-                error = PTR_ERR(handle);
+        }
-                goto release_and_out;
+        if (acl) {
+                if (!error)
+                        error = __ext4_set_acl(handle, inode, ACL_TYPE_ACCESS,
+                                               acl);
+                posix_acl_release(acl);
        }
-        error = ext4_set_acl(handle, inode, type, acl);
-        ext4_journal_stop(handle);
-        if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
-                goto retry;
-release_and_out:
-        posix_acl_release(acl);
        return error;
 }
-const struct xattr_handler ext4_xattr_acl_access_handler = {
-        .prefix = POSIX_ACL_XATTR_ACCESS,
-        .flags  = ACL_TYPE_ACCESS,
-        .list   = ext4_xattr_list_acl_access,
-        .get    = ext4_xattr_get_acl,
-        .set    = ext4_xattr_set_acl,
-};
-const struct xattr_handler ext4_xattr_acl_default_handler = {
-        .prefix = POSIX_ACL_XATTR_DEFAULT,
-        .flags  = ACL_TYPE_DEFAULT,
-        .list   = ext4_xattr_list_acl_default,
-        .get    = ext4_xattr_get_acl,
-        .set    = ext4_xattr_set_acl,
-};
diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h
index 18cb39ed7c7b..da2c79577d72 100644
--- a/fs/ext4/acl.h
+++ b/fs/ext4/acl.h
@@ -55,18 +55,13 @@ static inline int ext4_acl_count(size_t size)
 /* acl.c */
 struct posix_acl *ext4_get_acl(struct inode *inode, int type);
-extern int ext4_acl_chmod(struct inode *);
+int ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type);
 extern int ext4_init_acl(handle_t *, struct inode *, struct inode *);
 #else  /* CONFIG_EXT4_FS_POSIX_ACL */
 #include <linux/sched.h>
 #define ext4_get_acl NULL
+#define ext4_set_acl NULL
-static inline int
-ext4_acl_chmod(struct inode *inode)
-{
-        return 0;
-}
 static inline int
 ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 3f11656bd72e..41eb9dcfac7e 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -180,37 +180,12 @@ int ext4_setup_system_zone(struct super_block *sb)
 /* Called when the filesystem is unmounted */
 void ext4_release_system_zone(struct super_block *sb)
 {
-        struct rb_node  *n = EXT4_SB(sb)->system_blks.rb_node;
+        struct ext4_system_zone *entry, *n;
-        struct rb_node  *parent;
-        struct ext4_system_zone *entry;
-        while (n) {
+        rbtree_postorder_for_each_entry_safe(entry, n,
-                /* Do the node's children first */
+                        &EXT4_SB(sb)->system_blks, node)
-                if (n->rb_left) {
-                        n = n->rb_left;
-                        continue;
-                }
-                if (n->rb_right) {
-                        n = n->rb_right;
-                        continue;
-                }
-                /*
-                 * The node has no children; free it, and then zero
-                 * out parent's link to it.  Finally go to the
-                 * beginning of the loop and try to free the parent
-                 * node.
-                 */
-                parent = rb_parent(n);
-                entry = rb_entry(n, struct ext4_system_zone, node);
                kmem_cache_free(ext4_system_zone_cachep, entry);
-                if (!parent)
-                        EXT4_SB(sb)->system_blks = RB_ROOT;
-                else if (parent->rb_left == n)
-                        parent->rb_left = NULL;
-                else if (parent->rb_right == n)
-                        parent->rb_right = NULL;
-                n = parent;
-        }
        EXT4_SB(sb)->system_blks = RB_ROOT;
 }
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 680bb3388919..d638c57e996e 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -353,41 +353,16 @@ struct fname {
 */
 static void free_rb_tree_fname(struct rb_root *root)
 {
-        struct rb_node  *n = root->rb_node;
+        struct fname *fname, *next;
-        struct rb_node  *parent;
-        struct fname    *fname;
+        rbtree_postorder_for_each_entry_safe(fname, next, root, rb_hash)
-        while (n) {
-                /* Do the node's children first */
-                if (n->rb_left) {
-                        n = n->rb_left;
-                        continue;
-                }
-                if (n->rb_right) {
-                        n = n->rb_right;
-                        continue;
-                }
-                /*
-                 * The node has no children; free it, and then zero
-                 * out parent's link to it.  Finally go to the
-                 * beginning of the loop and try to free the parent
-                 * node.
-                 */
-                parent = rb_parent(n);
-                fname = rb_entry(n, struct fname, rb_hash);
                while (fname) {
                        struct fname *old = fname;
                        fname = fname->next;
                        kfree(old);
                }
-                if (!parent)
-                        *root = RB_ROOT;
+        *root = RB_ROOT;
-                else if (parent->rb_left == n)
-                        parent->rb_left = NULL;
-                else if (parent->rb_right == n)
-                        parent->rb_right = NULL;
-                n = parent;
-        }
 }
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index ece55565b9cd..d3a534fdc5ff 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -771,6 +771,8 @@ do {									       \
        if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime))                      \
                (einode)->xtime.tv_sec =                                       \
                        (signed)le32_to_cpu((raw_inode)->xtime);               \
+        else                                                                   \
+                (einode)->xtime.tv_sec = 0;                                    \
        if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra))            \
                ext4_decode_extra_time(&(einode)->xtime,                       \
                                       raw_inode->xtime ## _extra);            \
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 4410cc3d6ee2..74bc2d549c58 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3477,7 +3477,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
        WARN_ON(map->m_lblk < ee_block);
        /*
         * It is safe to convert extent to initialized via explicit
-         * zeroout only if extent is fully insde i_size or new_size.
+         * zeroout only if extent is fully inside i_size or new_size.
         */
        split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
@@ -3906,6 +3906,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
                } else
                        err = ret;
                map->m_flags |= EXT4_MAP_MAPPED;
+                map->m_pblk = newblock;
                if (allocated > map->m_len)
                        allocated = map->m_len;
                map->m_len = allocated;
@@ -4218,7 +4219,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
         */
        map->m_flags &= ~EXT4_MAP_FROM_CLUSTER;
        newex.ee_block = cpu_to_le32(map->m_lblk);
-        cluster_offset = EXT4_LBLK_CMASK(sbi, map->m_lblk);
+        cluster_offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
        /*
         * If we are doing bigalloc, check to see if the extent returned
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 3da21945ff1f..1a5073959f32 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -152,7 +152,7 @@ ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov,
        if (ret > 0) {
                ssize_t err;
-                err = generic_write_sync(file, pos, ret);
+                err = generic_write_sync(file, iocb->ki_pos - ret, ret);
                if (err < 0 && ret > 0)
                        ret = err;
        }
@@ -617,6 +617,7 @@ const struct inode_operations ext4_file_inode_operations = {
        .listxattr      = ext4_listxattr,
        .removexattr    = generic_removexattr,
        .get_acl        = ext4_get_acl,
+        .set_acl        = ext4_set_acl,
        .fiemap         = ext4_fiemap,
 };
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index bae987549dc3..82edf5b93352 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -849,15 +849,16 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping,
        handle_t *handle;
        struct page *page;
        struct ext4_iloc iloc;
+        int retries;
        ret = ext4_get_inode_loc(inode, &iloc);
        if (ret)
                return ret;
+retry_journal:
        handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
        if (IS_ERR(handle)) {
                ret = PTR_ERR(handle);
-                handle = NULL;
                goto out;
        }
@@ -867,7 +868,7 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping,
        if (inline_size >= pos + len) {
                ret = ext4_prepare_inline_data(handle, inode, pos + len);
                if (ret && ret != -ENOSPC)
-                        goto out;
+                        goto out_journal;
        }
        if (ret == -ENOSPC) {
@@ -875,6 +876,10 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping,
                                                            inode,
                                                            flags,
                                                            fsdata);
+                ext4_journal_stop(handle);
+                if (ret == -ENOSPC &&
+                    ext4_should_retry_alloc(inode->i_sb, &retries))
+                        goto retry_journal;
                goto out;
        }
@@ -887,7 +892,7 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping,
        page = grab_cache_page_write_begin(mapping, 0, flags);
        if (!page) {
                ret = -ENOMEM;
-                goto out;
+                goto out_journal;
        }
        down_read(&EXT4_I(inode)->xattr_sem);
@@ -904,16 +909,15 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping,
        up_read(&EXT4_I(inode)->xattr_sem);
        *pagep = page;
-        handle = NULL;
        brelse(iloc.bh);
        return 1;
 out_release_page:
        up_read(&EXT4_I(inode)->xattr_sem);
        unlock_page(page);
        page_cache_release(page);
+out_journal:
+        ext4_journal_stop(handle);
 out:
-        if (handle)
-                ext4_journal_stop(handle);
        brelse(iloc.bh);
        return ret;
 }
@@ -1837,7 +1841,6 @@ int ext4_try_to_evict_inline_data(handle_t *handle,
 {
        int error;
        struct ext4_xattr_entry *entry;
-        struct ext4_xattr_ibody_header *header;
        struct ext4_inode *raw_inode;
        struct ext4_iloc iloc;
@@ -1846,7 +1849,6 @@ int ext4_try_to_evict_inline_data(handle_t *handle,
                return error;
        raw_inode = ext4_raw_inode(&iloc);
-        header = IHDR(inode, raw_inode);
        entry = (struct ext4_xattr_entry *)((void *)raw_inode +
                                            EXT4_I(inode)->i_inline_off);
        if (EXT4_XATTR_LEN(entry->e_name_len) +
@@ -1924,9 +1926,11 @@ void ext4_inline_data_truncate(struct inode *inode, int *has_inline)
                }
                /* Clear the content within i_blocks. */
-                if (i_size < EXT4_MIN_INLINE_DATA_SIZE)
+                if (i_size < EXT4_MIN_INLINE_DATA_SIZE) {
-                        memset(ext4_raw_inode(&is.iloc)->i_block + i_size, 0,
+                        void *p = (void *) ext4_raw_inode(&is.iloc)->i_block;
-                                        EXT4_MIN_INLINE_DATA_SIZE - i_size);
+                        memset(p + i_size, 0,
+                               EXT4_MIN_INLINE_DATA_SIZE - i_size);
+                }
                EXT4_I(inode)->i_inline_size = i_size <
                                        EXT4_MIN_INLINE_DATA_SIZE ?
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 61d49ff22c81..24bfd7ff3049 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -38,6 +38,7 @@
 #include <linux/slab.h>
 #include <linux/ratelimit.h>
 #include <linux/aio.h>
+#include <linux/bitops.h>
 #include "ext4_jbd2.h"
 #include "xattr.h"
@@ -144,8 +145,8 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
 */
 static int ext4_inode_is_fast_symlink(struct inode *inode)
 {
-        int ea_blocks = EXT4_I(inode)->i_file_acl ?
+        int ea_blocks = EXT4_I(inode)->i_file_acl ?
-                (inode->i_sb->s_blocksize >> 9) : 0;
+                EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0;
        return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
 }
@@ -1772,7 +1773,7 @@ static int __ext4_journalled_writepage(struct page *page,
                ret = err;
        if (!ext4_has_inline_data(inode))
-                ext4_walk_page_buffers(handle, page_bufs, 0, len,
+                ext4_walk_page_buffers(NULL, page_bufs, 0, len,
                                       NULL, bput_one);
        ext4_set_inode_state(inode, EXT4_STATE_JDATA);
 out:
@@ -3501,11 +3502,6 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
        if (!S_ISREG(inode->i_mode))
                return -EOPNOTSUPP;
-        if (EXT4_SB(sb)->s_cluster_ratio > 1) {
-                /* TODO: Add support for bigalloc file systems */
-                return -EOPNOTSUPP;
-        }
        trace_ext4_punch_hole(inode, offset, length);
        /*
@@ -3926,18 +3922,20 @@ int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
 void ext4_set_inode_flags(struct inode *inode)
 {
        unsigned int flags = EXT4_I(inode)->i_flags;
+        unsigned int new_fl = 0;
-        inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
        if (flags & EXT4_SYNC_FL)
-                inode->i_flags |= S_SYNC;
+                new_fl |= S_SYNC;
        if (flags & EXT4_APPEND_FL)
-                inode->i_flags |= S_APPEND;
+                new_fl |= S_APPEND;
        if (flags & EXT4_IMMUTABLE_FL)
-                inode->i_flags |= S_IMMUTABLE;
+                new_fl |= S_IMMUTABLE;
        if (flags & EXT4_NOATIME_FL)
-                inode->i_flags |= S_NOATIME;
+                new_fl |= S_NOATIME;
        if (flags & EXT4_DIRSYNC_FL)
-                inode->i_flags |= S_DIRSYNC;
+                new_fl |= S_DIRSYNC;
+        set_mask_bits(&inode->i_flags,
+                      S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC, new_fl);
 }
 /* Propagate flags from i_flags to EXT4_I(inode)->i_flags */
@@ -4586,6 +4584,10 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
                        if (attr->ia_size > sbi->s_bitmap_maxbytes)
                                return -EFBIG;
                }
+                if (IS_I_VERSION(inode) && attr->ia_size != inode->i_size)
+                        inode_inc_iversion(inode);
                if (S_ISREG(inode->i_mode) &&
                    (attr->ia_size < inode->i_size)) {
                        if (ext4_should_order_data(inode)) {
@@ -4663,7 +4665,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
                ext4_orphan_del(NULL, inode);
        if (!rc && (ia_valid & ATTR_MODE))
-                rc = ext4_acl_chmod(inode);
+                rc = posix_acl_chmod(inode, inode->i_mode);
 err_out:
        ext4_std_error(inode->i_sb, error);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 60589b60e9b0..a2a837f00407 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -101,9 +101,8 @@ static long swap_inode_boot_loader(struct super_block *sb,
        handle_t *handle;
        int err;
        struct inode *inode_bl;
-        struct ext4_inode_info *ei;
        struct ext4_inode_info *ei_bl;
-        struct ext4_sb_info *sbi;
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
        if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode)) {
                err = -EINVAL;
@@ -115,9 +114,6 @@ static long swap_inode_boot_loader(struct super_block *sb,
                goto swap_boot_out;
        }
-        sbi = EXT4_SB(sb);
-        ei = EXT4_I(inode);
        inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO);
        if (IS_ERR(inode_bl)) {
                err = PTR_ERR(inode_bl);
@@ -144,7 +140,7 @@ static long swap_inode_boot_loader(struct super_block *sb,
        handle = ext4_journal_start(inode_bl, EXT4_HT_MOVE_EXTENTS, 2);
        if (IS_ERR(handle)) {
                err = -EINVAL;
-                goto swap_boot_out;
+                goto journal_err_out;
        }
        /* Protect extent tree against block allocations via delalloc */
@@ -202,6 +198,7 @@ static long swap_inode_boot_loader(struct super_block *sb,
        ext4_double_up_write_data_sem(inode, inode_bl);
+journal_err_out:
        ext4_inode_resume_unlocked_dio(inode);
        ext4_inode_resume_unlocked_dio(inode_bl);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 5a0408d7b114..d050e043e884 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1425,9 +1425,8 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
                        return ERR_PTR(-EIO);
                }
                if (unlikely(ino == dir->i_ino)) {
-                        EXT4_ERROR_INODE(dir, "'%.*s' linked to parent dir",
+                        EXT4_ERROR_INODE(dir, "'%pd' linked to parent dir",
-                                         dentry->d_name.len,
+                                         dentry);
-                                         dentry->d_name.name);
                        return ERR_PTR(-EIO);
                }
                inode = ext4_iget(dir->i_sb, ino);
@@ -3225,6 +3224,7 @@ const struct inode_operations ext4_dir_inode_operations = {
        .listxattr      = ext4_listxattr,
        .removexattr    = generic_removexattr,
        .get_acl        = ext4_get_acl,
+        .set_acl        = ext4_set_acl,
        .fiemap         = ext4_fiemap,
 };
@@ -3235,4 +3235,5 @@ const struct inode_operations ext4_special_inode_operations = {
        .listxattr      = ext4_listxattr,
        .removexattr    = generic_removexattr,
        .get_acl        = ext4_get_acl,
+        .set_acl        = ext4_set_acl,
 };
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index d488f80ee32d..ab95508e3d40 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -65,9 +65,9 @@ static void ext4_finish_bio(struct bio *bio)
 {
        int i;
        int error = !test_bit(BIO_UPTODATE, &bio->bi_flags);
+        struct bio_vec *bvec;
-        for (i = 0; i < bio->bi_vcnt; i++) {
+        bio_for_each_segment_all(bvec, bio, i) {
-                struct bio_vec *bvec = &bio->bi_io_vec[i];
                struct page *page = bvec->bv_page;
                struct buffer_head *bh, *head;
                unsigned bio_start = bvec->bv_offset;
@@ -298,7 +298,7 @@ ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end)
 static void ext4_end_bio(struct bio *bio, int error)
 {
        ext4_io_end_t *io_end = bio->bi_private;
-        sector_t bi_sector = bio->bi_sector;
+        sector_t bi_sector = bio->bi_iter.bi_sector;
        BUG_ON(!io_end);
        bio->bi_end_io = NULL;
@@ -366,7 +366,7 @@ static int io_submit_init_bio(struct ext4_io_submit *io,
        bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES));
        if (!bio)
                return -ENOMEM;
-        bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
+        bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
        bio->bi_bdev = bh->b_bdev;
        bio->bi_end_io = ext4_end_bio;
        bio->bi_private = ext4_get_io_end(io->io_end);
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index c5adbb318a90..f3b84cd9de56 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -243,6 +243,7 @@ static int ext4_alloc_group_tables(struct super_block *sb,
        ext4_group_t group;
        ext4_group_t last_group;
        unsigned overhead;
+        __u16 uninit_mask = (flexbg_size > 1) ? ~EXT4_BG_BLOCK_UNINIT : ~0;
        BUG_ON(flex_gd->count == 0 || group_data == NULL);
@@ -266,7 +267,7 @@ next_group:
        src_group++;
        for (; src_group <= last_group; src_group++) {
                overhead = ext4_group_overhead_blocks(sb, src_group);
-                if (overhead != 0)
+                if (overhead == 0)
                        last_blk += group_data[src_group - group].blocks_count;
                else
                        break;
@@ -280,8 +281,7 @@ next_group:
                group = ext4_get_group_number(sb, start_blk - 1);
                group -= group_data[0].group;
                group_data[group].free_blocks_count--;
-                if (flexbg_size > 1)
+                flex_gd->bg_flags[group] &= uninit_mask;
-                        flex_gd->bg_flags[group] &= ~EXT4_BG_BLOCK_UNINIT;
        }
        /* Allocate inode bitmaps */
@@ -292,22 +292,30 @@ next_group:
                group = ext4_get_group_number(sb, start_blk - 1);
                group -= group_data[0].group;
                group_data[group].free_blocks_count--;
-                if (flexbg_size > 1)
+                flex_gd->bg_flags[group] &= uninit_mask;
-                        flex_gd->bg_flags[group] &= ~EXT4_BG_BLOCK_UNINIT;
        }
        /* Allocate inode tables */
        for (; it_index < flex_gd->count; it_index++) {
-                if (start_blk + EXT4_SB(sb)->s_itb_per_group > last_blk)
+                unsigned int itb = EXT4_SB(sb)->s_itb_per_group;
+                ext4_fsblk_t next_group_start;
+                if (start_blk + itb > last_blk)
                        goto next_group;
                group_data[it_index].inode_table = start_blk;
-                group = ext4_get_group_number(sb, start_blk - 1);
+                group = ext4_get_group_number(sb, start_blk);
+                next_group_start = ext4_group_first_block_no(sb, group + 1);
                group -= group_data[0].group;
-                group_data[group].free_blocks_count -=
-                                        EXT4_SB(sb)->s_itb_per_group;
-                if (flexbg_size > 1)
-                        flex_gd->bg_flags[group] &= ~EXT4_BG_BLOCK_UNINIT;
+                if (start_blk + itb > next_group_start) {
+                        flex_gd->bg_flags[group + 1] &= uninit_mask;
+                        overhead = start_blk + itb - next_group_start;
+                        group_data[group + 1].free_blocks_count -= overhead;
+                        itb -= overhead;
+                }
+                group_data[group].free_blocks_count -= itb;
+                flex_gd->bg_flags[group] &= uninit_mask;
                start_blk += EXT4_SB(sb)->s_itb_per_group;
        }
@@ -401,7 +409,7 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle,
                start = ext4_group_first_block_no(sb, group);
                group -= flex_gd->groups[0].group;
-                count2 = sb->s_blocksize * 8 - (block - start);
+                count2 = EXT4_BLOCKS_PER_GROUP(sb) - (block - start);
                if (count2 > count)
                        count2 = count;
@@ -620,7 +628,7 @@ handle_ib:
                        if (err)
                                goto out;
                        count = group_table_count[j];
-                        start = group_data[i].block_bitmap;
+                        start = (&group_data[i].block_bitmap)[j];
                        block = start;
                }
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 1f7784de05b6..710fed2377d4 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3695,16 +3695,22 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        for (i = 0; i < 4; i++)
                sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
        sbi->s_def_hash_version = es->s_def_hash_version;
-        i = le32_to_cpu(es->s_flags);
+        if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) {
-        if (i & EXT2_FLAGS_UNSIGNED_HASH)
+                i = le32_to_cpu(es->s_flags);
-                sbi->s_hash_unsigned = 3;
+                if (i & EXT2_FLAGS_UNSIGNED_HASH)
-        else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) {
+                        sbi->s_hash_unsigned = 3;
+                else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) {
 #ifdef __CHAR_UNSIGNED__
-                es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH);
+                        if (!(sb->s_flags & MS_RDONLY))
-                sbi->s_hash_unsigned = 3;
+                                es->s_flags |=
+                                        cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH);
+                        sbi->s_hash_unsigned = 3;
 #else
-                es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
+                        if (!(sb->s_flags & MS_RDONLY))
+                                es->s_flags |=
+                                        cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
 #endif
+                }
        }
        /* Handle clustersize */
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 1423c4816a47..e175e94116ac 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -95,8 +95,8 @@ static struct mb_cache *ext4_xattr_cache;
 static const struct xattr_handler *ext4_xattr_handler_map[] = {
        [EXT4_XATTR_INDEX_USER]              = &ext4_xattr_user_handler,
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
-        [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS]  = &ext4_xattr_acl_access_handler,
+        [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS]  = &posix_acl_access_xattr_handler,
-        [EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ext4_xattr_acl_default_handler,
+        [EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &posix_acl_default_xattr_handler,
 #endif
        [EXT4_XATTR_INDEX_TRUSTED]           = &ext4_xattr_trusted_handler,
 #ifdef CONFIG_EXT4_FS_SECURITY
@@ -108,8 +108,8 @@ const struct xattr_handler *ext4_xattr_handlers[] = {
        &ext4_xattr_user_handler,
        &ext4_xattr_trusted_handler,
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
-        &ext4_xattr_acl_access_handler,
+        &posix_acl_access_xattr_handler,
-        &ext4_xattr_acl_default_handler,
+        &posix_acl_default_xattr_handler,
 #endif
 #ifdef CONFIG_EXT4_FS_SECURITY
        &ext4_xattr_security_handler,
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index c767dbdd7fc4..819d6398833f 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -96,8 +96,6 @@ struct ext4_xattr_ibody_find {
 extern const struct xattr_handler ext4_xattr_user_handler;
 extern const struct xattr_handler ext4_xattr_trusted_handler;
-extern const struct xattr_handler ext4_xattr_acl_access_handler;
-extern const struct xattr_handler ext4_xattr_acl_default_handler;
 extern const struct xattr_handler ext4_xattr_security_handler;
 extern ssize_t ext4_listxattr(struct dentry *, char *, size_t);
diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile
index 27a0820340b9..2e35da12d292 100644
--- a/fs/f2fs/Makefile
+++ b/fs/f2fs/Makefile
@@ -1,6 +1,6 @@
 obj-$(CONFIG_F2FS_FS) += f2fs.o
-f2fs-y          := dir.o file.o inode.o namei.o hash.o super.o
+f2fs-y          := dir.o file.o inode.o namei.o hash.o super.o inline.o
 f2fs-y          += checkpoint.o gc.o data.o node.o segment.o recovery.o
 f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o
 f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index d0fc287efeff..fa8da4cb8c4b 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -17,9 +17,6 @@
 #include "xattr.h"
 #include "acl.h"
-#define get_inode_mode(i)       ((is_inode_flag_set(F2FS_I(i), FI_ACL_MODE)) ? \
-                                        (F2FS_I(i)->i_acl_mode) : ((i)->i_mode))
 static inline size_t f2fs_acl_size(int count)
 {
        if (count <= 4) {
@@ -167,19 +164,11 @@ fail:
 struct posix_acl *f2fs_get_acl(struct inode *inode, int type)
 {
-        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
        int name_index = F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT;
        void *value = NULL;
        struct posix_acl *acl;
        int retval;
-        if (!test_opt(sbi, POSIX_ACL))
-                return NULL;
-        acl = get_cached_acl(inode, type);
-        if (acl != ACL_NOT_CACHED)
-                return acl;
        if (type == ACL_TYPE_ACCESS)
                name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS;
@@ -205,21 +194,15 @@ struct posix_acl *f2fs_get_acl(struct inode *inode, int type)
        return acl;
 }
-static int f2fs_set_acl(struct inode *inode, int type,
+static int __f2fs_set_acl(struct inode *inode, int type,
                        struct posix_acl *acl, struct page *ipage)
 {
-        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
        struct f2fs_inode_info *fi = F2FS_I(inode);
        int name_index;
        void *value = NULL;
        size_t size = 0;
        int error;
-        if (!test_opt(sbi, POSIX_ACL))
-                return 0;
-        if (S_ISLNK(inode->i_mode))
-                return -EOPNOTSUPP;
        switch (type) {
        case ACL_TYPE_ACCESS:
                name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS;
@@ -261,154 +244,31 @@ static int f2fs_set_acl(struct inode *inode, int type,
        return error;
 }
-int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage)
+int f2fs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 {
-        struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+        return __f2fs_set_acl(inode, type, acl, NULL);
-        struct posix_acl *acl = NULL;
-        int error = 0;
-        if (!S_ISLNK(inode->i_mode)) {
-                if (test_opt(sbi, POSIX_ACL)) {
-                        acl = f2fs_get_acl(dir, ACL_TYPE_DEFAULT);
-                        if (IS_ERR(acl))
-                                return PTR_ERR(acl);
-                }
-                if (!acl)
-                        inode->i_mode &= ~current_umask();
-        }
-        if (!test_opt(sbi, POSIX_ACL) || !acl)
-                goto cleanup;
-        if (S_ISDIR(inode->i_mode)) {
-                error = f2fs_set_acl(inode, ACL_TYPE_DEFAULT, acl, ipage);
-                if (error)
-                        goto cleanup;
-        }
-        error = posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode);
-        if (error < 0)
-                return error;
-        if (error > 0)
-                error = f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl, ipage);
-cleanup:
-        posix_acl_release(acl);
-        return error;
 }
-int f2fs_acl_chmod(struct inode *inode)
+int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage)
 {
-        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        struct posix_acl *default_acl, *acl;
-        struct posix_acl *acl;
+        int error = 0;
-        int error;
-        umode_t mode = get_inode_mode(inode);
-        if (!test_opt(sbi, POSIX_ACL))
-                return 0;
-        if (S_ISLNK(mode))
-                return -EOPNOTSUPP;
-        acl = f2fs_get_acl(inode, ACL_TYPE_ACCESS);
-        if (IS_ERR(acl) || !acl)
-                return PTR_ERR(acl);
-        error = posix_acl_chmod(&acl, GFP_KERNEL, mode);
+        error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
        if (error)
                return error;
-        error = f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl, NULL);
+        if (default_acl) {
-        posix_acl_release(acl);
+                error = __f2fs_set_acl(inode, ACL_TYPE_DEFAULT, default_acl,
-        return error;
+                                       ipage);
-}
+                posix_acl_release(default_acl);
+        }
-static size_t f2fs_xattr_list_acl(struct dentry *dentry, char *list,
+        if (acl) {
-                size_t list_size, const char *name, size_t name_len, int type)
+                if (error)
-{
+                        error = __f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl,
-        struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
+                                               ipage);
-        const char *xname = POSIX_ACL_XATTR_DEFAULT;
+                posix_acl_release(acl);
-        size_t size;
-        if (!test_opt(sbi, POSIX_ACL))
-                return 0;
-        if (type == ACL_TYPE_ACCESS)
-                xname = POSIX_ACL_XATTR_ACCESS;
-        size = strlen(xname) + 1;
-        if (list && size <= list_size)
-                memcpy(list, xname, size);
-        return size;
-}
-static int f2fs_xattr_get_acl(struct dentry *dentry, const char *name,
-                void *buffer, size_t size, int type)
-{
-        struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
-        struct posix_acl *acl;
-        int error;
-        if (strcmp(name, "") != 0)
-                return -EINVAL;
-        if (!test_opt(sbi, POSIX_ACL))
-                return -EOPNOTSUPP;
-        acl = f2fs_get_acl(dentry->d_inode, type);
-        if (IS_ERR(acl))
-                return PTR_ERR(acl);
-        if (!acl)
-                return -ENODATA;
-        error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
-        posix_acl_release(acl);
-        return error;
-}
-static int f2fs_xattr_set_acl(struct dentry *dentry, const char *name,
-                const void *value, size_t size, int flags, int type)
-{
-        struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
-        struct inode *inode = dentry->d_inode;
-        struct posix_acl *acl = NULL;
-        int error;
-        if (strcmp(name, "") != 0)
-                return -EINVAL;
-        if (!test_opt(sbi, POSIX_ACL))
-                return -EOPNOTSUPP;
-        if (!inode_owner_or_capable(inode))
-                return -EPERM;
-        if (value) {
-                acl = posix_acl_from_xattr(&init_user_ns, value, size);
-                if (IS_ERR(acl))
-                        return PTR_ERR(acl);
-                if (acl) {
-                        error = posix_acl_valid(acl);
-                        if (error)
-                                goto release_and_out;
-                }
-        } else {
-                acl = NULL;
        }
-        error = f2fs_set_acl(inode, type, acl, NULL);
-release_and_out:
-        posix_acl_release(acl);
        return error;
 }
-const struct xattr_handler f2fs_xattr_acl_default_handler = {
-        .prefix = POSIX_ACL_XATTR_DEFAULT,
-        .flags = ACL_TYPE_DEFAULT,
-        .list = f2fs_xattr_list_acl,
-        .get = f2fs_xattr_get_acl,
-        .set = f2fs_xattr_set_acl,
-};
-const struct xattr_handler f2fs_xattr_acl_access_handler = {
-        .prefix = POSIX_ACL_XATTR_ACCESS,
-        .flags = ACL_TYPE_ACCESS,
-        .list = f2fs_xattr_list_acl,
-        .get = f2fs_xattr_get_acl,
-        .set = f2fs_xattr_set_acl,
-};
diff --git a/fs/f2fs/acl.h b/fs/f2fs/acl.h
index 49633131e038..e0864651cdc1 100644
--- a/fs/f2fs/acl.h
+++ b/fs/f2fs/acl.h
@@ -37,18 +37,13 @@ struct f2fs_acl_header {
 #ifdef CONFIG_F2FS_FS_POSIX_ACL
 extern struct posix_acl *f2fs_get_acl(struct inode *, int);
-extern int f2fs_acl_chmod(struct inode *);
+extern int f2fs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
 extern int f2fs_init_acl(struct inode *, struct inode *, struct page *);
 #else
 #define f2fs_check_acl  NULL
 #define f2fs_get_acl    NULL
 #define f2fs_set_acl    NULL
-static inline int f2fs_acl_chmod(struct inode *inode)
-{
-        return 0;
-}
 static inline int f2fs_init_acl(struct inode *inode, struct inode *dir,
                                                        struct page *page)
 {
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 5716e5eb4e8e..293d0486a40f 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -30,7 +30,7 @@ static struct kmem_cache *inode_entry_slab;
 */
 struct page *grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index)
 {
-        struct address_space *mapping = sbi->meta_inode->i_mapping;
+        struct address_space *mapping = META_MAPPING(sbi);
        struct page *page = NULL;
 repeat:
        page = grab_cache_page(mapping, index);
@@ -50,7 +50,7 @@ repeat:
 */
 struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index)
 {
-        struct address_space *mapping = sbi->meta_inode->i_mapping;
+        struct address_space *mapping = META_MAPPING(sbi);
        struct page *page;
 repeat:
        page = grab_cache_page(mapping, index);
@@ -61,11 +61,12 @@ repeat:
        if (PageUptodate(page))
                goto out;
-        if (f2fs_readpage(sbi, page, index, READ_SYNC))
+        if (f2fs_submit_page_bio(sbi, page, index,
+                                READ_SYNC | REQ_META | REQ_PRIO))
                goto repeat;
        lock_page(page);
-        if (page->mapping != mapping) {
+        if (unlikely(page->mapping != mapping)) {
                f2fs_put_page(page, 1);
                goto repeat;
        }
@@ -81,13 +82,12 @@ static int f2fs_write_meta_page(struct page *page,
        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
        /* Should not write any meta pages, if any IO error was occurred */
-        if (wbc->for_reclaim || sbi->por_doing ||
+        if (unlikely(sbi->por_doing ||
-                        is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG)) {
+                        is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG)))
-                dec_page_count(sbi, F2FS_DIRTY_META);
+                goto redirty_out;
-                wbc->pages_skipped++;
-                set_page_dirty(page);
+        if (wbc->for_reclaim)
-                return AOP_WRITEPAGE_ACTIVATE;
+                goto redirty_out;
-        }
        wait_on_page_writeback(page);
@@ -95,24 +95,31 @@ static int f2fs_write_meta_page(struct page *page,
        dec_page_count(sbi, F2FS_DIRTY_META);
        unlock_page(page);
        return 0;
+redirty_out:
+        dec_page_count(sbi, F2FS_DIRTY_META);
+        wbc->pages_skipped++;
+        set_page_dirty(page);
+        return AOP_WRITEPAGE_ACTIVATE;
 }
 static int f2fs_write_meta_pages(struct address_space *mapping,
                                struct writeback_control *wbc)
 {
        struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
-        struct block_device *bdev = sbi->sb->s_bdev;
+        int nrpages = MAX_BIO_BLOCKS(max_hw_blocks(sbi));
        long written;
        if (wbc->for_kupdate)
                return 0;
-        if (get_pages(sbi, F2FS_DIRTY_META) == 0)
+        /* collect a number of dirty meta pages and write together */
+        if (get_pages(sbi, F2FS_DIRTY_META) < nrpages)
                return 0;
        /* if mounting is failed, skip writing node pages */
        mutex_lock(&sbi->cp_mutex);
-        written = sync_meta_pages(sbi, META, bio_get_nr_vecs(bdev));
+        written = sync_meta_pages(sbi, META, nrpages);
        mutex_unlock(&sbi->cp_mutex);
        wbc->nr_to_write -= written;
        return 0;
@@ -121,7 +128,7 @@ static int f2fs_write_meta_pages(struct address_space *mapping,
 long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
                                                long nr_to_write)
 {
-        struct address_space *mapping = sbi->meta_inode->i_mapping;
+        struct address_space *mapping = META_MAPPING(sbi);
        pgoff_t index = 0, end = LONG_MAX;
        struct pagevec pvec;
        long nwritten = 0;
@@ -136,7 +143,7 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
                nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
                                PAGECACHE_TAG_DIRTY,
                                min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
-                if (nr_pages == 0)
+                if (unlikely(nr_pages == 0))
                        break;
                for (i = 0; i < nr_pages; i++) {
@@ -149,7 +156,8 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
                                unlock_page(page);
                                break;
                        }
-                        if (nwritten++ >= nr_to_write)
+                        nwritten++;
+                        if (unlikely(nwritten >= nr_to_write))
                                break;
                }
                pagevec_release(&pvec);
@@ -157,7 +165,7 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
        }
        if (nwritten)
-                f2fs_submit_bio(sbi, type, nr_to_write == LONG_MAX);
+                f2fs_submit_merged_bio(sbi, type, WRITE);
        return nwritten;
 }
@@ -186,31 +194,24 @@ const struct address_space_operations f2fs_meta_aops = {
 int acquire_orphan_inode(struct f2fs_sb_info *sbi)
 {
-        unsigned int max_orphans;
        int err = 0;
-        /*
+        spin_lock(&sbi->orphan_inode_lock);
-         * considering 512 blocks in a segment 5 blocks are needed for cp
+        if (unlikely(sbi->n_orphans >= sbi->max_orphans))
-         * and log segment summaries. Remaining blocks are used to keep
-         * orphan entries with the limitation one reserved segment
-         * for cp pack we can have max 1020*507 orphan entries
-         */
-        max_orphans = (sbi->blocks_per_seg - 5) * F2FS_ORPHANS_PER_BLOCK;
-        mutex_lock(&sbi->orphan_inode_mutex);
-        if (sbi->n_orphans >= max_orphans)
                err = -ENOSPC;
        else
                sbi->n_orphans++;
-        mutex_unlock(&sbi->orphan_inode_mutex);
+        spin_unlock(&sbi->orphan_inode_lock);
        return err;
 }
 void release_orphan_inode(struct f2fs_sb_info *sbi)
 {
-        mutex_lock(&sbi->orphan_inode_mutex);
+        spin_lock(&sbi->orphan_inode_lock);
        f2fs_bug_on(sbi->n_orphans == 0);
        sbi->n_orphans--;
-        mutex_unlock(&sbi->orphan_inode_mutex);
+        spin_unlock(&sbi->orphan_inode_lock);
 }
 void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
@@ -218,27 +219,30 @@ void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
        struct list_head *head, *this;
        struct orphan_inode_entry *new = NULL, *orphan = NULL;
-        mutex_lock(&sbi->orphan_inode_mutex);
+        new = f2fs_kmem_cache_alloc(orphan_entry_slab, GFP_ATOMIC);
+        new->ino = ino;
+        spin_lock(&sbi->orphan_inode_lock);
        head = &sbi->orphan_inode_list;
        list_for_each(this, head) {
                orphan = list_entry(this, struct orphan_inode_entry, list);
-                if (orphan->ino == ino)
+                if (orphan->ino == ino) {
-                        goto out;
+                        spin_unlock(&sbi->orphan_inode_lock);
+                        kmem_cache_free(orphan_entry_slab, new);
+                        return;
+                }
                if (orphan->ino > ino)
                        break;
                orphan = NULL;
        }
-        new = f2fs_kmem_cache_alloc(orphan_entry_slab, GFP_ATOMIC);
-        new->ino = ino;
        /* add new_oentry into list which is sorted by inode number */
        if (orphan)
                list_add(&new->list, this->prev);
        else
                list_add_tail(&new->list, head);
-out:
+        spin_unlock(&sbi->orphan_inode_lock);
-        mutex_unlock(&sbi->orphan_inode_mutex);
 }
 void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
@@ -246,7 +250,7 @@ void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
        struct list_head *head;
        struct orphan_inode_entry *orphan;
-        mutex_lock(&sbi->orphan_inode_mutex);
+        spin_lock(&sbi->orphan_inode_lock);
        head = &sbi->orphan_inode_list;
        list_for_each_entry(orphan, head, list) {
                if (orphan->ino == ino) {
@@ -257,7 +261,7 @@ void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
                        break;
                }
        }
-        mutex_unlock(&sbi->orphan_inode_mutex);
+        spin_unlock(&sbi->orphan_inode_lock);
 }
 static void recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
@@ -270,12 +274,12 @@ static void recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
        iput(inode);
 }
-int recover_orphan_inodes(struct f2fs_sb_info *sbi)
+void recover_orphan_inodes(struct f2fs_sb_info *sbi)
 {
        block_t start_blk, orphan_blkaddr, i, j;
        if (!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG))
-                return 0;
+                return;
        sbi->por_doing = true;
        start_blk = __start_cp_addr(sbi) + 1;
@@ -295,29 +299,39 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi)
        /* clear Orphan Flag */
        clear_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG);
        sbi->por_doing = false;
-        return 0;
+        return;
 }
 static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
 {
-        struct list_head *head, *this, *next;
+        struct list_head *head;
        struct f2fs_orphan_block *orphan_blk = NULL;
-        struct page *page = NULL;
        unsigned int nentries = 0;
-        unsigned short index = 1;
+        unsigned short index;
-        unsigned short orphan_blocks;
+        unsigned short orphan_blocks = (unsigned short)((sbi->n_orphans +
-        orphan_blocks = (unsigned short)((sbi->n_orphans +
                (F2FS_ORPHANS_PER_BLOCK - 1)) / F2FS_ORPHANS_PER_BLOCK);
+        struct page *page = NULL;
+        struct orphan_inode_entry *orphan = NULL;
+        for (index = 0; index < orphan_blocks; index++)
+                grab_meta_page(sbi, start_blk + index);
-        mutex_lock(&sbi->orphan_inode_mutex);
+        index = 1;
+        spin_lock(&sbi->orphan_inode_lock);
        head = &sbi->orphan_inode_list;
        /* loop for each orphan inode entry and write them in Jornal block */
-        list_for_each_safe(this, next, head) {
+        list_for_each_entry(orphan, head, list) {
-                struct orphan_inode_entry *orphan;
+                if (!page) {
+                        page = find_get_page(META_MAPPING(sbi), start_blk++);
+                        f2fs_bug_on(!page);
+                        orphan_blk =
+                                (struct f2fs_orphan_block *)page_address(page);
+                        memset(orphan_blk, 0, sizeof(*orphan_blk));
+                        f2fs_put_page(page, 0);
+                }
-                orphan = list_entry(this, struct orphan_inode_entry, list);
+                orphan_blk->ino[nentries++] = cpu_to_le32(orphan->ino);
                if (nentries == F2FS_ORPHANS_PER_BLOCK) {
                        /*
@@ -331,29 +345,20 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
                        set_page_dirty(page);
                        f2fs_put_page(page, 1);
                        index++;
-                        start_blk++;
                        nentries = 0;
                        page = NULL;
                }
-                if (page)
+        }
-                        goto page_exist;
-                page = grab_meta_page(sbi, start_blk);
+        if (page) {
-                orphan_blk = (struct f2fs_orphan_block *)page_address(page);
+                orphan_blk->blk_addr = cpu_to_le16(index);
-                memset(orphan_blk, 0, sizeof(*orphan_blk));
+                orphan_blk->blk_count = cpu_to_le16(orphan_blocks);
-page_exist:
+                orphan_blk->entry_count = cpu_to_le32(nentries);
-                orphan_blk->ino[nentries++] = cpu_to_le32(orphan->ino);
+                set_page_dirty(page);
+                f2fs_put_page(page, 1);
        }
-        if (!page)
-                goto end;
-        orphan_blk->blk_addr = cpu_to_le16(index);
+        spin_unlock(&sbi->orphan_inode_lock);
-        orphan_blk->blk_count = cpu_to_le16(orphan_blocks);
-        orphan_blk->entry_count = cpu_to_le32(nentries);
-        set_page_dirty(page);
-        f2fs_put_page(page, 1);
-end:
-        mutex_unlock(&sbi->orphan_inode_mutex);
 }
 static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
@@ -428,7 +433,8 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi)
        cp1 = validate_checkpoint(sbi, cp_start_blk_no, &cp1_version);
        /* The second checkpoint pack should start at the next segment */
-        cp_start_blk_no += 1 << le32_to_cpu(fsb->log_blocks_per_seg);
+        cp_start_blk_no += ((unsigned long long)1) <<
+                                le32_to_cpu(fsb->log_blocks_per_seg);
        cp2 = validate_checkpoint(sbi, cp_start_blk_no, &cp2_version);
        if (cp1 && cp2) {
@@ -465,7 +471,7 @@ static int __add_dirty_inode(struct inode *inode, struct dir_inode_entry *new)
        list_for_each(this, head) {
                struct dir_inode_entry *entry;
                entry = list_entry(this, struct dir_inode_entry, list);
-                if (entry->inode == inode)
+                if (unlikely(entry->inode == inode))
                        return -EEXIST;
        }
        list_add_tail(&new->list, head);
@@ -513,8 +519,8 @@ void add_dirty_dir_inode(struct inode *inode)
 void remove_dirty_dir_inode(struct inode *inode)
 {
        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
-        struct list_head *head = &sbi->dir_inode_list;
-        struct list_head *this;
+        struct list_head *this, *head;
        if (!S_ISDIR(inode->i_mode))
                return;
@@ -525,6 +531,7 @@ void remove_dirty_dir_inode(struct inode *inode)
                return;
        }
+        head = &sbi->dir_inode_list;
        list_for_each(this, head) {
                struct dir_inode_entry *entry;
                entry = list_entry(this, struct dir_inode_entry, list);
@@ -546,11 +553,13 @@ void remove_dirty_dir_inode(struct inode *inode)
 struct inode *check_dirty_dir_inode(struct f2fs_sb_info *sbi, nid_t ino)
 {
-        struct list_head *head = &sbi->dir_inode_list;
-        struct list_head *this;
+        struct list_head *this, *head;
        struct inode *inode = NULL;
        spin_lock(&sbi->dir_inode_lock);
+        head = &sbi->dir_inode_list;
        list_for_each(this, head) {
                struct dir_inode_entry *entry;
                entry = list_entry(this, struct dir_inode_entry, list);
@@ -565,11 +574,13 @@ struct inode *check_dirty_dir_inode(struct f2fs_sb_info *sbi, nid_t ino)
 void sync_dirty_dir_inodes(struct f2fs_sb_info *sbi)
 {
-        struct list_head *head = &sbi->dir_inode_list;
+        struct list_head *head;
        struct dir_inode_entry *entry;
        struct inode *inode;
 retry:
        spin_lock(&sbi->dir_inode_lock);
+        head = &sbi->dir_inode_list;
        if (list_empty(head)) {
                spin_unlock(&sbi->dir_inode_lock);
                return;
@@ -585,7 +596,7 @@ retry:
                 * We should submit bio, since it exists several
                 * wribacking dentry pages in the freeing inode.
                 */
-                f2fs_submit_bio(sbi, DATA, true);
+                f2fs_submit_merged_bio(sbi, DATA, WRITE);
        }
        goto retry;
 }
@@ -760,8 +771,8 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
        /* wait for previous submitted node/meta pages writeback */
        wait_on_all_pages_writeback(sbi);
-        filemap_fdatawait_range(sbi->node_inode->i_mapping, 0, LONG_MAX);
+        filemap_fdatawait_range(NODE_MAPPING(sbi), 0, LONG_MAX);
-        filemap_fdatawait_range(sbi->meta_inode->i_mapping, 0, LONG_MAX);
+        filemap_fdatawait_range(META_MAPPING(sbi), 0, LONG_MAX);
        /* update user_block_counts */
        sbi->last_valid_block_count = sbi->total_valid_block_count;
@@ -770,7 +781,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
        /* Here, we only have one bio having CP pack */
        sync_meta_pages(sbi, META_FLUSH, LONG_MAX);
-        if (!is_set_ckpt_flags(ckpt, CP_ERROR_FLAG)) {
+        if (unlikely(!is_set_ckpt_flags(ckpt, CP_ERROR_FLAG))) {
                clear_prefree_segments(sbi);
                F2FS_RESET_SB_DIRT(sbi);
        }
@@ -791,9 +802,9 @@ void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
        trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish block_ops");
-        f2fs_submit_bio(sbi, DATA, true);
+        f2fs_submit_merged_bio(sbi, DATA, WRITE);
-        f2fs_submit_bio(sbi, NODE, true);
+        f2fs_submit_merged_bio(sbi, NODE, WRITE);
-        f2fs_submit_bio(sbi, META, true);
+        f2fs_submit_merged_bio(sbi, META, WRITE);
        /*
         * update checkpoint pack index
@@ -818,20 +829,28 @@ void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
 void init_orphan_info(struct f2fs_sb_info *sbi)
 {
-        mutex_init(&sbi->orphan_inode_mutex);
+        spin_lock_init(&sbi->orphan_inode_lock);
        INIT_LIST_HEAD(&sbi->orphan_inode_list);
        sbi->n_orphans = 0;
+        /*
+         * considering 512 blocks in a segment 8 blocks are needed for cp
+         * and log segment summaries. Remaining blocks are used to keep
+         * orphan entries with the limitation one reserved segment
+         * for cp pack we can have max 1020*504 orphan entries
+         */
+        sbi->max_orphans = (sbi->blocks_per_seg - 2 - NR_CURSEG_TYPE)
+                                * F2FS_ORPHANS_PER_BLOCK;
 }
 int __init create_checkpoint_caches(void)
 {
        orphan_entry_slab = f2fs_kmem_cache_create("f2fs_orphan_entry",
                        sizeof(struct orphan_inode_entry), NULL);
-        if (unlikely(!orphan_entry_slab))
+        if (!orphan_entry_slab)
                return -ENOMEM;
        inode_entry_slab = f2fs_kmem_cache_create("f2fs_dirty_dir_entry",
                        sizeof(struct dir_inode_entry), NULL);
-        if (unlikely(!inode_entry_slab)) {
+        if (!inode_entry_slab) {
                kmem_cache_destroy(orphan_entry_slab);
                return -ENOMEM;
        }
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index aa3438c571fa..2261ccdd0b5f 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -24,6 +24,188 @@
 #include "segment.h"
 #include <trace/events/f2fs.h>
+static void f2fs_read_end_io(struct bio *bio, int err)
+{
+        struct bio_vec *bvec;
+        int i;
+        bio_for_each_segment_all(bvec, bio, i) {
+                struct page *page = bvec->bv_page;
+                if (!err) {
+                        SetPageUptodate(page);
+                } else {
+                        ClearPageUptodate(page);
+                        SetPageError(page);
+                }
+                unlock_page(page);
+        }
+        bio_put(bio);
+}
+static void f2fs_write_end_io(struct bio *bio, int err)
+{
+        struct f2fs_sb_info *sbi = F2FS_SB(bio->bi_io_vec->bv_page->mapping->host->i_sb);
+        struct bio_vec *bvec;
+        int i;
+        bio_for_each_segment_all(bvec, bio, i) {
+                struct page *page = bvec->bv_page;
+                if (unlikely(err)) {
+                        SetPageError(page);
+                        set_bit(AS_EIO, &page->mapping->flags);
+                        set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG);
+                        sbi->sb->s_flags |= MS_RDONLY;
+                }
+                end_page_writeback(page);
+                dec_page_count(sbi, F2FS_WRITEBACK);
+        }
+        if (bio->bi_private)
+                complete(bio->bi_private);
+        if (!get_pages(sbi, F2FS_WRITEBACK) &&
+                        !list_empty(&sbi->cp_wait.task_list))
+                wake_up(&sbi->cp_wait);
+        bio_put(bio);
+}
+/*
+ * Low-level block read/write IO operations.
+ */
+static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr,
+                                int npages, bool is_read)
+{
+        struct bio *bio;
+        /* No failure on bio allocation */
+        bio = bio_alloc(GFP_NOIO, npages);
+        bio->bi_bdev = sbi->sb->s_bdev;
+        bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr);
+        bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io;
+        return bio;
+}
+static void __submit_merged_bio(struct f2fs_bio_info *io)
+{
+        struct f2fs_io_info *fio = &io->fio;
+        int rw;
+        if (!io->bio)
+                return;
+        rw = fio->rw;
+        if (is_read_io(rw)) {
+                trace_f2fs_submit_read_bio(io->sbi->sb, rw,
+                                                fio->type, io->bio);
+                submit_bio(rw, io->bio);
+        } else {
+                trace_f2fs_submit_write_bio(io->sbi->sb, rw,
+                                                fio->type, io->bio);
+                /*
+                 * META_FLUSH is only from the checkpoint procedure, and we
+                 * should wait this metadata bio for FS consistency.
+                 */
+                if (fio->type == META_FLUSH) {
+                        DECLARE_COMPLETION_ONSTACK(wait);
+                        io->bio->bi_private = &wait;
+                        submit_bio(rw, io->bio);
+                        wait_for_completion(&wait);
+                } else {
+                        submit_bio(rw, io->bio);
+                }
+        }
+        io->bio = NULL;
+}
+void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi,
+                                enum page_type type, int rw)
+{
+        enum page_type btype = PAGE_TYPE_OF_BIO(type);
+        struct f2fs_bio_info *io;
+        io = is_read_io(rw) ? &sbi->read_io : &sbi->write_io[btype];
+        mutex_lock(&io->io_mutex);
+        /* change META to META_FLUSH in the checkpoint procedure */
+        if (type >= META_FLUSH) {
+                io->fio.type = META_FLUSH;
+                io->fio.rw = WRITE_FLUSH_FUA | REQ_META | REQ_PRIO;
+        }
+        __submit_merged_bio(io);
+        mutex_unlock(&io->io_mutex);
+}
+/*
+ * Fill the locked page with data located in the block address.
+ * Return unlocked page.
+ */
+int f2fs_submit_page_bio(struct f2fs_sb_info *sbi, struct page *page,
+                                        block_t blk_addr, int rw)
+{
+        struct bio *bio;
+        trace_f2fs_submit_page_bio(page, blk_addr, rw);
+        /* Allocate a new bio */
+        bio = __bio_alloc(sbi, blk_addr, 1, is_read_io(rw));
+        if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) {
+                bio_put(bio);
+                f2fs_put_page(page, 1);
+                return -EFAULT;
+        }
+        submit_bio(rw, bio);
+        return 0;
+}
+void f2fs_submit_page_mbio(struct f2fs_sb_info *sbi, struct page *page,
+                        block_t blk_addr, struct f2fs_io_info *fio)
+{
+        enum page_type btype = PAGE_TYPE_OF_BIO(fio->type);
+        struct f2fs_bio_info *io;
+        bool is_read = is_read_io(fio->rw);
+        io = is_read ? &sbi->read_io : &sbi->write_io[btype];
+        verify_block_addr(sbi, blk_addr);
+        mutex_lock(&io->io_mutex);
+        if (!is_read)
+                inc_page_count(sbi, F2FS_WRITEBACK);
+        if (io->bio && (io->last_block_in_bio != blk_addr - 1 ||
+                                                io->fio.rw != fio->rw))
+                __submit_merged_bio(io);
+alloc_new:
+        if (io->bio == NULL) {
+                int bio_blocks = MAX_BIO_BLOCKS(max_hw_blocks(sbi));
+                io->bio = __bio_alloc(sbi, blk_addr, bio_blocks, is_read);
+                io->fio = *fio;
+        }
+        if (bio_add_page(io->bio, page, PAGE_CACHE_SIZE, 0) <
+                                                        PAGE_CACHE_SIZE) {
+                __submit_merged_bio(io);
+                goto alloc_new;
+        }
+        io->last_block_in_bio = blk_addr;
+        mutex_unlock(&io->io_mutex);
+        trace_f2fs_submit_page_mbio(page, fio->rw, fio->type, blk_addr);
+}
 /*
 * Lock ordering for the change of data block address:
 * ->data_page
@@ -37,7 +219,7 @@ static void __set_data_blkaddr(struct dnode_of_data *dn, block_t new_addr)
        struct page *node_page = dn->node_page;
        unsigned int ofs_in_node = dn->ofs_in_node;
-        f2fs_wait_on_page_writeback(node_page, NODE, false);
+        f2fs_wait_on_page_writeback(node_page, NODE);
        rn = F2FS_NODE(node_page);
@@ -51,19 +233,39 @@ int reserve_new_block(struct dnode_of_data *dn)
 {
        struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
-        if (is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))
+        if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)))
                return -EPERM;
-        if (!inc_valid_block_count(sbi, dn->inode, 1))
+        if (unlikely(!inc_valid_block_count(sbi, dn->inode, 1)))
                return -ENOSPC;
        trace_f2fs_reserve_new_block(dn->inode, dn->nid, dn->ofs_in_node);
        __set_data_blkaddr(dn, NEW_ADDR);
        dn->data_blkaddr = NEW_ADDR;
+        mark_inode_dirty(dn->inode);
        sync_inode_page(dn);
        return 0;
 }
+int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index)
+{
+        bool need_put = dn->inode_page ? false : true;
+        int err;
+        /* if inode_page exists, index should be zero */
+        f2fs_bug_on(!need_put && index);
+        err = get_dnode_of_data(dn, index, ALLOC_NODE);
+        if (err)
+                return err;
+        if (dn->data_blkaddr == NULL_ADDR)
+                err = reserve_new_block(dn);
+        if (err || need_put)
+                f2fs_put_dnode(dn);
+        return err;
+}
 static int check_extent_cache(struct inode *inode, pgoff_t pgofs,
                                        struct buffer_head *bh_result)
 {
@@ -71,6 +273,9 @@ static int check_extent_cache(struct inode *inode, pgoff_t pgofs,
        pgoff_t start_fofs, end_fofs;
        block_t start_blkaddr;
+        if (is_inode_flag_set(fi, FI_NO_EXTENT))
+                return 0;
        read_lock(&fi->ext.ext_lock);
        if (fi->ext.len == 0) {
                read_unlock(&fi->ext.ext_lock);
@@ -109,6 +314,7 @@ void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn)
        struct f2fs_inode_info *fi = F2FS_I(dn->inode);
        pgoff_t fofs, start_fofs, end_fofs;
        block_t start_blkaddr, end_blkaddr;
+        int need_update = true;
        f2fs_bug_on(blk_addr == NEW_ADDR);
        fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
@@ -117,6 +323,9 @@ void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn)
        /* Update the page address in the parent node */
        __set_data_blkaddr(dn, blk_addr);
+        if (is_inode_flag_set(fi, FI_NO_EXTENT))
+                return;
        write_lock(&fi->ext.ext_lock);
        start_fofs = fi->ext.fofs;
@@ -163,14 +372,21 @@ void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn)
                                        fofs - start_fofs + 1;
                        fi->ext.len -= fofs - start_fofs + 1;
                }
-                goto end_update;
+        } else {
+                need_update = false;
        }
-        write_unlock(&fi->ext.ext_lock);
-        return;
+        /* Finally, if the extent is very fragmented, let's drop the cache. */
+        if (fi->ext.len < F2FS_MIN_EXTENT_LEN) {
+                fi->ext.len = 0;
+                set_inode_flag(fi, FI_NO_EXTENT);
+                need_update = true;
+        }
 end_update:
        write_unlock(&fi->ext.ext_lock);
-        sync_inode_page(dn);
+        if (need_update)
+                sync_inode_page(dn);
+        return;
 }
 struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
@@ -196,7 +412,7 @@ struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
                return ERR_PTR(-ENOENT);
        /* By fallocate(), there is no cached page, but with NEW_ADDR */
-        if (dn.data_blkaddr == NEW_ADDR)
+        if (unlikely(dn.data_blkaddr == NEW_ADDR))
                return ERR_PTR(-EINVAL);
        page = grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS);
@@ -208,11 +424,14 @@ struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
                return page;
        }
-        err = f2fs_readpage(sbi, page, dn.data_blkaddr,
+        err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr,
                                        sync ? READ_SYNC : READA);
+        if (err)
+                return ERR_PTR(err);
        if (sync) {
                wait_on_page_locked(page);
-                if (!PageUptodate(page)) {
+                if (unlikely(!PageUptodate(page))) {
                        f2fs_put_page(page, 0);
                        return ERR_PTR(-EIO);
                }
@@ -246,7 +465,7 @@ repeat:
        }
        f2fs_put_dnode(&dn);
-        if (dn.data_blkaddr == NULL_ADDR) {
+        if (unlikely(dn.data_blkaddr == NULL_ADDR)) {
                f2fs_put_page(page, 1);
                return ERR_PTR(-ENOENT);
        }
@@ -266,16 +485,16 @@ repeat:
                return page;
        }
-        err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC);
+        err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr, READ_SYNC);
        if (err)
                return ERR_PTR(err);
        lock_page(page);
-        if (!PageUptodate(page)) {
+        if (unlikely(!PageUptodate(page))) {
                f2fs_put_page(page, 1);
                return ERR_PTR(-EIO);
        }
-        if (page->mapping != mapping) {
+        if (unlikely(page->mapping != mapping)) {
                f2fs_put_page(page, 1);
                goto repeat;
        }
@@ -286,12 +505,12 @@ repeat:
 * Caller ensures that this data page is never allocated.
 * A new zero-filled data page is allocated in the page cache.
 *
- * Also, caller should grab and release a mutex by calling mutex_lock_op() and
+ * Also, caller should grab and release a rwsem by calling f2fs_lock_op() and
- * mutex_unlock_op().
+ * f2fs_unlock_op().
- * Note that, npage is set only by make_empty_dir.
+ * Note that, ipage is set only by make_empty_dir.
 */
 struct page *get_new_data_page(struct inode *inode,
-                struct page *npage, pgoff_t index, bool new_i_size)
+                struct page *ipage, pgoff_t index, bool new_i_size)
 {
        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
        struct address_space *mapping = inode->i_mapping;
@@ -299,24 +518,16 @@ struct page *get_new_data_page(struct inode *inode,
        struct dnode_of_data dn;
        int err;
-        set_new_dnode(&dn, inode, npage, npage, 0);
+        set_new_dnode(&dn, inode, ipage, NULL, 0);
-        err = get_dnode_of_data(&dn, index, ALLOC_NODE);
+        err = f2fs_reserve_block(&dn, index);
        if (err)
                return ERR_PTR(err);
-        if (dn.data_blkaddr == NULL_ADDR) {
-                if (reserve_new_block(&dn)) {
-                        if (!npage)
-                                f2fs_put_dnode(&dn);
-                        return ERR_PTR(-ENOSPC);
-                }
-        }
-        if (!npage)
-                f2fs_put_dnode(&dn);
 repeat:
        page = grab_cache_page(mapping, index);
-        if (!page)
+        if (!page) {
-                return ERR_PTR(-ENOMEM);
+                err = -ENOMEM;
+                goto put_err;
+        }
        if (PageUptodate(page))
                return page;
@@ -325,15 +536,18 @@ repeat:
                zero_user_segment(page, 0, PAGE_CACHE_SIZE);
                SetPageUptodate(page);
        } else {
-                err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC);
+                err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr,
+                                                                READ_SYNC);
                if (err)
-                        return ERR_PTR(err);
+                        goto put_err;
                lock_page(page);
-                if (!PageUptodate(page)) {
+                if (unlikely(!PageUptodate(page))) {
                        f2fs_put_page(page, 1);
-                        return ERR_PTR(-EIO);
+                        err = -EIO;
+                        goto put_err;
                }
-                if (page->mapping != mapping) {
+                if (unlikely(page->mapping != mapping)) {
                        f2fs_put_page(page, 1);
                        goto repeat;
                }
@@ -344,140 +558,187 @@ repeat:
                i_size_write(inode, ((index + 1) << PAGE_CACHE_SHIFT));
                /* Only the directory inode sets new_i_size */
                set_inode_flag(F2FS_I(inode), FI_UPDATE_DIR);
-                mark_inode_dirty_sync(inode);
        }
        return page;
-}
-static void read_end_io(struct bio *bio, int err)
+put_err:
-{
+        f2fs_put_dnode(&dn);
-        const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+        return ERR_PTR(err);
-        struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
-        do {
-                struct page *page = bvec->bv_page;
-                if (--bvec >= bio->bi_io_vec)
-                        prefetchw(&bvec->bv_page->flags);
-                if (uptodate) {
-                        SetPageUptodate(page);
-                } else {
-                        ClearPageUptodate(page);
-                        SetPageError(page);
-                }
-                unlock_page(page);
-        } while (bvec >= bio->bi_io_vec);
-        bio_put(bio);
 }
-/*
+static int __allocate_data_block(struct dnode_of_data *dn)
- * Fill the locked page with data located in the block address.
- * Return unlocked page.
- */
-int f2fs_readpage(struct f2fs_sb_info *sbi, struct page *page,
-                                        block_t blk_addr, int type)
 {
-        struct block_device *bdev = sbi->sb->s_bdev;
+        struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
-        struct bio *bio;
+        struct f2fs_summary sum;
+        block_t new_blkaddr;
+        struct node_info ni;
+        int type;
-        trace_f2fs_readpage(page, blk_addr, type);
+        if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)))
+                return -EPERM;
+        if (unlikely(!inc_valid_block_count(sbi, dn->inode, 1)))
+                return -ENOSPC;
-        down_read(&sbi->bio_sem);
+        __set_data_blkaddr(dn, NEW_ADDR);
+        dn->data_blkaddr = NEW_ADDR;
-        /* Allocate a new bio */
+        get_node_info(sbi, dn->nid, &ni);
-        bio = f2fs_bio_alloc(bdev, 1);
+        set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version);
-        /* Initialize the bio */
+        type = CURSEG_WARM_DATA;
-        bio->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr);
-        bio->bi_end_io = read_end_io;
-        if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) {
+        allocate_data_block(sbi, NULL, NULL_ADDR, &new_blkaddr, &sum, type);
-                bio_put(bio);
-                up_read(&sbi->bio_sem);
+        /* direct IO doesn't use extent cache to maximize the performance */
-                f2fs_put_page(page, 1);
+        set_inode_flag(F2FS_I(dn->inode), FI_NO_EXTENT);
-                return -EFAULT;
+        update_extent_cache(new_blkaddr, dn);
-        }
+        clear_inode_flag(F2FS_I(dn->inode), FI_NO_EXTENT);
-        submit_bio(type, bio);
+        dn->data_blkaddr = new_blkaddr;
-        up_read(&sbi->bio_sem);
        return 0;
 }
 /*
- * This function should be used by the data read flow only where it
+ * get_data_block() now supported readahead/bmap/rw direct_IO with mapped bh.
- * does not check the "create" flag that indicates block allocation.
+ * If original data blocks are allocated, then give them to blockdev.
- * The reason for this special functionality is to exploit VFS readahead
+ * Otherwise,
- * mechanism.
+ *     a. preallocate requested block addresses
+ *     b. do not use extent cache for better performance
+ *     c. give the block addresses to blockdev
 */
-static int get_data_block_ro(struct inode *inode, sector_t iblock,
+static int get_data_block(struct inode *inode, sector_t iblock,
                        struct buffer_head *bh_result, int create)
 {
+        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
        unsigned int blkbits = inode->i_sb->s_blocksize_bits;
        unsigned maxblocks = bh_result->b_size >> blkbits;
        struct dnode_of_data dn;
-        pgoff_t pgofs;
+        int mode = create ? ALLOC_NODE : LOOKUP_NODE_RA;
-        int err;
+        pgoff_t pgofs, end_offset;
+        int err = 0, ofs = 1;
+        bool allocated = false;
        /* Get the page offset from the block offset(iblock) */
        pgofs = (pgoff_t)(iblock >> (PAGE_CACHE_SHIFT - blkbits));
-        if (check_extent_cache(inode, pgofs, bh_result)) {
+        if (check_extent_cache(inode, pgofs, bh_result))
-                trace_f2fs_get_data_block(inode, iblock, bh_result, 0);
+                goto out;
-                return 0;
-        }
+        if (create)
+                f2fs_lock_op(sbi);
        /* When reading holes, we need its node page */
        set_new_dnode(&dn, inode, NULL, NULL, 0);
-        err = get_dnode_of_data(&dn, pgofs, LOOKUP_NODE_RA);
+        err = get_dnode_of_data(&dn, pgofs, mode);
        if (err) {
-                trace_f2fs_get_data_block(inode, iblock, bh_result, err);
+                if (err == -ENOENT)
-                return (err == -ENOENT) ? 0 : err;
+                        err = 0;
+                goto unlock_out;
+        }
+        if (dn.data_blkaddr == NEW_ADDR)
+                goto put_out;
+        if (dn.data_blkaddr != NULL_ADDR) {
+                map_bh(bh_result, inode->i_sb, dn.data_blkaddr);
+        } else if (create) {
+                err = __allocate_data_block(&dn);
+                if (err)
+                        goto put_out;
+                allocated = true;
+                map_bh(bh_result, inode->i_sb, dn.data_blkaddr);
+        } else {
+                goto put_out;
        }
-        /* It does not support data allocation */
+        end_offset = IS_INODE(dn.node_page) ?
-        f2fs_bug_on(create);
+                        ADDRS_PER_INODE(F2FS_I(inode)) : ADDRS_PER_BLOCK;
+        bh_result->b_size = (((size_t)1) << blkbits);
+        dn.ofs_in_node++;
+        pgofs++;
+get_next:
+        if (dn.ofs_in_node >= end_offset) {
+                if (allocated)
+                        sync_inode_page(&dn);
+                allocated = false;
+                f2fs_put_dnode(&dn);
-        if (dn.data_blkaddr != NEW_ADDR && dn.data_blkaddr != NULL_ADDR) {
+                set_new_dnode(&dn, inode, NULL, NULL, 0);
-                int i;
+                err = get_dnode_of_data(&dn, pgofs, mode);
-                unsigned int end_offset;
+                if (err) {
+                        if (err == -ENOENT)
+                                err = 0;
+                        goto unlock_out;
+                }
+                if (dn.data_blkaddr == NEW_ADDR)
+                        goto put_out;
                end_offset = IS_INODE(dn.node_page) ?
-                                ADDRS_PER_INODE(F2FS_I(inode)) :
+                        ADDRS_PER_INODE(F2FS_I(inode)) : ADDRS_PER_BLOCK;
-                                ADDRS_PER_BLOCK;
+        }
-                clear_buffer_new(bh_result);
+        if (maxblocks > (bh_result->b_size >> blkbits)) {
+                block_t blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
+                if (blkaddr == NULL_ADDR && create) {
+                        err = __allocate_data_block(&dn);
+                        if (err)
+                                goto sync_out;
+                        allocated = true;
+                        blkaddr = dn.data_blkaddr;
+                }
                /* Give more consecutive addresses for the read ahead */
-                for (i = 0; i < end_offset - dn.ofs_in_node; i++)
+                if (blkaddr == (bh_result->b_blocknr + ofs)) {
-                        if (((datablock_addr(dn.node_page,
+                        ofs++;
-                                                        dn.ofs_in_node + i))
+                        dn.ofs_in_node++;
-                                != (dn.data_blkaddr + i)) || maxblocks == i)
+                        pgofs++;
-                                break;
+                        bh_result->b_size += (((size_t)1) << blkbits);
-                map_bh(bh_result, inode->i_sb, dn.data_blkaddr);
+                        goto get_next;
-                bh_result->b_size = (i << blkbits);
+                }
        }
+sync_out:
+        if (allocated)
+                sync_inode_page(&dn);
+put_out:
        f2fs_put_dnode(&dn);
-        trace_f2fs_get_data_block(inode, iblock, bh_result, 0);
+unlock_out:
-        return 0;
+        if (create)
+                f2fs_unlock_op(sbi);
+out:
+        trace_f2fs_get_data_block(inode, iblock, bh_result, err);
+        return err;
 }
 static int f2fs_read_data_page(struct file *file, struct page *page)
 {
-        return mpage_readpage(page, get_data_block_ro);
+        struct inode *inode = page->mapping->host;
+        int ret;
+        /* If the file has inline data, try to read it directlly */
+        if (f2fs_has_inline_data(inode))
+                ret = f2fs_read_inline_data(inode, page);
+        else
+                ret = mpage_readpage(page, get_data_block);
+        return ret;
 }
 static int f2fs_read_data_pages(struct file *file,
                        struct address_space *mapping,
                        struct list_head *pages, unsigned nr_pages)
 {
-        return mpage_readpages(mapping, pages, nr_pages, get_data_block_ro);
+        struct inode *inode = file->f_mapping->host;
+        /* If the file has inline data, skip readpages */
+        if (f2fs_has_inline_data(inode))
+                return 0;
+        return mpage_readpages(mapping, pages, nr_pages, get_data_block);
 }
-int do_write_data_page(struct page *page)
+int do_write_data_page(struct page *page, struct f2fs_io_info *fio)
 {
        struct inode *inode = page->mapping->host;
-        block_t old_blk_addr, new_blk_addr;
+        block_t old_blkaddr, new_blkaddr;
        struct dnode_of_data dn;
        int err = 0;
@@ -486,10 +747,10 @@ int do_write_data_page(struct page *page)
        if (err)
                return err;
-        old_blk_addr = dn.data_blkaddr;
+        old_blkaddr = dn.data_blkaddr;
        /* This page is already truncated */
-        if (old_blk_addr == NULL_ADDR)
+        if (old_blkaddr == NULL_ADDR)
                goto out_writepage;
        set_page_writeback(page);
@@ -498,15 +759,13 @@ int do_write_data_page(struct page *page)
         * If current allocation needs SSR,
         * it had better in-place writes for updated data.
         */
-        if (unlikely(old_blk_addr != NEW_ADDR &&
+        if (unlikely(old_blkaddr != NEW_ADDR &&
                        !is_cold_data(page) &&
                        need_inplace_update(inode))) {
-                rewrite_data_page(F2FS_SB(inode->i_sb), page,
+                rewrite_data_page(page, old_blkaddr, fio);
-                                                old_blk_addr);
        } else {
-                write_data_page(inode, page, &dn,
+                write_data_page(page, &dn, &new_blkaddr, fio);
-                                old_blk_addr, &new_blk_addr);
+                update_extent_cache(new_blkaddr, &dn);
-                update_extent_cache(new_blk_addr, &dn);
        }
 out_writepage:
        f2fs_put_dnode(&dn);
@@ -521,9 +780,13 @@ static int f2fs_write_data_page(struct page *page,
        loff_t i_size = i_size_read(inode);
        const pgoff_t end_index = ((unsigned long long) i_size)
                                                        >> PAGE_CACHE_SHIFT;
-        unsigned offset;
+        unsigned offset = 0;
        bool need_balance_fs = false;
        int err = 0;
+        struct f2fs_io_info fio = {
+                .type = DATA,
+                .rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE,
+        };
        if (page->index < end_index)
                goto write;
@@ -543,7 +806,7 @@ static int f2fs_write_data_page(struct page *page,
        zero_user_segment(page, offset, PAGE_CACHE_SIZE);
 write:
-        if (sbi->por_doing) {
+        if (unlikely(sbi->por_doing)) {
                err = AOP_WRITEPAGE_ACTIVATE;
                goto redirty_out;
        }
@@ -552,10 +815,18 @@ write:
        if (S_ISDIR(inode->i_mode)) {
                dec_page_count(sbi, F2FS_DIRTY_DENTS);
                inode_dec_dirty_dents(inode);
-                err = do_write_data_page(page);
+                err = do_write_data_page(page, &fio);
        } else {
                f2fs_lock_op(sbi);
-                err = do_write_data_page(page);
+                if (f2fs_has_inline_data(inode) || f2fs_may_inline(inode)) {
+                        err = f2fs_write_inline_data(inode, page, offset);
+                        f2fs_unlock_op(sbi);
+                        goto out;
+                } else {
+                        err = do_write_data_page(page, &fio);
+                }
                f2fs_unlock_op(sbi);
                need_balance_fs = true;
        }
@@ -564,8 +835,10 @@ write:
        else if (err)
                goto redirty_out;
-        if (wbc->for_reclaim)
+        if (wbc->for_reclaim) {
-                f2fs_submit_bio(sbi, DATA, true);
+                f2fs_submit_merged_bio(sbi, DATA, WRITE);
+                need_balance_fs = false;
+        }
        clear_cold_data(page);
 out:
@@ -617,7 +890,8 @@ static int f2fs_write_data_pages(struct address_space *mapping,
        ret = write_cache_pages(mapping, wbc, __f2fs_writepage, mapping);
        if (locked)
                mutex_unlock(&sbi->writepages);
-        f2fs_submit_bio(sbi, DATA, (wbc->sync_mode == WB_SYNC_ALL));
+        f2fs_submit_merged_bio(sbi, DATA, WRITE);
        remove_dirty_dir_inode(inode);
@@ -638,27 +912,28 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
        f2fs_balance_fs(sbi);
 repeat:
+        err = f2fs_convert_inline_data(inode, pos + len);
+        if (err)
+                return err;
        page = grab_cache_page_write_begin(mapping, index, flags);
        if (!page)
                return -ENOMEM;
        *pagep = page;
-        f2fs_lock_op(sbi);
+        if (f2fs_has_inline_data(inode) && (pos + len) <= MAX_INLINE_DATA)
+                goto inline_data;
+        f2fs_lock_op(sbi);
        set_new_dnode(&dn, inode, NULL, NULL, 0);
-        err = get_dnode_of_data(&dn, index, ALLOC_NODE);
+        err = f2fs_reserve_block(&dn, index);
-        if (err)
-                goto err;
-        if (dn.data_blkaddr == NULL_ADDR)
-                err = reserve_new_block(&dn);
-        f2fs_put_dnode(&dn);
-        if (err)
-                goto err;
        f2fs_unlock_op(sbi);
+        if (err) {
+                f2fs_put_page(page, 1);
+                return err;
+        }
+inline_data:
        if ((len == PAGE_CACHE_SIZE) || PageUptodate(page))
                return 0;
@@ -674,15 +949,19 @@ repeat:
        if (dn.data_blkaddr == NEW_ADDR) {
                zero_user_segment(page, 0, PAGE_CACHE_SIZE);
        } else {
-                err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC);
+                if (f2fs_has_inline_data(inode))
+                        err = f2fs_read_inline_data(inode, page);
+                else
+                        err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr,
+                                                        READ_SYNC);
                if (err)
                        return err;
                lock_page(page);
-                if (!PageUptodate(page)) {
+                if (unlikely(!PageUptodate(page))) {
                        f2fs_put_page(page, 1);
                        return -EIO;
                }
-                if (page->mapping != mapping) {
+                if (unlikely(page->mapping != mapping)) {
                        f2fs_put_page(page, 1);
                        goto repeat;
                }
@@ -691,11 +970,6 @@ out:
        SetPageUptodate(page);
        clear_cold_data(page);
        return 0;
-err:
-        f2fs_unlock_op(sbi);
-        f2fs_put_page(page, 1);
-        return err;
 }
 static int f2fs_write_end(struct file *file,
@@ -714,23 +988,43 @@ static int f2fs_write_end(struct file *file,
                update_inode_page(inode);
        }
-        unlock_page(page);
+        f2fs_put_page(page, 1);
-        page_cache_release(page);
        return copied;
 }
+static int check_direct_IO(struct inode *inode, int rw,
+                const struct iovec *iov, loff_t offset, unsigned long nr_segs)
+{
+        unsigned blocksize_mask = inode->i_sb->s_blocksize - 1;
+        int i;
+        if (rw == READ)
+                return 0;
+        if (offset & blocksize_mask)
+                return -EINVAL;
+        for (i = 0; i < nr_segs; i++)
+                if (iov[i].iov_len & blocksize_mask)
+                        return -EINVAL;
+        return 0;
+}
 static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb,
                const struct iovec *iov, loff_t offset, unsigned long nr_segs)
 {
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;
-        if (rw == WRITE)
+        /* Let buffer I/O handle the inline data case. */
+        if (f2fs_has_inline_data(inode))
+                return 0;
+        if (check_direct_IO(inode, rw, iov, offset, nr_segs))
                return 0;
-        /* Needs synchronization with the cleaner */
        return blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs,
-                                                  get_data_block_ro);
+                                                        get_data_block);
 }
 static void f2fs_invalidate_data_page(struct page *page, unsigned int offset,
@@ -759,6 +1053,8 @@ static int f2fs_set_data_page_dirty(struct page *page)
        trace_f2fs_set_page_dirty(page, DATA);
        SetPageUptodate(page);
+        mark_inode_dirty(inode);
        if (!PageDirty(page)) {
                __set_page_dirty_nobuffers(page);
                set_dirty_dir_page(inode, page);
@@ -769,7 +1065,7 @@ static int f2fs_set_data_page_dirty(struct page *page)
 static sector_t f2fs_bmap(struct address_space *mapping, sector_t block)
 {
-        return generic_block_bmap(mapping, block, get_data_block_ro);
+        return generic_block_bmap(mapping, block, get_data_block);
 }
 const struct address_space_operations f2fs_dblock_aops = {
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index a84b0a8e6854..3de9d20d0c14 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -24,7 +24,7 @@
 #include "gc.h"
 static LIST_HEAD(f2fs_stat_list);
-static struct dentry *debugfs_root;
+static struct dentry *f2fs_debugfs_root;
 static DEFINE_MUTEX(f2fs_stat_mutex);
 static void update_general_status(struct f2fs_sb_info *sbi)
@@ -45,14 +45,15 @@ static void update_general_status(struct f2fs_sb_info *sbi)
        si->valid_count = valid_user_blocks(sbi);
        si->valid_node_count = valid_node_count(sbi);
        si->valid_inode_count = valid_inode_count(sbi);
+        si->inline_inode = sbi->inline_inode;
        si->utilization = utilization(sbi);
        si->free_segs = free_segments(sbi);
        si->free_secs = free_sections(sbi);
        si->prefree_count = prefree_segments(sbi);
        si->dirty_count = dirty_segments(sbi);
-        si->node_pages = sbi->node_inode->i_mapping->nrpages;
+        si->node_pages = NODE_MAPPING(sbi)->nrpages;
-        si->meta_pages = sbi->meta_inode->i_mapping->nrpages;
+        si->meta_pages = META_MAPPING(sbi)->nrpages;
        si->nats = NM_I(sbi)->nat_cnt;
        si->sits = SIT_I(sbi)->dirty_sentries;
        si->fnids = NM_I(sbi)->fcnt;
@@ -165,9 +166,9 @@ get_cache:
        /* free nids */
        si->cache_mem = NM_I(sbi)->fcnt;
        si->cache_mem += NM_I(sbi)->nat_cnt;
-        npages = sbi->node_inode->i_mapping->nrpages;
+        npages = NODE_MAPPING(sbi)->nrpages;
        si->cache_mem += npages << PAGE_CACHE_SHIFT;
-        npages = sbi->meta_inode->i_mapping->nrpages;
+        npages = META_MAPPING(sbi)->nrpages;
        si->cache_mem += npages << PAGE_CACHE_SHIFT;
        si->cache_mem += sbi->n_orphans * sizeof(struct orphan_inode_entry);
        si->cache_mem += sbi->n_dirty_dirs * sizeof(struct dir_inode_entry);
@@ -200,6 +201,8 @@ static int stat_show(struct seq_file *s, void *v)
                seq_printf(s, "Other: %u)\n  - Data: %u\n",
                           si->valid_node_count - si->valid_inode_count,
                           si->valid_count - si->valid_node_count);
+                seq_printf(s, "  - Inline_data Inode: %u\n",
+                           si->inline_inode);
                seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n",
                           si->main_area_segs, si->main_area_sections,
                           si->main_area_zones);
@@ -242,14 +245,14 @@ static int stat_show(struct seq_file *s, void *v)
                seq_printf(s, "  - node blocks : %d\n", si->node_blks);
                seq_printf(s, "\nExtent Hit Ratio: %d / %d\n",
                           si->hit_ext, si->total_ext);
-                seq_printf(s, "\nBalancing F2FS Async:\n");
+                seq_puts(s, "\nBalancing F2FS Async:\n");
-                seq_printf(s, "  - nodes %4d in %4d\n",
+                seq_printf(s, "  - nodes: %4d in %4d\n",
                           si->ndirty_node, si->node_pages);
-                seq_printf(s, "  - dents %4d in dirs:%4d\n",
+                seq_printf(s, "  - dents: %4d in dirs:%4d\n",
                           si->ndirty_dent, si->ndirty_dirs);
-                seq_printf(s, "  - meta %4d in %4d\n",
+                seq_printf(s, "  - meta: %4d in %4d\n",
                           si->ndirty_meta, si->meta_pages);
-                seq_printf(s, "  - NATs %5d > %lu\n",
+                seq_printf(s, "  - NATs: %5d > %lu\n",
                           si->nats, NM_WOUT_THRESHOLD);
                seq_printf(s, "  - SITs: %5d\n  - free_nids: %5d\n",
                           si->sits, si->fnids);
@@ -340,14 +343,32 @@ void f2fs_destroy_stats(struct f2fs_sb_info *sbi)
 void __init f2fs_create_root_stats(void)
 {
-        debugfs_root = debugfs_create_dir("f2fs", NULL);
+        struct dentry *file;
-        if (debugfs_root)
-                debugfs_create_file("status", S_IRUGO, debugfs_root,
+        f2fs_debugfs_root = debugfs_create_dir("f2fs", NULL);
-                                         NULL, &stat_fops);
+        if (!f2fs_debugfs_root)
+                goto bail;
+        file = debugfs_create_file("status", S_IRUGO, f2fs_debugfs_root,
+                        NULL, &stat_fops);
+        if (!file)
+                goto free_debugfs_dir;
+        return;
+free_debugfs_dir:
+        debugfs_remove(f2fs_debugfs_root);
+bail:
+        f2fs_debugfs_root = NULL;
+        return;
 }
 void f2fs_destroy_root_stats(void)
 {
-        debugfs_remove_recursive(debugfs_root);
+        if (!f2fs_debugfs_root)
-        debugfs_root = NULL;
+                return;
+        debugfs_remove_recursive(f2fs_debugfs_root);
+        f2fs_debugfs_root = NULL;
 }
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 594fc1bb64ef..2b7c255bcbdf 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -190,9 +190,6 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
        unsigned int max_depth;
        unsigned int level;
-        if (namelen > F2FS_NAME_LEN)
-                return NULL;
        if (npages == 0)
                return NULL;
@@ -259,20 +256,17 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
        dir->i_mtime = dir->i_ctime = CURRENT_TIME;
        mark_inode_dirty(dir);
-        /* update parent inode number before releasing dentry page */
-        F2FS_I(inode)->i_pino = dir->i_ino;
        f2fs_put_page(page, 1);
 }
 static void init_dent_inode(const struct qstr *name, struct page *ipage)
 {
-        struct f2fs_node *rn;
+        struct f2fs_inode *ri;
        /* copy name info. to this inode page */
-        rn = F2FS_NODE(ipage);
+        ri = F2FS_INODE(ipage);
-        rn->i.i_namelen = cpu_to_le32(name->len);
+        ri->i_namelen = cpu_to_le32(name->len);
-        memcpy(rn->i.i_name, name->name, name->len);
+        memcpy(ri->i_name, name->name, name->len);
        set_page_dirty(ipage);
 }
@@ -348,11 +342,11 @@ static struct page *init_inode_metadata(struct inode *inode,
                err = f2fs_init_acl(inode, dir, page);
                if (err)
-                        goto error;
+                        goto put_error;
                err = f2fs_init_security(inode, dir, name, page);
                if (err)
-                        goto error;
+                        goto put_error;
                wait_on_page_writeback(page);
        } else {
@@ -376,8 +370,9 @@ static struct page *init_inode_metadata(struct inode *inode,
        }
        return page;
-error:
+put_error:
        f2fs_put_page(page, 1);
+error:
        remove_inode_page(inode);
        return ERR_PTR(err);
 }
@@ -393,6 +388,8 @@ static void update_parent_metadata(struct inode *dir, struct inode *inode,
                clear_inode_flag(F2FS_I(inode), FI_NEW_INODE);
        }
        dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+        mark_inode_dirty(dir);
        if (F2FS_I(dir)->i_current_depth != current_depth) {
                F2FS_I(dir)->i_current_depth = current_depth;
                set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
@@ -400,8 +397,6 @@ static void update_parent_metadata(struct inode *dir, struct inode *inode,
        if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR))
                update_inode_page(dir);
-        else
-                mark_inode_dirty(dir);
        if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK))
                clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
@@ -432,10 +427,11 @@ next:
 }
 /*
- * Caller should grab and release a mutex by calling mutex_lock_op() and
+ * Caller should grab and release a rwsem by calling f2fs_lock_op() and
- * mutex_unlock_op().
+ * f2fs_unlock_op().
 */
-int __f2fs_add_link(struct inode *dir, const struct qstr *name, struct inode *inode)
+int __f2fs_add_link(struct inode *dir, const struct qstr *name,
+                                                struct inode *inode)
 {
        unsigned int bit_pos;
        unsigned int level;
@@ -461,7 +457,7 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name, struct inode *in
        }
 start:
-        if (current_depth == MAX_DIR_HASH_DEPTH)
+        if (unlikely(current_depth == MAX_DIR_HASH_DEPTH))
                return -ENOSPC;
        /* Increase the depth, if required */
@@ -554,14 +550,11 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
        dir->i_ctime = dir->i_mtime = CURRENT_TIME;
-        if (inode && S_ISDIR(inode->i_mode)) {
-                drop_nlink(dir);
-                update_inode_page(dir);
-        } else {
-                mark_inode_dirty(dir);
-        }
        if (inode) {
+                if (S_ISDIR(inode->i_mode)) {
+                        drop_nlink(dir);
+                        update_inode_page(dir);
+                }
                inode->i_ctime = CURRENT_TIME;
                drop_nlink(inode);
                if (S_ISDIR(inode->i_mode)) {
@@ -636,7 +629,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
        bit_pos = ((unsigned long)ctx->pos % NR_DENTRY_IN_BLOCK);
-        for ( ; n < npages; n++) {
+        for (; n < npages; n++) {
                dentry_page = get_lock_data_page(inode, n);
                if (IS_ERR(dentry_page))
                        continue;
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 89dc7508faf2..fc3c558cb4f3 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -22,8 +22,10 @@
 #ifdef CONFIG_F2FS_CHECK_FS
 #define f2fs_bug_on(condition)  BUG_ON(condition)
+#define f2fs_down_write(x, y)   down_write_nest_lock(x, y)
 #else
 #define f2fs_bug_on(condition)
+#define f2fs_down_write(x, y)   down_write(x)
 #endif
 /*
@@ -37,6 +39,7 @@
 #define F2FS_MOUNT_POSIX_ACL            0x00000020
 #define F2FS_MOUNT_DISABLE_EXT_IDENTIFY 0x00000040
 #define F2FS_MOUNT_INLINE_XATTR         0x00000080
+#define F2FS_MOUNT_INLINE_DATA          0x00000100
 #define clear_opt(sbi, option)  (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option)
 #define set_opt(sbi, option)    (sbi->mount_opt.opt |= F2FS_MOUNT_##option)
@@ -97,6 +100,13 @@ struct dir_inode_entry {
        struct inode *inode;    /* vfs inode pointer */
 };
+/* for the list of blockaddresses to be discarded */
+struct discard_entry {
+        struct list_head list;  /* list head */
+        block_t blkaddr;        /* block address to be discarded */
+        int len;                /* # of consecutive blocks of the discard */
+};
 /* for the list of fsync inodes, used only during recovery */
 struct fsync_inode_entry {
        struct list_head list;  /* list head */
@@ -155,13 +165,15 @@ enum {
        LOOKUP_NODE,                    /* look up a node without readahead */
        LOOKUP_NODE_RA,                 /*
                                         * look up a node with readahead called
-                                         * by get_datablock_ro.
+                                         * by get_data_block.
                                         */
 };
 #define F2FS_LINK_MAX           32000   /* maximum link count per file */
 /* for in-memory extent cache entry */
+#define F2FS_MIN_EXTENT_LEN     16      /* minimum extent length */
 struct extent_info {
        rwlock_t ext_lock;      /* rwlock for consistency */
        unsigned int fofs;      /* start offset in a file */
@@ -308,6 +320,14 @@ struct f2fs_sm_info {
        /* a threshold to reclaim prefree segments */
        unsigned int rec_prefree_segments;
+        /* for small discard management */
+        struct list_head discard_list;          /* 4KB discard list */
+        int nr_discards;                        /* # of discards in the list */
+        int max_discards;                       /* max. discards to be issued */
+        unsigned int ipu_policy;        /* in-place-update policy */
+        unsigned int min_ipu_util;      /* in-place-update threshold */
 };
 /*
@@ -338,6 +358,7 @@ enum count_type {
 *                      with waiting the bio's completion
 * ...                  Only can be used with META.
 */
+#define PAGE_TYPE_OF_BIO(type)  ((type) > META ? META : (type))
 enum page_type {
        DATA,
        NODE,
@@ -346,6 +367,20 @@ enum page_type {
        META_FLUSH,
 };
+struct f2fs_io_info {
+        enum page_type type;    /* contains DATA/NODE/META/META_FLUSH */
+        int rw;                 /* contains R/RS/W/WS with REQ_META/REQ_PRIO */
+};
+#define is_read_io(rw)  (((rw) & 1) == READ)
+struct f2fs_bio_info {
+        struct f2fs_sb_info *sbi;       /* f2fs superblock */
+        struct bio *bio;                /* bios to merge */
+        sector_t last_block_in_bio;     /* last block number */
+        struct f2fs_io_info fio;        /* store buffered io info. */
+        struct mutex io_mutex;          /* mutex for bio */
+};
 struct f2fs_sb_info {
        struct super_block *sb;                 /* pointer to VFS super block */
        struct proc_dir_entry *s_proc;          /* proc entry */
@@ -359,9 +394,10 @@ struct f2fs_sb_info {
        /* for segment-related operations */
        struct f2fs_sm_info *sm_info;           /* segment manager */
-        struct bio *bio[NR_PAGE_TYPE];          /* bios to merge */
-        sector_t last_block_in_bio[NR_PAGE_TYPE];       /* last block number */
+        /* for bio operations */
-        struct rw_semaphore bio_sem;            /* IO semaphore */
+        struct f2fs_bio_info read_io;                   /* for read bios */
+        struct f2fs_bio_info write_io[NR_PAGE_TYPE];    /* for write bios */
        /* for checkpoint */
        struct f2fs_checkpoint *ckpt;           /* raw checkpoint pointer */
@@ -376,8 +412,9 @@ struct f2fs_sb_info {
        /* for orphan inode management */
        struct list_head orphan_inode_list;     /* orphan inode list */
-        struct mutex orphan_inode_mutex;        /* for orphan inode list */
+        spinlock_t orphan_inode_lock;           /* for orphan inode list */
        unsigned int n_orphans;                 /* # of orphan inodes */
+        unsigned int max_orphans;               /* max orphan inodes */
        /* for directory inode management */
        struct list_head dir_inode_list;        /* dir inode list */
@@ -414,6 +451,9 @@ struct f2fs_sb_info {
        struct f2fs_gc_kthread  *gc_thread;     /* GC thread */
        unsigned int cur_victim_sec;            /* current victim section num */
+        /* maximum # of trials to find a victim segment for SSR and GC */
+        unsigned int max_victim_search;
        /*
         * for stat information.
         * one is for the LFS mode, and the other is for the SSR mode.
@@ -423,6 +463,7 @@ struct f2fs_sb_info {
        unsigned int segment_count[2];          /* # of allocated segments */
        unsigned int block_count[2];            /* # of allocated blocks */
        int total_hit_ext, read_hit_ext;        /* extent cache hit ratio */
+        int inline_inode;                       /* # of inline_data inodes */
        int bg_gc;                              /* background gc calls */
        unsigned int n_dirty_dirs;              /* # of dir inodes */
 #endif
@@ -462,6 +503,11 @@ static inline struct f2fs_node *F2FS_NODE(struct page *page)
        return (struct f2fs_node *)page_address(page);
 }
+static inline struct f2fs_inode *F2FS_INODE(struct page *page)
+{
+        return &((struct f2fs_node *)page_address(page))->i;
+}
 static inline struct f2fs_nm_info *NM_I(struct f2fs_sb_info *sbi)
 {
        return (struct f2fs_nm_info *)(sbi->nm_info);
@@ -487,6 +533,16 @@ static inline struct dirty_seglist_info *DIRTY_I(struct f2fs_sb_info *sbi)
        return (struct dirty_seglist_info *)(SM_I(sbi)->dirty_info);
 }
+static inline struct address_space *META_MAPPING(struct f2fs_sb_info *sbi)
+{
+        return sbi->meta_inode->i_mapping;
+}
+static inline struct address_space *NODE_MAPPING(struct f2fs_sb_info *sbi)
+{
+        return sbi->node_inode->i_mapping;
+}
 static inline void F2FS_SET_SB_DIRT(struct f2fs_sb_info *sbi)
 {
        sbi->s_dirty = 1;
@@ -534,7 +590,7 @@ static inline void f2fs_unlock_op(struct f2fs_sb_info *sbi)
 static inline void f2fs_lock_all(struct f2fs_sb_info *sbi)
 {
-        down_write_nest_lock(&sbi->cp_rwsem, &sbi->cp_mutex);
+        f2fs_down_write(&sbi->cp_rwsem, &sbi->cp_mutex);
 }
 static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi)
@@ -548,7 +604,7 @@ static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi)
 static inline int check_nid_range(struct f2fs_sb_info *sbi, nid_t nid)
 {
        WARN_ON((nid >= NM_I(sbi)->max_nid));
-        if (nid >= NM_I(sbi)->max_nid)
+        if (unlikely(nid >= NM_I(sbi)->max_nid))
                return -EINVAL;
        return 0;
 }
@@ -561,9 +617,9 @@ static inline int check_nid_range(struct f2fs_sb_info *sbi, nid_t nid)
 static inline int F2FS_HAS_BLOCKS(struct inode *inode)
 {
        if (F2FS_I(inode)->i_xattr_nid)
-                return (inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS + 1);
+                return inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS + 1;
        else
-                return (inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS);
+                return inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS;
 }
 static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi,
@@ -574,7 +630,7 @@ static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi,
        spin_lock(&sbi->stat_lock);
        valid_block_count =
                sbi->total_valid_block_count + (block_t)count;
-        if (valid_block_count > sbi->user_block_count) {
+        if (unlikely(valid_block_count > sbi->user_block_count)) {
                spin_unlock(&sbi->stat_lock);
                return false;
        }
@@ -585,7 +641,7 @@ static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi,
        return true;
 }
-static inline int dec_valid_block_count(struct f2fs_sb_info *sbi,
+static inline void dec_valid_block_count(struct f2fs_sb_info *sbi,
                                                struct inode *inode,
                                                blkcnt_t count)
 {
@@ -595,7 +651,6 @@ static inline int dec_valid_block_count(struct f2fs_sb_info *sbi,
        inode->i_blocks -= count;
        sbi->total_valid_block_count -= (block_t)count;
        spin_unlock(&sbi->stat_lock);
-        return 0;
 }
 static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type)
@@ -686,50 +741,48 @@ static inline block_t __start_sum_addr(struct f2fs_sb_info *sbi)
 }
 static inline bool inc_valid_node_count(struct f2fs_sb_info *sbi,
-                                                struct inode *inode,
+                                                struct inode *inode)
-                                                unsigned int count)
 {
        block_t valid_block_count;
        unsigned int valid_node_count;
        spin_lock(&sbi->stat_lock);
-        valid_block_count = sbi->total_valid_block_count + (block_t)count;
+        valid_block_count = sbi->total_valid_block_count + 1;
-        sbi->alloc_valid_block_count += (block_t)count;
+        if (unlikely(valid_block_count > sbi->user_block_count)) {
-        valid_node_count = sbi->total_valid_node_count + count;
-        if (valid_block_count > sbi->user_block_count) {
                spin_unlock(&sbi->stat_lock);
                return false;
        }
-        if (valid_node_count > sbi->total_node_count) {
+        valid_node_count = sbi->total_valid_node_count + 1;
+        if (unlikely(valid_node_count > sbi->total_node_count)) {
                spin_unlock(&sbi->stat_lock);
                return false;
        }
        if (inode)
-                inode->i_blocks += count;
+                inode->i_blocks++;
-        sbi->total_valid_node_count = valid_node_count;
-        sbi->total_valid_block_count = valid_block_count;
+        sbi->alloc_valid_block_count++;
+        sbi->total_valid_node_count++;
+        sbi->total_valid_block_count++;
        spin_unlock(&sbi->stat_lock);
        return true;
 }
 static inline void dec_valid_node_count(struct f2fs_sb_info *sbi,
-                                                struct inode *inode,
+                                                struct inode *inode)
-                                                unsigned int count)
 {
        spin_lock(&sbi->stat_lock);
-        f2fs_bug_on(sbi->total_valid_block_count < count);
+        f2fs_bug_on(!sbi->total_valid_block_count);
-        f2fs_bug_on(sbi->total_valid_node_count < count);
+        f2fs_bug_on(!sbi->total_valid_node_count);
-        f2fs_bug_on(inode->i_blocks < count);
+        f2fs_bug_on(!inode->i_blocks);
-        inode->i_blocks -= count;
+        inode->i_blocks--;
-        sbi->total_valid_node_count -= count;
+        sbi->total_valid_node_count--;
-        sbi->total_valid_block_count -= (block_t)count;
+        sbi->total_valid_block_count--;
        spin_unlock(&sbi->stat_lock);
 }
@@ -751,13 +804,12 @@ static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi)
        spin_unlock(&sbi->stat_lock);
 }
-static inline int dec_valid_inode_count(struct f2fs_sb_info *sbi)
+static inline void dec_valid_inode_count(struct f2fs_sb_info *sbi)
 {
        spin_lock(&sbi->stat_lock);
        f2fs_bug_on(!sbi->total_valid_inode_count);
        sbi->total_valid_inode_count--;
        spin_unlock(&sbi->stat_lock);
-        return 0;
 }
 static inline unsigned int valid_inode_count(struct f2fs_sb_info *sbi)
@@ -771,7 +823,7 @@ static inline unsigned int valid_inode_count(struct f2fs_sb_info *sbi)
 static inline void f2fs_put_page(struct page *page, int unlock)
 {
-        if (!page || IS_ERR(page))
+        if (!page)
                return;
        if (unlock) {
@@ -876,7 +928,9 @@ enum {
        FI_NO_ALLOC,            /* should not allocate any blocks */
        FI_UPDATE_DIR,          /* should update inode block for consistency */
        FI_DELAY_IPUT,          /* used for the recovery */
+        FI_NO_EXTENT,           /* not to use the extent cache */
        FI_INLINE_XATTR,        /* used for inline xattr */
+        FI_INLINE_DATA,         /* used for inline data*/
 };
 static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag)
@@ -914,6 +968,8 @@ static inline void get_inline_info(struct f2fs_inode_info *fi,
 {
        if (ri->i_inline & F2FS_INLINE_XATTR)
                set_inode_flag(fi, FI_INLINE_XATTR);
+        if (ri->i_inline & F2FS_INLINE_DATA)
+                set_inode_flag(fi, FI_INLINE_DATA);
 }
 static inline void set_raw_inline(struct f2fs_inode_info *fi,
@@ -923,6 +979,8 @@ static inline void set_raw_inline(struct f2fs_inode_info *fi,
        if (is_inode_flag_set(fi, FI_INLINE_XATTR))
                ri->i_inline |= F2FS_INLINE_XATTR;
+        if (is_inode_flag_set(fi, FI_INLINE_DATA))
+                ri->i_inline |= F2FS_INLINE_DATA;
 }
 static inline unsigned int addrs_per_inode(struct f2fs_inode_info *fi)
@@ -948,16 +1006,33 @@ static inline int inline_xattr_size(struct inode *inode)
                return 0;
 }
+static inline int f2fs_has_inline_data(struct inode *inode)
+{
+        return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DATA);
+}
+static inline void *inline_data_addr(struct page *page)
+{
+        struct f2fs_inode *ri;
+        ri = (struct f2fs_inode *)page_address(page);
+        return (void *)&(ri->i_addr[1]);
+}
 static inline int f2fs_readonly(struct super_block *sb)
 {
        return sb->s_flags & MS_RDONLY;
 }
+#define get_inode_mode(i) \
+        ((is_inode_flag_set(F2FS_I(i), FI_ACL_MODE)) ? \
+         (F2FS_I(i)->i_acl_mode) : ((i)->i_mode))
 /*
 * file.c
 */
 int f2fs_sync_file(struct file *, loff_t, loff_t, int);
 void truncate_data_blocks(struct dnode_of_data *);
+int truncate_blocks(struct inode *, u64);
 void f2fs_truncate(struct inode *);
 int f2fs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
 int f2fs_setattr(struct dentry *, struct iattr *);
@@ -1027,7 +1102,7 @@ int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int);
 int truncate_inode_blocks(struct inode *, pgoff_t);
 int truncate_xattr_node(struct inode *, struct page *);
 int wait_on_node_pages_writeback(struct f2fs_sb_info *, nid_t);
-int remove_inode_page(struct inode *);
+void remove_inode_page(struct inode *);
 struct page *new_inode_page(struct inode *, const struct qstr *);
 struct page *new_node_page(struct dnode_of_data *, unsigned int, struct page *);
 void ra_node_page(struct f2fs_sb_info *, nid_t);
@@ -1059,19 +1134,19 @@ void clear_prefree_segments(struct f2fs_sb_info *);
 int npages_for_summary_flush(struct f2fs_sb_info *);
 void allocate_new_segments(struct f2fs_sb_info *);
 struct page *get_sum_page(struct f2fs_sb_info *, unsigned int);
-struct bio *f2fs_bio_alloc(struct block_device *, int);
-void f2fs_submit_bio(struct f2fs_sb_info *, enum page_type, bool);
-void f2fs_wait_on_page_writeback(struct page *, enum page_type, bool);
 void write_meta_page(struct f2fs_sb_info *, struct page *);
-void write_node_page(struct f2fs_sb_info *, struct page *, unsigned int,
+void write_node_page(struct f2fs_sb_info *, struct page *,
-                                        block_t, block_t *);
+                struct f2fs_io_info *, unsigned int, block_t, block_t *);
-void write_data_page(struct inode *, struct page *, struct dnode_of_data*,
+void write_data_page(struct page *, struct dnode_of_data *, block_t *,
-                                        block_t, block_t *);
+                                        struct f2fs_io_info *);
-void rewrite_data_page(struct f2fs_sb_info *, struct page *, block_t);
+void rewrite_data_page(struct page *, block_t, struct f2fs_io_info *);
 void recover_data_page(struct f2fs_sb_info *, struct page *,
                                struct f2fs_summary *, block_t, block_t);
 void rewrite_node_page(struct f2fs_sb_info *, struct page *,
                                struct f2fs_summary *, block_t, block_t);
+void allocate_data_block(struct f2fs_sb_info *, struct page *,
+                block_t, block_t *, struct f2fs_summary *, int);
+void f2fs_wait_on_page_writeback(struct page *, enum page_type);
 void write_data_summaries(struct f2fs_sb_info *, block_t);
 void write_node_summaries(struct f2fs_sb_info *, block_t);
 int lookup_journal_in_cursum(struct f2fs_summary_block *,
@@ -1079,6 +1154,8 @@ int lookup_journal_in_cursum(struct f2fs_summary_block *,
 void flush_sit_entries(struct f2fs_sb_info *);
 int build_segment_manager(struct f2fs_sb_info *);
 void destroy_segment_manager(struct f2fs_sb_info *);
+int __init create_segment_manager_caches(void);
+void destroy_segment_manager_caches(void);
 /*
 * checkpoint.c
@@ -1090,7 +1167,7 @@ int acquire_orphan_inode(struct f2fs_sb_info *);
 void release_orphan_inode(struct f2fs_sb_info *);
 void add_orphan_inode(struct f2fs_sb_info *, nid_t);
 void remove_orphan_inode(struct f2fs_sb_info *, nid_t);
-int recover_orphan_inodes(struct f2fs_sb_info *);
+void recover_orphan_inodes(struct f2fs_sb_info *);
 int get_valid_checkpoint(struct f2fs_sb_info *);
 void set_dirty_dir_page(struct inode *, struct page *);
 void add_dirty_dir_inode(struct inode *);
@@ -1105,13 +1182,17 @@ void destroy_checkpoint_caches(void);
 /*
 * data.c
 */
+void f2fs_submit_merged_bio(struct f2fs_sb_info *, enum page_type, int);
+int f2fs_submit_page_bio(struct f2fs_sb_info *, struct page *, block_t, int);
+void f2fs_submit_page_mbio(struct f2fs_sb_info *, struct page *, block_t,
+                                                struct f2fs_io_info *);
 int reserve_new_block(struct dnode_of_data *);
+int f2fs_reserve_block(struct dnode_of_data *, pgoff_t);
 void update_extent_cache(block_t, struct dnode_of_data *);
 struct page *find_data_page(struct inode *, pgoff_t, bool);
 struct page *get_lock_data_page(struct inode *, pgoff_t);
 struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool);
-int f2fs_readpage(struct f2fs_sb_info *, struct page *, block_t, int);
+int do_write_data_page(struct page *, struct f2fs_io_info *);
-int do_write_data_page(struct page *);
 /*
 * gc.c
@@ -1144,7 +1225,7 @@ struct f2fs_stat_info {
        int ndirty_node, ndirty_dent, ndirty_dirs, ndirty_meta;
        int nats, sits, fnids;
        int total_count, utilization;
-        int bg_gc;
+        int bg_gc, inline_inode;
        unsigned int valid_count, valid_node_count, valid_inode_count;
        unsigned int bimodal, avg_vblocks;
        int util_free, util_valid, util_invalid;
@@ -1164,7 +1245,7 @@ struct f2fs_stat_info {
 static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
 {
-        return (struct f2fs_stat_info*)sbi->stat_info;
+        return (struct f2fs_stat_info *)sbi->stat_info;
 }
 #define stat_inc_call_count(si)         ((si)->call_count++)
@@ -1173,6 +1254,17 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
 #define stat_dec_dirty_dir(sbi)         ((sbi)->n_dirty_dirs--)
 #define stat_inc_total_hit(sb)          ((F2FS_SB(sb))->total_hit_ext++)
 #define stat_inc_read_hit(sb)           ((F2FS_SB(sb))->read_hit_ext++)
+#define stat_inc_inline_inode(inode)                                    \
+        do {                                                            \
+                if (f2fs_has_inline_data(inode))                        \
+                        ((F2FS_SB(inode->i_sb))->inline_inode++);       \
+        } while (0)
+#define stat_dec_inline_inode(inode)                                    \
+        do {                                                            \
+                if (f2fs_has_inline_data(inode))                        \
+                        ((F2FS_SB(inode->i_sb))->inline_inode--);       \
+        } while (0)
 #define stat_inc_seg_type(sbi, curseg)                                  \
                ((sbi)->segment_count[(curseg)->alloc_type]++)
 #define stat_inc_block_count(sbi, curseg)                               \
@@ -1216,6 +1308,8 @@ void f2fs_destroy_root_stats(void);
 #define stat_dec_dirty_dir(sbi)
 #define stat_inc_total_hit(sb)
 #define stat_inc_read_hit(sb)
+#define stat_inc_inline_inode(inode)
+#define stat_dec_inline_inode(inode)
 #define stat_inc_seg_type(sbi, curseg)
 #define stat_inc_block_count(sbi, curseg)
 #define stat_inc_seg_count(si, type)
@@ -1238,4 +1332,13 @@ extern const struct address_space_operations f2fs_meta_aops;
 extern const struct inode_operations f2fs_dir_inode_operations;
 extern const struct inode_operations f2fs_symlink_inode_operations;
 extern const struct inode_operations f2fs_special_inode_operations;
+/*
+ * inline.c
+ */
+bool f2fs_may_inline(struct inode *);
+int f2fs_read_inline_data(struct inode *, struct page *);
+int f2fs_convert_inline_data(struct inode *, pgoff_t);
+int f2fs_write_inline_data(struct inode *, struct page *, unsigned int);
+int recover_inline_data(struct inode *, struct page *);
 #endif
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 7d714f4972d5..0dfcef53a6ed 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -33,7 +33,6 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
        struct page *page = vmf->page;
        struct inode *inode = file_inode(vma->vm_file);
        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
-        block_t old_blk_addr;
        struct dnode_of_data dn;
        int err;
@@ -44,30 +43,16 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
        /* block allocation */
        f2fs_lock_op(sbi);
        set_new_dnode(&dn, inode, NULL, NULL, 0);
-        err = get_dnode_of_data(&dn, page->index, ALLOC_NODE);
+        err = f2fs_reserve_block(&dn, page->index);
-        if (err) {
-                f2fs_unlock_op(sbi);
-                goto out;
-        }
-        old_blk_addr = dn.data_blkaddr;
-        if (old_blk_addr == NULL_ADDR) {
-                err = reserve_new_block(&dn);
-                if (err) {
-                        f2fs_put_dnode(&dn);
-                        f2fs_unlock_op(sbi);
-                        goto out;
-                }
-        }
-        f2fs_put_dnode(&dn);
        f2fs_unlock_op(sbi);
+        if (err)
+                goto out;
        file_update_time(vma->vm_file);
        lock_page(page);
-        if (page->mapping != inode->i_mapping ||
+        if (unlikely(page->mapping != inode->i_mapping ||
                        page_offset(page) > i_size_read(inode) ||
-                        !PageUptodate(page)) {
+                        !PageUptodate(page))) {
                unlock_page(page);
                err = -EFAULT;
                goto out;
@@ -130,12 +115,12 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
        int ret = 0;
        bool need_cp = false;
        struct writeback_control wbc = {
-                .sync_mode = WB_SYNC_ALL,
+                .sync_mode = WB_SYNC_NONE,
                .nr_to_write = LONG_MAX,
                .for_reclaim = 0,
        };
-        if (f2fs_readonly(inode->i_sb))
+        if (unlikely(f2fs_readonly(inode->i_sb)))
                return 0;
        trace_f2fs_sync_file_enter(inode);
@@ -217,7 +202,7 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
        raw_node = F2FS_NODE(dn->node_page);
        addr = blkaddr_in_node(raw_node) + ofs;
-        for ( ; count > 0; count--, addr++, dn->ofs_in_node++) {
+        for (; count > 0; count--, addr++, dn->ofs_in_node++) {
                block_t blkaddr = le32_to_cpu(*addr);
                if (blkaddr == NULL_ADDR)
                        continue;
@@ -256,7 +241,7 @@ static void truncate_partial_data_page(struct inode *inode, u64 from)
                return;
        lock_page(page);
-        if (page->mapping != inode->i_mapping) {
+        if (unlikely(page->mapping != inode->i_mapping)) {
                f2fs_put_page(page, 1);
                return;
        }
@@ -266,21 +251,24 @@ static void truncate_partial_data_page(struct inode *inode, u64 from)
        f2fs_put_page(page, 1);
 }
-static int truncate_blocks(struct inode *inode, u64 from)
+int truncate_blocks(struct inode *inode, u64 from)
 {
        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
        unsigned int blocksize = inode->i_sb->s_blocksize;
        struct dnode_of_data dn;
        pgoff_t free_from;
-        int count = 0;
+        int count = 0, err = 0;
-        int err;
        trace_f2fs_truncate_blocks_enter(inode, from);
+        if (f2fs_has_inline_data(inode))
+                goto done;
        free_from = (pgoff_t)
                        ((from + blocksize - 1) >> (sbi->log_blocksize));
        f2fs_lock_op(sbi);
        set_new_dnode(&dn, inode, NULL, NULL, 0);
        err = get_dnode_of_data(&dn, free_from, LOOKUP_NODE);
        if (err) {
@@ -308,7 +296,7 @@ static int truncate_blocks(struct inode *inode, u64 from)
 free_next:
        err = truncate_inode_blocks(inode, free_from);
        f2fs_unlock_op(sbi);
+done:
        /* lastly zero out the first data page */
        truncate_partial_data_page(inode, from);
@@ -382,6 +370,10 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
        if ((attr->ia_valid & ATTR_SIZE) &&
                        attr->ia_size != i_size_read(inode)) {
+                err = f2fs_convert_inline_data(inode, attr->ia_size);
+                if (err)
+                        return err;
                truncate_setsize(inode, attr->ia_size);
                f2fs_truncate(inode);
                f2fs_balance_fs(F2FS_SB(inode->i_sb));
@@ -390,7 +382,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
        __setattr_copy(inode, attr);
        if (attr->ia_valid & ATTR_MODE) {
-                err = f2fs_acl_chmod(inode);
+                err = posix_acl_chmod(inode, get_inode_mode(inode));
                if (err || is_inode_flag_set(fi, FI_ACL_MODE)) {
                        inode->i_mode = fi->i_acl_mode;
                        clear_inode_flag(fi, FI_ACL_MODE);
@@ -405,6 +397,7 @@ const struct inode_operations f2fs_file_inode_operations = {
        .getattr        = f2fs_getattr,
        .setattr        = f2fs_setattr,
        .get_acl        = f2fs_get_acl,
+        .set_acl        = f2fs_set_acl,
 #ifdef CONFIG_F2FS_FS_XATTR
        .setxattr       = generic_setxattr,
        .getxattr       = generic_getxattr,
@@ -459,12 +452,16 @@ int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end)
        return 0;
 }
-static int punch_hole(struct inode *inode, loff_t offset, loff_t len, int mode)
+static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
 {
        pgoff_t pg_start, pg_end;
        loff_t off_start, off_end;
        int ret = 0;
+        ret = f2fs_convert_inline_data(inode, MAX_INLINE_DATA + 1);
+        if (ret)
+                return ret;
        pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT;
        pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT;
@@ -499,12 +496,6 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len, int mode)
                }
        }
-        if (!(mode & FALLOC_FL_KEEP_SIZE) &&
-                i_size_read(inode) <= (offset + len)) {
-                i_size_write(inode, offset);
-                mark_inode_dirty(inode);
-        }
        return ret;
 }
@@ -521,6 +512,10 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
        if (ret)
                return ret;
+        ret = f2fs_convert_inline_data(inode, offset + len);
+        if (ret)
+                return ret;
        pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT;
        pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT;
@@ -532,22 +527,10 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
                f2fs_lock_op(sbi);
                set_new_dnode(&dn, inode, NULL, NULL, 0);
-                ret = get_dnode_of_data(&dn, index, ALLOC_NODE);
+                ret = f2fs_reserve_block(&dn, index);
-                if (ret) {
-                        f2fs_unlock_op(sbi);
-                        break;
-                }
-                if (dn.data_blkaddr == NULL_ADDR) {
-                        ret = reserve_new_block(&dn);
-                        if (ret) {
-                                f2fs_put_dnode(&dn);
-                                f2fs_unlock_op(sbi);
-                                break;
-                        }
-                }
-                f2fs_put_dnode(&dn);
                f2fs_unlock_op(sbi);
+                if (ret)
+                        break;
                if (pg_start == pg_end)
                        new_size = offset + len;
@@ -578,7 +561,7 @@ static long f2fs_fallocate(struct file *file, int mode,
                return -EOPNOTSUPP;
        if (mode & FALLOC_FL_PUNCH_HOLE)
-                ret = punch_hole(inode, offset, len, mode);
+                ret = punch_hole(inode, offset, len);
        else
                ret = expand_inode_data(inode, offset, len, mode);
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index b7ad1ec7e4cc..ea0371e854b4 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -119,7 +119,6 @@ int start_gc_thread(struct f2fs_sb_info *sbi)
                kfree(gc_th);
                sbi->gc_thread = NULL;
        }
 out:
        return err;
 }
@@ -164,8 +163,8 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type,
                p->ofs_unit = sbi->segs_per_sec;
        }
-        if (p->max_search > MAX_VICTIM_SEARCH)
+        if (p->max_search > sbi->max_victim_search)
-                p->max_search = MAX_VICTIM_SEARCH;
+                p->max_search = sbi->max_victim_search;
        p->offset = sbi->last_victim[p->gc_mode];
 }
@@ -429,7 +428,7 @@ next_step:
                /* set page dirty and write it */
                if (gc_type == FG_GC) {
-                        f2fs_wait_on_page_writeback(node_page, NODE, true);
+                        f2fs_wait_on_page_writeback(node_page, NODE);
                        set_page_dirty(node_page);
                } else {
                        if (!PageWriteback(node_page))
@@ -521,6 +520,11 @@ static int check_dnode(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
 static void move_data_page(struct inode *inode, struct page *page, int gc_type)
 {
+        struct f2fs_io_info fio = {
+                .type = DATA,
+                .rw = WRITE_SYNC,
+        };
        if (gc_type == BG_GC) {
                if (PageWriteback(page))
                        goto out;
@@ -529,7 +533,7 @@ static void move_data_page(struct inode *inode, struct page *page, int gc_type)
        } else {
                struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
-                f2fs_wait_on_page_writeback(page, DATA, true);
+                f2fs_wait_on_page_writeback(page, DATA);
                if (clear_page_dirty_for_io(page) &&
                        S_ISDIR(inode->i_mode)) {
@@ -537,7 +541,7 @@ static void move_data_page(struct inode *inode, struct page *page, int gc_type)
                        inode_dec_dirty_dents(inode);
                }
                set_cold_data(page);
-                do_write_data_page(page);
+                do_write_data_page(page, &fio);
                clear_cold_data(page);
        }
 out:
@@ -631,7 +635,7 @@ next_iput:
                goto next_step;
        if (gc_type == FG_GC) {
-                f2fs_submit_bio(sbi, DATA, true);
+                f2fs_submit_merged_bio(sbi, DATA, WRITE);
                /*
                 * In the case of FG_GC, it'd be better to reclaim this victim
@@ -664,8 +668,6 @@ static void do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno,
        /* read segment summary of victim */
        sum_page = get_sum_page(sbi, segno);
-        if (IS_ERR(sum_page))
-                return;
        blk_start_plug(&plug);
@@ -697,7 +699,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi)
        INIT_LIST_HEAD(&ilist);
 gc_more:
-        if (!(sbi->sb->s_flags & MS_ACTIVE))
+        if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE)))
                goto stop;
        if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree)) {
diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h
index 507056d22205..5d5eb6047bf4 100644
--- a/fs/f2fs/gc.h
+++ b/fs/f2fs/gc.h
@@ -20,7 +20,7 @@
 #define LIMIT_FREE_BLOCK        40 /* percentage over invalid + free space */
 /* Search max. number of dirty segments to select a victim segment */
-#define MAX_VICTIM_SEARCH 4096 /* covers 8GB */
+#define DEF_MAX_VICTIM_SEARCH 4096 /* covers 8GB */
 struct f2fs_gc_kthread {
        struct task_struct *f2fs_gc_task;
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
new file mode 100644
index 000000000000..31ee5b164ff9
--- /dev/null
+++ b/fs/f2fs/inline.c
@@ -0,0 +1,222 @@
+/*
+ * fs/f2fs/inline.c
+ * Copyright (c) 2013, Intel Corporation
+ * Authors: Huajun Li <huajun.li@intel.com>
+ *          Haicheng Li <haicheng.li@intel.com>
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/fs.h>
+#include <linux/f2fs_fs.h>
+#include "f2fs.h"
+bool f2fs_may_inline(struct inode *inode)
+{
+        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        block_t nr_blocks;
+        loff_t i_size;
+        if (!test_opt(sbi, INLINE_DATA))
+                return false;
+        nr_blocks = F2FS_I(inode)->i_xattr_nid ? 3 : 2;
+        if (inode->i_blocks > nr_blocks)
+                return false;
+        i_size = i_size_read(inode);
+        if (i_size > MAX_INLINE_DATA)
+                return false;
+        return true;
+}
+int f2fs_read_inline_data(struct inode *inode, struct page *page)
+{
+        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        struct page *ipage;
+        void *src_addr, *dst_addr;
+        if (page->index) {
+                zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+                goto out;
+        }
+        ipage = get_node_page(sbi, inode->i_ino);
+        if (IS_ERR(ipage))
+                return PTR_ERR(ipage);
+        zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE);
+        /* Copy the whole inline data block */
+        src_addr = inline_data_addr(ipage);
+        dst_addr = kmap(page);
+        memcpy(dst_addr, src_addr, MAX_INLINE_DATA);
+        kunmap(page);
+        f2fs_put_page(ipage, 1);
+out:
+        SetPageUptodate(page);
+        unlock_page(page);
+        return 0;
+}
+static int __f2fs_convert_inline_data(struct inode *inode, struct page *page)
+{
+        int err;
+        struct page *ipage;
+        struct dnode_of_data dn;
+        void *src_addr, *dst_addr;
+        block_t new_blk_addr;
+        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        struct f2fs_io_info fio = {
+                .type = DATA,
+                .rw = WRITE_SYNC | REQ_PRIO,
+        };
+        f2fs_lock_op(sbi);
+        ipage = get_node_page(sbi, inode->i_ino);
+        if (IS_ERR(ipage))
+                return PTR_ERR(ipage);
+        /*
+         * i_addr[0] is not used for inline data,
+         * so reserving new block will not destroy inline data
+         */
+        set_new_dnode(&dn, inode, ipage, NULL, 0);
+        err = f2fs_reserve_block(&dn, 0);
+        if (err) {
+                f2fs_unlock_op(sbi);
+                return err;
+        }
+        zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE);
+        /* Copy the whole inline data block */
+        src_addr = inline_data_addr(ipage);
+        dst_addr = kmap(page);
+        memcpy(dst_addr, src_addr, MAX_INLINE_DATA);
+        kunmap(page);
+        SetPageUptodate(page);
+        /* write data page to try to make data consistent */
+        set_page_writeback(page);
+        write_data_page(page, &dn, &new_blk_addr, &fio);
+        update_extent_cache(new_blk_addr, &dn);
+        f2fs_wait_on_page_writeback(page, DATA);
+        /* clear inline data and flag after data writeback */
+        zero_user_segment(ipage, INLINE_DATA_OFFSET,
+                                 INLINE_DATA_OFFSET + MAX_INLINE_DATA);
+        clear_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
+        stat_dec_inline_inode(inode);
+        sync_inode_page(&dn);
+        f2fs_put_dnode(&dn);
+        f2fs_unlock_op(sbi);
+        return err;
+}
+int f2fs_convert_inline_data(struct inode *inode, pgoff_t to_size)
+{
+        struct page *page;
+        int err;
+        if (!f2fs_has_inline_data(inode))
+                return 0;
+        else if (to_size <= MAX_INLINE_DATA)
+                return 0;
+        page = grab_cache_page_write_begin(inode->i_mapping, 0, AOP_FLAG_NOFS);
+        if (!page)
+                return -ENOMEM;
+        err = __f2fs_convert_inline_data(inode, page);
+        f2fs_put_page(page, 1);
+        return err;
+}
+int f2fs_write_inline_data(struct inode *inode,
+                           struct page *page, unsigned size)
+{
+        void *src_addr, *dst_addr;
+        struct page *ipage;
+        struct dnode_of_data dn;
+        int err;
+        set_new_dnode(&dn, inode, NULL, NULL, 0);
+        err = get_dnode_of_data(&dn, 0, LOOKUP_NODE);
+        if (err)
+                return err;
+        ipage = dn.inode_page;
+        zero_user_segment(ipage, INLINE_DATA_OFFSET,
+                                 INLINE_DATA_OFFSET + MAX_INLINE_DATA);
+        src_addr = kmap(page);
+        dst_addr = inline_data_addr(ipage);
+        memcpy(dst_addr, src_addr, size);
+        kunmap(page);
+        /* Release the first data block if it is allocated */
+        if (!f2fs_has_inline_data(inode)) {
+                truncate_data_blocks_range(&dn, 1);
+                set_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
+                stat_inc_inline_inode(inode);
+        }
+        sync_inode_page(&dn);
+        f2fs_put_dnode(&dn);
+        return 0;
+}
+int recover_inline_data(struct inode *inode, struct page *npage)
+{
+        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        struct f2fs_inode *ri = NULL;
+        void *src_addr, *dst_addr;
+        struct page *ipage;
+        /*
+         * The inline_data recovery policy is as follows.
+         * [prev.] [next] of inline_data flag
+         *    o       o  -> recover inline_data
+         *    o       x  -> remove inline_data, and then recover data blocks
+         *    x       o  -> remove inline_data, and then recover inline_data
+         *    x       x  -> recover data blocks
+         */
+        if (IS_INODE(npage))
+                ri = F2FS_INODE(npage);
+        if (f2fs_has_inline_data(inode) &&
+                        ri && ri->i_inline & F2FS_INLINE_DATA) {
+process_inline:
+                ipage = get_node_page(sbi, inode->i_ino);
+                f2fs_bug_on(IS_ERR(ipage));
+                src_addr = inline_data_addr(npage);
+                dst_addr = inline_data_addr(ipage);
+                memcpy(dst_addr, src_addr, MAX_INLINE_DATA);
+                update_inode(inode, ipage);
+                f2fs_put_page(ipage, 1);
+                return -1;
+        }
+        if (f2fs_has_inline_data(inode)) {
+                ipage = get_node_page(sbi, inode->i_ino);
+                f2fs_bug_on(IS_ERR(ipage));
+                zero_user_segment(ipage, INLINE_DATA_OFFSET,
+                                 INLINE_DATA_OFFSET + MAX_INLINE_DATA);
+                clear_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
+                update_inode(inode, ipage);
+                f2fs_put_page(ipage, 1);
+        } else if (ri && ri->i_inline & F2FS_INLINE_DATA) {
+                truncate_blocks(inode, 0);
+                set_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
+                goto process_inline;
+        }
+        return 0;
+}
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index d0eaa9faeca0..4d67ed736dca 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -42,9 +42,11 @@ static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
        if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
                        S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
                if (ri->i_addr[0])
-                        inode->i_rdev = old_decode_dev(le32_to_cpu(ri->i_addr[0]));
+                        inode->i_rdev =
+                                old_decode_dev(le32_to_cpu(ri->i_addr[0]));
                else
-                        inode->i_rdev = new_decode_dev(le32_to_cpu(ri->i_addr[1]));
+                        inode->i_rdev =
+                                new_decode_dev(le32_to_cpu(ri->i_addr[1]));
        }
 }
@@ -52,11 +54,13 @@ static void __set_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
 {
        if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
                if (old_valid_dev(inode->i_rdev)) {
-                        ri->i_addr[0] = cpu_to_le32(old_encode_dev(inode->i_rdev));
+                        ri->i_addr[0] =
+                                cpu_to_le32(old_encode_dev(inode->i_rdev));
                        ri->i_addr[1] = 0;
                } else {
                        ri->i_addr[0] = 0;
-                        ri->i_addr[1] = cpu_to_le32(new_encode_dev(inode->i_rdev));
+                        ri->i_addr[1] =
+                                cpu_to_le32(new_encode_dev(inode->i_rdev));
                        ri->i_addr[2] = 0;
                }
        }
@@ -67,7 +71,6 @@ static int do_read_inode(struct inode *inode)
        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
        struct f2fs_inode_info *fi = F2FS_I(inode);
        struct page *node_page;
-        struct f2fs_node *rn;
        struct f2fs_inode *ri;
        /* Check if ino is within scope */
@@ -81,8 +84,7 @@ static int do_read_inode(struct inode *inode)
        if (IS_ERR(node_page))
                return PTR_ERR(node_page);
-        rn = F2FS_NODE(node_page);
+        ri = F2FS_INODE(node_page);
-        ri = &(rn->i);
        inode->i_mode = le16_to_cpu(ri->i_mode);
        i_uid_write(inode, le32_to_cpu(ri->i_uid));
@@ -175,13 +177,11 @@ bad_inode:
 void update_inode(struct inode *inode, struct page *node_page)
 {
-        struct f2fs_node *rn;
        struct f2fs_inode *ri;
-        f2fs_wait_on_page_writeback(node_page, NODE, false);
+        f2fs_wait_on_page_writeback(node_page, NODE);
-        rn = F2FS_NODE(node_page);
+        ri = F2FS_INODE(node_page);
-        ri = &(rn->i);
        ri->i_mode = cpu_to_le16(inode->i_mode);
        ri->i_advise = F2FS_I(inode)->i_advise;
@@ -281,6 +281,7 @@ void f2fs_evict_inode(struct inode *inode)
        f2fs_lock_op(sbi);
        remove_inode_page(inode);
+        stat_dec_inline_inode(inode);
        f2fs_unlock_op(sbi);
        sb_end_intwrite(inode->i_sb);
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 575adac17f8b..397d459e97bf 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -424,11 +424,13 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
                }
                f2fs_set_link(new_dir, new_entry, new_page, old_inode);
+                F2FS_I(old_inode)->i_pino = new_dir->i_ino;
                new_inode->i_ctime = CURRENT_TIME;
                if (old_dir_entry)
                        drop_nlink(new_inode);
                drop_nlink(new_inode);
+                mark_inode_dirty(new_inode);
                if (!new_inode->i_nlink)
                        add_orphan_inode(sbi, new_inode->i_ino);
@@ -457,11 +459,14 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
                if (old_dir != new_dir) {
                        f2fs_set_link(old_inode, old_dir_entry,
                                                old_dir_page, new_dir);
+                        F2FS_I(old_inode)->i_pino = new_dir->i_ino;
+                        update_inode_page(old_inode);
                } else {
                        kunmap(old_dir_page);
                        f2fs_put_page(old_dir_page, 0);
                }
                drop_nlink(old_dir);
+                mark_inode_dirty(old_dir);
                update_inode_page(old_dir);
        }
@@ -496,6 +501,7 @@ const struct inode_operations f2fs_dir_inode_operations = {
        .getattr        = f2fs_getattr,
        .setattr        = f2fs_setattr,
        .get_acl        = f2fs_get_acl,
+        .set_acl        = f2fs_set_acl,
 #ifdef CONFIG_F2FS_FS_XATTR
        .setxattr       = generic_setxattr,
        .getxattr       = generic_getxattr,
@@ -522,6 +528,7 @@ const struct inode_operations f2fs_special_inode_operations = {
        .getattr        = f2fs_getattr,
        .setattr        = f2fs_setattr,
        .get_acl        = f2fs_get_acl,
+        .set_acl        = f2fs_set_acl,
 #ifdef CONFIG_F2FS_FS_XATTR
        .setxattr       = generic_setxattr,
        .getxattr       = generic_getxattr,
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 4ac4150d421d..b0649b76eb4f 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -87,17 +87,19 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
 */
 static void ra_nat_pages(struct f2fs_sb_info *sbi, int nid)
 {
-        struct address_space *mapping = sbi->meta_inode->i_mapping;
+        struct address_space *mapping = META_MAPPING(sbi);
        struct f2fs_nm_info *nm_i = NM_I(sbi);
-        struct blk_plug plug;
        struct page *page;
        pgoff_t index;
        int i;
+        struct f2fs_io_info fio = {
+                .type = META,
+                .rw = READ_SYNC | REQ_META | REQ_PRIO
+        };
-        blk_start_plug(&plug);
        for (i = 0; i < FREE_NID_PAGES; i++, nid += NAT_ENTRY_PER_BLOCK) {
-                if (nid >= nm_i->max_nid)
+                if (unlikely(nid >= nm_i->max_nid))
                        nid = 0;
                index = current_nat_addr(sbi, nid);
@@ -105,15 +107,15 @@ static void ra_nat_pages(struct f2fs_sb_info *sbi, int nid)
                if (!page)
                        continue;
                if (PageUptodate(page)) {
+                        mark_page_accessed(page);
                        f2fs_put_page(page, 1);
                        continue;
                }
-                if (f2fs_readpage(sbi, page, index, READ))
+                f2fs_submit_page_mbio(sbi, page, index, &fio);
-                        continue;
+                mark_page_accessed(page);
                f2fs_put_page(page, 0);
        }
-        blk_finish_plug(&plug);
+        f2fs_submit_merged_bio(sbi, META, READ);
 }
 static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n)
@@ -391,8 +393,8 @@ got:
 /*
 * Caller should call f2fs_put_dnode(dn).
- * Also, it should grab and release a mutex by calling mutex_lock_op() and
+ * Also, it should grab and release a rwsem by calling f2fs_lock_op() and
- * mutex_unlock_op() only if ro is not set RDONLY_NODE.
+ * f2fs_unlock_op() only if ro is not set RDONLY_NODE.
 * In the case of RDONLY_NODE, we don't need to care about mutex.
 */
 int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
@@ -502,7 +504,7 @@ static void truncate_node(struct dnode_of_data *dn)
        /* Deallocate node address */
        invalidate_blocks(sbi, ni.blk_addr);
-        dec_valid_node_count(sbi, dn->inode, 1);
+        dec_valid_node_count(sbi, dn->inode);
        set_node_addr(sbi, &ni, NULL_ADDR);
        if (dn->nid == dn->inode->i_ino) {
@@ -516,6 +518,10 @@ invalidate:
        F2FS_SET_SB_DIRT(sbi);
        f2fs_put_page(dn->node_page, 1);
+        invalidate_mapping_pages(NODE_MAPPING(sbi),
+                        dn->node_page->index, dn->node_page->index);
        dn->node_page = NULL;
        trace_f2fs_truncate_node(dn->inode, dn->nid, ni.blk_addr);
 }
@@ -631,19 +637,19 @@ static int truncate_partial_nodes(struct dnode_of_data *dn,
                return 0;
        /* get indirect nodes in the path */
-        for (i = 0; i < depth - 1; i++) {
+        for (i = 0; i < idx + 1; i++) {
                /* refernece count'll be increased */
                pages[i] = get_node_page(sbi, nid[i]);
                if (IS_ERR(pages[i])) {
-                        depth = i + 1;
                        err = PTR_ERR(pages[i]);
+                        idx = i - 1;
                        goto fail;
                }
                nid[i + 1] = get_nid(pages[i], offset[i + 1], false);
        }
        /* free direct nodes linked to a partial indirect node */
-        for (i = offset[depth - 1]; i < NIDS_PER_BLOCK; i++) {
+        for (i = offset[idx + 1]; i < NIDS_PER_BLOCK; i++) {
                child_nid = get_nid(pages[idx], i, false);
                if (!child_nid)
                        continue;
@@ -654,7 +660,7 @@ static int truncate_partial_nodes(struct dnode_of_data *dn,
                set_nid(pages[idx], i, 0, false);
        }
-        if (offset[depth - 1] == 0) {
+        if (offset[idx + 1] == 0) {
                dn->node_page = pages[idx];
                dn->nid = nid[idx];
                truncate_node(dn);
@@ -662,9 +668,10 @@ static int truncate_partial_nodes(struct dnode_of_data *dn,
                f2fs_put_page(pages[idx], 1);
        }
        offset[idx]++;
-        offset[depth - 1] = 0;
+        offset[idx + 1] = 0;
+        idx--;
 fail:
-        for (i = depth - 3; i >= 0; i--)
+        for (i = idx; i >= 0; i--)
                f2fs_put_page(pages[i], 1);
        trace_f2fs_truncate_partial_nodes(dn->inode, nid, depth, err);
@@ -678,11 +685,10 @@ fail:
 int truncate_inode_blocks(struct inode *inode, pgoff_t from)
 {
        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
-        struct address_space *node_mapping = sbi->node_inode->i_mapping;
        int err = 0, cont = 1;
        int level, offset[4], noffset[4];
        unsigned int nofs = 0;
-        struct f2fs_node *rn;
+        struct f2fs_inode *ri;
        struct dnode_of_data dn;
        struct page *page;
@@ -699,7 +705,7 @@ restart:
        set_new_dnode(&dn, inode, page, NULL, 0);
        unlock_page(page);
-        rn = F2FS_NODE(page);
+        ri = F2FS_INODE(page);
        switch (level) {
        case 0:
        case 1:
@@ -709,7 +715,7 @@ restart:
                nofs = noffset[1];
                if (!offset[level - 1])
                        goto skip_partial;
-                err = truncate_partial_nodes(&dn, &rn->i, offset, level);
+                err = truncate_partial_nodes(&dn, ri, offset, level);
                if (err < 0 && err != -ENOENT)
                        goto fail;
                nofs += 1 + NIDS_PER_BLOCK;
@@ -718,7 +724,7 @@ restart:
                nofs = 5 + 2 * NIDS_PER_BLOCK;
                if (!offset[level - 1])
                        goto skip_partial;
-                err = truncate_partial_nodes(&dn, &rn->i, offset, level);
+                err = truncate_partial_nodes(&dn, ri, offset, level);
                if (err < 0 && err != -ENOENT)
                        goto fail;
                break;
@@ -728,7 +734,7 @@ restart:
 skip_partial:
        while (cont) {
-                dn.nid = le32_to_cpu(rn->i.i_nid[offset[0] - NODE_DIR1_BLOCK]);
+                dn.nid = le32_to_cpu(ri->i_nid[offset[0] - NODE_DIR1_BLOCK]);
                switch (offset[0]) {
                case NODE_DIR1_BLOCK:
                case NODE_DIR2_BLOCK:
@@ -751,14 +757,14 @@ skip_partial:
                if (err < 0 && err != -ENOENT)
                        goto fail;
                if (offset[1] == 0 &&
-                                rn->i.i_nid[offset[0] - NODE_DIR1_BLOCK]) {
+                                ri->i_nid[offset[0] - NODE_DIR1_BLOCK]) {
                        lock_page(page);
-                        if (page->mapping != node_mapping) {
+                        if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
                                f2fs_put_page(page, 1);
                                goto restart;
                        }
                        wait_on_page_writeback(page);
-                        rn->i.i_nid[offset[0] - NODE_DIR1_BLOCK] = 0;
+                        ri->i_nid[offset[0] - NODE_DIR1_BLOCK] = 0;
                        set_page_dirty(page);
                        unlock_page(page);
                }
@@ -794,38 +800,34 @@ int truncate_xattr_node(struct inode *inode, struct page *page)
        set_new_dnode(&dn, inode, page, npage, nid);
        if (page)
-                dn.inode_page_locked = 1;
+                dn.inode_page_locked = true;
        truncate_node(&dn);
        return 0;
 }
 /*
- * Caller should grab and release a mutex by calling mutex_lock_op() and
+ * Caller should grab and release a rwsem by calling f2fs_lock_op() and
- * mutex_unlock_op().
+ * f2fs_unlock_op().
 */
-int remove_inode_page(struct inode *inode)
+void remove_inode_page(struct inode *inode)
 {
        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
        struct page *page;
        nid_t ino = inode->i_ino;
        struct dnode_of_data dn;
-        int err;
        page = get_node_page(sbi, ino);
        if (IS_ERR(page))
-                return PTR_ERR(page);
+                return;
-        err = truncate_xattr_node(inode, page);
+        if (truncate_xattr_node(inode, page)) {
-        if (err) {
                f2fs_put_page(page, 1);
-                return err;
+                return;
        }
        /* 0 is possible, after f2fs_new_inode() is failed */
        f2fs_bug_on(inode->i_blocks != 0 && inode->i_blocks != 1);
        set_new_dnode(&dn, inode, page, page, ino);
        truncate_node(&dn);
-        return 0;
 }
 struct page *new_inode_page(struct inode *inode, const struct qstr *name)
@@ -843,19 +845,18 @@ struct page *new_node_page(struct dnode_of_data *dn,
                                unsigned int ofs, struct page *ipage)
 {
        struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
-        struct address_space *mapping = sbi->node_inode->i_mapping;
        struct node_info old_ni, new_ni;
        struct page *page;
        int err;
-        if (is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))
+        if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)))
                return ERR_PTR(-EPERM);
-        page = grab_cache_page(mapping, dn->nid);
+        page = grab_cache_page(NODE_MAPPING(sbi), dn->nid);
        if (!page)
                return ERR_PTR(-ENOMEM);
-        if (!inc_valid_node_count(sbi, dn->inode, 1)) {
+        if (unlikely(!inc_valid_node_count(sbi, dn->inode))) {
                err = -ENOSPC;
                goto fail;
        }
@@ -898,14 +899,14 @@ fail:
 * LOCKED_PAGE: f2fs_put_page(page, 1)
 * error: nothing
 */
-static int read_node_page(struct page *page, int type)
+static int read_node_page(struct page *page, int rw)
 {
        struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
        struct node_info ni;
        get_node_info(sbi, page->index, &ni);
-        if (ni.blk_addr == NULL_ADDR) {
+        if (unlikely(ni.blk_addr == NULL_ADDR)) {
                f2fs_put_page(page, 1);
                return -ENOENT;
        }
@@ -913,7 +914,7 @@ static int read_node_page(struct page *page, int type)
        if (PageUptodate(page))
                return LOCKED_PAGE;
-        return f2fs_readpage(sbi, page, ni.blk_addr, type);
+        return f2fs_submit_page_bio(sbi, page, ni.blk_addr, rw);
 }
 /*
@@ -921,18 +922,17 @@ static int read_node_page(struct page *page, int type)
 */
 void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid)
 {
-        struct address_space *mapping = sbi->node_inode->i_mapping;
        struct page *apage;
        int err;
-        apage = find_get_page(mapping, nid);
+        apage = find_get_page(NODE_MAPPING(sbi), nid);
        if (apage && PageUptodate(apage)) {
                f2fs_put_page(apage, 0);
                return;
        }
        f2fs_put_page(apage, 0);
-        apage = grab_cache_page(mapping, nid);
+        apage = grab_cache_page(NODE_MAPPING(sbi), nid);
        if (!apage)
                return;
@@ -945,11 +945,10 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid)
 struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid)
 {
-        struct address_space *mapping = sbi->node_inode->i_mapping;
        struct page *page;
        int err;
 repeat:
-        page = grab_cache_page(mapping, nid);
+        page = grab_cache_page(NODE_MAPPING(sbi), nid);
        if (!page)
                return ERR_PTR(-ENOMEM);
@@ -960,11 +959,11 @@ repeat:
                goto got_it;
        lock_page(page);
-        if (!PageUptodate(page)) {
+        if (unlikely(!PageUptodate(page))) {
                f2fs_put_page(page, 1);
                return ERR_PTR(-EIO);
        }
-        if (page->mapping != mapping) {
+        if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
                f2fs_put_page(page, 1);
                goto repeat;
        }
@@ -981,7 +980,6 @@ got_it:
 struct page *get_node_page_ra(struct page *parent, int start)
 {
        struct f2fs_sb_info *sbi = F2FS_SB(parent->mapping->host->i_sb);
-        struct address_space *mapping = sbi->node_inode->i_mapping;
        struct blk_plug plug;
        struct page *page;
        int err, i, end;
@@ -992,7 +990,7 @@ struct page *get_node_page_ra(struct page *parent, int start)
        if (!nid)
                return ERR_PTR(-ENOENT);
 repeat:
-        page = grab_cache_page(mapping, nid);
+        page = grab_cache_page(NODE_MAPPING(sbi), nid);
        if (!page)
                return ERR_PTR(-ENOMEM);
@@ -1017,12 +1015,12 @@ repeat:
        blk_finish_plug(&plug);
        lock_page(page);
-        if (page->mapping != mapping) {
+        if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
                f2fs_put_page(page, 1);
                goto repeat;
        }
 page_hit:
-        if (!PageUptodate(page)) {
+        if (unlikely(!PageUptodate(page))) {
                f2fs_put_page(page, 1);
                return ERR_PTR(-EIO);
        }
@@ -1048,7 +1046,6 @@ void sync_inode_page(struct dnode_of_data *dn)
 int sync_node_pages(struct f2fs_sb_info *sbi, nid_t ino,
                                        struct writeback_control *wbc)
 {
-        struct address_space *mapping = sbi->node_inode->i_mapping;
        pgoff_t index, end;
        struct pagevec pvec;
        int step = ino ? 2 : 0;
@@ -1062,7 +1059,7 @@ next_step:
        while (index <= end) {
                int i, nr_pages;
-                nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+                nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
                                PAGECACHE_TAG_DIRTY,
                                min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
                if (nr_pages == 0)
@@ -1095,7 +1092,7 @@ next_step:
                        else if (!trylock_page(page))
                                continue;
-                        if (unlikely(page->mapping != mapping)) {
+                        if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
 continue_unlock:
                                unlock_page(page);
                                continue;
@@ -1122,7 +1119,7 @@ continue_unlock:
                                set_fsync_mark(page, 0);
                                set_dentry_mark(page, 0);
                        }
-                        mapping->a_ops->writepage(page, wbc);
+                        NODE_MAPPING(sbi)->a_ops->writepage(page, wbc);
                        wrote++;
                        if (--wbc->nr_to_write == 0)
@@ -1143,31 +1140,31 @@ continue_unlock:
        }
        if (wrote)
-                f2fs_submit_bio(sbi, NODE, wbc->sync_mode == WB_SYNC_ALL);
+                f2fs_submit_merged_bio(sbi, NODE, WRITE);
        return nwritten;
 }
 int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino)
 {
-        struct address_space *mapping = sbi->node_inode->i_mapping;
        pgoff_t index = 0, end = LONG_MAX;
        struct pagevec pvec;
-        int nr_pages;
        int ret2 = 0, ret = 0;
        pagevec_init(&pvec, 0);
-        while ((index <= end) &&
-                        (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+        while (index <= end) {
-                        PAGECACHE_TAG_WRITEBACK,
+                int i, nr_pages;
-                        min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) {
+                nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
-                unsigned i;
+                                PAGECACHE_TAG_WRITEBACK,
+                                min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
+                if (nr_pages == 0)
+                        break;
                for (i = 0; i < nr_pages; i++) {
                        struct page *page = pvec.pages[i];
                        /* until radix tree lookup accepts end_index */
-                        if (page->index > end)
+                        if (unlikely(page->index > end))
                                continue;
                        if (ino && ino_of_node(page) == ino) {
@@ -1180,9 +1177,9 @@ int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino)
                cond_resched();
        }
-        if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
+        if (unlikely(test_and_clear_bit(AS_ENOSPC, &NODE_MAPPING(sbi)->flags)))
                ret2 = -ENOSPC;
-        if (test_and_clear_bit(AS_EIO, &mapping->flags))
+        if (unlikely(test_and_clear_bit(AS_EIO, &NODE_MAPPING(sbi)->flags)))
                ret2 = -EIO;
        if (!ret)
                ret = ret2;
@@ -1196,8 +1193,12 @@ static int f2fs_write_node_page(struct page *page,
        nid_t nid;
        block_t new_addr;
        struct node_info ni;
+        struct f2fs_io_info fio = {
+                .type = NODE,
+                .rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE,
+        };
-        if (sbi->por_doing)
+        if (unlikely(sbi->por_doing))
                goto redirty_out;
        wait_on_page_writeback(page);
@@ -1209,7 +1210,7 @@ static int f2fs_write_node_page(struct page *page,
        get_node_info(sbi, nid, &ni);
        /* This page is already truncated */
-        if (ni.blk_addr == NULL_ADDR) {
+        if (unlikely(ni.blk_addr == NULL_ADDR)) {
                dec_page_count(sbi, F2FS_DIRTY_NODES);
                unlock_page(page);
                return 0;
@@ -1220,7 +1221,7 @@ static int f2fs_write_node_page(struct page *page,
        mutex_lock(&sbi->node_write);
        set_page_writeback(page);
-        write_node_page(sbi, page, nid, ni.blk_addr, &new_addr);
+        write_node_page(sbi, page, &fio, nid, ni.blk_addr, &new_addr);
        set_node_addr(sbi, &ni, new_addr);
        dec_page_count(sbi, F2FS_DIRTY_NODES);
        mutex_unlock(&sbi->node_write);
@@ -1255,6 +1256,7 @@ static int f2fs_write_node_pages(struct address_space *mapping,
        /* if mounting is failed, skip writing node pages */
        wbc->nr_to_write = 3 * max_hw_blocks(sbi);
+        wbc->sync_mode = WB_SYNC_NONE;
        sync_node_pages(sbi, 0, wbc);
        wbc->nr_to_write = nr_to_write - (3 * max_hw_blocks(sbi) -
                                                wbc->nr_to_write);
@@ -1333,7 +1335,7 @@ static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid, bool build)
                return -1;
        /* 0 nid should not be used */
-        if (nid == 0)
+        if (unlikely(nid == 0))
                return 0;
        if (build) {
@@ -1386,7 +1388,7 @@ static void scan_nat_page(struct f2fs_nm_info *nm_i,
        for (; i < NAT_ENTRY_PER_BLOCK; i++, start_nid++) {
-                if (start_nid >= nm_i->max_nid)
+                if (unlikely(start_nid >= nm_i->max_nid))
                        break;
                blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr);
@@ -1420,7 +1422,7 @@ static void build_free_nids(struct f2fs_sb_info *sbi)
                f2fs_put_page(page, 1);
                nid += (NAT_ENTRY_PER_BLOCK - (nid % NAT_ENTRY_PER_BLOCK));
-                if (nid >= nm_i->max_nid)
+                if (unlikely(nid >= nm_i->max_nid))
                        nid = 0;
                if (i++ == FREE_NID_PAGES)
@@ -1454,7 +1456,7 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid)
        struct free_nid *i = NULL;
        struct list_head *this;
 retry:
-        if (sbi->total_valid_node_count + 1 >= nm_i->max_nid)
+        if (unlikely(sbi->total_valid_node_count + 1 >= nm_i->max_nid))
                return false;
        spin_lock(&nm_i->free_nid_list_lock);
@@ -1535,13 +1537,12 @@ void recover_node_page(struct f2fs_sb_info *sbi, struct page *page,
 int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
 {
-        struct address_space *mapping = sbi->node_inode->i_mapping;
+        struct f2fs_inode *src, *dst;
-        struct f2fs_node *src, *dst;
        nid_t ino = ino_of_node(page);
        struct node_info old_ni, new_ni;
        struct page *ipage;
-        ipage = grab_cache_page(mapping, ino);
+        ipage = grab_cache_page(NODE_MAPPING(sbi), ino);
        if (!ipage)
                return -ENOMEM;
@@ -1552,19 +1553,19 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
        SetPageUptodate(ipage);
        fill_node_footer(ipage, ino, ino, 0, true);
-        src = F2FS_NODE(page);
+        src = F2FS_INODE(page);
-        dst = F2FS_NODE(ipage);
+        dst = F2FS_INODE(ipage);
-        memcpy(dst, src, (unsigned long)&src->i.i_ext - (unsigned long)&src->i);
+        memcpy(dst, src, (unsigned long)&src->i_ext - (unsigned long)src);
-        dst->i.i_size = 0;
+        dst->i_size = 0;
-        dst->i.i_blocks = cpu_to_le64(1);
+        dst->i_blocks = cpu_to_le64(1);
-        dst->i.i_links = cpu_to_le32(1);
+        dst->i_links = cpu_to_le32(1);
-        dst->i.i_xattr_nid = 0;
+        dst->i_xattr_nid = 0;
        new_ni = old_ni;
        new_ni.ino = ino;
-        if (!inc_valid_node_count(sbi, NULL, 1))
+        if (unlikely(!inc_valid_node_count(sbi, NULL)))
                WARN_ON(1);
        set_node_addr(sbi, &new_ni, NEW_ADDR);
        inc_valid_inode_count(sbi);
@@ -1572,47 +1573,88 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
        return 0;
 }
+/*
+ * ra_sum_pages() merge contiguous pages into one bio and submit.
+ * these pre-readed pages are linked in pages list.
+ */
+static int ra_sum_pages(struct f2fs_sb_info *sbi, struct list_head *pages,
+                                int start, int nrpages)
+{
+        struct page *page;
+        int page_idx = start;
+        struct f2fs_io_info fio = {
+                .type = META,
+                .rw = READ_SYNC | REQ_META | REQ_PRIO
+        };
+        for (; page_idx < start + nrpages; page_idx++) {
+                /* alloc temporal page for read node summary info*/
+                page = alloc_page(GFP_F2FS_ZERO);
+                if (!page) {
+                        struct page *tmp;
+                        list_for_each_entry_safe(page, tmp, pages, lru) {
+                                list_del(&page->lru);
+                                unlock_page(page);
+                                __free_pages(page, 0);
+                        }
+                        return -ENOMEM;
+                }
+                lock_page(page);
+                page->index = page_idx;
+                list_add_tail(&page->lru, pages);
+        }
+        list_for_each_entry(page, pages, lru)
+                f2fs_submit_page_mbio(sbi, page, page->index, &fio);
+        f2fs_submit_merged_bio(sbi, META, READ);
+        return 0;
+}
 int restore_node_summary(struct f2fs_sb_info *sbi,
                        unsigned int segno, struct f2fs_summary_block *sum)
 {
        struct f2fs_node *rn;
        struct f2fs_summary *sum_entry;
-        struct page *page;
+        struct page *page, *tmp;
        block_t addr;
-        int i, last_offset;
+        int bio_blocks = MAX_BIO_BLOCKS(max_hw_blocks(sbi));
+        int i, last_offset, nrpages, err = 0;
-        /* alloc temporal page for read node */
+        LIST_HEAD(page_list);
-        page = alloc_page(GFP_NOFS | __GFP_ZERO);
-        if (!page)
-                return -ENOMEM;
-        lock_page(page);
        /* scan the node segment */
        last_offset = sbi->blocks_per_seg;
        addr = START_BLOCK(sbi, segno);
        sum_entry = &sum->entries[0];
-        for (i = 0; i < last_offset; i++, sum_entry++) {
+        for (i = 0; i < last_offset; i += nrpages, addr += nrpages) {
-                /*
+                nrpages = min(last_offset - i, bio_blocks);
-                 * In order to read next node page,
-                 * we must clear PageUptodate flag.
-                 */
-                ClearPageUptodate(page);
-                if (f2fs_readpage(sbi, page, addr, READ_SYNC))
+                /* read ahead node pages */
-                        goto out;
+                err = ra_sum_pages(sbi, &page_list, addr, nrpages);
+                if (err)
+                        return err;
-                lock_page(page);
+                list_for_each_entry_safe(page, tmp, &page_list, lru) {
-                rn = F2FS_NODE(page);
-                sum_entry->nid = rn->footer.nid;
+                        lock_page(page);
-                sum_entry->version = 0;
+                        if (unlikely(!PageUptodate(page))) {
-                sum_entry->ofs_in_node = 0;
+                                err = -EIO;
-                addr++;
+                        } else {
+                                rn = F2FS_NODE(page);
+                                sum_entry->nid = rn->footer.nid;
+                                sum_entry->version = 0;
+                                sum_entry->ofs_in_node = 0;
+                                sum_entry++;
+                        }
+                        list_del(&page->lru);
+                        unlock_page(page);
+                        __free_pages(page, 0);
+                }
        }
-        unlock_page(page);
+        return err;
-out:
-        __free_pages(page, 0);
-        return 0;
 }
 static bool flush_nats_in_journal(struct f2fs_sb_info *sbi)
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 3496bb3e15dc..c4c79885c993 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -224,7 +224,13 @@ static inline block_t next_blkaddr_of_node(struct page *node_page)
 *    |            `- direct node (5 + N => 5 + 2N - 1)
 *    `- double indirect node (5 + 2N)
 *                 `- indirect node (6 + 2N)
- *                       `- direct node (x(N + 1))
+ *                       `- direct node
+ *                 ......
+ *                 `- indirect node ((6 + 2N) + x(N + 1))
+ *                       `- direct node
+ *                 ......
+ *                 `- indirect node ((6 + 2N) + (N - 1)(N + 1))
+ *                       `- direct node
 */
 static inline bool IS_DNODE(struct page *node_page)
 {
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index fdc81161f254..976a7a934db5 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -40,8 +40,7 @@ static struct fsync_inode_entry *get_fsync_inode(struct list_head *head,
 static int recover_dentry(struct page *ipage, struct inode *inode)
 {
-        struct f2fs_node *raw_node = F2FS_NODE(ipage);
+        struct f2fs_inode *raw_inode = F2FS_INODE(ipage);
-        struct f2fs_inode *raw_inode = &(raw_node->i);
        nid_t pino = le32_to_cpu(raw_inode->i_pino);
        struct f2fs_dir_entry *de;
        struct qstr name;
@@ -62,6 +61,12 @@ static int recover_dentry(struct page *ipage, struct inode *inode)
        name.len = le32_to_cpu(raw_inode->i_namelen);
        name.name = raw_inode->i_name;
+        if (unlikely(name.len > F2FS_NAME_LEN)) {
+                WARN_ON(1);
+                err = -ENAMETOOLONG;
+                goto out;
+        }
 retry:
        de = f2fs_find_entry(dir, &name, &page);
        if (de && inode->i_ino == le32_to_cpu(de->ino))
@@ -90,17 +95,16 @@ out_unmap_put:
        kunmap(page);
        f2fs_put_page(page, 0);
 out:
-        f2fs_msg(inode->i_sb, KERN_NOTICE, "recover_inode and its dentry: "
+        f2fs_msg(inode->i_sb, KERN_NOTICE,
-                        "ino = %x, name = %s, dir = %lx, err = %d",
+                        "%s: ino = %x, name = %s, dir = %lx, err = %d",
-                        ino_of_node(ipage), raw_inode->i_name,
+                        __func__, ino_of_node(ipage), raw_inode->i_name,
                        IS_ERR(dir) ? 0 : dir->i_ino, err);
        return err;
 }
 static int recover_inode(struct inode *inode, struct page *node_page)
 {
-        struct f2fs_node *raw_node = F2FS_NODE(node_page);
+        struct f2fs_inode *raw_inode = F2FS_INODE(node_page);
-        struct f2fs_inode *raw_inode = &(raw_node->i);
        if (!IS_INODE(node_page))
                return 0;
@@ -143,9 +147,9 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
        while (1) {
                struct fsync_inode_entry *entry;
-                err = f2fs_readpage(sbi, page, blkaddr, READ_SYNC);
+                err = f2fs_submit_page_bio(sbi, page, blkaddr, READ_SYNC);
                if (err)
-                        goto out;
+                        return err;
                lock_page(page);
@@ -191,9 +195,10 @@ next:
                /* check next segment */
                blkaddr = next_blkaddr_of_node(page);
        }
        unlock_page(page);
-out:
        __free_pages(page, 0);
        return err;
 }
@@ -293,6 +298,9 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
        struct node_info ni;
        int err = 0, recovered = 0;
+        if (recover_inline_data(inode, page))
+                goto out;
        start = start_bidx_of_node(ofs_of_node(page), fi);
        if (IS_INODE(page))
                end = start + ADDRS_PER_INODE(fi);
@@ -300,12 +308,13 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
                end = start + ADDRS_PER_BLOCK;
        f2fs_lock_op(sbi);
        set_new_dnode(&dn, inode, NULL, NULL, 0);
        err = get_dnode_of_data(&dn, start, ALLOC_NODE);
        if (err) {
                f2fs_unlock_op(sbi);
-                return err;
+                goto out;
        }
        wait_on_page_writeback(dn.node_page);
@@ -356,10 +365,10 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
 err:
        f2fs_put_dnode(&dn);
        f2fs_unlock_op(sbi);
+out:
-        f2fs_msg(sbi->sb, KERN_NOTICE, "recover_data: ino = %lx, "
+        f2fs_msg(sbi->sb, KERN_NOTICE,
-                        "recovered_data = %d blocks, err = %d",
+                "recover_data: ino = %lx, recovered = %d blocks, err = %d",
-                        inode->i_ino, recovered, err);
+                inode->i_ino, recovered, err);
        return err;
 }
@@ -377,7 +386,7 @@ static int recover_data(struct f2fs_sb_info *sbi,
        blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
        /* read node page */
-        page = alloc_page(GFP_NOFS | __GFP_ZERO);
+        page = alloc_page(GFP_F2FS_ZERO);
        if (!page)
                return -ENOMEM;
@@ -386,9 +395,9 @@ static int recover_data(struct f2fs_sb_info *sbi,
        while (1) {
                struct fsync_inode_entry *entry;
-                err = f2fs_readpage(sbi, page, blkaddr, READ_SYNC);
+                err = f2fs_submit_page_bio(sbi, page, blkaddr, READ_SYNC);
                if (err)
-                        goto out;
+                        return err;
                lock_page(page);
@@ -412,8 +421,8 @@ next:
                /* check next segment */
                blkaddr = next_blkaddr_of_node(page);
        }
        unlock_page(page);
-out:
        __free_pages(page, 0);
        if (!err)
@@ -429,7 +438,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi)
        fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry",
                        sizeof(struct fsync_inode_entry), NULL);
-        if (unlikely(!fsync_entry_slab))
+        if (!fsync_entry_slab)
                return -ENOMEM;
        INIT_LIST_HEAD(&inode_list);
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index fa284d397199..7caac5f2ca9e 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -14,12 +14,163 @@
 #include <linux/blkdev.h>
 #include <linux/prefetch.h>
 #include <linux/vmalloc.h>
+#include <linux/swap.h>
 #include "f2fs.h"
 #include "segment.h"
 #include "node.h"
 #include <trace/events/f2fs.h>
+#define __reverse_ffz(x) __reverse_ffs(~(x))
+static struct kmem_cache *discard_entry_slab;
+/*
+ * __reverse_ffs is copied from include/asm-generic/bitops/__ffs.h since
+ * MSB and LSB are reversed in a byte by f2fs_set_bit.
+ */
+static inline unsigned long __reverse_ffs(unsigned long word)
+{
+        int num = 0;
+#if BITS_PER_LONG == 64
+        if ((word & 0xffffffff) == 0) {
+                num += 32;
+                word >>= 32;
+        }
+#endif
+        if ((word & 0xffff) == 0) {
+                num += 16;
+                word >>= 16;
+        }
+        if ((word & 0xff) == 0) {
+                num += 8;
+                word >>= 8;
+        }
+        if ((word & 0xf0) == 0)
+                num += 4;
+        else
+                word >>= 4;
+        if ((word & 0xc) == 0)
+                num += 2;
+        else
+                word >>= 2;
+        if ((word & 0x2) == 0)
+                num += 1;
+        return num;
+}
+/*
+ * __find_rev_next(_zero)_bit is copied from lib/find_next_bit.c becasue
+ * f2fs_set_bit makes MSB and LSB reversed in a byte.
+ * Example:
+ *                             LSB <--> MSB
+ *   f2fs_set_bit(0, bitmap) => 0000 0001
+ *   f2fs_set_bit(7, bitmap) => 1000 0000
+ */
+static unsigned long __find_rev_next_bit(const unsigned long *addr,
+                        unsigned long size, unsigned long offset)
+{
+        const unsigned long *p = addr + BIT_WORD(offset);
+        unsigned long result = offset & ~(BITS_PER_LONG - 1);
+        unsigned long tmp;
+        unsigned long mask, submask;
+        unsigned long quot, rest;
+        if (offset >= size)
+                return size;
+        size -= result;
+        offset %= BITS_PER_LONG;
+        if (!offset)
+                goto aligned;
+        tmp = *(p++);
+        quot = (offset >> 3) << 3;
+        rest = offset & 0x7;
+        mask = ~0UL << quot;
+        submask = (unsigned char)(0xff << rest) >> rest;
+        submask <<= quot;
+        mask &= submask;
+        tmp &= mask;
+        if (size < BITS_PER_LONG)
+                goto found_first;
+        if (tmp)
+                goto found_middle;
+        size -= BITS_PER_LONG;
+        result += BITS_PER_LONG;
+aligned:
+        while (size & ~(BITS_PER_LONG-1)) {
+                tmp = *(p++);
+                if (tmp)
+                        goto found_middle;
+                result += BITS_PER_LONG;
+                size -= BITS_PER_LONG;
+        }
+        if (!size)
+                return result;
+        tmp = *p;
+found_first:
+        tmp &= (~0UL >> (BITS_PER_LONG - size));
+        if (tmp == 0UL)         /* Are any bits set? */
+                return result + size;   /* Nope. */
+found_middle:
+        return result + __reverse_ffs(tmp);
+}
+static unsigned long __find_rev_next_zero_bit(const unsigned long *addr,
+                        unsigned long size, unsigned long offset)
+{
+        const unsigned long *p = addr + BIT_WORD(offset);
+        unsigned long result = offset & ~(BITS_PER_LONG - 1);
+        unsigned long tmp;
+        unsigned long mask, submask;
+        unsigned long quot, rest;
+        if (offset >= size)
+                return size;
+        size -= result;
+        offset %= BITS_PER_LONG;
+        if (!offset)
+                goto aligned;
+        tmp = *(p++);
+        quot = (offset >> 3) << 3;
+        rest = offset & 0x7;
+        mask = ~(~0UL << quot);
+        submask = (unsigned char)~((unsigned char)(0xff << rest) >> rest);
+        submask <<= quot;
+        mask += submask;
+        tmp |= mask;
+        if (size < BITS_PER_LONG)
+                goto found_first;
+        if (~tmp)
+                goto found_middle;
+        size -= BITS_PER_LONG;
+        result += BITS_PER_LONG;
+aligned:
+        while (size & ~(BITS_PER_LONG - 1)) {
+                tmp = *(p++);
+                if (~tmp)
+                        goto found_middle;
+                result += BITS_PER_LONG;
+                size -= BITS_PER_LONG;
+        }
+        if (!size)
+                return result;
+        tmp = *p;
+found_first:
+        tmp |= ~0UL << size;
+        if (tmp == ~0UL)        /* Are any bits zero? */
+                return result + size;   /* Nope. */
+found_middle:
+        return result + __reverse_ffz(tmp);
+}
 /*
 * This function balances dirty node and dentry pages.
 * In addition, it controls garbage collection.
@@ -116,6 +267,56 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
        mutex_unlock(&dirty_i->seglist_lock);
 }
+static void f2fs_issue_discard(struct f2fs_sb_info *sbi,
+                                block_t blkstart, block_t blklen)
+{
+        sector_t start = SECTOR_FROM_BLOCK(sbi, blkstart);
+        sector_t len = SECTOR_FROM_BLOCK(sbi, blklen);
+        blkdev_issue_discard(sbi->sb->s_bdev, start, len, GFP_NOFS, 0);
+        trace_f2fs_issue_discard(sbi->sb, blkstart, blklen);
+}
+static void add_discard_addrs(struct f2fs_sb_info *sbi,
+                        unsigned int segno, struct seg_entry *se)
+{
+        struct list_head *head = &SM_I(sbi)->discard_list;
+        struct discard_entry *new;
+        int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long);
+        int max_blocks = sbi->blocks_per_seg;
+        unsigned long *cur_map = (unsigned long *)se->cur_valid_map;
+        unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map;
+        unsigned long dmap[entries];
+        unsigned int start = 0, end = -1;
+        int i;
+        if (!test_opt(sbi, DISCARD))
+                return;
+        /* zero block will be discarded through the prefree list */
+        if (!se->valid_blocks || se->valid_blocks == max_blocks)
+                return;
+        /* SIT_VBLOCK_MAP_SIZE should be multiple of sizeof(unsigned long) */
+        for (i = 0; i < entries; i++)
+                dmap[i] = (cur_map[i] ^ ckpt_map[i]) & ckpt_map[i];
+        while (SM_I(sbi)->nr_discards <= SM_I(sbi)->max_discards) {
+                start = __find_rev_next_bit(dmap, max_blocks, end + 1);
+                if (start >= max_blocks)
+                        break;
+                end = __find_rev_next_zero_bit(dmap, max_blocks, start + 1);
+                new = f2fs_kmem_cache_alloc(discard_entry_slab, GFP_NOFS);
+                INIT_LIST_HEAD(&new->list);
+                new->blkaddr = START_BLOCK(sbi, segno) + start;
+                new->len = end - start;
+                list_add_tail(&new->list, head);
+                SM_I(sbi)->nr_discards += end - start;
+        }
+}
 /*
 * Should call clear_prefree_segments after checkpoint is done.
 */
@@ -138,6 +339,9 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
 void clear_prefree_segments(struct f2fs_sb_info *sbi)
 {
+        struct list_head *head = &(SM_I(sbi)->discard_list);
+        struct list_head *this, *next;
+        struct discard_entry *entry;
        struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
        unsigned long *prefree_map = dirty_i->dirty_segmap[PRE];
        unsigned int total_segs = TOTAL_SEGS(sbi);
@@ -160,14 +364,19 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi)
                if (!test_opt(sbi, DISCARD))
                        continue;
-                blkdev_issue_discard(sbi->sb->s_bdev,
+                f2fs_issue_discard(sbi, START_BLOCK(sbi, start),
-                                START_BLOCK(sbi, start) <<
+                                (end - start) << sbi->log_blocks_per_seg);
-                                sbi->log_sectors_per_block,
-                                (1 << (sbi->log_sectors_per_block +
-                                sbi->log_blocks_per_seg)) * (end - start),
-                                GFP_NOFS, 0);
        }
        mutex_unlock(&dirty_i->seglist_lock);
+        /* send small discards */
+        list_for_each_safe(this, next, head) {
+                entry = list_entry(this, struct discard_entry, list);
+                f2fs_issue_discard(sbi, entry->blkaddr, entry->len);
+                list_del(&entry->list);
+                SM_I(sbi)->nr_discards -= entry->len;
+                kmem_cache_free(discard_entry_slab, entry);
+        }
 }
 static void __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno)
@@ -459,13 +668,18 @@ static void __next_free_blkoff(struct f2fs_sb_info *sbi,
                        struct curseg_info *seg, block_t start)
 {
        struct seg_entry *se = get_seg_entry(sbi, seg->segno);
-        block_t ofs;
+        int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long);
-        for (ofs = start; ofs < sbi->blocks_per_seg; ofs++) {
+        unsigned long target_map[entries];
-                if (!f2fs_test_bit(ofs, se->ckpt_valid_map)
+        unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map;
-                        && !f2fs_test_bit(ofs, se->cur_valid_map))
+        unsigned long *cur_map = (unsigned long *)se->cur_valid_map;
-                        break;
+        int i, pos;
-        }
-        seg->next_blkoff = ofs;
+        for (i = 0; i < entries; i++)
+                target_map[i] = ckpt_map[i] | cur_map[i];
+        pos = __find_rev_next_zero_bit(target_map, sbi->blocks_per_seg, start);
+        seg->next_blkoff = pos;
 }
 /*
@@ -573,148 +787,6 @@ static const struct segment_allocation default_salloc_ops = {
        .allocate_segment = allocate_segment_by_default,
 };
-static void f2fs_end_io_write(struct bio *bio, int err)
-{
-        const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-        struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
-        struct bio_private *p = bio->bi_private;
-        do {
-                struct page *page = bvec->bv_page;
-                if (--bvec >= bio->bi_io_vec)
-                        prefetchw(&bvec->bv_page->flags);
-                if (!uptodate) {
-                        SetPageError(page);
-                        if (page->mapping)
-                                set_bit(AS_EIO, &page->mapping->flags);
-                        set_ckpt_flags(p->sbi->ckpt, CP_ERROR_FLAG);
-                        p->sbi->sb->s_flags |= MS_RDONLY;
-                }
-                end_page_writeback(page);
-                dec_page_count(p->sbi, F2FS_WRITEBACK);
-        } while (bvec >= bio->bi_io_vec);
-        if (p->is_sync)
-                complete(p->wait);
-        if (!get_pages(p->sbi, F2FS_WRITEBACK) &&
-                        !list_empty(&p->sbi->cp_wait.task_list))
-                wake_up(&p->sbi->cp_wait);
-        kfree(p);
-        bio_put(bio);
-}
-struct bio *f2fs_bio_alloc(struct block_device *bdev, int npages)
-{
-        struct bio *bio;
-        /* No failure on bio allocation */
-        bio = bio_alloc(GFP_NOIO, npages);
-        bio->bi_bdev = bdev;
-        bio->bi_private = NULL;
-        return bio;
-}
-static void do_submit_bio(struct f2fs_sb_info *sbi,
-                                enum page_type type, bool sync)
-{
-        int rw = sync ? WRITE_SYNC : WRITE;
-        enum page_type btype = type > META ? META : type;
-        if (type >= META_FLUSH)
-                rw = WRITE_FLUSH_FUA;
-        if (btype == META)
-                rw |= REQ_META;
-        if (sbi->bio[btype]) {
-                struct bio_private *p = sbi->bio[btype]->bi_private;
-                p->sbi = sbi;
-                sbi->bio[btype]->bi_end_io = f2fs_end_io_write;
-                trace_f2fs_do_submit_bio(sbi->sb, btype, sync, sbi->bio[btype]);
-                if (type == META_FLUSH) {
-                        DECLARE_COMPLETION_ONSTACK(wait);
-                        p->is_sync = true;
-                        p->wait = &wait;
-                        submit_bio(rw, sbi->bio[btype]);
-                        wait_for_completion(&wait);
-                } else {
-                        p->is_sync = false;
-                        submit_bio(rw, sbi->bio[btype]);
-                }
-                sbi->bio[btype] = NULL;
-        }
-}
-void f2fs_submit_bio(struct f2fs_sb_info *sbi, enum page_type type, bool sync)
-{
-        down_write(&sbi->bio_sem);
-        do_submit_bio(sbi, type, sync);
-        up_write(&sbi->bio_sem);
-}
-static void submit_write_page(struct f2fs_sb_info *sbi, struct page *page,
-                                block_t blk_addr, enum page_type type)
-{
-        struct block_device *bdev = sbi->sb->s_bdev;
-        int bio_blocks;
-        verify_block_addr(sbi, blk_addr);
-        down_write(&sbi->bio_sem);
-        inc_page_count(sbi, F2FS_WRITEBACK);
-        if (sbi->bio[type] && sbi->last_block_in_bio[type] != blk_addr - 1)
-                do_submit_bio(sbi, type, false);
-alloc_new:
-        if (sbi->bio[type] == NULL) {
-                struct bio_private *priv;
-retry:
-                priv = kmalloc(sizeof(struct bio_private), GFP_NOFS);
-                if (!priv) {
-                        cond_resched();
-                        goto retry;
-                }
-                bio_blocks = MAX_BIO_BLOCKS(max_hw_blocks(sbi));
-                sbi->bio[type] = f2fs_bio_alloc(bdev, bio_blocks);
-                sbi->bio[type]->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr);
-                sbi->bio[type]->bi_private = priv;
-                /*
-                 * The end_io will be assigned at the sumbission phase.
-                 * Until then, let bio_add_page() merge consecutive IOs as much
-                 * as possible.
-                 */
-        }
-        if (bio_add_page(sbi->bio[type], page, PAGE_CACHE_SIZE, 0) <
-                                                        PAGE_CACHE_SIZE) {
-                do_submit_bio(sbi, type, false);
-                goto alloc_new;
-        }
-        sbi->last_block_in_bio[type] = blk_addr;
-        up_write(&sbi->bio_sem);
-        trace_f2fs_submit_write_page(page, blk_addr, type);
-}
-void f2fs_wait_on_page_writeback(struct page *page,
-                                enum page_type type, bool sync)
-{
-        struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
-        if (PageWriteback(page)) {
-                f2fs_submit_bio(sbi, type, sync);
-                wait_on_page_writeback(page);
-        }
-}
 static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type)
 {
        struct curseg_info *curseg = CURSEG_I(sbi, type);
@@ -782,16 +854,14 @@ static int __get_segment_type(struct page *page, enum page_type p_type)
        return __get_segment_type_6(page, p_type);
 }
-static void do_write_page(struct f2fs_sb_info *sbi, struct page *page,
+void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
-                        block_t old_blkaddr, block_t *new_blkaddr,
+                block_t old_blkaddr, block_t *new_blkaddr,
-                        struct f2fs_summary *sum, enum page_type p_type)
+                struct f2fs_summary *sum, int type)
 {
        struct sit_info *sit_i = SIT_I(sbi);
        struct curseg_info *curseg;
        unsigned int old_cursegno;
-        int type;
-        type = __get_segment_type(page, p_type);
        curseg = CURSEG_I(sbi, type);
        mutex_lock(&curseg->curseg_mutex);
@@ -824,49 +894,64 @@ static void do_write_page(struct f2fs_sb_info *sbi, struct page *page,
        locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr));
        mutex_unlock(&sit_i->sentry_lock);
-        if (p_type == NODE)
+        if (page && IS_NODESEG(type))
                fill_node_footer_blkaddr(page, NEXT_FREE_BLKADDR(sbi, curseg));
-        /* writeout dirty page into bdev */
-        submit_write_page(sbi, page, *new_blkaddr, p_type);
        mutex_unlock(&curseg->curseg_mutex);
 }
+static void do_write_page(struct f2fs_sb_info *sbi, struct page *page,
+                        block_t old_blkaddr, block_t *new_blkaddr,
+                        struct f2fs_summary *sum, struct f2fs_io_info *fio)
+{
+        int type = __get_segment_type(page, fio->type);
+        allocate_data_block(sbi, page, old_blkaddr, new_blkaddr, sum, type);
+        /* writeout dirty page into bdev */
+        f2fs_submit_page_mbio(sbi, page, *new_blkaddr, fio);
+}
 void write_meta_page(struct f2fs_sb_info *sbi, struct page *page)
 {
+        struct f2fs_io_info fio = {
+                .type = META,
+                .rw = WRITE_SYNC | REQ_META | REQ_PRIO
+        };
        set_page_writeback(page);
-        submit_write_page(sbi, page, page->index, META);
+        f2fs_submit_page_mbio(sbi, page, page->index, &fio);
 }
 void write_node_page(struct f2fs_sb_info *sbi, struct page *page,
+                struct f2fs_io_info *fio,
                unsigned int nid, block_t old_blkaddr, block_t *new_blkaddr)
 {
        struct f2fs_summary sum;
        set_summary(&sum, nid, 0, 0);
-        do_write_page(sbi, page, old_blkaddr, new_blkaddr, &sum, NODE);
+        do_write_page(sbi, page, old_blkaddr, new_blkaddr, &sum, fio);
 }
-void write_data_page(struct inode *inode, struct page *page,
+void write_data_page(struct page *page, struct dnode_of_data *dn,
-                struct dnode_of_data *dn, block_t old_blkaddr,
+                block_t *new_blkaddr, struct f2fs_io_info *fio)
-                block_t *new_blkaddr)
 {
-        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
        struct f2fs_summary sum;
        struct node_info ni;
-        f2fs_bug_on(old_blkaddr == NULL_ADDR);
+        f2fs_bug_on(dn->data_blkaddr == NULL_ADDR);
        get_node_info(sbi, dn->nid, &ni);
        set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version);
-        do_write_page(sbi, page, old_blkaddr,
+        do_write_page(sbi, page, dn->data_blkaddr, new_blkaddr, &sum, fio);
-                        new_blkaddr, &sum, DATA);
 }
-void rewrite_data_page(struct f2fs_sb_info *sbi, struct page *page,
+void rewrite_data_page(struct page *page, block_t old_blkaddr,
-                                        block_t old_blk_addr)
+                                        struct f2fs_io_info *fio)
 {
-        submit_write_page(sbi, page, old_blk_addr, DATA);
+        struct inode *inode = page->mapping->host;
+        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        f2fs_submit_page_mbio(sbi, page, old_blkaddr, fio);
 }
 void recover_data_page(struct f2fs_sb_info *sbi,
@@ -925,6 +1010,10 @@ void rewrite_node_page(struct f2fs_sb_info *sbi,
        unsigned int segno, old_cursegno;
        block_t next_blkaddr = next_blkaddr_of_node(page);
        unsigned int next_segno = GET_SEGNO(sbi, next_blkaddr);
+        struct f2fs_io_info fio = {
+                .type = NODE,
+                .rw = WRITE_SYNC,
+        };
        curseg = CURSEG_I(sbi, type);
@@ -953,8 +1042,8 @@ void rewrite_node_page(struct f2fs_sb_info *sbi,
        /* rewrite node page */
        set_page_writeback(page);
-        submit_write_page(sbi, page, new_blkaddr, NODE);
+        f2fs_submit_page_mbio(sbi, page, new_blkaddr, &fio);
-        f2fs_submit_bio(sbi, NODE, true);
+        f2fs_submit_merged_bio(sbi, NODE, WRITE);
        refresh_sit_entry(sbi, old_blkaddr, new_blkaddr);
        locate_dirty_segment(sbi, old_cursegno);
@@ -964,6 +1053,16 @@ void rewrite_node_page(struct f2fs_sb_info *sbi,
        mutex_unlock(&curseg->curseg_mutex);
 }
+void f2fs_wait_on_page_writeback(struct page *page,
+                                enum page_type type)
+{
+        struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
+        if (PageWriteback(page)) {
+                f2fs_submit_merged_bio(sbi, type, WRITE);
+                wait_on_page_writeback(page);
+        }
+}
 static int read_compacted_summaries(struct f2fs_sb_info *sbi)
 {
        struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
@@ -1314,6 +1413,10 @@ void flush_sit_entries(struct f2fs_sb_info *sbi)
                sit_offset = SIT_ENTRY_OFFSET(sit_i, segno);
+                /* add discard candidates */
+                if (SM_I(sbi)->nr_discards < SM_I(sbi)->max_discards)
+                        add_discard_addrs(sbi, segno, se);
                if (flushed)
                        goto to_sit_page;
@@ -1480,41 +1583,94 @@ static int build_curseg(struct f2fs_sb_info *sbi)
        return restore_curseg_summaries(sbi);
 }
+static int ra_sit_pages(struct f2fs_sb_info *sbi, int start, int nrpages)
+{
+        struct address_space *mapping = META_MAPPING(sbi);
+        struct page *page;
+        block_t blk_addr, prev_blk_addr = 0;
+        int sit_blk_cnt = SIT_BLK_CNT(sbi);
+        int blkno = start;
+        struct f2fs_io_info fio = {
+                .type = META,
+                .rw = READ_SYNC | REQ_META | REQ_PRIO
+        };
+        for (; blkno < start + nrpages && blkno < sit_blk_cnt; blkno++) {
+                blk_addr = current_sit_addr(sbi, blkno * SIT_ENTRY_PER_BLOCK);
+                if (blkno != start && prev_blk_addr + 1 != blk_addr)
+                        break;
+                prev_blk_addr = blk_addr;
+repeat:
+                page = grab_cache_page(mapping, blk_addr);
+                if (!page) {
+                        cond_resched();
+                        goto repeat;
+                }
+                if (PageUptodate(page)) {
+                        mark_page_accessed(page);
+                        f2fs_put_page(page, 1);
+                        continue;
+                }
+                f2fs_submit_page_mbio(sbi, page, blk_addr, &fio);
+                mark_page_accessed(page);
+                f2fs_put_page(page, 0);
+        }
+        f2fs_submit_merged_bio(sbi, META, READ);
+        return blkno - start;
+}
 static void build_sit_entries(struct f2fs_sb_info *sbi)
 {
        struct sit_info *sit_i = SIT_I(sbi);
        struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
        struct f2fs_summary_block *sum = curseg->sum_blk;
-        unsigned int start;
+        int sit_blk_cnt = SIT_BLK_CNT(sbi);
+        unsigned int i, start, end;
+        unsigned int readed, start_blk = 0;
+        int nrpages = MAX_BIO_BLOCKS(max_hw_blocks(sbi));
-        for (start = 0; start < TOTAL_SEGS(sbi); start++) {
+        do {
-                struct seg_entry *se = &sit_i->sentries[start];
+                readed = ra_sit_pages(sbi, start_blk, nrpages);
-                struct f2fs_sit_block *sit_blk;
-                struct f2fs_sit_entry sit;
+                start = start_blk * sit_i->sents_per_block;
-                struct page *page;
+                end = (start_blk + readed) * sit_i->sents_per_block;
-                int i;
+                for (; start < end && start < TOTAL_SEGS(sbi); start++) {
-                mutex_lock(&curseg->curseg_mutex);
+                        struct seg_entry *se = &sit_i->sentries[start];
-                for (i = 0; i < sits_in_cursum(sum); i++) {
+                        struct f2fs_sit_block *sit_blk;
-                        if (le32_to_cpu(segno_in_journal(sum, i)) == start) {
+                        struct f2fs_sit_entry sit;
-                                sit = sit_in_journal(sum, i);
+                        struct page *page;
-                                mutex_unlock(&curseg->curseg_mutex);
-                                goto got_it;
+                        mutex_lock(&curseg->curseg_mutex);
+                        for (i = 0; i < sits_in_cursum(sum); i++) {
+                                if (le32_to_cpu(segno_in_journal(sum, i))
+                                                                == start) {
+                                        sit = sit_in_journal(sum, i);
+                                        mutex_unlock(&curseg->curseg_mutex);
+                                        goto got_it;
+                                }
                        }
-                }
+                        mutex_unlock(&curseg->curseg_mutex);
-                mutex_unlock(&curseg->curseg_mutex);
-                page = get_current_sit_page(sbi, start);
+                        page = get_current_sit_page(sbi, start);
-                sit_blk = (struct f2fs_sit_block *)page_address(page);
+                        sit_blk = (struct f2fs_sit_block *)page_address(page);
-                sit = sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, start)];
+                        sit = sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, start)];
-                f2fs_put_page(page, 1);
+                        f2fs_put_page(page, 1);
 got_it:
-                check_block_count(sbi, start, &sit);
+                        check_block_count(sbi, start, &sit);
-                seg_info_from_raw_sit(se, &sit);
+                        seg_info_from_raw_sit(se, &sit);
-                if (sbi->segs_per_sec > 1) {
+                        if (sbi->segs_per_sec > 1) {
-                        struct sec_entry *e = get_sec_entry(sbi, start);
+                                struct sec_entry *e = get_sec_entry(sbi, start);
-                        e->valid_blocks += se->valid_blocks;
+                                e->valid_blocks += se->valid_blocks;
+                        }
                }
-        }
+                start_blk += readed;
+        } while (start_blk < sit_blk_cnt);
 }
 static void init_free_segmap(struct f2fs_sb_info *sbi)
@@ -1644,6 +1800,12 @@ int build_segment_manager(struct f2fs_sb_info *sbi)
        sm_info->main_segments = le32_to_cpu(raw_super->segment_count_main);
        sm_info->ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr);
        sm_info->rec_prefree_segments = DEF_RECLAIM_PREFREE_SEGMENTS;
+        sm_info->ipu_policy = F2FS_IPU_DISABLE;
+        sm_info->min_ipu_util = DEF_MIN_IPU_UTIL;
+        INIT_LIST_HEAD(&sm_info->discard_list);
+        sm_info->nr_discards = 0;
+        sm_info->max_discards = 0;
        err = build_sit_info(sbi);
        if (err)
@@ -1760,3 +1922,17 @@ void destroy_segment_manager(struct f2fs_sb_info *sbi)
        sbi->sm_info = NULL;
        kfree(sm_info);
 }
+int __init create_segment_manager_caches(void)
+{
+        discard_entry_slab = f2fs_kmem_cache_create("discard_entry",
+                        sizeof(struct discard_entry), NULL);
+        if (!discard_entry_slab)
+                return -ENOMEM;
+        return 0;
+}
+void destroy_segment_manager_caches(void)
+{
+        kmem_cache_destroy(discard_entry_slab);
+}
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 269f690b4e24..5731682d7516 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -20,13 +20,8 @@
 #define GET_L2R_SEGNO(free_i, segno)    (segno - free_i->start_segno)
 #define GET_R2L_SEGNO(free_i, segno)    (segno + free_i->start_segno)
-#define IS_DATASEG(t)                                                   \
+#define IS_DATASEG(t)   (t <= CURSEG_COLD_DATA)
-        ((t == CURSEG_HOT_DATA) || (t == CURSEG_COLD_DATA) ||           \
+#define IS_NODESEG(t)   (t >= CURSEG_HOT_NODE)
-        (t == CURSEG_WARM_DATA))
-#define IS_NODESEG(t)                                                   \
-        ((t == CURSEG_HOT_NODE) || (t == CURSEG_COLD_NODE) ||           \
-        (t == CURSEG_WARM_NODE))
 #define IS_CURSEG(sbi, seg)                                             \
        ((seg == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) ||      \
@@ -83,25 +78,20 @@
        (segno / SIT_ENTRY_PER_BLOCK)
 #define START_SEGNO(sit_i, segno)               \
        (SIT_BLOCK_OFFSET(sit_i, segno) * SIT_ENTRY_PER_BLOCK)
+#define SIT_BLK_CNT(sbi)                        \
+        ((TOTAL_SEGS(sbi) + SIT_ENTRY_PER_BLOCK - 1) / SIT_ENTRY_PER_BLOCK)
 #define f2fs_bitmap_size(nr)                    \
        (BITS_TO_LONGS(nr) * sizeof(unsigned long))
 #define TOTAL_SEGS(sbi) (SM_I(sbi)->main_segments)
 #define TOTAL_SECS(sbi) (sbi->total_sections)
 #define SECTOR_FROM_BLOCK(sbi, blk_addr)                                \
-        (blk_addr << ((sbi)->log_blocksize - F2FS_LOG_SECTOR_SIZE))
+        (((sector_t)blk_addr) << (sbi)->log_sectors_per_block)
 #define SECTOR_TO_BLOCK(sbi, sectors)                                   \
-        (sectors >> ((sbi)->log_blocksize - F2FS_LOG_SECTOR_SIZE))
+        (sectors >> (sbi)->log_sectors_per_block)
 #define MAX_BIO_BLOCKS(max_hw_blocks)                                   \
        (min((int)max_hw_blocks, BIO_MAX_PAGES))
-/* during checkpoint, bio_private is used to synchronize the last bio */
-struct bio_private {
-        struct f2fs_sb_info *sbi;
-        bool is_sync;
-        void *wait;
-};
 /*
 * indicate a block allocation direction: RIGHT and LEFT.
 * RIGHT means allocating new sections towards the end of volume.
@@ -458,8 +448,8 @@ static inline int reserved_sections(struct f2fs_sb_info *sbi)
 static inline bool need_SSR(struct f2fs_sb_info *sbi)
 {
-        return ((prefree_segments(sbi) / sbi->segs_per_sec)
+        return (prefree_segments(sbi) / sbi->segs_per_sec)
-                        + free_sections(sbi) < overprovision_sections(sbi));
+                        + free_sections(sbi) < overprovision_sections(sbi);
 }
 static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed)
@@ -467,38 +457,71 @@ static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed)
        int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES);
        int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS);
-        if (sbi->por_doing)
+        if (unlikely(sbi->por_doing))
                return false;
-        return ((free_sections(sbi) + freed) <= (node_secs + 2 * dent_secs +
+        return (free_sections(sbi) + freed) <= (node_secs + 2 * dent_secs +
-                                                reserved_sections(sbi)));
+                                                reserved_sections(sbi));
 }
 static inline bool excess_prefree_segs(struct f2fs_sb_info *sbi)
 {
-        return (prefree_segments(sbi) > SM_I(sbi)->rec_prefree_segments);
+        return prefree_segments(sbi) > SM_I(sbi)->rec_prefree_segments;
 }
 static inline int utilization(struct f2fs_sb_info *sbi)
 {
-        return div_u64((u64)valid_user_blocks(sbi) * 100, sbi->user_block_count);
+        return div_u64((u64)valid_user_blocks(sbi) * 100,
+                                        sbi->user_block_count);
 }
 /*
 * Sometimes f2fs may be better to drop out-of-place update policy.
- * So, if fs utilization is over MIN_IPU_UTIL, then f2fs tries to write
+ * And, users can control the policy through sysfs entries.
- * data in the original place likewise other traditional file systems.
+ * There are five policies with triggering conditions as follows.
- * But, currently set 100 in percentage, which means it is disabled.
+ * F2FS_IPU_FORCE - all the time,
- * See below need_inplace_update().
+ * F2FS_IPU_SSR - if SSR mode is activated,
+ * F2FS_IPU_UTIL - if FS utilization is over threashold,
+ * F2FS_IPU_SSR_UTIL - if SSR mode is activated and FS utilization is over
+ *                     threashold,
+ * F2FS_IPUT_DISABLE - disable IPU. (=default option)
 */
-#define MIN_IPU_UTIL            100
+#define DEF_MIN_IPU_UTIL        70
+enum {
+        F2FS_IPU_FORCE,
+        F2FS_IPU_SSR,
+        F2FS_IPU_UTIL,
+        F2FS_IPU_SSR_UTIL,
+        F2FS_IPU_DISABLE,
+};
 static inline bool need_inplace_update(struct inode *inode)
 {
        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        /* IPU can be done only for the user data */
        if (S_ISDIR(inode->i_mode))
                return false;
-        if (need_SSR(sbi) && utilization(sbi) > MIN_IPU_UTIL)
+        switch (SM_I(sbi)->ipu_policy) {
+        case F2FS_IPU_FORCE:
                return true;
+        case F2FS_IPU_SSR:
+                if (need_SSR(sbi))
+                        return true;
+                break;
+        case F2FS_IPU_UTIL:
+                if (utilization(sbi) > SM_I(sbi)->min_ipu_util)
+                        return true;
+                break;
+        case F2FS_IPU_SSR_UTIL:
+                if (need_SSR(sbi) && utilization(sbi) > SM_I(sbi)->min_ipu_util)
+                        return true;
+                break;
+        case F2FS_IPU_DISABLE:
+                break;
+        }
        return false;
 }
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index bafff72de8e8..1a85f83abd53 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -50,6 +50,7 @@ enum {
        Opt_active_logs,
        Opt_disable_ext_identify,
        Opt_inline_xattr,
+        Opt_inline_data,
        Opt_err,
 };
@@ -65,6 +66,7 @@ static match_table_t f2fs_tokens = {
        {Opt_active_logs, "active_logs=%u"},
        {Opt_disable_ext_identify, "disable_ext_identify"},
        {Opt_inline_xattr, "inline_xattr"},
+        {Opt_inline_data, "inline_data"},
        {Opt_err, NULL},
 };
@@ -72,6 +74,7 @@ static match_table_t f2fs_tokens = {
 enum {
        GC_THREAD,      /* struct f2fs_gc_thread */
        SM_INFO,        /* struct f2fs_sm_info */
+        F2FS_SBI,       /* struct f2fs_sb_info */
 };
 struct f2fs_attr {
@@ -89,6 +92,8 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type)
                return (unsigned char *)sbi->gc_thread;
        else if (struct_type == SM_INFO)
                return (unsigned char *)SM_I(sbi);
+        else if (struct_type == F2FS_SBI)
+                return (unsigned char *)sbi;
        return NULL;
 }
@@ -175,6 +180,10 @@ F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_max_sleep_time, max_sleep_time);
 F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time);
 F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_idle, gc_idle);
 F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments);
+F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, max_small_discards, max_discards);
+F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy);
+F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search);
 #define ATTR_LIST(name) (&f2fs_attr_##name.attr)
 static struct attribute *f2fs_attrs[] = {
@@ -183,6 +192,10 @@ static struct attribute *f2fs_attrs[] = {
        ATTR_LIST(gc_no_gc_sleep_time),
        ATTR_LIST(gc_idle),
        ATTR_LIST(reclaim_segments),
+        ATTR_LIST(max_small_discards),
+        ATTR_LIST(ipu_policy),
+        ATTR_LIST(min_ipu_util),
+        ATTR_LIST(max_victim_search),
        NULL,
 };
@@ -311,6 +324,9 @@ static int parse_options(struct super_block *sb, char *options)
                case Opt_disable_ext_identify:
                        set_opt(sbi, DISABLE_EXT_IDENTIFY);
                        break;
+                case Opt_inline_data:
+                        set_opt(sbi, INLINE_DATA);
+                        break;
                default:
                        f2fs_msg(sb, KERN_ERR,
                                "Unrecognized mount option \"%s\" or missing value",
@@ -325,7 +341,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
 {
        struct f2fs_inode_info *fi;
-        fi = kmem_cache_alloc(f2fs_inode_cachep, GFP_NOFS | __GFP_ZERO);
+        fi = kmem_cache_alloc(f2fs_inode_cachep, GFP_F2FS_ZERO);
        if (!fi)
                return NULL;
@@ -508,7 +524,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
 #endif
        if (test_opt(sbi, DISABLE_EXT_IDENTIFY))
                seq_puts(seq, ",disable_ext_identify");
+        if (test_opt(sbi, INLINE_DATA))
+                seq_puts(seq, ",inline_data");
        seq_printf(seq, ",active_logs=%u", sbi->active_logs);
        return 0;
@@ -518,7 +535,8 @@ static int segment_info_seq_show(struct seq_file *seq, void *offset)
 {
        struct super_block *sb = seq->private;
        struct f2fs_sb_info *sbi = F2FS_SB(sb);
-        unsigned int total_segs = le32_to_cpu(sbi->raw_super->segment_count_main);
+        unsigned int total_segs =
+                        le32_to_cpu(sbi->raw_super->segment_count_main);
        int i;
        for (i = 0; i < total_segs; i++) {
@@ -618,7 +636,7 @@ static struct inode *f2fs_nfs_get_inode(struct super_block *sb,
        struct f2fs_sb_info *sbi = F2FS_SB(sb);
        struct inode *inode;
-        if (ino < F2FS_ROOT_INO(sbi))
+        if (unlikely(ino < F2FS_ROOT_INO(sbi)))
                return ERR_PTR(-ESTALE);
        /*
@@ -629,7 +647,7 @@ static struct inode *f2fs_nfs_get_inode(struct super_block *sb,
        inode = f2fs_iget(sb, ino);
        if (IS_ERR(inode))
                return ERR_CAST(inode);
-        if (generation && inode->i_generation != generation) {
+        if (unlikely(generation && inode->i_generation != generation)) {
                /* we didn't find the right inode.. */
                iput(inode);
                return ERR_PTR(-ESTALE);
@@ -732,10 +750,10 @@ static int sanity_check_ckpt(struct f2fs_sb_info *sbi)
        fsmeta += le32_to_cpu(ckpt->rsvd_segment_count);
        fsmeta += le32_to_cpu(raw_super->segment_count_ssa);
-        if (fsmeta >= total)
+        if (unlikely(fsmeta >= total))
                return 1;
-        if (is_set_ckpt_flags(ckpt, CP_ERROR_FLAG)) {
+        if (unlikely(is_set_ckpt_flags(ckpt, CP_ERROR_FLAG))) {
                f2fs_msg(sbi->sb, KERN_ERR, "A bug case: need to run fsck");
                return 1;
        }
@@ -763,6 +781,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
        sbi->node_ino_num = le32_to_cpu(raw_super->node_ino);
        sbi->meta_ino_num = le32_to_cpu(raw_super->meta_ino);
        sbi->cur_victim_sec = NULL_SECNO;
+        sbi->max_victim_search = DEF_MAX_VICTIM_SEARCH;
        for (i = 0; i < NR_COUNT_TYPE; i++)
                atomic_set(&sbi->nr_pages[i], 0);
@@ -798,9 +817,10 @@ retry:
        /* sanity checking of raw super */
        if (sanity_check_raw_super(sb, *raw_super)) {
                brelse(*raw_super_buf);
-                f2fs_msg(sb, KERN_ERR, "Can't find a valid F2FS filesystem "
+                f2fs_msg(sb, KERN_ERR,
-                                "in %dth superblock", block + 1);
+                        "Can't find valid F2FS filesystem in %dth superblock",
-                if(block == 0) {
+                                                                block + 1);
+                if (block == 0) {
                        block++;
                        goto retry;
                } else {
@@ -818,6 +838,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
        struct buffer_head *raw_super_buf;
        struct inode *root;
        long err = -EINVAL;
+        int i;
        /* allocate memory for f2fs-specific super block info */
        sbi = kzalloc(sizeof(struct f2fs_sb_info), GFP_KERNEL);
@@ -825,7 +846,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
                return -ENOMEM;
        /* set a block size */
-        if (!sb_set_blocksize(sb, F2FS_BLKSIZE)) {
+        if (unlikely(!sb_set_blocksize(sb, F2FS_BLKSIZE))) {
                f2fs_msg(sb, KERN_ERR, "unable to set blocksize");
                goto free_sbi;
        }
@@ -874,7 +895,16 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
        mutex_init(&sbi->node_write);
        sbi->por_doing = false;
        spin_lock_init(&sbi->stat_lock);
-        init_rwsem(&sbi->bio_sem);
+        mutex_init(&sbi->read_io.io_mutex);
+        sbi->read_io.sbi = sbi;
+        sbi->read_io.bio = NULL;
+        for (i = 0; i < NR_PAGE_TYPE; i++) {
+                mutex_init(&sbi->write_io[i].io_mutex);
+                sbi->write_io[i].sbi = sbi;
+                sbi->write_io[i].bio = NULL;
+        }
        init_rwsem(&sbi->cp_rwsem);
        init_waitqueue_head(&sbi->cp_wait);
        init_sb_info(sbi);
@@ -939,9 +969,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
        }
        /* if there are nt orphan nodes free them */
-        err = -EINVAL;
+        recover_orphan_inodes(sbi);
-        if (recover_orphan_inodes(sbi))
-                goto free_node_inode;
        /* read root inode and dentry */
        root = f2fs_iget(sb, F2FS_ROOT_INO(sbi));
@@ -950,8 +978,10 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
                err = PTR_ERR(root);
                goto free_node_inode;
        }
-        if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size)
+        if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
+                err = -EINVAL;
                goto free_root_inode;
+        }
        sb->s_root = d_make_root(root); /* allocate root dentry */
        if (!sb->s_root) {
@@ -1053,7 +1083,7 @@ static int __init init_inodecache(void)
 {
        f2fs_inode_cachep = f2fs_kmem_cache_create("f2fs_inode_cache",
                        sizeof(struct f2fs_inode_info), NULL);
-        if (f2fs_inode_cachep == NULL)
+        if (!f2fs_inode_cachep)
                return -ENOMEM;
        return 0;
 }
@@ -1078,9 +1108,12 @@ static int __init init_f2fs_fs(void)
        err = create_node_manager_caches();
        if (err)
                goto free_inodecache;
-        err = create_gc_caches();
+        err = create_segment_manager_caches();
        if (err)
                goto free_node_manager_caches;
+        err = create_gc_caches();
+        if (err)
+                goto free_segment_manager_caches;
        err = create_checkpoint_caches();
        if (err)
                goto free_gc_caches;
@@ -1102,6 +1135,8 @@ free_checkpoint_caches:
        destroy_checkpoint_caches();
 free_gc_caches:
        destroy_gc_caches();
+free_segment_manager_caches:
+        destroy_segment_manager_caches();
 free_node_manager_caches:
        destroy_node_manager_caches();
 free_inodecache:
@@ -1117,6 +1152,7 @@ static void __exit exit_f2fs_fs(void)
        unregister_filesystem(&f2fs_fs_type);
        destroy_checkpoint_caches();
        destroy_gc_caches();
+        destroy_segment_manager_caches();
        destroy_node_manager_caches();
        destroy_inodecache();
        kset_unregister(f2fs_kset);
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index aa7a3f139fe5..89d0422a91a8 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -21,6 +21,7 @@
 #include <linux/rwsem.h>
 #include <linux/f2fs_fs.h>
 #include <linux/security.h>
+#include <linux/posix_acl_xattr.h>
 #include "f2fs.h"
 #include "xattr.h"
@@ -216,8 +217,8 @@ const struct xattr_handler f2fs_xattr_security_handler = {
 static const struct xattr_handler *f2fs_xattr_handler_map[] = {
        [F2FS_XATTR_INDEX_USER] = &f2fs_xattr_user_handler,
 #ifdef CONFIG_F2FS_FS_POSIX_ACL
-        [F2FS_XATTR_INDEX_POSIX_ACL_ACCESS] = &f2fs_xattr_acl_access_handler,
+        [F2FS_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler,
-        [F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT] = &f2fs_xattr_acl_default_handler,
+        [F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT] = &posix_acl_default_xattr_handler,
 #endif
        [F2FS_XATTR_INDEX_TRUSTED] = &f2fs_xattr_trusted_handler,
 #ifdef CONFIG_F2FS_FS_SECURITY
@@ -229,8 +230,8 @@ static const struct xattr_handler *f2fs_xattr_handler_map[] = {
 const struct xattr_handler *f2fs_xattr_handlers[] = {
        &f2fs_xattr_user_handler,
 #ifdef CONFIG_F2FS_FS_POSIX_ACL
-        &f2fs_xattr_acl_access_handler,
+        &posix_acl_access_xattr_handler,
-        &f2fs_xattr_acl_default_handler,
+        &posix_acl_default_xattr_handler,
 #endif
        &f2fs_xattr_trusted_handler,
 #ifdef CONFIG_F2FS_FS_SECURITY
@@ -522,7 +523,7 @@ static int __f2fs_setxattr(struct inode *inode, int name_index,
                if (found)
                        free = free + ENTRY_SIZE(here);
-                if (free < newsize) {
+                if (unlikely(free < newsize)) {
                        error = -ENOSPC;
                        goto exit;
                }
diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h
index 02a08fb88a15..b21d9ebdeff3 100644
--- a/fs/f2fs/xattr.h
+++ b/fs/f2fs/xattr.h
@@ -108,8 +108,6 @@ struct f2fs_xattr_entry {
 #ifdef CONFIG_F2FS_FS_XATTR
 extern const struct xattr_handler f2fs_xattr_user_handler;
 extern const struct xattr_handler f2fs_xattr_trusted_handler;
-extern const struct xattr_handler f2fs_xattr_acl_access_handler;
-extern const struct xattr_handler f2fs_xattr_acl_default_handler;
 extern const struct xattr_handler f2fs_xattr_advise_handler;
 extern const struct xattr_handler f2fs_xattr_security_handler;
diff --git a/fs/file.c b/fs/file.c
index 4a78f981557a..eb56a13dab3e 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -34,7 +34,7 @@ static void *alloc_fdmem(size_t size)
         * vmalloc() if the allocation size will be considered "large" by the VM.
         */
        if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
-                void *data = kmalloc(size, GFP_KERNEL|__GFP_NOWARN);
+                void *data = kmalloc(size, GFP_KERNEL|__GFP_NOWARN|__GFP_NORETRY);
                if (data != NULL)
                        return data;
        }
@@ -348,21 +348,16 @@ out:
        return NULL;
 }
-static void close_files(struct files_struct * files)
+static struct fdtable *close_files(struct files_struct * files)
 {
-        int i, j;
-        struct fdtable *fdt;
-        j = 0;
        /*
         * It is safe to dereference the fd table without RCU or
         * ->file_lock because this is the last reference to the
-         * files structure.  But use RCU to shut RCU-lockdep up.
+         * files structure.
         */
-        rcu_read_lock();
+        struct fdtable *fdt = rcu_dereference_raw(files->fdt);
-        fdt = files_fdtable(files);
+        int i, j = 0;
-        rcu_read_unlock();
        for (;;) {
                unsigned long set;
                i = j * BITS_PER_LONG;
@@ -381,6 +376,8 @@ static void close_files(struct files_struct * files)
                        set >>= 1;
                }
        }
+        return fdt;
 }
 struct files_struct *get_files_struct(struct task_struct *task)
@@ -398,14 +395,9 @@ struct files_struct *get_files_struct(struct task_struct *task)
 void put_files_struct(struct files_struct *files)
 {
-        struct fdtable *fdt;
        if (atomic_dec_and_test(&files->count)) {
-                close_files(files);
+                struct fdtable *fdt = close_files(files);
-                /* not really needed, since nobody can see us */
-                rcu_read_lock();
-                fdt = files_fdtable(files);
-                rcu_read_unlock();
                /* free the arrays if they are not embedded */
                if (fdt != &files->fdtab)
                        __free_fdtable(fdt);
@@ -645,16 +637,16 @@ void do_close_on_exec(struct files_struct *files)
        spin_unlock(&files->file_lock);
 }
-struct file *fget(unsigned int fd)
+static struct file *__fget(unsigned int fd, fmode_t mask)
 {
-        struct file *file;
        struct files_struct *files = current->files;
+        struct file *file;
        rcu_read_lock();
        file = fcheck_files(files, fd);
        if (file) {
                /* File object ref couldn't be taken */
-                if (file->f_mode & FMODE_PATH ||
+                if ((file->f_mode & mask) ||
                    !atomic_long_inc_not_zero(&file->f_count))
                        file = NULL;
        }
@@ -663,25 +655,16 @@ struct file *fget(unsigned int fd)
        return file;
 }
+struct file *fget(unsigned int fd)
+{
+        return __fget(fd, FMODE_PATH);
+}
 EXPORT_SYMBOL(fget);
 struct file *fget_raw(unsigned int fd)
 {
-        struct file *file;
+        return __fget(fd, 0);
-        struct files_struct *files = current->files;
-        rcu_read_lock();
-        file = fcheck_files(files, fd);
-        if (file) {
-                /* File object ref couldn't be taken */
-                if (!atomic_long_inc_not_zero(&file->f_count))
-                        file = NULL;
-        }
-        rcu_read_unlock();
-        return file;
 }
 EXPORT_SYMBOL(fget_raw);
 /*
@@ -700,58 +683,54 @@ EXPORT_SYMBOL(fget_raw);
 * The fput_needed flag returned by fget_light should be passed to the
 * corresponding fput_light.
 */
-struct file *fget_light(unsigned int fd, int *fput_needed)
+static unsigned long __fget_light(unsigned int fd, fmode_t mask)
 {
-        struct file *file;
        struct files_struct *files = current->files;
+        struct file *file;
-        *fput_needed = 0;
        if (atomic_read(&files->count) == 1) {
-                file = fcheck_files(files, fd);
+                file = __fcheck_files(files, fd);
-                if (file && (file->f_mode & FMODE_PATH))
+                if (!file || unlikely(file->f_mode & mask))
-                        file = NULL;
+                        return 0;
+                return (unsigned long)file;
        } else {
-                rcu_read_lock();
+                file = __fget(fd, mask);
-                file = fcheck_files(files, fd);
+                if (!file)
-                if (file) {
+                        return 0;
-                        if (!(file->f_mode & FMODE_PATH) &&
+                return FDPUT_FPUT | (unsigned long)file;
-                            atomic_long_inc_not_zero(&file->f_count))
-                                *fput_needed = 1;
-                        else
-                                /* Didn't get the reference, someone's freed */
-                                file = NULL;
-                }
-                rcu_read_unlock();
        }
+}
+unsigned long __fdget(unsigned int fd)
+{
+        return __fget_light(fd, FMODE_PATH);
+}
+EXPORT_SYMBOL(__fdget);
-        return file;
+unsigned long __fdget_raw(unsigned int fd)
+{
+        return __fget_light(fd, 0);
 }
-EXPORT_SYMBOL(fget_light);
-struct file *fget_raw_light(unsigned int fd, int *fput_needed)
+unsigned long __fdget_pos(unsigned int fd)
 {
-        struct file *file;
+        unsigned long v = __fdget(fd);
-        struct files_struct *files = current->files;
+        struct file *file = (struct file *)(v & ~3);
-        *fput_needed = 0;
+        if (file && (file->f_mode & FMODE_ATOMIC_POS)) {
-        if (atomic_read(&files->count) == 1) {
+                if (file_count(file) > 1) {
-                file = fcheck_files(files, fd);
+                        v |= FDPUT_POS_UNLOCK;
-        } else {
+                        mutex_lock(&file->f_pos_lock);
-                rcu_read_lock();
-                file = fcheck_files(files, fd);
-                if (file) {
-                        if (atomic_long_inc_not_zero(&file->f_count))
-                                *fput_needed = 1;
-                        else
-                                /* Didn't get the reference, someone's freed */
-                                file = NULL;
                }
-                rcu_read_unlock();
        }
+        return v;
-        return file;
 }
+/*
+ * We only lock f_pos if we have threads or if the file might be
+ * shared with another process. In both cases we'll have an elevated
+ * file count (done either by fdget() or by fork()).
+ */
 void set_close_on_exec(unsigned int fd, int flag)
 {
        struct files_struct *files = current->files;
diff --git a/fs/file_table.c b/fs/file_table.c
index 5fff9030be34..5b24008ea4f6 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -135,6 +135,7 @@ struct file *get_empty_filp(void)
        atomic_long_set(&f->f_count, 1);
        rwlock_init(&f->f_owner.lock);
        spin_lock_init(&f->f_lock);
+        mutex_init(&f->f_pos_lock);
        eventpoll_init_file(f);
        /* f->f_version: 0 */
        return f;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 1f4a10ece2f1..d754e3cf99a8 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -40,18 +40,13 @@
 struct wb_writeback_work {
        long nr_pages;
        struct super_block *sb;
-        /*
+        unsigned long *older_than_this;
-         * Write only inodes dirtied before this time. Don't forget to set
-         * older_than_this_is_set when you set this.
-         */
-        unsigned long older_than_this;
        enum writeback_sync_modes sync_mode;
        unsigned int tagged_writepages:1;
        unsigned int for_kupdate:1;
        unsigned int range_cyclic:1;
        unsigned int for_background:1;
        unsigned int for_sync:1;        /* sync(2) WB_SYNC_ALL writeback */
-        unsigned int older_than_this_is_set:1;
        enum wb_reason reason;          /* why was writeback initiated? */
        struct list_head list;          /* pending work list */
@@ -252,10 +247,10 @@ static int move_expired_inodes(struct list_head *delaying_queue,
        int do_sb_sort = 0;
        int moved = 0;
-        WARN_ON_ONCE(!work->older_than_this_is_set);
        while (!list_empty(delaying_queue)) {
                inode = wb_inode(delaying_queue->prev);
-                if (inode_dirtied_after(inode, work->older_than_this))
+                if (work->older_than_this &&
+                    inode_dirtied_after(inode, *work->older_than_this))
                        break;
                list_move(&inode->i_wb_list, &tmp);
                moved++;
@@ -516,13 +511,16 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
        }
        WARN_ON(inode->i_state & I_SYNC);
        /*
-         * Skip inode if it is clean. We don't want to mess with writeback
+         * Skip inode if it is clean and we have no outstanding writeback in
-         * lists in this function since flusher thread may be doing for example
+         * WB_SYNC_ALL mode. We don't want to mess with writeback lists in this
-         * sync in parallel and if we move the inode, it could get skipped. So
+         * function since flusher thread may be doing for example sync in
-         * here we make sure inode is on some writeback list and leave it there
+         * parallel and if we move the inode, it could get skipped. So here we
-         * unless we have completely cleaned the inode.
+         * make sure inode is on some writeback list and leave it there unless
+         * we have completely cleaned the inode.
         */
-        if (!(inode->i_state & I_DIRTY))
+        if (!(inode->i_state & I_DIRTY) &&
+            (wbc->sync_mode != WB_SYNC_ALL ||
+             !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)))
                goto out;
        inode->i_state |= I_SYNC;
        spin_unlock(&inode->i_lock);
@@ -739,8 +737,6 @@ static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
                .sync_mode      = WB_SYNC_NONE,
                .range_cyclic   = 1,
                .reason         = reason,
-                .older_than_this = jiffies,
-                .older_than_this_is_set = 1,
        };
        spin_lock(&wb->list_lock);
@@ -799,13 +795,12 @@ static long wb_writeback(struct bdi_writeback *wb,
 {
        unsigned long wb_start = jiffies;
        long nr_pages = work->nr_pages;
+        unsigned long oldest_jif;
        struct inode *inode;
        long progress;
-        if (!work->older_than_this_is_set) {
+        oldest_jif = jiffies;
-                work->older_than_this = jiffies;
+        work->older_than_this = &oldest_jif;
-                work->older_than_this_is_set = 1;
-        }
        spin_lock(&wb->list_lock);
        for (;;) {
@@ -839,10 +834,10 @@ static long wb_writeback(struct bdi_writeback *wb,
                 * safe.
                 */
                if (work->for_kupdate) {
-                        work->older_than_this = jiffies -
+                        oldest_jif = jiffies -
                                msecs_to_jiffies(dirty_expire_interval * 10);
                } else if (work->for_background)
-                        work->older_than_this = jiffies;
+                        oldest_jif = jiffies;
                trace_writeback_start(wb->bdi, work);
                if (list_empty(&wb->b_io))
@@ -1354,21 +1349,18 @@ EXPORT_SYMBOL(try_to_writeback_inodes_sb);
 /**
 * sync_inodes_sb       -       sync sb inode pages
- * @sb:                 the superblock
+ * @sb: the superblock
- * @older_than_this:    timestamp
 *
 * This function writes and waits on any dirty inode belonging to this
- * superblock that has been dirtied before given timestamp.
+ * super_block.
 */
-void sync_inodes_sb(struct super_block *sb, unsigned long older_than_this)
+void sync_inodes_sb(struct super_block *sb)
 {
        DECLARE_COMPLETION_ONSTACK(done);
        struct wb_writeback_work work = {
                .sb             = sb,
                .sync_mode      = WB_SYNC_ALL,
                .nr_pages       = LONG_MAX,
-                .older_than_this = older_than_this,
-                .older_than_this_is_set = 1,
                .range_cyclic   = 0,
                .done           = &done,
                .reason         = WB_REASON_SYNC,
diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
index e1959efad64f..b5ebc2d7d80d 100644
--- a/fs/fscache/object-list.c
+++ b/fs/fscache/object-list.c
@@ -50,6 +50,8 @@ void fscache_objlist_add(struct fscache_object *obj)
        struct fscache_object *xobj;
        struct rb_node **p = &fscache_object_list.rb_node, *parent = NULL;
+        ASSERT(RB_EMPTY_NODE(&obj->objlist_link));
        write_lock(&fscache_object_list_lock);
        while (*p) {
@@ -75,6 +77,9 @@ void fscache_objlist_add(struct fscache_object *obj)
 */
 void fscache_objlist_remove(struct fscache_object *obj)
 {
+        if (RB_EMPTY_NODE(&obj->objlist_link))
+                return;
        write_lock(&fscache_object_list_lock);
        BUG_ON(RB_EMPTY_ROOT(&fscache_object_list));
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index 53d35c504240..d3b4539f1651 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -314,6 +314,9 @@ void fscache_object_init(struct fscache_object *object,
        object->cache = cache;
        object->cookie = cookie;
        object->parent = NULL;
+#ifdef CONFIG_FSCACHE_OBJECT_LIST
+        RB_CLEAR_NODE(&object->objlist_link);
+#endif
        object->oob_event_mask = 0;
        for (t = object->oob_table; t->events; t++)
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index ef74ad5fd362..0a648bb455ae 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1296,22 +1296,6 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
        return fuse_dev_do_read(fc, file, &cs, iov_length(iov, nr_segs));
 }
-static int fuse_dev_pipe_buf_steal(struct pipe_inode_info *pipe,
-                                   struct pipe_buffer *buf)
-{
-        return 1;
-}
-static const struct pipe_buf_operations fuse_dev_pipe_buf_ops = {
-        .can_merge = 0,
-        .map = generic_pipe_buf_map,
-        .unmap = generic_pipe_buf_unmap,
-        .confirm = generic_pipe_buf_confirm,
-        .release = generic_pipe_buf_release,
-        .steal = fuse_dev_pipe_buf_steal,
-        .get = generic_pipe_buf_get,
-};
 static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
                                    struct pipe_inode_info *pipe,
                                    size_t len, unsigned int flags)
@@ -1358,7 +1342,11 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
                buf->page = bufs[page_nr].page;
                buf->offset = bufs[page_nr].offset;
                buf->len = bufs[page_nr].len;
-                buf->ops = &fuse_dev_pipe_buf_ops;
+                /*
+                 * Need to be careful about this.  Having buf->ops in module
+                 * code can Oops if the buffer persists after module unload.
+                 */
+                buf->ops = &nosteal_pipe_buf_ops;
                pipe->nrbufs++;
                page_nr++;
@@ -1599,7 +1587,8 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size,
                this_num = min_t(unsigned, num, PAGE_CACHE_SIZE - offset);
                err = fuse_copy_page(cs, &page, offset, this_num, 0);
-                if (!err && offset == 0 && (num != 0 || file_size == end))
+                if (!err && offset == 0 &&
+                    (this_num == PAGE_CACHE_SIZE || file_size == end))
                        SetPageUptodate(page);
                unlock_page(page);
                page_cache_release(page);
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index c3eb2c46c8f1..1d1292c581c3 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -112,6 +112,16 @@ void fuse_invalidate_attr(struct inode *inode)
        get_fuse_inode(inode)->i_time = 0;
 }
+/**
+ * Mark the attributes as stale due to an atime change.  Avoid the invalidate if
+ * atime is not used.
+ */
+void fuse_invalidate_atime(struct inode *inode)
+{
+        if (!IS_RDONLY(inode))
+                fuse_invalidate_attr(inode);
+}
 /*
 * Just mark the entry as stale, so that a next attempt to look it up
 * will result in a new lookup call to userspace
@@ -1371,7 +1381,7 @@ static int fuse_readdir(struct file *file, struct dir_context *ctx)
        }
        __free_page(page);
-        fuse_invalidate_attr(inode); /* atime changed */
+        fuse_invalidate_atime(inode);
        return err;
 }
@@ -1404,7 +1414,7 @@ static char *read_link(struct dentry *dentry)
                link[req->out.args[0].size] = '\0';
 out:
        fuse_put_request(fc, req);
-        fuse_invalidate_attr(inode); /* atime changed */
+        fuse_invalidate_atime(inode);
        return link;
 }
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 7e70506297bc..77bcc303c3ae 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -127,7 +127,15 @@ static void fuse_file_put(struct fuse_file *ff, bool sync)
        if (atomic_dec_and_test(&ff->count)) {
                struct fuse_req *req = ff->reserved_req;
-                if (sync) {
+                if (ff->fc->no_open) {
+                        /*
+                         * Drop the release request when client does not
+                         * implement 'open'
+                         */
+                        req->background = 0;
+                        path_put(&req->misc.release.path);
+                        fuse_put_request(ff->fc, req);
+                } else if (sync) {
                        req->background = 0;
                        fuse_request_send(ff->fc, req);
                        path_put(&req->misc.release.path);
@@ -144,27 +152,36 @@ static void fuse_file_put(struct fuse_file *ff, bool sync)
 int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
                 bool isdir)
 {
-        struct fuse_open_out outarg;
        struct fuse_file *ff;
-        int err;
        int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN;
        ff = fuse_file_alloc(fc);
        if (!ff)
                return -ENOMEM;
-        err = fuse_send_open(fc, nodeid, file, opcode, &outarg);
+        ff->fh = 0;
-        if (err) {
+        ff->open_flags = FOPEN_KEEP_CACHE; /* Default for no-open */
-                fuse_file_free(ff);
+        if (!fc->no_open || isdir) {
-                return err;
+                struct fuse_open_out outarg;
+                int err;
+                err = fuse_send_open(fc, nodeid, file, opcode, &outarg);
+                if (!err) {
+                        ff->fh = outarg.fh;
+                        ff->open_flags = outarg.open_flags;
+                } else if (err != -ENOSYS || isdir) {
+                        fuse_file_free(ff);
+                        return err;
+                } else {
+                        fc->no_open = 1;
+                }
        }
        if (isdir)
-                outarg.open_flags &= ~FOPEN_DIRECT_IO;
+                ff->open_flags &= ~FOPEN_DIRECT_IO;
-        ff->fh = outarg.fh;
        ff->nodeid = nodeid;
-        ff->open_flags = outarg.open_flags;
        file->private_data = fuse_file_get(ff);
        return 0;
@@ -687,7 +704,7 @@ static int fuse_readpage(struct file *file, struct page *page)
                SetPageUptodate(page);
        }
-        fuse_invalidate_attr(inode); /* atime changed */
+        fuse_invalidate_atime(inode);
 out:
        unlock_page(page);
        return err;
@@ -716,7 +733,7 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
                        fuse_read_update_size(inode, pos,
                                              req->misc.read.attr_ver);
                }
-                fuse_invalidate_attr(inode); /* atime changed */
+                fuse_invalidate_atime(inode);
        }
        for (i = 0; i < req->num_pages; i++) {
@@ -2710,6 +2727,9 @@ fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
        inode = file->f_mapping->host;
        i_size = i_size_read(inode);
+        if ((rw == READ) && (offset > i_size))
+                return 0;
        /* optimization for short read */
        if (async_dio && rw != WRITE && offset + count > i_size) {
                if (offset >= i_size)
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 7d2730912667..2da5db2c8bdb 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -485,6 +485,9 @@ struct fuse_conn {
         * and hence races in setting them will not cause malfunction
         */
+        /** Is open/release not implemented by fs? */
+        unsigned no_open:1;
        /** Is fsync not implemented by fs? */
        unsigned no_fsync:1;
@@ -788,6 +791,8 @@ void fuse_invalidate_attr(struct inode *inode);
 void fuse_invalidate_entry_cache(struct dentry *entry);
+void fuse_invalidate_atime(struct inode *inode);
 /**
 * Acquire reference to fuse_conn
 */
diff --git a/fs/generic_acl.c b/fs/generic_acl.c
deleted file mode 100644
index b3f3676796d3..000000000000
--- a/fs/generic_acl.c
+++ /dev/null
@@ -1,184 +0,0 @@
-/*
- * (C) 2005 Andreas Gruenbacher <agruen@suse.de>
- *
- * This file is released under the GPL.
- *
- * Generic ACL support for in-memory filesystems.
- */
-#include <linux/sched.h>
-#include <linux/gfp.h>
-#include <linux/fs.h>
-#include <linux/generic_acl.h>
-#include <linux/posix_acl.h>
-#include <linux/posix_acl_xattr.h>
-static size_t
-generic_acl_list(struct dentry *dentry, char *list, size_t list_size,
-                const char *name, size_t name_len, int type)
-{
-        struct posix_acl *acl;
-        const char *xname;
-        size_t size;
-        acl = get_cached_acl(dentry->d_inode, type);
-        if (!acl)
-                return 0;
-        posix_acl_release(acl);
-        switch (type) {
-        case ACL_TYPE_ACCESS:
-                xname = POSIX_ACL_XATTR_ACCESS;
-                break;
-        case ACL_TYPE_DEFAULT:
-                xname = POSIX_ACL_XATTR_DEFAULT;
-                break;
-        default:
-                return 0;
-        }
-        size = strlen(xname) + 1;
-        if (list && size <= list_size)
-                memcpy(list, xname, size);
-        return size;
-}
-static int
-generic_acl_get(struct dentry *dentry, const char *name, void *buffer,
-                     size_t size, int type)
-{
-        struct posix_acl *acl;
-        int error;
-        if (strcmp(name, "") != 0)
-                return -EINVAL;
-        acl = get_cached_acl(dentry->d_inode, type);
-        if (!acl)
-                return -ENODATA;
-        error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
-        posix_acl_release(acl);
-        return error;
-}
-static int
-generic_acl_set(struct dentry *dentry, const char *name, const void *value,
-                     size_t size, int flags, int type)
-{
-        struct inode *inode = dentry->d_inode;
-        struct posix_acl *acl = NULL;
-        int error;
-        if (strcmp(name, "") != 0)
-                return -EINVAL;
-        if (S_ISLNK(inode->i_mode))
-                return -EOPNOTSUPP;
-        if (!inode_owner_or_capable(inode))
-                return -EPERM;
-        if (value) {
-                acl = posix_acl_from_xattr(&init_user_ns, value, size);
-                if (IS_ERR(acl))
-                        return PTR_ERR(acl);
-        }
-        if (acl) {
-                error = posix_acl_valid(acl);
-                if (error)
-                        goto failed;
-                switch (type) {
-                case ACL_TYPE_ACCESS:
-                        error = posix_acl_equiv_mode(acl, &inode->i_mode);
-                        if (error < 0)
-                                goto failed;
-                        inode->i_ctime = CURRENT_TIME;
-                        if (error == 0) {
-                                posix_acl_release(acl);
-                                acl = NULL;
-                        }
-                        break;
-                case ACL_TYPE_DEFAULT:
-                        if (!S_ISDIR(inode->i_mode)) {
-                                error = -EINVAL;
-                                goto failed;
-                        }
-                        break;
-                }
-        }
-        set_cached_acl(inode, type, acl);
-        error = 0;
-failed:
-        posix_acl_release(acl);
-        return error;
-}
-/**
- * generic_acl_init  -  Take care of acl inheritance at @inode create time
- *
- * Files created inside a directory with a default ACL inherit the
- * directory's default ACL.
- */
-int
-generic_acl_init(struct inode *inode, struct inode *dir)
-{
-        struct posix_acl *acl = NULL;
-        int error;
-        if (!S_ISLNK(inode->i_mode))
-                acl = get_cached_acl(dir, ACL_TYPE_DEFAULT);
-        if (acl) {
-                if (S_ISDIR(inode->i_mode))
-                        set_cached_acl(inode, ACL_TYPE_DEFAULT, acl);
-                error = posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode);
-                if (error < 0)
-                        return error;
-                if (error > 0)
-                        set_cached_acl(inode, ACL_TYPE_ACCESS, acl);
-        } else {
-                inode->i_mode &= ~current_umask();
-        }
-        error = 0;
-        posix_acl_release(acl);
-        return error;
-}
-/**
- * generic_acl_chmod  -  change the access acl of @inode upon chmod()
- *
- * A chmod also changes the permissions of the owner, group/mask, and
- * other ACL entries.
- */
-int
-generic_acl_chmod(struct inode *inode)
-{
-        struct posix_acl *acl;
-        int error = 0;
-        if (S_ISLNK(inode->i_mode))
-                return -EOPNOTSUPP;
-        acl = get_cached_acl(inode, ACL_TYPE_ACCESS);
-        if (acl) {
-                error = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
-                if (error)
-                        return error;
-                set_cached_acl(inode, ACL_TYPE_ACCESS, acl);
-                posix_acl_release(acl);
-        }
-        return error;
-}
-const struct xattr_handler generic_acl_access_handler = {
-        .prefix = POSIX_ACL_XATTR_ACCESS,
-        .flags  = ACL_TYPE_ACCESS,
-        .list   = generic_acl_list,
-        .get    = generic_acl_get,
-        .set    = generic_acl_set,
-};
-const struct xattr_handler generic_acl_default_handler = {
-        .prefix = POSIX_ACL_XATTR_DEFAULT,
-        .flags  = ACL_TYPE_DEFAULT,
-        .list   = generic_acl_list,
-        .get    = generic_acl_get,
-        .set    = generic_acl_set,
-};
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index f69ac0af5496..ba9456685f47 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -49,10 +49,6 @@ struct posix_acl *gfs2_get_acl(struct inode *inode, int type)
        if (!ip->i_eattr)
                return NULL;
-        acl = get_cached_acl(&ip->i_inode, type);
-        if (acl != ACL_NOT_CACHED)
-                return acl;
        name = gfs2_acl_name(type);
        if (name == NULL)
                return ERR_PTR(-EINVAL);
@@ -80,7 +76,7 @@ static int gfs2_set_mode(struct inode *inode, umode_t mode)
        return error;
 }
-static int gfs2_acl_set(struct inode *inode, int type, struct posix_acl *acl)
+int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 {
        int error;
        int len;
@@ -88,219 +84,49 @@ static int gfs2_acl_set(struct inode *inode, int type, struct posix_acl *acl)
        const char *name = gfs2_acl_name(type);
        BUG_ON(name == NULL);
-        len = posix_acl_to_xattr(&init_user_ns, acl, NULL, 0);
-        if (len == 0)
-                return 0;
-        data = kmalloc(len, GFP_NOFS);
-        if (data == NULL)
-                return -ENOMEM;
-        error = posix_acl_to_xattr(&init_user_ns, acl, data, len);
-        if (error < 0)
-                goto out;
-        error = __gfs2_xattr_set(inode, name, data, len, 0, GFS2_EATYPE_SYS);
-        if (!error)
-                set_cached_acl(inode, type, acl);
-out:
-        kfree(data);
-        return error;
-}
-int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode)
-{
-        struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
-        struct posix_acl *acl;
-        umode_t mode = inode->i_mode;
-        int error = 0;
-        if (!sdp->sd_args.ar_posix_acl)
-                return 0;
-        if (S_ISLNK(inode->i_mode))
-                return 0;
-        acl = gfs2_get_acl(&dip->i_inode, ACL_TYPE_DEFAULT);
-        if (IS_ERR(acl))
-                return PTR_ERR(acl);
-        if (!acl) {
-                mode &= ~current_umask();
-                return gfs2_set_mode(inode, mode);
-        }
-        if (S_ISDIR(inode->i_mode)) {
-                error = gfs2_acl_set(inode, ACL_TYPE_DEFAULT, acl);
-                if (error)
-                        goto out;
-        }
-        error = posix_acl_create(&acl, GFP_NOFS, &mode);
-        if (error < 0)
-                return error;
-        if (error == 0)
-                goto munge;
-        error = gfs2_acl_set(inode, ACL_TYPE_ACCESS, acl);
-        if (error)
-                goto out;
-munge:
-        error = gfs2_set_mode(inode, mode);
-out:
-        posix_acl_release(acl);
-        return error;
-}
-int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr)
-{
-        struct inode *inode = &ip->i_inode;
-        struct posix_acl *acl;
-        char *data;
-        unsigned int len;
-        int error;
-        acl = gfs2_get_acl(&ip->i_inode, ACL_TYPE_ACCESS);
-        if (IS_ERR(acl))
-                return PTR_ERR(acl);
-        if (!acl)
-                return gfs2_setattr_simple(inode, attr);
-        error = posix_acl_chmod(&acl, GFP_NOFS, attr->ia_mode);
-        if (error)
-                return error;
-        len = posix_acl_to_xattr(&init_user_ns, acl, NULL, 0);
-        data = kmalloc(len, GFP_NOFS);
-        error = -ENOMEM;
-        if (data == NULL)
-                goto out;
-        posix_acl_to_xattr(&init_user_ns, acl, data, len);
-        error = gfs2_xattr_acl_chmod(ip, attr, data);
-        kfree(data);
-        set_cached_acl(&ip->i_inode, ACL_TYPE_ACCESS, acl);
-out:
-        posix_acl_release(acl);
-        return error;
-}
-static int gfs2_acl_type(const char *name)
-{
-        if (strcmp(name, GFS2_POSIX_ACL_ACCESS) == 0)
-                return ACL_TYPE_ACCESS;
-        if (strcmp(name, GFS2_POSIX_ACL_DEFAULT) == 0)
-                return ACL_TYPE_DEFAULT;
-        return -EINVAL;
-}
-static int gfs2_xattr_system_get(struct dentry *dentry, const char *name,
-                                 void *buffer, size_t size, int xtype)
-{
-        struct inode *inode = dentry->d_inode;
-        struct gfs2_sbd *sdp = GFS2_SB(inode);
-        struct posix_acl *acl;
-        int type;
-        int error;
-        if (!sdp->sd_args.ar_posix_acl)
-                return -EOPNOTSUPP;
-        type = gfs2_acl_type(name);
-        if (type < 0)
-                return type;
-        acl = gfs2_get_acl(inode, type);
-        if (IS_ERR(acl))
-                return PTR_ERR(acl);
-        if (acl == NULL)
-                return -ENODATA;
-        error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
-        posix_acl_release(acl);
-        return error;
-}
-static int gfs2_xattr_system_set(struct dentry *dentry, const char *name,
-                                 const void *value, size_t size, int flags,
-                                 int xtype)
-{
-        struct inode *inode = dentry->d_inode;
-        struct gfs2_sbd *sdp = GFS2_SB(inode);
-        struct posix_acl *acl = NULL;
-        int error = 0, type;
-        if (!sdp->sd_args.ar_posix_acl)
-                return -EOPNOTSUPP;
-        type = gfs2_acl_type(name);
-        if (type < 0)
-                return type;
-        if (flags & XATTR_CREATE)
-                return -EINVAL;
-        if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
-                return value ? -EACCES : 0;
-        if (!uid_eq(current_fsuid(), inode->i_uid) && !capable(CAP_FOWNER))
-                return -EPERM;
-        if (S_ISLNK(inode->i_mode))
-                return -EOPNOTSUPP;
-        if (!value)
-                goto set_acl;
-        acl = posix_acl_from_xattr(&init_user_ns, value, size);
-        if (!acl) {
-                /*
-                 * acl_set_file(3) may request that we set default ACLs with
-                 * zero length -- defend (gracefully) against that here.
-                 */
-                goto out;
-        }
-        if (IS_ERR(acl)) {
-                error = PTR_ERR(acl);
-                goto out;
-        }
-        error = posix_acl_valid(acl);
-        if (error)
-                goto out_release;
-        error = -EINVAL;
        if (acl->a_count > GFS2_ACL_MAX_ENTRIES)
-                goto out_release;
+                return -EINVAL;
        if (type == ACL_TYPE_ACCESS) {
                umode_t mode = inode->i_mode;
                error = posix_acl_equiv_mode(acl, &mode);
+                if (error < 0)
+                        return error;
-                if (error <= 0) {
+                if (error == 0)
-                        posix_acl_release(acl);
                        acl = NULL;
-                        if (error < 0)
-                                return error;
-                }
                error = gfs2_set_mode(inode, mode);
                if (error)
-                        goto out_release;
+                        return error;
        }
-set_acl:
+        if (acl) {
-        error = __gfs2_xattr_set(inode, name, value, size, 0, GFS2_EATYPE_SYS);
+                len = posix_acl_to_xattr(&init_user_ns, acl, NULL, 0);
-        if (!error) {
+                if (len == 0)
-                if (acl)
+                        return 0;
-                        set_cached_acl(inode, type, acl);
+                data = kmalloc(len, GFP_NOFS);
-                else
+                if (data == NULL)
-                        forget_cached_acl(inode, type);
+                        return -ENOMEM;
+                error = posix_acl_to_xattr(&init_user_ns, acl, data, len);
+                if (error < 0)
+                        goto out;
+        } else {
+                data = NULL;
+                len = 0;
        }
-out_release:
-        posix_acl_release(acl);
+        error = __gfs2_xattr_set(inode, name, data, len, 0, GFS2_EATYPE_SYS);
+        if (error)
+                goto out;
+        if (acl)
+                set_cached_acl(inode, type, acl);
+        else
+                forget_cached_acl(inode, type);
 out:
+        kfree(data);
        return error;
 }
-const struct xattr_handler gfs2_xattr_system_handler = {
-        .prefix = XATTR_SYSTEM_PREFIX,
-        .flags  = GFS2_EATYPE_SYS,
-        .get    = gfs2_xattr_system_get,
-        .set    = gfs2_xattr_system_set,
-};
diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h
index 0da38dc7efec..301260c999ba 100644
--- a/fs/gfs2/acl.h
+++ b/fs/gfs2/acl.h
@@ -17,8 +17,6 @@
 #define GFS2_ACL_MAX_ENTRIES            25
 extern struct posix_acl *gfs2_get_acl(struct inode *inode, int type);
-extern int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode);
+extern int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type);
-extern int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr);
-extern const struct xattr_handler gfs2_xattr_system_handler;
 #endif /* __ACL_DOT_H__ */
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 73f3e4ee4037..49436fa7cd4f 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -1032,8 +1032,9 @@ static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
                        unmap_shared_mapping_range(ip->i_inode.i_mapping, offset, len);
                rv = filemap_write_and_wait_range(mapping, lstart, end);
                if (rv)
-                        return rv;
+                        goto out;
-                truncate_inode_pages_range(mapping, lstart, end);
+                if (rw == WRITE)
+                        truncate_inode_pages_range(mapping, lstart, end);
        }
        rv = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
@@ -1080,30 +1081,22 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
                bh = bh->b_this_page;
        } while(bh != head);
        spin_unlock(&sdp->sd_ail_lock);
-        gfs2_log_unlock(sdp);
        head = bh = page_buffers(page);
        do {
-                gfs2_log_lock(sdp);
                bd = bh->b_private;
                if (bd) {
                        gfs2_assert_warn(sdp, bd->bd_bh == bh);
-                        if (!list_empty(&bd->bd_list)) {
+                        if (!list_empty(&bd->bd_list))
-                                if (!buffer_pinned(bh))
+                                list_del_init(&bd->bd_list);
-                                        list_del_init(&bd->bd_list);
+                        bd->bd_bh = NULL;
-                                else
-                                        bd = NULL;
-                        }
-                        if (bd)
-                                bd->bd_bh = NULL;
                        bh->b_private = NULL;
-                }
-                gfs2_log_unlock(sdp);
-                if (bd)
                        kmem_cache_free(gfs2_bufdata_cachep, bd);
+                }
                bh = bh->b_this_page;
        } while (bh != head);
+        gfs2_log_unlock(sdp);
        return try_to_free_buffers(page);
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 2e5fc268d324..fa32655449c8 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -834,6 +834,7 @@ static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh,
        struct gfs2_leaf *leaf;
        struct gfs2_dirent *dent;
        struct qstr name = { .name = "" };
+        struct timespec tv = CURRENT_TIME;
        error = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL);
        if (error)
@@ -850,7 +851,11 @@ static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh,
        leaf->lf_entries = 0;
        leaf->lf_dirent_format = cpu_to_be32(GFS2_FORMAT_DE);
        leaf->lf_next = 0;
-        memset(leaf->lf_reserved, 0, sizeof(leaf->lf_reserved));
+        leaf->lf_inode = cpu_to_be64(ip->i_no_addr);
+        leaf->lf_dist = cpu_to_be32(1);
+        leaf->lf_nsec = cpu_to_be32(tv.tv_nsec);
+        leaf->lf_sec = cpu_to_be64(tv.tv_sec);
+        memset(leaf->lf_reserved2, 0, sizeof(leaf->lf_reserved2));
        dent = (struct gfs2_dirent *)(leaf+1);
        gfs2_qstr2dirent(&name, bh->b_size - sizeof(struct gfs2_leaf), dent);
        *pbh = bh;
@@ -1612,11 +1617,31 @@ out:
        return ret;
 }
+/**
+ * dir_new_leaf - Add a new leaf onto hash chain
+ * @inode: The directory
+ * @name: The name we are adding
+ *
+ * This adds a new dir leaf onto an existing leaf when there is not
+ * enough space to add a new dir entry. This is a last resort after
+ * we've expanded the hash table to max size and also split existing
+ * leaf blocks, so it will only occur for very large directories.
+ *
+ * The dist parameter is set to 1 for leaf blocks directly attached
+ * to the hash table, 2 for one layer of indirection, 3 for two layers
+ * etc. We are thus able to tell the difference between an old leaf
+ * with dist set to zero (i.e. "don't know") and a new one where we
+ * set this information for debug/fsck purposes.
+ *
+ * Returns: 0 on success, or -ve on error
+ */
 static int dir_new_leaf(struct inode *inode, const struct qstr *name)
 {
        struct buffer_head *bh, *obh;
        struct gfs2_inode *ip = GFS2_I(inode);
        struct gfs2_leaf *leaf, *oleaf;
+        u32 dist = 1;
        int error;
        u32 index;
        u64 bn;
@@ -1626,6 +1651,7 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name)
        if (error)
                return error;
        do {
+                dist++;
                oleaf = (struct gfs2_leaf *)obh->b_data;
                bn = be64_to_cpu(oleaf->lf_next);
                if (!bn)
@@ -1643,6 +1669,7 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name)
                brelse(obh);
                return -ENOSPC;
        }
+        leaf->lf_dist = cpu_to_be32(dist);
        oleaf->lf_next = cpu_to_be64(bh->b_blocknr);
        brelse(bh);
        brelse(obh);
@@ -1659,39 +1686,53 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name)
 /**
 * gfs2_dir_add - Add new filename into directory
- * @dip: The GFS2 inode
+ * @inode: The directory inode
- * @filename: The new name
+ * @name: The new name
- * @inode: The inode number of the entry
+ * @nip: The GFS2 inode to be linked in to the directory
- * @type: The type of the entry
+ * @da: The directory addition info
+ *
+ * If the call to gfs2_diradd_alloc_required resulted in there being
+ * no need to allocate any new directory blocks, then it will contain
+ * a pointer to the directory entry and the bh in which it resides. We
+ * can use that without having to repeat the search. If there was no
+ * free space, then we must now create more space.
 *
 * Returns: 0 on success, error code on failure
 */
 int gfs2_dir_add(struct inode *inode, const struct qstr *name,
-                 const struct gfs2_inode *nip)
+                 const struct gfs2_inode *nip, struct gfs2_diradd *da)
 {
        struct gfs2_inode *ip = GFS2_I(inode);
-        struct buffer_head *bh;
+        struct buffer_head *bh = da->bh;
-        struct gfs2_dirent *dent;
+        struct gfs2_dirent *dent = da->dent;
+        struct timespec tv;
        struct gfs2_leaf *leaf;
        int error;
        while(1) {
-                dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space,
+                if (da->bh == NULL) {
-                                          &bh);
+                        dent = gfs2_dirent_search(inode, name,
+                                                  gfs2_dirent_find_space, &bh);
+                }
                if (dent) {
                        if (IS_ERR(dent))
                                return PTR_ERR(dent);
                        dent = gfs2_init_dirent(inode, dent, name, bh);
                        gfs2_inum_out(nip, dent);
                        dent->de_type = cpu_to_be16(IF2DT(nip->i_inode.i_mode));
+                        tv = CURRENT_TIME;
                        if (ip->i_diskflags & GFS2_DIF_EXHASH) {
                                leaf = (struct gfs2_leaf *)bh->b_data;
                                be16_add_cpu(&leaf->lf_entries, 1);
+                                leaf->lf_nsec = cpu_to_be32(tv.tv_nsec);
+                                leaf->lf_sec = cpu_to_be64(tv.tv_sec);
                        }
+                        da->dent = NULL;
+                        da->bh = NULL;
                        brelse(bh);
                        ip->i_entries++;
-                        ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
+                        ip->i_inode.i_mtime = ip->i_inode.i_ctime = tv;
                        if (S_ISDIR(nip->i_inode.i_mode))
                                inc_nlink(&ip->i_inode);
                        mark_inode_dirty(inode);
@@ -1742,6 +1783,7 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry)
        const struct qstr *name = &dentry->d_name;
        struct gfs2_dirent *dent, *prev = NULL;
        struct buffer_head *bh;
+        struct timespec tv = CURRENT_TIME;
        /* Returns _either_ the entry (if its first in block) or the
           previous entry otherwise */
@@ -1767,13 +1809,15 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry)
                if (!entries)
                        gfs2_consist_inode(dip);
                leaf->lf_entries = cpu_to_be16(--entries);
+                leaf->lf_nsec = cpu_to_be32(tv.tv_nsec);
+                leaf->lf_sec = cpu_to_be64(tv.tv_sec);
        }
        brelse(bh);
        if (!dip->i_entries)
                gfs2_consist_inode(dip);
        dip->i_entries--;
-        dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME;
+        dip->i_inode.i_mtime = dip->i_inode.i_ctime = tv;
        if (S_ISDIR(dentry->d_inode->i_mode))
                drop_nlink(&dip->i_inode);
        mark_inode_dirty(&dip->i_inode);
@@ -2017,22 +2061,36 @@ out:
 * gfs2_diradd_alloc_required - find if adding entry will require an allocation
 * @ip: the file being written to
 * @filname: the filename that's going to be added
+ * @da: The structure to return dir alloc info
 *
- * Returns: 1 if alloc required, 0 if not, -ve on error
+ * Returns: 0 if ok, -ve on error
 */
-int gfs2_diradd_alloc_required(struct inode *inode, const struct qstr *name)
+int gfs2_diradd_alloc_required(struct inode *inode, const struct qstr *name,
+                               struct gfs2_diradd *da)
 {
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_sbd *sdp = GFS2_SB(inode);
+        const unsigned int extra = sizeof(struct gfs2_dinode) - sizeof(struct gfs2_leaf);
        struct gfs2_dirent *dent;
        struct buffer_head *bh;
+        da->nr_blocks = 0;
+        da->bh = NULL;
+        da->dent = NULL;
        dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space, &bh);
        if (!dent) {
-                return 1;
+                da->nr_blocks = sdp->sd_max_dirres;
+                if (!(ip->i_diskflags & GFS2_DIF_EXHASH) &&
+                    (GFS2_DIRENT_SIZE(name->len) < extra))
+                        da->nr_blocks = 1;
+                return 0;
        }
        if (IS_ERR(dent))
                return PTR_ERR(dent);
-        brelse(bh);
+        da->bh = bh;
+        da->dent = dent;
        return 0;
 }
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
index 4f03bbd1873f..126c65dda028 100644
--- a/fs/gfs2/dir.h
+++ b/fs/gfs2/dir.h
@@ -16,6 +16,14 @@
 struct inode;
 struct gfs2_inode;
 struct gfs2_inum;
+struct buffer_head;
+struct gfs2_dirent;
+struct gfs2_diradd {
+        unsigned nr_blocks;
+        struct gfs2_dirent *dent;
+        struct buffer_head *bh;
+};
 extern struct inode *gfs2_dir_search(struct inode *dir,
                                     const struct qstr *filename,
@@ -23,7 +31,13 @@ extern struct inode *gfs2_dir_search(struct inode *dir,
 extern int gfs2_dir_check(struct inode *dir, const struct qstr *filename,
                          const struct gfs2_inode *ip);
 extern int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
-                        const struct gfs2_inode *ip);
+                        const struct gfs2_inode *ip, struct gfs2_diradd *da);
+static inline void gfs2_dir_no_add(struct gfs2_diradd *da)
+{
+        if (da->bh)
+                brelse(da->bh);
+        da->bh = NULL;
+}
 extern int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry);
 extern int gfs2_dir_read(struct inode *inode, struct dir_context *ctx,
                         struct file_ra_state *f_ra);
@@ -33,7 +47,8 @@ extern int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
 extern int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip);
 extern int gfs2_diradd_alloc_required(struct inode *dir,
-                                      const struct qstr *filename);
+                                      const struct qstr *filename,
+                                      struct gfs2_diradd *da);
 extern int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block,
                                   struct buffer_head **bhp);
 extern void gfs2_dir_hash_inval(struct gfs2_inode *ip);
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 6f7a47c05259..ca0be6c69a26 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1552,13 +1552,11 @@ void gfs2_glock_thaw(struct gfs2_sbd *sdp)
        glock_hash_walk(thaw_glock, sdp);
 }
-static int dump_glock(struct seq_file *seq, struct gfs2_glock *gl)
+static void dump_glock(struct seq_file *seq, struct gfs2_glock *gl)
 {
-        int ret;
        spin_lock(&gl->gl_spin);
-        ret = gfs2_dump_glock(seq, gl);
+        gfs2_dump_glock(seq, gl);
        spin_unlock(&gl->gl_spin);
-        return ret;
 }
 static void dump_glock_func(struct gfs2_glock *gl)
@@ -1647,10 +1645,9 @@ static const char *hflags2str(char *buf, unsigned flags, unsigned long iflags)
 * @seq: the seq_file struct
 * @gh: the glock holder
 *
- * Returns: 0 on success, -ENOBUFS when we run out of space
 */
-static int dump_holder(struct seq_file *seq, const struct gfs2_holder *gh)
+static void dump_holder(struct seq_file *seq, const struct gfs2_holder *gh)
 {
        struct task_struct *gh_owner = NULL;
        char flags_buf[32];
@@ -1666,7 +1663,6 @@ static int dump_holder(struct seq_file *seq, const struct gfs2_holder *gh)
                       gh_owner ? gh_owner->comm : "(ended)",
                       (void *)gh->gh_ip);
        rcu_read_unlock();
-        return 0;
 }
 static const char *gflags2str(char *buf, const struct gfs2_glock *gl)
@@ -1721,16 +1717,14 @@ static const char *gflags2str(char *buf, const struct gfs2_glock *gl)
 * example. The field's are n = number (id of the object), f = flags,
 * t = type, s = state, r = refcount, e = error, p = pid.
 *
- * Returns: 0 on success, -ENOBUFS when we run out of space
 */
-int gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl)
+void gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl)
 {
        const struct gfs2_glock_operations *glops = gl->gl_ops;
        unsigned long long dtime;
        const struct gfs2_holder *gh;
        char gflags_buf[32];
-        int error = 0;
        dtime = jiffies - gl->gl_demote_time;
        dtime *= 1000000/HZ; /* demote time in uSec */
@@ -1747,15 +1741,11 @@ int gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl)
                  atomic_read(&gl->gl_revokes),
                  (int)gl->gl_lockref.count, gl->gl_hold_time);
-        list_for_each_entry(gh, &gl->gl_holders, gh_list) {
+        list_for_each_entry(gh, &gl->gl_holders, gh_list)
-                error = dump_holder(seq, gh);
+                dump_holder(seq, gh);
-                if (error)
-                        goto out;
-        }
        if (gl->gl_state != LM_ST_UNLOCKED && glops->go_dump)
-                error = glops->go_dump(seq, gl);
+                glops->go_dump(seq, gl);
-out:
-        return error;
 }
 static int gfs2_glstats_seq_show(struct seq_file *seq, void *iter_ptr)
@@ -1953,7 +1943,8 @@ static void gfs2_glock_seq_stop(struct seq_file *seq, void *iter_ptr)
 static int gfs2_glock_seq_show(struct seq_file *seq, void *iter_ptr)
 {
-        return dump_glock(seq, iter_ptr);
+        dump_glock(seq, iter_ptr);
+        return 0;
 }
 static void *gfs2_sbstats_seq_start(struct seq_file *seq, loff_t *pos)
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 6647d77366ba..32572f71f027 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -199,7 +199,7 @@ extern int gfs2_glock_nq_num(struct gfs2_sbd *sdp, u64 number,
                             struct gfs2_holder *gh);
 extern int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
 extern void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
-extern int gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl);
+extern void gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl);
 #define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { gfs2_dump_glock(NULL, gl); BUG(); } } while(0)
 extern __printf(2, 3)
 void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index f88dcd925010..3bf0631b5d56 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -133,7 +133,8 @@ void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
 static void rgrp_go_sync(struct gfs2_glock *gl)
 {
-        struct address_space *metamapping = gfs2_glock2aspace(gl);
+        struct gfs2_sbd *sdp = gl->gl_sbd;
+        struct address_space *mapping = &sdp->sd_aspace;
        struct gfs2_rgrpd *rgd;
        int error;
@@ -141,10 +142,10 @@ static void rgrp_go_sync(struct gfs2_glock *gl)
                return;
        GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_EXCLUSIVE);
-        gfs2_log_flush(gl->gl_sbd, gl);
+        gfs2_log_flush(sdp, gl);
-        filemap_fdatawrite(metamapping);
+        filemap_fdatawrite_range(mapping, gl->gl_vm.start, gl->gl_vm.end);
-        error = filemap_fdatawait(metamapping);
+        error = filemap_fdatawait_range(mapping, gl->gl_vm.start, gl->gl_vm.end);
-        mapping_set_error(metamapping, error);
+        mapping_set_error(mapping, error);
        gfs2_ail_empty_gl(gl);
        spin_lock(&gl->gl_spin);
@@ -166,11 +167,12 @@ static void rgrp_go_sync(struct gfs2_glock *gl)
 static void rgrp_go_inval(struct gfs2_glock *gl, int flags)
 {
-        struct address_space *mapping = gfs2_glock2aspace(gl);
+        struct gfs2_sbd *sdp = gl->gl_sbd;
+        struct address_space *mapping = &sdp->sd_aspace;
        WARN_ON_ONCE(!(flags & DIO_METADATA));
-        gfs2_assert_withdraw(gl->gl_sbd, !atomic_read(&gl->gl_ail_count));
+        gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count));
-        truncate_inode_pages(mapping, 0);
+        truncate_inode_pages_range(mapping, gl->gl_vm.start, gl->gl_vm.end);
        if (gl->gl_object) {
                struct gfs2_rgrpd *rgd = (struct gfs2_rgrpd *)gl->gl_object;
@@ -435,21 +437,19 @@ static int inode_go_lock(struct gfs2_holder *gh)
 * @seq: The iterator
 * @ip: the inode
 *
- * Returns: 0 on success, -ENOBUFS when we run out of space
 */
-static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
+static void inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
 {
        const struct gfs2_inode *ip = gl->gl_object;
        if (ip == NULL)
-                return 0;
+                return;
        gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu\n",
                  (unsigned long long)ip->i_no_formal_ino,
                  (unsigned long long)ip->i_no_addr,
                  IF2DT(ip->i_inode.i_mode), ip->i_flags,
                  (unsigned int)ip->i_diskflags,
                  (unsigned long long)i_size_read(&ip->i_inode));
-        return 0;
 }
 /**
@@ -558,7 +558,7 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = {
        .go_unlock = gfs2_rgrp_go_unlock,
        .go_dump = gfs2_rgrp_dump,
        .go_type = LM_TYPE_RGRP,
-        .go_flags = GLOF_ASPACE | GLOF_LVB,
+        .go_flags = GLOF_LVB,
 };
 const struct gfs2_glock_operations gfs2_trans_glops = {
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index ba1ea67f4eeb..cf0e34400f71 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -93,6 +93,7 @@ struct gfs2_rgrpd {
        struct gfs2_rgrp_lvb *rd_rgl;
        u32 rd_last_alloc;
        u32 rd_flags;
+        u32 rd_extfail_pt;              /* extent failure point */
 #define GFS2_RDF_CHECK          0x10000000 /* check for unlinked inodes */
 #define GFS2_RDF_UPTODATE       0x20000000 /* rg is up to date */
 #define GFS2_RDF_ERROR          0x40000000 /* error in rg */
@@ -217,7 +218,7 @@ struct gfs2_glock_operations {
        int (*go_demote_ok) (const struct gfs2_glock *gl);
        int (*go_lock) (struct gfs2_holder *gh);
        void (*go_unlock) (struct gfs2_holder *gh);
-        int (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl);
+        void (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl);
        void (*go_callback)(struct gfs2_glock *gl, bool remote);
        const int go_type;
        const unsigned long go_flags;
@@ -350,7 +351,15 @@ struct gfs2_glock {
        atomic_t gl_ail_count;
        atomic_t gl_revokes;
        struct delayed_work gl_work;
-        struct work_struct gl_delete;
+        union {
+                /* For inode and iopen glocks only */
+                struct work_struct gl_delete;
+                /* For rgrp glocks only */
+                struct {
+                        loff_t start;
+                        loff_t end;
+                } gl_vm;
+        };
        struct rcu_head gl_rcu;
 };
@@ -419,10 +428,13 @@ enum {
 };
 struct gfs2_quota_data {
+        struct hlist_bl_node qd_hlist;
        struct list_head qd_list;
        struct kqid qd_id;
+        struct gfs2_sbd *qd_sbd;
        struct lockref qd_lockref;
        struct list_head qd_lru;
+        unsigned qd_hash;
        unsigned long qd_flags;         /* QDF_... */
@@ -441,6 +453,7 @@ struct gfs2_quota_data {
        u64 qd_sync_gen;
        unsigned long qd_last_warn;
+        struct rcu_head qd_rcu;
 };
 struct gfs2_trans {
@@ -720,13 +733,15 @@ struct gfs2_sbd {
        spinlock_t sd_trunc_lock;
        unsigned int sd_quota_slots;
-        unsigned int sd_quota_chunks;
+        unsigned long *sd_quota_bitmap;
-        unsigned char **sd_quota_bitmap;
+        spinlock_t sd_bitmap_lock;
        u64 sd_quota_sync_gen;
        /* Log stuff */
+        struct address_space sd_aspace;
        spinlock_t sd_log_lock;
        struct gfs2_trans *sd_log_tr;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 7119504159f1..5c524180c98e 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -149,7 +149,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
        ip = GFS2_I(inode);
        if (!inode)
-                return ERR_PTR(-ENOBUFS);
+                return ERR_PTR(-ENOMEM);
        if (inode->i_state & I_NEW) {
                struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -469,14 +469,36 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_inode *ip,
        brelse(dibh);
 }
+/**
+ * gfs2_trans_da_blocks - Calculate number of blocks to link inode
+ * @dip: The directory we are linking into
+ * @da: The dir add information
+ * @nr_inodes: The number of inodes involved
+ *
+ * This calculate the number of blocks we need to reserve in a
+ * transaction to link @nr_inodes into a directory. In most cases
+ * @nr_inodes will be 2 (the directory plus the inode being linked in)
+ * but in case of rename, 4 may be required.
+ *
+ * Returns: Number of blocks
+ */
+static unsigned gfs2_trans_da_blks(const struct gfs2_inode *dip,
+                                   const struct gfs2_diradd *da,
+                                   unsigned nr_inodes)
+{
+        return da->nr_blocks + gfs2_rg_blocks(dip, da->nr_blocks) +
+               (nr_inodes * RES_DINODE) + RES_QUOTA + RES_STATFS;
+}
 static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
-                       struct gfs2_inode *ip, int arq)
+                       struct gfs2_inode *ip, struct gfs2_diradd *da)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
-        struct gfs2_alloc_parms ap = { .target = sdp->sd_max_dirres, };
+        struct gfs2_alloc_parms ap = { .target = da->nr_blocks, };
        int error;
-        if (arq) {
+        if (da->nr_blocks) {
                error = gfs2_quota_lock_check(dip);
                if (error)
                        goto fail_quota_locks;
@@ -485,10 +507,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
                if (error)
                        goto fail_quota_locks;
-                error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
+                error = gfs2_trans_begin(sdp, gfs2_trans_da_blks(dip, da, 2), 0);
-                                         dip->i_rgd->rd_length +
-                                         2 * RES_DINODE +
-                                         RES_STATFS + RES_QUOTA, 0);
                if (error)
                        goto fail_ipreserv;
        } else {
@@ -497,7 +516,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
                        goto fail_quota_locks;
        }
-        error = gfs2_dir_add(&dip->i_inode, name, ip);
+        error = gfs2_dir_add(&dip->i_inode, name, ip, da);
        if (error)
                goto fail_end_trans;
@@ -552,6 +571,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
                             unsigned int size, int excl, int *opened)
 {
        const struct qstr *name = &dentry->d_name;
+        struct posix_acl *default_acl, *acl;
        struct gfs2_holder ghs[2];
        struct inode *inode = NULL;
        struct gfs2_inode *dip = GFS2_I(dir), *ip;
@@ -560,7 +580,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
        struct dentry *d;
        int error;
        u32 aflags = 0;
-        int arq;
+        struct gfs2_diradd da = { .bh = NULL, };
        if (!name->len || name->len > GFS2_FNAMESIZE)
                return -ENAMETOOLONG;
@@ -585,6 +605,9 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
        error = PTR_ERR(inode);
        if (!IS_ERR(inode)) {
                d = d_splice_alias(inode, dentry);
+                error = PTR_ERR(d);
+                if (IS_ERR(d))
+                        goto fail_gunlock;
                error = 0;
                if (file) {
                        if (S_ISREG(inode->i_mode)) {
@@ -602,7 +625,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
                goto fail_gunlock;
        }
-        arq = error = gfs2_diradd_alloc_required(dir, name);
+        error = gfs2_diradd_alloc_required(dir, name, &da);
        if (error < 0)
                goto fail_gunlock;
@@ -611,10 +634,14 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
        if (!inode)
                goto fail_gunlock;
+        error = posix_acl_create(dir, &mode, &default_acl, &acl);
+        if (error)
+                goto fail_free_vfs_inode;
        ip = GFS2_I(inode);
        error = gfs2_rs_alloc(ip);
        if (error)
-                goto fail_free_inode;
+                goto fail_free_acls;
        inode->i_mode = mode;
        set_nlink(inode, S_ISDIR(mode) ? 2 : 1);
@@ -682,7 +709,16 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
        gfs2_set_iop(inode);
        insert_inode_hash(inode);
-        error = gfs2_acl_create(dip, inode);
+        if (default_acl) {
+                error = gfs2_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
+                posix_acl_release(default_acl);
+        }
+        if (acl) {
+                if (!error)
+                        error = gfs2_set_acl(inode, acl, ACL_TYPE_ACCESS);
+                posix_acl_release(acl);
+        }
        if (error)
                goto fail_gunlock3;
@@ -690,7 +726,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
        if (error)
                goto fail_gunlock3;
-        error = link_dinode(dip, name, ip, arq);
+        error = link_dinode(dip, name, ip, &da);
        if (error)
                goto fail_gunlock3;
@@ -716,9 +752,16 @@ fail_free_inode:
        if (ip->i_gl)
                gfs2_glock_put(ip->i_gl);
        gfs2_rs_delete(ip, NULL);
+fail_free_acls:
+        if (default_acl)
+                posix_acl_release(default_acl);
+        if (acl)
+                posix_acl_release(acl);
+fail_free_vfs_inode:
        free_inode_nonrcu(inode);
        inode = NULL;
 fail_gunlock:
+        gfs2_dir_no_add(&da);
        gfs2_glock_dq_uninit(ghs);
        if (inode && !IS_ERR(inode)) {
                clear_nlink(inode);
@@ -779,6 +822,11 @@ static struct dentry *__gfs2_lookup(struct inode *dir, struct dentry *dentry,
        }
        d = d_splice_alias(inode, dentry);
+        if (IS_ERR(d)) {
+                iput(inode);
+                gfs2_glock_dq_uninit(&gh);
+                return d;
+        }
        if (file && S_ISREG(inode->i_mode))
                error = finish_open(file, dentry, gfs2_open_common, opened);
@@ -817,7 +865,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
        struct gfs2_inode *ip = GFS2_I(inode);
        struct gfs2_holder ghs[2];
        struct buffer_head *dibh;
-        int alloc_required;
+        struct gfs2_diradd da = { .bh = NULL, };
        int error;
        if (S_ISDIR(inode->i_mode))
@@ -872,13 +920,12 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
        if (ip->i_inode.i_nlink == (u32)-1)
                goto out_gunlock;
-        alloc_required = error = gfs2_diradd_alloc_required(dir, &dentry->d_name);
+        error = gfs2_diradd_alloc_required(dir, &dentry->d_name, &da);
        if (error < 0)
                goto out_gunlock;
-        error = 0;
-        if (alloc_required) {
+        if (da.nr_blocks) {
-                struct gfs2_alloc_parms ap = { .target = sdp->sd_max_dirres, };
+                struct gfs2_alloc_parms ap = { .target = da.nr_blocks, };
                error = gfs2_quota_lock_check(dip);
                if (error)
                        goto out_gunlock;
@@ -887,10 +934,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
                if (error)
                        goto out_gunlock_q;
-                error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
+                error = gfs2_trans_begin(sdp, gfs2_trans_da_blks(dip, &da, 2), 0);
-                                         gfs2_rg_blocks(dip, sdp->sd_max_dirres) +
-                                         2 * RES_DINODE + RES_STATFS +
-                                         RES_QUOTA, 0);
                if (error)
                        goto out_ipres;
        } else {
@@ -903,7 +947,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
        if (error)
                goto out_end_trans;
-        error = gfs2_dir_add(dir, &dentry->d_name, ip);
+        error = gfs2_dir_add(dir, &dentry->d_name, ip, &da);
        if (error)
                goto out_brelse;
@@ -919,12 +963,13 @@ out_brelse:
 out_end_trans:
        gfs2_trans_end(sdp);
 out_ipres:
-        if (alloc_required)
+        if (da.nr_blocks)
                gfs2_inplace_release(dip);
 out_gunlock_q:
-        if (alloc_required)
+        if (da.nr_blocks)
                gfs2_quota_unlock(dip);
 out_gunlock:
+        gfs2_dir_no_add(&da);
        gfs2_glock_dq(ghs + 1);
 out_child:
        gfs2_glock_dq(ghs);
@@ -1254,7 +1299,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
        struct gfs2_rgrpd *nrgd;
        unsigned int num_gh;
        int dir_rename = 0;
-        int alloc_required = 0;
+        struct gfs2_diradd da = { .nr_blocks = 0, };
        unsigned int x;
        int error;
@@ -1388,14 +1433,14 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
                        goto out_gunlock;
        }
-        if (nip == NULL)
+        if (nip == NULL) {
-                alloc_required = gfs2_diradd_alloc_required(ndir, &ndentry->d_name);
+                error = gfs2_diradd_alloc_required(ndir, &ndentry->d_name, &da);
-        error = alloc_required;
+                if (error)
-        if (error < 0)
+                        goto out_gunlock;
-                goto out_gunlock;
+        }
-        if (alloc_required) {
+        if (da.nr_blocks) {
-                struct gfs2_alloc_parms ap = { .target = sdp->sd_max_dirres, };
+                struct gfs2_alloc_parms ap = { .target = da.nr_blocks, };
                error = gfs2_quota_lock_check(ndip);
                if (error)
                        goto out_gunlock;
@@ -1404,10 +1449,8 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
                if (error)
                        goto out_gunlock_q;
-                error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
+                error = gfs2_trans_begin(sdp, gfs2_trans_da_blks(ndip, &da, 4) +
-                                         gfs2_rg_blocks(ndip, sdp->sd_max_dirres) +
+                                         4 * RES_LEAF + 4, 0);
-                                         4 * RES_DINODE + 4 * RES_LEAF +
-                                         RES_STATFS + RES_QUOTA + 4, 0);
                if (error)
                        goto out_ipreserv;
        } else {
@@ -1441,19 +1484,20 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
        if (error)
                goto out_end_trans;
-        error = gfs2_dir_add(ndir, &ndentry->d_name, ip);
+        error = gfs2_dir_add(ndir, &ndentry->d_name, ip, &da);
        if (error)
                goto out_end_trans;
 out_end_trans:
        gfs2_trans_end(sdp);
 out_ipreserv:
-        if (alloc_required)
+        if (da.nr_blocks)
                gfs2_inplace_release(ndip);
 out_gunlock_q:
-        if (alloc_required)
+        if (da.nr_blocks)
                gfs2_quota_unlock(ndip);
 out_gunlock:
+        gfs2_dir_no_add(&da);
        while (x--) {
                gfs2_glock_dq(ghs + x);
                gfs2_holder_uninit(ghs + x);
@@ -1607,10 +1651,22 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
        if (!(attr->ia_valid & ATTR_GID) || gid_eq(ogid, ngid))
                ogid = ngid = NO_GID_QUOTA_CHANGE;
-        error = gfs2_quota_lock(ip, nuid, ngid);
+        error = get_write_access(inode);
        if (error)
                return error;
+        error = gfs2_rs_alloc(ip);
+        if (error)
+                goto out;
+        error = gfs2_rindex_update(sdp);
+        if (error)
+                goto out;
+        error = gfs2_quota_lock(ip, nuid, ngid);
+        if (error)
+                goto out;
        if (!uid_eq(ouid, NO_UID_QUOTA_CHANGE) ||
            !gid_eq(ogid, NO_GID_QUOTA_CHANGE)) {
                error = gfs2_quota_check(ip, nuid, ngid);
@@ -1637,6 +1693,8 @@ out_end_trans:
        gfs2_trans_end(sdp);
 out_gunlock_q:
        gfs2_quota_unlock(ip);
+out:
+        put_write_access(inode);
        return error;
 }
@@ -1678,10 +1736,11 @@ static int gfs2_setattr(struct dentry *dentry, struct iattr *attr)
                error = gfs2_setattr_size(inode, attr->ia_size);
        else if (attr->ia_valid & (ATTR_UID | ATTR_GID))
                error = setattr_chown(inode, attr);
-        else if ((attr->ia_valid & ATTR_MODE) && IS_POSIXACL(inode))
+        else {
-                error = gfs2_acl_chmod(ip, attr);
-        else
                error = gfs2_setattr_simple(inode, attr);
+                if (!error && attr->ia_valid & ATTR_MODE)
+                        error = posix_acl_chmod(inode, inode->i_mode);
+        }
 out:
        if (!error)
@@ -1841,6 +1900,7 @@ const struct inode_operations gfs2_file_iops = {
        .removexattr = gfs2_removexattr,
        .fiemap = gfs2_fiemap,
        .get_acl = gfs2_get_acl,
+        .set_acl = gfs2_set_acl,
 };
 const struct inode_operations gfs2_dir_iops = {
@@ -1862,6 +1922,7 @@ const struct inode_operations gfs2_dir_iops = {
        .removexattr = gfs2_removexattr,
        .fiemap = gfs2_fiemap,
        .get_acl = gfs2_get_acl,
+        .set_acl = gfs2_set_acl,
        .atomic_open = gfs2_atomic_open,
 };
@@ -1877,6 +1938,5 @@ const struct inode_operations gfs2_symlink_iops = {
        .listxattr = gfs2_listxattr,
        .removexattr = gfs2_removexattr,
        .fiemap = gfs2_fiemap,
-        .get_acl = gfs2_get_acl,
 };
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 010b9fb9fec6..76693793cedd 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -83,6 +83,7 @@ static void maybe_release_space(struct gfs2_bufdata *bd)
               bd->bd_bh->b_data + bi->bi_offset, bi->bi_len);
        clear_bit(GBF_FULL, &bi->bi_flags);
        rgd->rd_free_clone = rgd->rd_free;
+        rgd->rd_extfail_pt = rgd->rd_free;
 }
 /**
@@ -272,7 +273,7 @@ static struct bio *gfs2_log_alloc_bio(struct gfs2_sbd *sdp, u64 blkno)
                nrvecs = max(nrvecs/2, 1U);
        }
-        bio->bi_sector = blkno * (sb->s_blocksize >> 9);
+        bio->bi_iter.bi_sector = blkno * (sb->s_blocksize >> 9);
        bio->bi_bdev = sb->s_bdev;
        bio->bi_end_io = gfs2_end_log_write;
        bio->bi_private = sdp;
@@ -588,8 +589,12 @@ static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
 static void gfs2_meta_sync(struct gfs2_glock *gl)
 {
        struct address_space *mapping = gfs2_glock2aspace(gl);
+        struct gfs2_sbd *sdp = gl->gl_sbd;
        int error;
+        if (mapping == NULL)
+                mapping = &sdp->sd_aspace;
        filemap_fdatawrite(mapping);
        error = filemap_fdatawait(mapping);
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 0650db2541ef..c272e73063de 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -76,6 +76,7 @@ static int __init init_gfs2_fs(void)
        gfs2_str2qstr(&gfs2_qdot, ".");
        gfs2_str2qstr(&gfs2_qdotdot, "..");
+        gfs2_quota_hash_init();
        error = gfs2_sys_init();
        if (error)
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 52f177be3bf8..c7f24690ed05 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -116,6 +116,9 @@ struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create)
        unsigned long index;
        unsigned int bufnum;
+        if (mapping == NULL)
+                mapping = &sdp->sd_aspace;
        shift = PAGE_CACHE_SHIFT - sdp->sd_sb.sb_bsize_shift;
        index = blkno >> shift;             /* convert block to page */
        bufnum = blkno - (index << shift);  /* block buf index within page */
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 52fa88314f5c..c6872d09561a 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -36,6 +36,7 @@
 #include "log.h"
 #include "quota.h"
 #include "dir.h"
+#include "meta_io.h"
 #include "trace_gfs2.h"
 #define DO 0
@@ -62,6 +63,7 @@ static void gfs2_tune_init(struct gfs2_tune *gt)
 static struct gfs2_sbd *init_sbd(struct super_block *sb)
 {
        struct gfs2_sbd *sdp;
+        struct address_space *mapping;
        sdp = kzalloc(sizeof(struct gfs2_sbd), GFP_KERNEL);
        if (!sdp)
@@ -97,6 +99,18 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
        init_waitqueue_head(&sdp->sd_quota_wait);
        INIT_LIST_HEAD(&sdp->sd_trunc_list);
        spin_lock_init(&sdp->sd_trunc_lock);
+        spin_lock_init(&sdp->sd_bitmap_lock);
+        mapping = &sdp->sd_aspace;
+        address_space_init_once(mapping);
+        mapping->a_ops = &gfs2_meta_aops;
+        mapping->host = sb->s_bdev->bd_inode;
+        mapping->flags = 0;
+        mapping_set_gfp_mask(mapping, GFP_NOFS);
+        mapping->private_data = NULL;
+        mapping->backing_dev_info = sb->s_bdi;
+        mapping->writeback_index = 0;
        spin_lock_init(&sdp->sd_log_lock);
        atomic_set(&sdp->sd_log_pinned, 0);
@@ -217,14 +231,14 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector, int silent)
        page = alloc_page(GFP_NOFS);
        if (unlikely(!page))
-                return -ENOBUFS;
+                return -ENOMEM;
        ClearPageUptodate(page);
        ClearPageDirty(page);
        lock_page(page);
        bio = bio_alloc(GFP_NOFS, 1);
-        bio->bi_sector = sector * (sb->s_blocksize >> 9);
+        bio->bi_iter.bi_sector = sector * (sb->s_blocksize >> 9);
        bio->bi_bdev = sb->s_bdev;
        bio_add_page(bio, page, PAGE_SIZE, 0);
@@ -956,40 +970,6 @@ fail:
        return error;
 }
-static int init_threads(struct gfs2_sbd *sdp, int undo)
-{
-        struct task_struct *p;
-        int error = 0;
-        if (undo)
-                goto fail_quotad;
-        p = kthread_run(gfs2_logd, sdp, "gfs2_logd");
-        if (IS_ERR(p)) {
-                error = PTR_ERR(p);
-                fs_err(sdp, "can't start logd thread: %d\n", error);
-                return error;
-        }
-        sdp->sd_logd_process = p;
-        p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad");
-        if (IS_ERR(p)) {
-                error = PTR_ERR(p);
-                fs_err(sdp, "can't start quotad thread: %d\n", error);
-                goto fail;
-        }
-        sdp->sd_quotad_process = p;
-        return 0;
-fail_quotad:
-        kthread_stop(sdp->sd_quotad_process);
-fail:
-        kthread_stop(sdp->sd_logd_process);
-        return error;
-}
 static const match_table_t nolock_tokens = {
        { Opt_jid, "jid=%d\n", },
        { Opt_err, NULL },
@@ -1254,15 +1234,11 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
                goto fail_per_node;
        }
-        error = init_threads(sdp, DO);
-        if (error)
-                goto fail_per_node;
        if (!(sb->s_flags & MS_RDONLY)) {
                error = gfs2_make_fs_rw(sdp);
                if (error) {
                        fs_err(sdp, "can't make FS RW: %d\n", error);
-                        goto fail_threads;
+                        goto fail_per_node;
                }
        }
@@ -1270,8 +1246,6 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
        gfs2_online_uevent(sdp);
        return 0;
-fail_threads:
-        init_threads(sdp, UNDO);
 fail_per_node:
        init_per_node(sdp, UNDO);
 fail_inodes:
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 98236d0df3ca..8bec0e3192dd 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -52,6 +52,11 @@
 #include <linux/dqblk_xfs.h>
 #include <linux/lockref.h>
 #include <linux/list_lru.h>
+#include <linux/rcupdate.h>
+#include <linux/rculist_bl.h>
+#include <linux/bit_spinlock.h>
+#include <linux/jhash.h>
+#include <linux/vmalloc.h>
 #include "gfs2.h"
 #include "incore.h"
@@ -67,16 +72,44 @@
 #include "inode.h"
 #include "util.h"
-struct gfs2_quota_change_host {
+#define GFS2_QD_HASH_SHIFT      12
-        u64 qc_change;
+#define GFS2_QD_HASH_SIZE       (1 << GFS2_QD_HASH_SHIFT)
-        u32 qc_flags; /* GFS2_QCF_... */
+#define GFS2_QD_HASH_MASK       (GFS2_QD_HASH_SIZE - 1)
-        struct kqid qc_id;
-};
-/* Lock order: qd_lock -> qd->lockref.lock -> lru lock */
+/* Lock order: qd_lock -> bucket lock -> qd->lockref.lock -> lru lock */
+/*                     -> sd_bitmap_lock                              */
 static DEFINE_SPINLOCK(qd_lock);
 struct list_lru gfs2_qd_lru;
+static struct hlist_bl_head qd_hash_table[GFS2_QD_HASH_SIZE];
+static unsigned int gfs2_qd_hash(const struct gfs2_sbd *sdp,
+                                 const struct kqid qid)
+{
+        unsigned int h;
+        h = jhash(&sdp, sizeof(struct gfs2_sbd *), 0);
+        h = jhash(&qid, sizeof(struct kqid), h);
+        return h & GFS2_QD_HASH_MASK;
+}
+static inline void spin_lock_bucket(unsigned int hash)
+{
+        hlist_bl_lock(&qd_hash_table[hash]);
+}
+static inline void spin_unlock_bucket(unsigned int hash)
+{
+        hlist_bl_unlock(&qd_hash_table[hash]);
+}
+static void gfs2_qd_dealloc(struct rcu_head *rcu)
+{
+        struct gfs2_quota_data *qd = container_of(rcu, struct gfs2_quota_data, qd_rcu);
+        kmem_cache_free(gfs2_quotad_cachep, qd);
+}
 static void gfs2_qd_dispose(struct list_head *list)
 {
        struct gfs2_quota_data *qd;
@@ -93,6 +126,10 @@ static void gfs2_qd_dispose(struct list_head *list)
                list_del(&qd->qd_list);
                spin_unlock(&qd_lock);
+                spin_lock_bucket(qd->qd_hash);
+                hlist_bl_del_rcu(&qd->qd_hlist);
+                spin_unlock_bucket(qd->qd_hash);
                gfs2_assert_warn(sdp, !qd->qd_change);
                gfs2_assert_warn(sdp, !qd->qd_slot_count);
                gfs2_assert_warn(sdp, !qd->qd_bh_count);
@@ -101,7 +138,7 @@ static void gfs2_qd_dispose(struct list_head *list)
                atomic_dec(&sdp->sd_quota_count);
                /* Delete it from the common reclaim list */
-                kmem_cache_free(gfs2_quotad_cachep, qd);
+                call_rcu(&qd->qd_rcu, gfs2_qd_dealloc);
        }
 }
@@ -171,83 +208,95 @@ static u64 qd2offset(struct gfs2_quota_data *qd)
        return offset;
 }
-static int qd_alloc(struct gfs2_sbd *sdp, struct kqid qid,
+static struct gfs2_quota_data *qd_alloc(unsigned hash, struct gfs2_sbd *sdp, struct kqid qid)
-                    struct gfs2_quota_data **qdp)
 {
        struct gfs2_quota_data *qd;
        int error;
        qd = kmem_cache_zalloc(gfs2_quotad_cachep, GFP_NOFS);
        if (!qd)
-                return -ENOMEM;
+                return NULL;
+        qd->qd_sbd = sdp;
        qd->qd_lockref.count = 1;
        spin_lock_init(&qd->qd_lockref.lock);
        qd->qd_id = qid;
        qd->qd_slot = -1;
        INIT_LIST_HEAD(&qd->qd_lru);
+        qd->qd_hash = hash;
        error = gfs2_glock_get(sdp, qd2index(qd),
                              &gfs2_quota_glops, CREATE, &qd->qd_gl);
        if (error)
                goto fail;
-        *qdp = qd;
+        return qd;
-        return 0;
 fail:
        kmem_cache_free(gfs2_quotad_cachep, qd);
-        return error;
+        return NULL;
 }
-static int qd_get(struct gfs2_sbd *sdp, struct kqid qid,
+static struct gfs2_quota_data *gfs2_qd_search_bucket(unsigned int hash,
-                  struct gfs2_quota_data **qdp)
+                                                     const struct gfs2_sbd *sdp,
+                                                     struct kqid qid)
 {
-        struct gfs2_quota_data *qd = NULL, *new_qd = NULL;
+        struct gfs2_quota_data *qd;
-        int error, found;
+        struct hlist_bl_node *h;
-        *qdp = NULL;
-        for (;;) {
+        hlist_bl_for_each_entry_rcu(qd, h, &qd_hash_table[hash], qd_hlist) {
-                found = 0;
+                if (!qid_eq(qd->qd_id, qid))
-                spin_lock(&qd_lock);
+                        continue;
-                list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) {
+                if (qd->qd_sbd != sdp)
-                        if (qid_eq(qd->qd_id, qid) &&
+                        continue;
-                            lockref_get_not_dead(&qd->qd_lockref)) {
+                if (lockref_get_not_dead(&qd->qd_lockref)) {
-                                list_lru_del(&gfs2_qd_lru, &qd->qd_lru);
+                        list_lru_del(&gfs2_qd_lru, &qd->qd_lru);
-                                found = 1;
+                        return qd;
-                                break;
-                        }
                }
+        }
-                if (!found)
+        return NULL;
-                        qd = NULL;
+}
-                if (!qd && new_qd) {
-                        qd = new_qd;
-                        list_add(&qd->qd_list, &sdp->sd_quota_list);
-                        atomic_inc(&sdp->sd_quota_count);
-                        new_qd = NULL;
-                }
-                spin_unlock(&qd_lock);
+static int qd_get(struct gfs2_sbd *sdp, struct kqid qid,
+                  struct gfs2_quota_data **qdp)
+{
+        struct gfs2_quota_data *qd, *new_qd;
+        unsigned int hash = gfs2_qd_hash(sdp, qid);
-                if (qd) {
+        rcu_read_lock();
-                        if (new_qd) {
+        *qdp = qd = gfs2_qd_search_bucket(hash, sdp, qid);
-                                gfs2_glock_put(new_qd->qd_gl);
+        rcu_read_unlock();
-                                kmem_cache_free(gfs2_quotad_cachep, new_qd);
-                        }
-                        *qdp = qd;
-                        return 0;
-                }
-                error = qd_alloc(sdp, qid, &new_qd);
+        if (qd)
-                if (error)
+                return 0;
-                        return error;
+        new_qd = qd_alloc(hash, sdp, qid);
+        if (!new_qd)
+                return -ENOMEM;
+        spin_lock(&qd_lock);
+        spin_lock_bucket(hash);
+        *qdp = qd = gfs2_qd_search_bucket(hash, sdp, qid);
+        if (qd == NULL) {
+                *qdp = new_qd;
+                list_add(&new_qd->qd_list, &sdp->sd_quota_list);
+                hlist_bl_add_head_rcu(&new_qd->qd_hlist, &qd_hash_table[hash]);
+                atomic_inc(&sdp->sd_quota_count);
        }
+        spin_unlock_bucket(hash);
+        spin_unlock(&qd_lock);
+        if (qd) {
+                gfs2_glock_put(new_qd->qd_gl);
+                kmem_cache_free(gfs2_quotad_cachep, new_qd);
+        }
+        return 0;
 }
 static void qd_hold(struct gfs2_quota_data *qd)
 {
        struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
@@ -268,88 +317,48 @@ static void qd_put(struct gfs2_quota_data *qd)
 static int slot_get(struct gfs2_quota_data *qd)
 {
-        struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+        struct gfs2_sbd *sdp = qd->qd_sbd;
-        unsigned int c, o = 0, b;
+        unsigned int bit;
-        unsigned char byte = 0;
+        int error = 0;
-        spin_lock(&qd_lock);
+        spin_lock(&sdp->sd_bitmap_lock);
+        if (qd->qd_slot_count != 0)
+                goto out;
-        if (qd->qd_slot_count++) {
+        error = -ENOSPC;
-                spin_unlock(&qd_lock);
+        bit = find_first_zero_bit(sdp->sd_quota_bitmap, sdp->sd_quota_slots);
-                return 0;
+        if (bit < sdp->sd_quota_slots) {
+                set_bit(bit, sdp->sd_quota_bitmap);
+                qd->qd_slot = bit;
+out:
+                qd->qd_slot_count++;
        }
+        spin_unlock(&sdp->sd_bitmap_lock);
-        for (c = 0; c < sdp->sd_quota_chunks; c++)
+        return error;
-                for (o = 0; o < PAGE_SIZE; o++) {
-                        byte = sdp->sd_quota_bitmap[c][o];
-                        if (byte != 0xFF)
-                                goto found;
-                }
-        goto fail;
-found:
-        for (b = 0; b < 8; b++)
-                if (!(byte & (1 << b)))
-                        break;
-        qd->qd_slot = c * (8 * PAGE_SIZE) + o * 8 + b;
-        if (qd->qd_slot >= sdp->sd_quota_slots)
-                goto fail;
-        sdp->sd_quota_bitmap[c][o] |= 1 << b;
-        spin_unlock(&qd_lock);
-        return 0;
-fail:
-        qd->qd_slot_count--;
-        spin_unlock(&qd_lock);
-        return -ENOSPC;
 }
 static void slot_hold(struct gfs2_quota_data *qd)
 {
-        struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+        struct gfs2_sbd *sdp = qd->qd_sbd;
-        spin_lock(&qd_lock);
+        spin_lock(&sdp->sd_bitmap_lock);
        gfs2_assert(sdp, qd->qd_slot_count);
        qd->qd_slot_count++;
-        spin_unlock(&qd_lock);
+        spin_unlock(&sdp->sd_bitmap_lock);
-}
-static void gfs2_icbit_munge(struct gfs2_sbd *sdp, unsigned char **bitmap,
-                             unsigned int bit, int new_value)
-{
-        unsigned int c, o, b = bit;
-        int old_value;
-        c = b / (8 * PAGE_SIZE);
-        b %= 8 * PAGE_SIZE;
-        o = b / 8;
-        b %= 8;
-        old_value = (bitmap[c][o] & (1 << b));
-        gfs2_assert_withdraw(sdp, !old_value != !new_value);
-        if (new_value)
-                bitmap[c][o] |= 1 << b;
-        else
-                bitmap[c][o] &= ~(1 << b);
 }
 static void slot_put(struct gfs2_quota_data *qd)
 {
-        struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+        struct gfs2_sbd *sdp = qd->qd_sbd;
-        spin_lock(&qd_lock);
+        spin_lock(&sdp->sd_bitmap_lock);
        gfs2_assert(sdp, qd->qd_slot_count);
        if (!--qd->qd_slot_count) {
-                gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, qd->qd_slot, 0);
+                BUG_ON(!test_and_clear_bit(qd->qd_slot, sdp->sd_quota_bitmap));
                qd->qd_slot = -1;
        }
-        spin_unlock(&qd_lock);
+        spin_unlock(&sdp->sd_bitmap_lock);
 }
 static int bh_get(struct gfs2_quota_data *qd)
@@ -427,8 +436,7 @@ static int qd_check_sync(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd,
        list_move_tail(&qd->qd_list, &sdp->sd_quota_list);
        set_bit(QDF_LOCKED, &qd->qd_flags);
        qd->qd_change_sync = qd->qd_change;
-        gfs2_assert_warn(sdp, qd->qd_slot_count);
+        slot_hold(qd);
-        qd->qd_slot_count++;
        return 1;
 }
@@ -1214,17 +1222,6 @@ int gfs2_quota_refresh(struct gfs2_sbd *sdp, struct kqid qid)
        return error;
 }
-static void gfs2_quota_change_in(struct gfs2_quota_change_host *qc, const void *buf)
-{
-        const struct gfs2_quota_change *str = buf;
-        qc->qc_change = be64_to_cpu(str->qc_change);
-        qc->qc_flags = be32_to_cpu(str->qc_flags);
-        qc->qc_id = make_kqid(&init_user_ns,
-                              (qc->qc_flags & GFS2_QCF_USER)?USRQUOTA:GRPQUOTA,
-                              be32_to_cpu(str->qc_id));
-}
 int gfs2_quota_init(struct gfs2_sbd *sdp)
 {
        struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
@@ -1232,6 +1229,8 @@ int gfs2_quota_init(struct gfs2_sbd *sdp)
        unsigned int blocks = size >> sdp->sd_sb.sb_bsize_shift;
        unsigned int x, slot = 0;
        unsigned int found = 0;
+        unsigned int hash;
+        unsigned int bm_size;
        u64 dblock;
        u32 extlen = 0;
        int error;
@@ -1240,23 +1239,20 @@ int gfs2_quota_init(struct gfs2_sbd *sdp)
                return -EIO;
        sdp->sd_quota_slots = blocks * sdp->sd_qc_per_block;
-        sdp->sd_quota_chunks = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * PAGE_SIZE);
+        bm_size = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * sizeof(unsigned long));
+        bm_size *= sizeof(unsigned long);
        error = -ENOMEM;
+        sdp->sd_quota_bitmap = kmalloc(bm_size, GFP_NOFS|__GFP_NOWARN);
-        sdp->sd_quota_bitmap = kcalloc(sdp->sd_quota_chunks,
+        if (sdp->sd_quota_bitmap == NULL)
-                                       sizeof(unsigned char *), GFP_NOFS);
+                sdp->sd_quota_bitmap = __vmalloc(bm_size, GFP_NOFS, PAGE_KERNEL);
        if (!sdp->sd_quota_bitmap)
                return error;
-        for (x = 0; x < sdp->sd_quota_chunks; x++) {
+        memset(sdp->sd_quota_bitmap, 0, bm_size);
-                sdp->sd_quota_bitmap[x] = kzalloc(PAGE_SIZE, GFP_NOFS);
-                if (!sdp->sd_quota_bitmap[x])
-                        goto fail;
-        }
        for (x = 0; x < blocks; x++) {
                struct buffer_head *bh;
+                const struct gfs2_quota_change *qc;
                unsigned int y;
                if (!extlen) {
@@ -1274,34 +1270,42 @@ int gfs2_quota_init(struct gfs2_sbd *sdp)
                        goto fail;
                }
+                qc = (const struct gfs2_quota_change *)(bh->b_data + sizeof(struct gfs2_meta_header));
                for (y = 0; y < sdp->sd_qc_per_block && slot < sdp->sd_quota_slots;
                     y++, slot++) {
-                        struct gfs2_quota_change_host qc;
                        struct gfs2_quota_data *qd;
+                        s64 qc_change = be64_to_cpu(qc->qc_change);
-                        gfs2_quota_change_in(&qc, bh->b_data +
+                        u32 qc_flags = be32_to_cpu(qc->qc_flags);
-                                          sizeof(struct gfs2_meta_header) +
+                        enum quota_type qtype = (qc_flags & GFS2_QCF_USER) ?
-                                          y * sizeof(struct gfs2_quota_change));
+                                                USRQUOTA : GRPQUOTA;
-                        if (!qc.qc_change)
+                        struct kqid qc_id = make_kqid(&init_user_ns, qtype,
+                                                      be32_to_cpu(qc->qc_id));
+                        qc++;
+                        if (!qc_change)
                                continue;
-                        error = qd_alloc(sdp, qc.qc_id, &qd);
+                        hash = gfs2_qd_hash(sdp, qc_id);
-                        if (error) {
+                        qd = qd_alloc(hash, sdp, qc_id);
+                        if (qd == NULL) {
                                brelse(bh);
                                goto fail;
                        }
                        set_bit(QDF_CHANGE, &qd->qd_flags);
-                        qd->qd_change = qc.qc_change;
+                        qd->qd_change = qc_change;
                        qd->qd_slot = slot;
                        qd->qd_slot_count = 1;
                        spin_lock(&qd_lock);
-                        gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, slot, 1);
+                        BUG_ON(test_and_set_bit(slot, sdp->sd_quota_bitmap));
                        list_add(&qd->qd_list, &sdp->sd_quota_list);
                        atomic_inc(&sdp->sd_quota_count);
                        spin_unlock(&qd_lock);
+                        spin_lock_bucket(hash);
+                        hlist_bl_add_head_rcu(&qd->qd_hlist, &qd_hash_table[hash]);
+                        spin_unlock_bucket(hash);
                        found++;
                }
@@ -1324,44 +1328,28 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
 {
        struct list_head *head = &sdp->sd_quota_list;
        struct gfs2_quota_data *qd;
-        unsigned int x;
        spin_lock(&qd_lock);
        while (!list_empty(head)) {
                qd = list_entry(head->prev, struct gfs2_quota_data, qd_list);
-                /*
-                 * To be removed in due course... we should be able to
-                 * ensure that all refs to the qd have done by this point
-                 * so that this rather odd test is not required
-                 */
-                spin_lock(&qd->qd_lockref.lock);
-                if (qd->qd_lockref.count > 1 ||
-                    (qd->qd_lockref.count && !test_bit(QDF_CHANGE, &qd->qd_flags))) {
-                        spin_unlock(&qd->qd_lockref.lock);
-                        list_move(&qd->qd_list, head);
-                        spin_unlock(&qd_lock);
-                        schedule();
-                        spin_lock(&qd_lock);
-                        continue;
-                }
-                spin_unlock(&qd->qd_lockref.lock);
                list_del(&qd->qd_list);
                /* Also remove if this qd exists in the reclaim list */
                list_lru_del(&gfs2_qd_lru, &qd->qd_lru);
                atomic_dec(&sdp->sd_quota_count);
                spin_unlock(&qd_lock);
-                if (!qd->qd_lockref.count) {
+                spin_lock_bucket(qd->qd_hash);
-                        gfs2_assert_warn(sdp, !qd->qd_change);
+                hlist_bl_del_rcu(&qd->qd_hlist);
-                        gfs2_assert_warn(sdp, !qd->qd_slot_count);
+                spin_unlock_bucket(qd->qd_hash);
-                } else
-                        gfs2_assert_warn(sdp, qd->qd_slot_count == 1);
+                gfs2_assert_warn(sdp, !qd->qd_change);
+                gfs2_assert_warn(sdp, !qd->qd_slot_count);
                gfs2_assert_warn(sdp, !qd->qd_bh_count);
                gfs2_glock_put(qd->qd_gl);
-                kmem_cache_free(gfs2_quotad_cachep, qd);
+                call_rcu(&qd->qd_rcu, gfs2_qd_dealloc);
                spin_lock(&qd_lock);
        }
@@ -1370,9 +1358,11 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
        gfs2_assert_warn(sdp, !atomic_read(&sdp->sd_quota_count));
        if (sdp->sd_quota_bitmap) {
-                for (x = 0; x < sdp->sd_quota_chunks; x++)
+                if (is_vmalloc_addr(sdp->sd_quota_bitmap))
-                        kfree(sdp->sd_quota_bitmap[x]);
+                        vfree(sdp->sd_quota_bitmap);
-                kfree(sdp->sd_quota_bitmap);
+                else
+                        kfree(sdp->sd_quota_bitmap);
+                sdp->sd_quota_bitmap = NULL;
        }
 }
@@ -1656,3 +1646,11 @@ const struct quotactl_ops gfs2_quotactl_ops = {
        .get_dqblk      = gfs2_get_dqblk,
        .set_dqblk      = gfs2_set_dqblk,
 };
+void __init gfs2_quota_hash_init(void)
+{
+        unsigned i;
+        for(i = 0; i < GFS2_QD_HASH_SIZE; i++)
+                INIT_HLIST_BL_HEAD(&qd_hash_table[i]);
+}
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index 96e4f34a03b0..55d506eb3c4a 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -57,5 +57,6 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)
 extern const struct quotactl_ops gfs2_quotactl_ops;
 extern struct shrinker gfs2_qd_shrinker;
 extern struct list_lru gfs2_qd_lru;
+extern void __init gfs2_quota_hash_init(void);
 #endif /* __QUOTA_DOT_H__ */
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index c8d6161bd682..a1da21349235 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -57,6 +57,11 @@
 * 3 = Used (metadata)
 */
+struct gfs2_extent {
+        struct gfs2_rbm rbm;
+        u32 len;
+};
 static const char valid_change[16] = {
                /* current */
        /* n */ 0, 1, 1, 1,
@@ -65,8 +70,9 @@ static const char valid_change[16] = {
                1, 0, 0, 0
 };
-static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 minext,
+static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
-                         const struct gfs2_inode *ip, bool nowrap);
+                         const struct gfs2_inode *ip, bool nowrap,
+                         const struct gfs2_alloc_parms *ap);
 /**
@@ -635,9 +641,13 @@ static void __rs_deltree(struct gfs2_blkreserv *rs)
                /* return reserved blocks to the rgrp */
                BUG_ON(rs->rs_rbm.rgd->rd_reserved < rs->rs_free);
                rs->rs_rbm.rgd->rd_reserved -= rs->rs_free;
+                /* The rgrp extent failure point is likely not to increase;
+                   it will only do so if the freed blocks are somehow
+                   contiguous with a span of free blocks that follows. Still,
+                   it will force the number to be recalculated later. */
+                rgd->rd_extfail_pt += rs->rs_free;
                rs->rs_free = 0;
                clear_bit(GBF_FULL, &bi->bi_flags);
-                smp_mb__after_clear_bit();
        }
 }
@@ -876,6 +886,7 @@ static int rgd_insert(struct gfs2_rgrpd *rgd)
 static int read_rindex_entry(struct gfs2_inode *ip)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        const unsigned bsize = sdp->sd_sb.sb_bsize;
        loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex);
        struct gfs2_rindex buf;
        int error;
@@ -913,6 +924,8 @@ static int read_rindex_entry(struct gfs2_inode *ip)
                goto fail;
        rgd->rd_gl->gl_object = rgd;
+        rgd->rd_gl->gl_vm.start = rgd->rd_addr * bsize;
+        rgd->rd_gl->gl_vm.end = rgd->rd_gl->gl_vm.start + (rgd->rd_length * bsize) - 1;
        rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr;
        rgd->rd_flags &= ~GFS2_RDF_UPTODATE;
        if (rgd->rd_data > sdp->sd_max_rg_data)
@@ -1126,6 +1139,8 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
                gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data);
                rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK);
                rgd->rd_free_clone = rgd->rd_free;
+                /* max out the rgrp allocation failure point */
+                rgd->rd_extfail_pt = rgd->rd_free;
        }
        if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic) {
                rgd->rd_rgl->rl_unlinked = cpu_to_be32(count_unlinked(rgd));
@@ -1184,7 +1199,7 @@ int gfs2_rgrp_go_lock(struct gfs2_holder *gh)
        if (gh->gh_flags & GL_SKIP && sdp->sd_args.ar_rgrplvb)
                return 0;
-        return gfs2_rgrp_bh_get((struct gfs2_rgrpd *)gh->gh_gl->gl_object);
+        return gfs2_rgrp_bh_get(rgd);
 }
 /**
@@ -1455,7 +1470,7 @@ static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip,
        if (WARN_ON(gfs2_rbm_from_block(&rbm, goal)))
                return;
-        ret = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, extlen, ip, true);
+        ret = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, &extlen, ip, true, ap);
        if (ret == 0) {
                rs->rs_rbm = rbm;
                rs->rs_free = extlen;
@@ -1520,6 +1535,7 @@ static u64 gfs2_next_unreserved_block(struct gfs2_rgrpd *rgd, u64 block,
 * @rbm: The current position in the resource group
 * @ip: The inode for which we are searching for blocks
 * @minext: The minimum extent length
+ * @maxext: A pointer to the maximum extent structure
 *
 * This checks the current position in the rgrp to see whether there is
 * a reservation covering this block. If not then this function is a
@@ -1532,7 +1548,8 @@ static u64 gfs2_next_unreserved_block(struct gfs2_rgrpd *rgd, u64 block,
 static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm,
                                             const struct gfs2_inode *ip,
-                                             u32 minext)
+                                             u32 minext,
+                                             struct gfs2_extent *maxext)
 {
        u64 block = gfs2_rbm_to_block(rbm);
        u32 extlen = 1;
@@ -1545,8 +1562,7 @@ static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm,
         */
        if (minext) {
                extlen = gfs2_free_extlen(rbm, minext);
-                nblock = block + extlen;
+                if (extlen <= maxext->len)
-                if (extlen < minext)
                        goto fail;
        }
@@ -1555,9 +1571,17 @@ static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm,
         * and skip if parts of it are already reserved
         */
        nblock = gfs2_next_unreserved_block(rbm->rgd, block, extlen, ip);
-        if (nblock == block)
+        if (nblock == block) {
-                return 0;
+                if (!minext || extlen >= minext)
+                        return 0;
+                if (extlen > maxext->len) {
+                        maxext->len = extlen;
+                        maxext->rbm = *rbm;
+                }
 fail:
+                nblock = block + extlen;
+        }
        ret = gfs2_rbm_from_block(rbm, nblock);
        if (ret < 0)
                return ret;
@@ -1568,30 +1592,38 @@ fail:
 * gfs2_rbm_find - Look for blocks of a particular state
 * @rbm: Value/result starting position and final position
 * @state: The state which we want to find
- * @minext: The requested extent length (0 for a single block)
+ * @minext: Pointer to the requested extent length (NULL for a single block)
+ *          This is updated to be the actual reservation size.
 * @ip: If set, check for reservations
 * @nowrap: Stop looking at the end of the rgrp, rather than wrapping
 *          around until we've reached the starting point.
+ * @ap: the allocation parameters
 *
 * Side effects:
 * - If looking for free blocks, we set GBF_FULL on each bitmap which
 *   has no free blocks in it.
+ * - If looking for free blocks, we set rd_extfail_pt on each rgrp which
+ *   has come up short on a free block search.
 *
 * Returns: 0 on success, -ENOSPC if there is no block of the requested state
 */
-static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 minext,
+static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
-                         const struct gfs2_inode *ip, bool nowrap)
+                         const struct gfs2_inode *ip, bool nowrap,
+                         const struct gfs2_alloc_parms *ap)
 {
        struct buffer_head *bh;
        int initial_bii;
        u32 initial_offset;
+        int first_bii = rbm->bii;
+        u32 first_offset = rbm->offset;
        u32 offset;
        u8 *buffer;
        int n = 0;
        int iters = rbm->rgd->rd_length;
        int ret;
        struct gfs2_bitmap *bi;
+        struct gfs2_extent maxext = { .rbm.rgd = rbm->rgd, };
        /* If we are not starting at the beginning of a bitmap, then we
         * need to add one to the bitmap count to ensure that we search
@@ -1620,7 +1652,9 @@ static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 minext,
                        return 0;
                initial_bii = rbm->bii;
-                ret = gfs2_reservation_check_and_update(rbm, ip, minext);
+                ret = gfs2_reservation_check_and_update(rbm, ip,
+                                                        minext ? *minext : 0,
+                                                        &maxext);
                if (ret == 0)
                        return 0;
                if (ret > 0) {
@@ -1655,6 +1689,24 @@ next_iter:
                        break;
        }
+        if (minext == NULL || state != GFS2_BLKST_FREE)
+                return -ENOSPC;
+        /* If the extent was too small, and it's smaller than the smallest
+           to have failed before, remember for future reference that it's
+           useless to search this rgrp again for this amount or more. */
+        if ((first_offset == 0) && (first_bii == 0) &&
+            (*minext < rbm->rgd->rd_extfail_pt))
+                rbm->rgd->rd_extfail_pt = *minext;
+        /* If the maximum extent we found is big enough to fulfill the
+           minimum requirements, use it anyway. */
+        if (maxext.len) {
+                *rbm = maxext.rbm;
+                *minext = maxext.len;
+                return 0;
+        }
        return -ENOSPC;
 }
@@ -1680,7 +1732,8 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip
        while (1) {
                down_write(&sdp->sd_log_flush_lock);
-                error = gfs2_rbm_find(&rbm, GFS2_BLKST_UNLINKED, 0, NULL, true);
+                error = gfs2_rbm_find(&rbm, GFS2_BLKST_UNLINKED, NULL, NULL,
+                                      true, NULL);
                up_write(&sdp->sd_log_flush_lock);
                if (error == -ENOSPC)
                        break;
@@ -1891,7 +1944,9 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, const struct gfs2_alloc_parms *a
                }
                /* Skip unuseable resource groups */
-                if (rs->rs_rbm.rgd->rd_flags & (GFS2_RGF_NOALLOC | GFS2_RDF_ERROR))
+                if ((rs->rs_rbm.rgd->rd_flags & (GFS2_RGF_NOALLOC |
+                                                 GFS2_RDF_ERROR)) ||
+                    (ap->target > rs->rs_rbm.rgd->rd_extfail_pt))
                        goto skip_rgrp;
                if (sdp->sd_args.ar_rgrplvb)
@@ -1911,15 +1966,16 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, const struct gfs2_alloc_parms *a
                        return 0;
                }
-                /* Drop reservation, if we couldn't use reserved rgrp */
-                if (gfs2_rs_active(rs))
-                        gfs2_rs_deltree(rs);
 check_rgrp:
                /* Check for unlinked inodes which can be reclaimed */
                if (rs->rs_rbm.rgd->rd_flags & GFS2_RDF_CHECK)
                        try_rgrp_unlink(rs->rs_rbm.rgd, &last_unlinked,
                                        ip->i_no_addr);
 skip_rgrp:
+                /* Drop reservation, if we couldn't use reserved rgrp */
+                if (gfs2_rs_active(rs))
+                        gfs2_rs_deltree(rs);
                /* Unlock rgrp if required */
                if (!rg_locked)
                        gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
@@ -2064,25 +2120,24 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
 *
 */
-int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl)
+void gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl)
 {
        struct gfs2_rgrpd *rgd = gl->gl_object;
        struct gfs2_blkreserv *trs;
        const struct rb_node *n;
        if (rgd == NULL)
-                return 0;
+                return;
-        gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u r:%u\n",
+        gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u r:%u e:%u\n",
                       (unsigned long long)rgd->rd_addr, rgd->rd_flags,
                       rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes,
-                       rgd->rd_reserved);
+                       rgd->rd_reserved, rgd->rd_extfail_pt);
        spin_lock(&rgd->rd_rsspin);
        for (n = rb_first(&rgd->rd_rstree); n; n = rb_next(&trs->rs_node)) {
                trs = rb_entry(n, struct gfs2_blkreserv, rs_node);
                dump_rs(seq, trs);
        }
        spin_unlock(&rgd->rd_rsspin);
-        return 0;
 }
 static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd)
@@ -2184,18 +2239,20 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
        int error;
        gfs2_set_alloc_start(&rbm, ip, dinode);
-        error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, 0, ip, false);
+        error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, ip, false, NULL);
        if (error == -ENOSPC) {
                gfs2_set_alloc_start(&rbm, ip, dinode);
-                error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, 0, NULL, false);
+                error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, NULL, false,
+                                      NULL);
        }
        /* Since all blocks are reserved in advance, this shouldn't happen */
        if (error) {
-                fs_warn(sdp, "inum=%llu error=%d, nblocks=%u, full=%d\n",
+                fs_warn(sdp, "inum=%llu error=%d, nblocks=%u, full=%d fail_pt=%d\n",
                        (unsigned long long)ip->i_no_addr, error, *nblocks,
-                        test_bit(GBF_FULL, &rbm.rgd->rd_bits->bi_flags));
+                        test_bit(GBF_FULL, &rbm.rgd->rd_bits->bi_flags),
+                        rbm.rgd->rd_extfail_pt);
                goto rgrp_error;
        }
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 3a10d2ffbbe7..463ab2e95d1c 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -68,7 +68,7 @@ extern void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist,
 extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state);
 extern void gfs2_rlist_free(struct gfs2_rgrp_list *rlist);
 extern u64 gfs2_ri_total(struct gfs2_sbd *sdp);
-extern int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl);
+extern void gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl);
 extern int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
                                   struct buffer_head *bh,
                                   const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 35da5b19c0de..60f60f6181f3 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -369,6 +369,33 @@ int gfs2_jdesc_check(struct gfs2_jdesc *jd)
        return 0;
 }
+static int init_threads(struct gfs2_sbd *sdp)
+{
+        struct task_struct *p;
+        int error = 0;
+        p = kthread_run(gfs2_logd, sdp, "gfs2_logd");
+        if (IS_ERR(p)) {
+                error = PTR_ERR(p);
+                fs_err(sdp, "can't start logd thread: %d\n", error);
+                return error;
+        }
+        sdp->sd_logd_process = p;
+        p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad");
+        if (IS_ERR(p)) {
+                error = PTR_ERR(p);
+                fs_err(sdp, "can't start quotad thread: %d\n", error);
+                goto fail;
+        }
+        sdp->sd_quotad_process = p;
+        return 0;
+fail:
+        kthread_stop(sdp->sd_logd_process);
+        return error;
+}
 /**
 * gfs2_make_fs_rw - Turn a Read-Only FS into a Read-Write one
 * @sdp: the filesystem
@@ -384,10 +411,14 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp)
        struct gfs2_log_header_host head;
        int error;
-        error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &t_gh);
+        error = init_threads(sdp);
        if (error)
                return error;
+        error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &t_gh);
+        if (error)
+                goto fail_threads;
        j_gl->gl_ops->go_inval(j_gl, DIO_METADATA);
        error = gfs2_find_jhead(sdp->sd_jdesc, &head);
@@ -417,7 +448,9 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp)
 fail:
        t_gh.gh_flags |= GL_NOCACHE;
        gfs2_glock_dq_uninit(&t_gh);
+fail_threads:
+        kthread_stop(sdp->sd_quotad_process);
+        kthread_stop(sdp->sd_logd_process);
        return error;
 }
@@ -800,6 +833,9 @@ static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
        struct gfs2_holder t_gh;
        int error;
+        kthread_stop(sdp->sd_quotad_process);
+        kthread_stop(sdp->sd_logd_process);
        flush_workqueue(gfs2_delete_workqueue);
        gfs2_quota_sync(sdp->sd_vfs, 0);
        gfs2_statfs_sync(sdp->sd_vfs, 0);
@@ -857,9 +893,6 @@ restart:
        }
        spin_unlock(&sdp->sd_jindex_spin);
-        kthread_stop(sdp->sd_quotad_process);
-        kthread_stop(sdp->sd_logd_process);
        if (!(sb->s_flags & MS_RDONLY)) {
                error = gfs2_make_fs_ro(sdp);
                if (error)
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 8c6a6f6bdba9..0b81f783f787 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -13,6 +13,7 @@
 #include <linux/buffer_head.h>
 #include <linux/xattr.h>
 #include <linux/gfs2_ondisk.h>
+#include <linux/posix_acl_xattr.h>
 #include <asm/uaccess.h>
 #include "gfs2.h"
@@ -1500,7 +1501,8 @@ static const struct xattr_handler gfs2_xattr_security_handler = {
 const struct xattr_handler *gfs2_xattr_handlers[] = {
        &gfs2_xattr_user_handler,
        &gfs2_xattr_security_handler,
-        &gfs2_xattr_system_handler,
+        &posix_acl_access_xattr_handler,
+        &posix_acl_default_xattr_handler,
        NULL,
 };
diff --git a/fs/hfsplus/acl.h b/fs/hfsplus/acl.h
index 07c0d4947527..95c8ed9ec17f 100644
--- a/fs/hfsplus/acl.h
+++ b/fs/hfsplus/acl.h
@@ -12,16 +12,13 @@
 /* posix_acl.c */
 struct posix_acl *hfsplus_get_posix_acl(struct inode *inode, int type);
-extern int hfsplus_posix_acl_chmod(struct inode *);
+int hfsplus_set_posix_acl(struct inode *inode, struct posix_acl *acl,
+                int type);
 extern int hfsplus_init_posix_acl(struct inode *, struct inode *);
 #else  /* CONFIG_HFSPLUS_FS_POSIX_ACL */
 #define hfsplus_get_posix_acl NULL
+#define hfsplus_set_posix_acl NULL
-static inline int hfsplus_posix_acl_chmod(struct inode *inode)
-{
-        return 0;
-}
 static inline int hfsplus_init_posix_acl(struct inode *inode, struct inode *dir)
 {
diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c
index 968ce411db53..32602c667b4a 100644
--- a/fs/hfsplus/catalog.c
+++ b/fs/hfsplus/catalog.c
@@ -103,6 +103,8 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry,
                folder = &entry->folder;
                memset(folder, 0, sizeof(*folder));
                folder->type = cpu_to_be16(HFSPLUS_FOLDER);
+                if (test_bit(HFSPLUS_SB_HFSX, &sbi->flags))
+                        folder->flags |= cpu_to_be16(HFSPLUS_HAS_FOLDER_COUNT);
                folder->id = cpu_to_be32(inode->i_ino);
                HFSPLUS_I(inode)->create_date =
                        folder->create_date =
@@ -203,6 +205,36 @@ int hfsplus_find_cat(struct super_block *sb, u32 cnid,
        return hfs_brec_find(fd, hfs_find_rec_by_key);
 }
+static void hfsplus_subfolders_inc(struct inode *dir)
+{
+        struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
+        if (test_bit(HFSPLUS_SB_HFSX, &sbi->flags)) {
+                /*
+                 * Increment subfolder count. Note, the value is only meaningful
+                 * for folders with HFSPLUS_HAS_FOLDER_COUNT flag set.
+                 */
+                HFSPLUS_I(dir)->subfolders++;
+        }
+}
+static void hfsplus_subfolders_dec(struct inode *dir)
+{
+        struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
+        if (test_bit(HFSPLUS_SB_HFSX, &sbi->flags)) {
+                /*
+                 * Decrement subfolder count. Note, the value is only meaningful
+                 * for folders with HFSPLUS_HAS_FOLDER_COUNT flag set.
+                 *
+                 * Check for zero. Some subfolders may have been created
+                 * by an implementation ignorant of this counter.
+                 */
+                if (HFSPLUS_I(dir)->subfolders)
+                        HFSPLUS_I(dir)->subfolders--;
+        }
+}
 int hfsplus_create_cat(u32 cnid, struct inode *dir,
                struct qstr *str, struct inode *inode)
 {
@@ -247,6 +279,8 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir,
                goto err1;
        dir->i_size++;
+        if (S_ISDIR(inode->i_mode))
+                hfsplus_subfolders_inc(dir);
        dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
        hfsplus_mark_inode_dirty(dir, HFSPLUS_I_CAT_DIRTY);
@@ -336,6 +370,8 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
                goto out;
        dir->i_size--;
+        if (type == HFSPLUS_FOLDER)
+                hfsplus_subfolders_dec(dir);
        dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
        hfsplus_mark_inode_dirty(dir, HFSPLUS_I_CAT_DIRTY);
@@ -380,6 +416,7 @@ int hfsplus_rename_cat(u32 cnid,
        hfs_bnode_read(src_fd.bnode, &entry, src_fd.entryoffset,
                                src_fd.entrylength);
+        type = be16_to_cpu(entry.type);
        /* create new dir entry with the data from the old entry */
        hfsplus_cat_build_key(sb, dst_fd.search_key, dst_dir->i_ino, dst_name);
@@ -394,6 +431,8 @@ int hfsplus_rename_cat(u32 cnid,
        if (err)
                goto out;
        dst_dir->i_size++;
+        if (type == HFSPLUS_FOLDER)
+                hfsplus_subfolders_inc(dst_dir);
        dst_dir->i_mtime = dst_dir->i_ctime = CURRENT_TIME_SEC;
        /* finally remove the old entry */
@@ -405,6 +444,8 @@ int hfsplus_rename_cat(u32 cnid,
        if (err)
                goto out;
        src_dir->i_size--;
+        if (type == HFSPLUS_FOLDER)
+                hfsplus_subfolders_dec(src_dir);
        src_dir->i_mtime = src_dir->i_ctime = CURRENT_TIME_SEC;
        /* remove old thread entry */
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 4a4fea002673..bdec66522de3 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -529,9 +529,10 @@ const struct inode_operations hfsplus_dir_inode_operations = {
        .setxattr               = generic_setxattr,
        .getxattr               = generic_getxattr,
        .listxattr              = hfsplus_listxattr,
-        .removexattr            = hfsplus_removexattr,
+        .removexattr            = generic_removexattr,
 #ifdef CONFIG_HFSPLUS_FS_POSIX_ACL
        .get_acl                = hfsplus_get_posix_acl,
+        .set_acl                = hfsplus_set_posix_acl,
 #endif
 };
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 08846425b67f..62d571eb69ba 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -242,6 +242,7 @@ struct hfsplus_inode_info {
         */
        sector_t fs_blocks;
        u8 userflags;           /* BSD user file flags */
+        u32 subfolders;         /* Subfolder count (HFSX only) */
        struct list_head open_dir_list;
        loff_t phys_size;
diff --git a/fs/hfsplus/hfsplus_raw.h b/fs/hfsplus/hfsplus_raw.h
index 8ffb3a8ffe75..5a126828d85e 100644
--- a/fs/hfsplus/hfsplus_raw.h
+++ b/fs/hfsplus/hfsplus_raw.h
@@ -261,7 +261,7 @@ struct hfsplus_cat_folder {
        struct DInfo user_info;
        struct DXInfo finder_info;
        __be32 text_encoding;
-        u32 reserved;
+        __be32 subfolders;      /* Subfolder count in HFSX. Reserved in HFS+. */
 } __packed;
 /* HFS file info (stolen from hfs.h) */
@@ -301,11 +301,13 @@ struct hfsplus_cat_file {
        struct hfsplus_fork_raw rsrc_fork;
 } __packed;
-/* File attribute bits */
+/* File and folder flag bits */
 #define HFSPLUS_FILE_LOCKED             0x0001
 #define HFSPLUS_FILE_THREAD_EXISTS      0x0002
 #define HFSPLUS_XATTR_EXISTS            0x0004
 #define HFSPLUS_ACL_EXISTS              0x0008
+#define HFSPLUS_HAS_FOLDER_COUNT        0x0010  /* Folder has subfolder count
+                                                 * (HFSX only) */
 /* HFS+ catalog thread (part of a cat_entry) */
 struct hfsplus_cat_thread {
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 37213d075f3c..a4f45bd88a63 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -178,64 +178,6 @@ const struct dentry_operations hfsplus_dentry_operations = {
        .d_compare    = hfsplus_compare_dentry,
 };
-static struct dentry *hfsplus_file_lookup(struct inode *dir,
-                struct dentry *dentry, unsigned int flags)
-{
-        struct hfs_find_data fd;
-        struct super_block *sb = dir->i_sb;
-        struct inode *inode = NULL;
-        struct hfsplus_inode_info *hip;
-        int err;
-        if (HFSPLUS_IS_RSRC(dir) || strcmp(dentry->d_name.name, "rsrc"))
-                goto out;
-        inode = HFSPLUS_I(dir)->rsrc_inode;
-        if (inode)
-                goto out;
-        inode = new_inode(sb);
-        if (!inode)
-                return ERR_PTR(-ENOMEM);
-        hip = HFSPLUS_I(inode);
-        inode->i_ino = dir->i_ino;
-        INIT_LIST_HEAD(&hip->open_dir_list);
-        mutex_init(&hip->extents_lock);
-        hip->extent_state = 0;
-        hip->flags = 0;
-        hip->userflags = 0;
-        set_bit(HFSPLUS_I_RSRC, &hip->flags);
-        err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
-        if (!err) {
-                err = hfsplus_find_cat(sb, dir->i_ino, &fd);
-                if (!err)
-                        err = hfsplus_cat_read_inode(inode, &fd);
-                hfs_find_exit(&fd);
-        }
-        if (err) {
-                iput(inode);
-                return ERR_PTR(err);
-        }
-        hip->rsrc_inode = dir;
-        HFSPLUS_I(dir)->rsrc_inode = inode;
-        igrab(dir);
-        /*
-         * __mark_inode_dirty expects inodes to be hashed.  Since we don't
-         * want resource fork inodes in the regular inode space, we make them
-         * appear hashed, but do not put on any lists.  hlist_del()
-         * will work fine and require no locking.
-         */
-        hlist_add_fake(&inode->i_hash);
-        mark_inode_dirty(inode);
-out:
-        d_add(dentry, inode);
-        return NULL;
-}
 static void hfsplus_get_perms(struct inode *inode,
                struct hfsplus_perm *perms, int dir)
 {
@@ -319,7 +261,7 @@ static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr)
        mark_inode_dirty(inode);
        if (attr->ia_valid & ATTR_MODE) {
-                error = hfsplus_posix_acl_chmod(inode);
+                error = posix_acl_chmod(inode, inode->i_mode);
                if (unlikely(error))
                        return error;
        }
@@ -385,14 +327,14 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end,
 }
 static const struct inode_operations hfsplus_file_inode_operations = {
-        .lookup         = hfsplus_file_lookup,
        .setattr        = hfsplus_setattr,
        .setxattr       = generic_setxattr,
        .getxattr       = generic_getxattr,
        .listxattr      = hfsplus_listxattr,
-        .removexattr    = hfsplus_removexattr,
+        .removexattr    = generic_removexattr,
 #ifdef CONFIG_HFSPLUS_FS_POSIX_ACL
        .get_acl        = hfsplus_get_posix_acl,
+        .set_acl        = hfsplus_set_posix_acl,
 #endif
 };
@@ -433,6 +375,7 @@ struct inode *hfsplus_new_inode(struct super_block *sb, umode_t mode)
        hip->extent_state = 0;
        hip->flags = 0;
        hip->userflags = 0;
+        hip->subfolders = 0;
        memset(hip->first_extents, 0, sizeof(hfsplus_extent_rec));
        memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec));
        hip->alloc_blocks = 0;
@@ -552,6 +495,10 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
                inode->i_ctime = hfsp_mt2ut(folder->attribute_mod_date);
                HFSPLUS_I(inode)->create_date = folder->create_date;
                HFSPLUS_I(inode)->fs_blocks = 0;
+                if (folder->flags & cpu_to_be16(HFSPLUS_HAS_FOLDER_COUNT)) {
+                        HFSPLUS_I(inode)->subfolders =
+                                be32_to_cpu(folder->subfolders);
+                }
                inode->i_op = &hfsplus_dir_inode_operations;
                inode->i_fop = &hfsplus_dir_operations;
        } else if (type == HFSPLUS_FILE) {
@@ -624,6 +571,10 @@ int hfsplus_cat_write_inode(struct inode *inode)
                folder->content_mod_date = hfsp_ut2mt(inode->i_mtime);
                folder->attribute_mod_date = hfsp_ut2mt(inode->i_ctime);
                folder->valence = cpu_to_be32(inode->i_size - 2);
+                if (folder->flags & cpu_to_be16(HFSPLUS_HAS_FOLDER_COUNT)) {
+                        folder->subfolders =
+                                cpu_to_be32(HFSPLUS_I(inode)->subfolders);
+                }
                hfs_bnode_write(fd.bnode, &entry, fd.entryoffset,
                                         sizeof(struct hfsplus_cat_folder));
        } else if (HFSPLUS_IS_RSRC(inode)) {
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index 968eab5bc1f5..68537e8b7a09 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -75,7 +75,7 @@ int hfsplus_parse_options_remount(char *input, int *force)
        int token;
        if (!input)
-                return 0;
+                return 1;
        while ((p = strsep(&input, ",")) != NULL) {
                if (!*p)
diff --git a/fs/hfsplus/posix_acl.c b/fs/hfsplus/posix_acl.c
index b609cc14c72e..df0c9af68d05 100644
--- a/fs/hfsplus/posix_acl.c
+++ b/fs/hfsplus/posix_acl.c
@@ -17,9 +17,7 @@ struct posix_acl *hfsplus_get_posix_acl(struct inode *inode, int type)
        char *value = NULL;
        ssize_t size;
-        acl = get_cached_acl(inode, type);
+        hfs_dbg(ACL_MOD, "[%s]: ino %lu\n", __func__, inode->i_ino);
-        if (acl != ACL_NOT_CACHED)
-                return acl;
        switch (type) {
        case ACL_TYPE_ACCESS:
@@ -56,17 +54,15 @@ struct posix_acl *hfsplus_get_posix_acl(struct inode *inode, int type)
        return acl;
 }
-static int hfsplus_set_posix_acl(struct inode *inode,
+int hfsplus_set_posix_acl(struct inode *inode, struct posix_acl *acl,
-                                        int type,
+                int type)
-                                        struct posix_acl *acl)
 {
        int err;
        char *xattr_name;
        size_t size = 0;
        char *value = NULL;
-        if (S_ISLNK(inode->i_mode))
+        hfs_dbg(ACL_MOD, "[%s]: ino %lu\n", __func__, inode->i_ino);
-                return -EOPNOTSUPP;
        switch (type) {
        case ACL_TYPE_ACCESS:
@@ -115,7 +111,7 @@ end_set_acl:
 int hfsplus_init_posix_acl(struct inode *inode, struct inode *dir)
 {
        int err = 0;
-        struct posix_acl *acl = NULL;
+        struct posix_acl *default_acl, *acl;
        hfs_dbg(ACL_MOD,
                "[%s]: ino %lu, dir->ino %lu\n",
@@ -124,151 +120,21 @@ int hfsplus_init_posix_acl(struct inode *inode, struct inode *dir)
        if (S_ISLNK(inode->i_mode))
                return 0;
-        acl = hfsplus_get_posix_acl(dir, ACL_TYPE_DEFAULT);
+        err = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
-        if (IS_ERR(acl))
+        if (err)
-                return PTR_ERR(acl);
-        if (acl) {
-                if (S_ISDIR(inode->i_mode)) {
-                        err = hfsplus_set_posix_acl(inode,
-                                                        ACL_TYPE_DEFAULT,
-                                                        acl);
-                        if (unlikely(err))
-                                goto init_acl_cleanup;
-                }
-                err = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode);
-                if (unlikely(err < 0))
-                        return err;
-                if (err > 0)
-                        err = hfsplus_set_posix_acl(inode,
-                                                        ACL_TYPE_ACCESS,
-                                                        acl);
-        } else
-                inode->i_mode &= ~current_umask();
-init_acl_cleanup:
-        posix_acl_release(acl);
-        return err;
-}
-int hfsplus_posix_acl_chmod(struct inode *inode)
-{
-        int err;
-        struct posix_acl *acl;
-        hfs_dbg(ACL_MOD, "[%s]: ino %lu\n", __func__, inode->i_ino);
-        if (S_ISLNK(inode->i_mode))
-                return -EOPNOTSUPP;
-        acl = hfsplus_get_posix_acl(inode, ACL_TYPE_ACCESS);
-        if (IS_ERR(acl) || !acl)
-                return PTR_ERR(acl);
-        err = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
-        if (unlikely(err))
                return err;
-        err = hfsplus_set_posix_acl(inode, ACL_TYPE_ACCESS, acl);
+        if (default_acl) {
-        posix_acl_release(acl);
+                err = hfsplus_set_posix_acl(inode, default_acl,
-        return err;
+                                            ACL_TYPE_DEFAULT);
-}
+                posix_acl_release(default_acl);
-static int hfsplus_xattr_get_posix_acl(struct dentry *dentry,
-                                        const char *name,
-                                        void *buffer,
-                                        size_t size,
-                                        int type)
-{
-        int err = 0;
-        struct posix_acl *acl;
-        hfs_dbg(ACL_MOD,
-                "[%s]: ino %lu, buffer %p, size %zu, type %#x\n",
-                __func__, dentry->d_inode->i_ino, buffer, size, type);
-        if (strcmp(name, "") != 0)
-                return -EINVAL;
-        acl = hfsplus_get_posix_acl(dentry->d_inode, type);
-        if (IS_ERR(acl))
-                return PTR_ERR(acl);
-        if (acl == NULL)
-                return -ENODATA;
-        err = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
-        posix_acl_release(acl);
-        return err;
-}
-static int hfsplus_xattr_set_posix_acl(struct dentry *dentry,
-                                        const char *name,
-                                        const void *value,
-                                        size_t size,
-                                        int flags,
-                                        int type)
-{
-        int err = 0;
-        struct inode *inode = dentry->d_inode;
-        struct posix_acl *acl = NULL;
-        hfs_dbg(ACL_MOD,
-                "[%s]: ino %lu, value %p, size %zu, flags %#x, type %#x\n",
-                __func__, inode->i_ino, value, size, flags, type);
-        if (strcmp(name, "") != 0)
-                return -EINVAL;
-        if (!inode_owner_or_capable(inode))
-                return -EPERM;
-        if (value) {
-                acl = posix_acl_from_xattr(&init_user_ns, value, size);
-                if (IS_ERR(acl))
-                        return PTR_ERR(acl);
-                else if (acl) {
-                        err = posix_acl_valid(acl);
-                        if (err)
-                                goto end_xattr_set_acl;
-                }
        }
-        err = hfsplus_set_posix_acl(inode, type, acl);
+        if (acl) {
+                if (!err)
-end_xattr_set_acl:
+                        err = hfsplus_set_posix_acl(inode, acl,
-        posix_acl_release(acl);
+                                                    ACL_TYPE_ACCESS);
+                posix_acl_release(acl);
+        }
        return err;
 }
-static size_t hfsplus_xattr_list_posix_acl(struct dentry *dentry,
-                                                char *list,
-                                                size_t list_size,
-                                                const char *name,
-                                                size_t name_len,
-                                                int type)
-{
-        /*
-         * This method is not used.
-         * It is used hfsplus_listxattr() instead of generic_listxattr().
-         */
-        return -EOPNOTSUPP;
-}
-const struct xattr_handler hfsplus_xattr_acl_access_handler = {
-        .prefix = POSIX_ACL_XATTR_ACCESS,
-        .flags  = ACL_TYPE_ACCESS,
-        .list   = hfsplus_xattr_list_posix_acl,
-        .get    = hfsplus_xattr_get_posix_acl,
-        .set    = hfsplus_xattr_set_posix_acl,
-};
-const struct xattr_handler hfsplus_xattr_acl_default_handler = {
-        .prefix = POSIX_ACL_XATTR_DEFAULT,
-        .flags  = ACL_TYPE_DEFAULT,
-        .list   = hfsplus_xattr_list_posix_acl,
-        .get    = hfsplus_xattr_get_posix_acl,
-        .set    = hfsplus_xattr_set_posix_acl,
-};
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index e9a97a0d4314..3f999649587f 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -63,7 +63,7 @@ int hfsplus_submit_bio(struct super_block *sb, sector_t sector,
        sector &= ~((io_size >> HFSPLUS_SECTOR_SHIFT) - 1);
        bio = bio_alloc(GFP_NOIO, 1);
-        bio->bi_sector = sector;
+        bio->bi_iter.bi_sector = sector;
        bio->bi_bdev = sb->s_bdev;
        if (!(rw & WRITE) && data)
diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c
index 3c6136f98c73..4e27edc082a4 100644
--- a/fs/hfsplus/xattr.c
+++ b/fs/hfsplus/xattr.c
@@ -7,16 +7,19 @@
 */
 #include "hfsplus_fs.h"
+#include <linux/posix_acl_xattr.h>
 #include "xattr.h"
 #include "acl.h"
+static int hfsplus_removexattr(struct inode *inode, const char *name);
 const struct xattr_handler *hfsplus_xattr_handlers[] = {
        &hfsplus_xattr_osx_handler,
        &hfsplus_xattr_user_handler,
        &hfsplus_xattr_trusted_handler,
 #ifdef CONFIG_HFSPLUS_FS_POSIX_ACL
-        &hfsplus_xattr_acl_access_handler,
+        &posix_acl_access_xattr_handler,
-        &hfsplus_xattr_acl_default_handler,
+        &posix_acl_default_xattr_handler,
 #endif
        &hfsplus_xattr_security_handler,
        NULL
@@ -51,82 +54,6 @@ static inline int is_known_namespace(const char *name)
        return true;
 }
-static int can_set_system_xattr(struct inode *inode, const char *name,
-                                const void *value, size_t size)
-{
-#ifdef CONFIG_HFSPLUS_FS_POSIX_ACL
-        struct posix_acl *acl;
-        int err;
-        if (!inode_owner_or_capable(inode))
-                return -EPERM;
-        /*
-         * POSIX_ACL_XATTR_ACCESS is tied to i_mode
-         */
-        if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0) {
-                acl = posix_acl_from_xattr(&init_user_ns, value, size);
-                if (IS_ERR(acl))
-                        return PTR_ERR(acl);
-                if (acl) {
-                        err = posix_acl_equiv_mode(acl, &inode->i_mode);
-                        posix_acl_release(acl);
-                        if (err < 0)
-                                return err;
-                        mark_inode_dirty(inode);
-                }
-                /*
-                 * We're changing the ACL.  Get rid of the cached one
-                 */
-                forget_cached_acl(inode, ACL_TYPE_ACCESS);
-                return 0;
-        } else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0) {
-                acl = posix_acl_from_xattr(&init_user_ns, value, size);
-                if (IS_ERR(acl))
-                        return PTR_ERR(acl);
-                posix_acl_release(acl);
-                /*
-                 * We're changing the default ACL.  Get rid of the cached one
-                 */
-                forget_cached_acl(inode, ACL_TYPE_DEFAULT);
-                return 0;
-        }
-#endif /* CONFIG_HFSPLUS_FS_POSIX_ACL */
-        return -EOPNOTSUPP;
-}
-static int can_set_xattr(struct inode *inode, const char *name,
-                                const void *value, size_t value_len)
-{
-        if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
-                return can_set_system_xattr(inode, name, value, value_len);
-        if (!strncmp(name, XATTR_MAC_OSX_PREFIX, XATTR_MAC_OSX_PREFIX_LEN)) {
-                /*
-                 * This makes sure that we aren't trying to set an
-                 * attribute in a different namespace by prefixing it
-                 * with "osx."
-                 */
-                if (is_known_namespace(name + XATTR_MAC_OSX_PREFIX_LEN))
-                        return -EOPNOTSUPP;
-                return 0;
-        }
-        /*
-         * Don't allow setting an attribute in an unknown namespace.
-         */
-        if (strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) &&
-            strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) &&
-            strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN))
-                return -EOPNOTSUPP;
-        return 0;
-}
 static void hfsplus_init_header_node(struct inode *attr_file,
                                        u32 clump_size,
                                        char *buf, u16 node_size)
@@ -349,18 +276,8 @@ int __hfsplus_setxattr(struct inode *inode, const char *name,
                                HFSPLUS_IS_RSRC(inode))
                return -EOPNOTSUPP;
-        err = can_set_xattr(inode, name, value, size);
+        if (value == NULL)
-        if (err)
+                return hfsplus_removexattr(inode, name);
-                return err;
-        if (strncmp(name, XATTR_MAC_OSX_PREFIX,
-                                XATTR_MAC_OSX_PREFIX_LEN) == 0)
-                name += XATTR_MAC_OSX_PREFIX_LEN;
-        if (value == NULL) {
-                value = "";
-                size = 0;
-        }
        err = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &cat_fd);
        if (err) {
@@ -478,16 +395,11 @@ end_setxattr:
        return err;
 }
-static inline int is_osx_xattr(const char *xattr_name)
-{
-        return !is_known_namespace(xattr_name);
-}
 static int name_len(const char *xattr_name, int xattr_name_len)
 {
        int len = xattr_name_len + 1;
-        if (is_osx_xattr(xattr_name))
+        if (!is_known_namespace(xattr_name))
                len += XATTR_MAC_OSX_PREFIX_LEN;
        return len;
@@ -498,7 +410,7 @@ static int copy_name(char *buffer, const char *xattr_name, int name_len)
        int len = name_len;
        int offset = 0;
-        if (is_osx_xattr(xattr_name)) {
+        if (!is_known_namespace(xattr_name)) {
                strncpy(buffer, XATTR_MAC_OSX_PREFIX, XATTR_MAC_OSX_PREFIX_LEN);
                offset += XATTR_MAC_OSX_PREFIX_LEN;
                len += XATTR_MAC_OSX_PREFIX_LEN;
@@ -576,18 +488,6 @@ ssize_t __hfsplus_getxattr(struct inode *inode, const char *name,
                                HFSPLUS_IS_RSRC(inode))
                return -EOPNOTSUPP;
-        if (strncmp(name, XATTR_MAC_OSX_PREFIX,
-                                XATTR_MAC_OSX_PREFIX_LEN) == 0) {
-                /* skip "osx." prefix */
-                name += XATTR_MAC_OSX_PREFIX_LEN;
-                /*
-                 * Don't allow retrieving properly prefixed attributes
-                 * by prepending them with "osx."
-                 */
-                if (is_known_namespace(name))
-                        return -EOPNOTSUPP;
-        }
        if (!strcmp_xattr_finder_info(name))
                return hfsplus_getxattr_finder_info(inode, value, size);
@@ -822,32 +722,18 @@ end_listxattr:
        return res;
 }
-int hfsplus_removexattr(struct dentry *dentry, const char *name)
+static int hfsplus_removexattr(struct inode *inode, const char *name)
 {
        int err = 0;
-        struct inode *inode = dentry->d_inode;
        struct hfs_find_data cat_fd;
        u16 flags;
        u16 cat_entry_type;
        int is_xattr_acl_deleted = 0;
        int is_all_xattrs_deleted = 0;
-        if ((!S_ISREG(inode->i_mode) &&
-                        !S_ISDIR(inode->i_mode)) ||
-                                HFSPLUS_IS_RSRC(inode))
-                return -EOPNOTSUPP;
        if (!HFSPLUS_SB(inode->i_sb)->attr_tree)
                return -EOPNOTSUPP;
-        err = can_set_xattr(inode, name, NULL, 0);
-        if (err)
-                return err;
-        if (strncmp(name, XATTR_MAC_OSX_PREFIX,
-                                XATTR_MAC_OSX_PREFIX_LEN) == 0)
-                name += XATTR_MAC_OSX_PREFIX_LEN;
        if (!strcmp_xattr_finder_info(name))
                return -EOPNOTSUPP;
@@ -921,8 +807,12 @@ static int hfsplus_osx_getxattr(struct dentry *dentry, const char *name,
        if (len > HFSPLUS_ATTR_MAX_STRLEN)
                return -EOPNOTSUPP;
-        strcpy(xattr_name, XATTR_MAC_OSX_PREFIX);
+        /*
-        strcpy(xattr_name + XATTR_MAC_OSX_PREFIX_LEN, name);
+         * Don't allow retrieving properly prefixed attributes
+         * by prepending them with "osx."
+         */
+        if (is_known_namespace(name))
+                return -EOPNOTSUPP;
        return hfsplus_getxattr(dentry, xattr_name, buffer, size);
 }
@@ -940,8 +830,12 @@ static int hfsplus_osx_setxattr(struct dentry *dentry, const char *name,
        if (len > HFSPLUS_ATTR_MAX_STRLEN)
                return -EOPNOTSUPP;
-        strcpy(xattr_name, XATTR_MAC_OSX_PREFIX);
+        /*
-        strcpy(xattr_name + XATTR_MAC_OSX_PREFIX_LEN, name);
+         * Don't allow setting properly prefixed attributes
+         * by prepending them with "osx."
+         */
+        if (is_known_namespace(name))
+                return -EOPNOTSUPP;
        return hfsplus_setxattr(dentry, xattr_name, buffer, size, flags);
 }
diff --git a/fs/hfsplus/xattr.h b/fs/hfsplus/xattr.h
index 841b5698c0fc..288530cf80b5 100644
--- a/fs/hfsplus/xattr.h
+++ b/fs/hfsplus/xattr.h
@@ -14,8 +14,6 @@
 extern const struct xattr_handler hfsplus_xattr_osx_handler;
 extern const struct xattr_handler hfsplus_xattr_user_handler;
 extern const struct xattr_handler hfsplus_xattr_trusted_handler;
-extern const struct xattr_handler hfsplus_xattr_acl_access_handler;
-extern const struct xattr_handler hfsplus_xattr_acl_default_handler;
 extern const struct xattr_handler hfsplus_xattr_security_handler;
 extern const struct xattr_handler *hfsplus_xattr_handlers[];
@@ -42,8 +40,6 @@ static inline ssize_t hfsplus_getxattr(struct dentry *dentry,
 ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size);
-int hfsplus_removexattr(struct dentry *dentry, const char *name);
 int hfsplus_init_security(struct inode *inode, struct inode *dir,
                                const struct qstr *qstr);
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index db23ce1bd903..fe649d325b1f 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -186,7 +186,7 @@ static struct inode *hostfs_iget(struct super_block *sb)
        return inode;
 }
-int hostfs_statfs(struct dentry *dentry, struct kstatfs *sf)
+static int hostfs_statfs(struct dentry *dentry, struct kstatfs *sf)
 {
        /*
         * do_statfs uses struct statfs64 internally, but the linux kernel
@@ -268,7 +268,7 @@ static const struct super_operations hostfs_sbops = {
        .show_options   = hostfs_show_options,
 };
-int hostfs_readdir(struct file *file, struct dir_context *ctx)
+static int hostfs_readdir(struct file *file, struct dir_context *ctx)
 {
        void *dir;
        char *name;
@@ -293,7 +293,7 @@ int hostfs_readdir(struct file *file, struct dir_context *ctx)
        return 0;
 }
-int hostfs_file_open(struct inode *ino, struct file *file)
+static int hostfs_file_open(struct inode *ino, struct file *file)
 {
        static DEFINE_MUTEX(open_mutex);
        char *name;
@@ -359,7 +359,8 @@ static int hostfs_file_release(struct inode *inode, struct file *file)
        return 0;
 }
-int hostfs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
+static int hostfs_fsync(struct file *file, loff_t start, loff_t end,
+                        int datasync)
 {
        struct inode *inode = file->f_mapping->host;
        int ret;
@@ -394,7 +395,7 @@ static const struct file_operations hostfs_dir_fops = {
        .read           = generic_read_dir,
 };
-int hostfs_writepage(struct page *page, struct writeback_control *wbc)
+static int hostfs_writepage(struct page *page, struct writeback_control *wbc)
 {
        struct address_space *mapping = page->mapping;
        struct inode *inode = mapping->host;
@@ -430,7 +431,7 @@ int hostfs_writepage(struct page *page, struct writeback_control *wbc)
        return err;
 }
-int hostfs_readpage(struct file *file, struct page *page)
+static int hostfs_readpage(struct file *file, struct page *page)
 {
        char *buffer;
        long long start;
@@ -455,9 +456,9 @@ int hostfs_readpage(struct file *file, struct page *page)
        return err;
 }
-int hostfs_write_begin(struct file *file, struct address_space *mapping,
+static int hostfs_write_begin(struct file *file, struct address_space *mapping,
-                        loff_t pos, unsigned len, unsigned flags,
+                              loff_t pos, unsigned len, unsigned flags,
-                        struct page **pagep, void **fsdata)
+                              struct page **pagep, void **fsdata)
 {
        pgoff_t index = pos >> PAGE_CACHE_SHIFT;
@@ -467,9 +468,9 @@ int hostfs_write_begin(struct file *file, struct address_space *mapping,
        return 0;
 }
-int hostfs_write_end(struct file *file, struct address_space *mapping,
+static int hostfs_write_end(struct file *file, struct address_space *mapping,
-                        loff_t pos, unsigned len, unsigned copied,
+                            loff_t pos, unsigned len, unsigned copied,
-                        struct page *page, void *fsdata)
+                            struct page *page, void *fsdata)
 {
        struct inode *inode = mapping->host;
        void *buffer;
@@ -549,8 +550,8 @@ static int read_name(struct inode *ino, char *name)
        return 0;
 }
-int hostfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+static int hostfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
-                  bool excl)
+                         bool excl)
 {
        struct inode *inode;
        char *name;
@@ -591,8 +592,8 @@ int hostfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
        return error;
 }
-struct dentry *hostfs_lookup(struct inode *ino, struct dentry *dentry,
+static struct dentry *hostfs_lookup(struct inode *ino, struct dentry *dentry,
-                             unsigned int flags)
+                                    unsigned int flags)
 {
        struct inode *inode;
        char *name;
@@ -628,7 +629,8 @@ struct dentry *hostfs_lookup(struct inode *ino, struct dentry *dentry,
        return ERR_PTR(err);
 }
-int hostfs_link(struct dentry *to, struct inode *ino, struct dentry *from)
+static int hostfs_link(struct dentry *to, struct inode *ino,
+                       struct dentry *from)
 {
        char *from_name, *to_name;
        int err;
@@ -646,7 +648,7 @@ int hostfs_link(struct dentry *to, struct inode *ino, struct dentry *from)
        return err;
 }
-int hostfs_unlink(struct inode *ino, struct dentry *dentry)
+static int hostfs_unlink(struct inode *ino, struct dentry *dentry)
 {
        char *file;
        int err;
@@ -662,7 +664,8 @@ int hostfs_unlink(struct inode *ino, struct dentry *dentry)
        return err;
 }
-int hostfs_symlink(struct inode *ino, struct dentry *dentry, const char *to)
+static int hostfs_symlink(struct inode *ino, struct dentry *dentry,
+                          const char *to)
 {
        char *file;
        int err;
@@ -674,7 +677,7 @@ int hostfs_symlink(struct inode *ino, struct dentry *dentry, const char *to)
        return err;
 }
-int hostfs_mkdir(struct inode *ino, struct dentry *dentry, umode_t mode)
+static int hostfs_mkdir(struct inode *ino, struct dentry *dentry, umode_t mode)
 {
        char *file;
        int err;
@@ -686,7 +689,7 @@ int hostfs_mkdir(struct inode *ino, struct dentry *dentry, umode_t mode)
        return err;
 }
-int hostfs_rmdir(struct inode *ino, struct dentry *dentry)
+static int hostfs_rmdir(struct inode *ino, struct dentry *dentry)
 {
        char *file;
        int err;
@@ -738,8 +741,8 @@ static int hostfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
        return err;
 }
-int hostfs_rename(struct inode *from_ino, struct dentry *from,
+static int hostfs_rename(struct inode *from_ino, struct dentry *from,
-                  struct inode *to_ino, struct dentry *to)
+                         struct inode *to_ino, struct dentry *to)
 {
        char *from_name, *to_name;
        int err;
@@ -756,7 +759,7 @@ int hostfs_rename(struct inode *from_ino, struct dentry *from,
        return err;
 }
-int hostfs_permission(struct inode *ino, int desired)
+static int hostfs_permission(struct inode *ino, int desired)
 {
        char *name;
        int r = 0, w = 0, x = 0, err;
@@ -782,7 +785,7 @@ int hostfs_permission(struct inode *ino, int desired)
        return err;
 }
-int hostfs_setattr(struct dentry *dentry, struct iattr *attr)
+static int hostfs_setattr(struct dentry *dentry, struct iattr *attr)
 {
        struct inode *inode = dentry->d_inode;
        struct hostfs_iattr attrs;
diff --git a/fs/hpfs/alloc.c b/fs/hpfs/alloc.c
index cdb84a838068..58b5106186d0 100644
--- a/fs/hpfs/alloc.c
+++ b/fs/hpfs/alloc.c
@@ -8,6 +8,58 @@
 #include "hpfs_fn.h"
+static void hpfs_claim_alloc(struct super_block *s, secno sec)
+{
+        struct hpfs_sb_info *sbi = hpfs_sb(s);
+        if (sbi->sb_n_free != (unsigned)-1) {
+                if (unlikely(!sbi->sb_n_free)) {
+                        hpfs_error(s, "free count underflow, allocating sector %08x", sec);
+                        sbi->sb_n_free = -1;
+                        return;
+                }
+                sbi->sb_n_free--;
+        }
+}
+static void hpfs_claim_free(struct super_block *s, secno sec)
+{
+        struct hpfs_sb_info *sbi = hpfs_sb(s);
+        if (sbi->sb_n_free != (unsigned)-1) {
+                if (unlikely(sbi->sb_n_free >= sbi->sb_fs_size)) {
+                        hpfs_error(s, "free count overflow, freeing sector %08x", sec);
+                        sbi->sb_n_free = -1;
+                        return;
+                }
+                sbi->sb_n_free++;
+        }
+}
+static void hpfs_claim_dirband_alloc(struct super_block *s, secno sec)
+{
+        struct hpfs_sb_info *sbi = hpfs_sb(s);
+        if (sbi->sb_n_free_dnodes != (unsigned)-1) {
+                if (unlikely(!sbi->sb_n_free_dnodes)) {
+                        hpfs_error(s, "dirband free count underflow, allocating sector %08x", sec);
+                        sbi->sb_n_free_dnodes = -1;
+                        return;
+                }
+                sbi->sb_n_free_dnodes--;
+        }
+}
+static void hpfs_claim_dirband_free(struct super_block *s, secno sec)
+{
+        struct hpfs_sb_info *sbi = hpfs_sb(s);
+        if (sbi->sb_n_free_dnodes != (unsigned)-1) {
+                if (unlikely(sbi->sb_n_free_dnodes >= sbi->sb_dirband_size / 4)) {
+                        hpfs_error(s, "dirband free count overflow, freeing sector %08x", sec);
+                        sbi->sb_n_free_dnodes = -1;
+                        return;
+                }
+                sbi->sb_n_free_dnodes++;
+        }
+}
 /*
 * Check if a sector is allocated in bitmap
 * This is really slow. Turned on only if chk==2
@@ -203,9 +255,15 @@ secno hpfs_alloc_sector(struct super_block *s, secno near, unsigned n, int forwa
        }
        sec = 0;
        ret:
+        if (sec) {
+                i = 0;
+                do
+                        hpfs_claim_alloc(s, sec + i);
+                while (unlikely(++i < n));
+        }
        if (sec && f_p) {
                for (i = 0; i < forward; i++) {
-                        if (!hpfs_alloc_if_possible(s, sec + i + 1)) {
+                        if (!hpfs_alloc_if_possible(s, sec + n + i)) {
                                hpfs_error(s, "Prealloc doesn't work! Wanted %d, allocated at %08x, can't allocate %d", forward, sec, i);
                                sec = 0;
                                break;
@@ -228,6 +286,7 @@ static secno alloc_in_dirband(struct super_block *s, secno near)
        nr >>= 2;
        sec = alloc_in_bmp(s, (~0x3fff) | nr, 1, 0);
        if (!sec) return 0;
+        hpfs_claim_dirband_alloc(s, sec);
        return ((sec & 0x3fff) << 2) + sbi->sb_dirband_start;
 }
@@ -242,6 +301,7 @@ int hpfs_alloc_if_possible(struct super_block *s, secno sec)
                bmp[(sec & 0x3fff) >> 5] &= cpu_to_le32(~(1 << (sec & 0x1f)));
                hpfs_mark_4buffers_dirty(&qbh);
                hpfs_brelse4(&qbh);
+                hpfs_claim_alloc(s, sec);
                return 1;
        }
        hpfs_brelse4(&qbh);
@@ -275,6 +335,7 @@ void hpfs_free_sectors(struct super_block *s, secno sec, unsigned n)
                return;
        }
        bmp[(sec & 0x3fff) >> 5] |= cpu_to_le32(1 << (sec & 0x1f));
+        hpfs_claim_free(s, sec);
        if (!--n) {
                hpfs_mark_4buffers_dirty(&qbh);
                hpfs_brelse4(&qbh);
@@ -359,6 +420,7 @@ void hpfs_free_dnode(struct super_block *s, dnode_secno dno)
                bmp[ssec >> 5] |= cpu_to_le32(1 << (ssec & 0x1f));
                hpfs_mark_4buffers_dirty(&qbh);
                hpfs_brelse4(&qbh);
+                hpfs_claim_dirband_free(s, dno);
        }
 }
@@ -366,7 +428,7 @@ struct dnode *hpfs_alloc_dnode(struct super_block *s, secno near,
                         dnode_secno *dno, struct quad_buffer_head *qbh)
 {
        struct dnode *d;
-        if (hpfs_count_one_bitmap(s, hpfs_sb(s)->sb_dmap) > FREE_DNODES_ADD) {
+        if (hpfs_get_free_dnodes(s) > FREE_DNODES_ADD) {
                if (!(*dno = alloc_in_dirband(s, near)))
                        if (!(*dno = hpfs_alloc_sector(s, near, 4, 0))) return NULL;
        } else {
diff --git a/fs/hpfs/buffer.c b/fs/hpfs/buffer.c
index 4d0a1afa058c..139ef1684d07 100644
--- a/fs/hpfs/buffer.c
+++ b/fs/hpfs/buffer.c
@@ -86,7 +86,6 @@ void *hpfs_get_sector(struct super_block *s, unsigned secno, struct buffer_head
 void *hpfs_map_4sectors(struct super_block *s, unsigned secno, struct quad_buffer_head *qbh,
                   int ahead)
 {
-        struct buffer_head *bh;
        char *data;
        hpfs_lock_assert(s);
@@ -100,34 +99,32 @@ void *hpfs_map_4sectors(struct super_block *s, unsigned secno, struct quad_buffe
        hpfs_prefetch_sectors(s, secno, 4 + ahead);
+        if (!(qbh->bh[0] = sb_bread(s, secno + 0))) goto bail0;
+        if (!(qbh->bh[1] = sb_bread(s, secno + 1))) goto bail1;
+        if (!(qbh->bh[2] = sb_bread(s, secno + 2))) goto bail2;
+        if (!(qbh->bh[3] = sb_bread(s, secno + 3))) goto bail3;
+        if (likely(qbh->bh[1]->b_data == qbh->bh[0]->b_data + 1 * 512) &&
+            likely(qbh->bh[2]->b_data == qbh->bh[0]->b_data + 2 * 512) &&
+            likely(qbh->bh[3]->b_data == qbh->bh[0]->b_data + 3 * 512)) {
+                return qbh->data = qbh->bh[0]->b_data;
+        }
        qbh->data = data = kmalloc(2048, GFP_NOFS);
        if (!data) {
                printk("HPFS: hpfs_map_4sectors: out of memory\n");
-                goto bail;
+                goto bail4;
        }
-        qbh->bh[0] = bh = sb_bread(s, secno);
+        memcpy(data + 0 * 512, qbh->bh[0]->b_data, 512);
-        if (!bh)
+        memcpy(data + 1 * 512, qbh->bh[1]->b_data, 512);
-                goto bail0;
+        memcpy(data + 2 * 512, qbh->bh[2]->b_data, 512);
-        memcpy(data, bh->b_data, 512);
+        memcpy(data + 3 * 512, qbh->bh[3]->b_data, 512);
-        qbh->bh[1] = bh = sb_bread(s, secno + 1);
-        if (!bh)
-                goto bail1;
-        memcpy(data + 512, bh->b_data, 512);
-        qbh->bh[2] = bh = sb_bread(s, secno + 2);
-        if (!bh)
-                goto bail2;
-        memcpy(data + 2 * 512, bh->b_data, 512);
-        qbh->bh[3] = bh = sb_bread(s, secno + 3);
-        if (!bh)
-                goto bail3;
-        memcpy(data + 3 * 512, bh->b_data, 512);
        return data;
+ bail4:
+        brelse(qbh->bh[3]);
 bail3:
        brelse(qbh->bh[2]);
 bail2:
@@ -135,9 +132,6 @@ void *hpfs_map_4sectors(struct super_block *s, unsigned secno, struct quad_buffe
 bail1:
        brelse(qbh->bh[0]);
 bail0:
-        kfree(data);
-        printk("HPFS: hpfs_map_4sectors: read error\n");
- bail:
        return NULL;
 }
@@ -155,44 +149,54 @@ void *hpfs_get_4sectors(struct super_block *s, unsigned secno,
                return NULL;
        }
-        /*return hpfs_map_4sectors(s, secno, qbh, 0);*/
+        if (!hpfs_get_sector(s, secno + 0, &qbh->bh[0])) goto bail0;
+        if (!hpfs_get_sector(s, secno + 1, &qbh->bh[1])) goto bail1;
+        if (!hpfs_get_sector(s, secno + 2, &qbh->bh[2])) goto bail2;
+        if (!hpfs_get_sector(s, secno + 3, &qbh->bh[3])) goto bail3;
+        if (likely(qbh->bh[1]->b_data == qbh->bh[0]->b_data + 1 * 512) &&
+            likely(qbh->bh[2]->b_data == qbh->bh[0]->b_data + 2 * 512) &&
+            likely(qbh->bh[3]->b_data == qbh->bh[0]->b_data + 3 * 512)) {
+                return qbh->data = qbh->bh[0]->b_data;
+        }
        if (!(qbh->data = kmalloc(2048, GFP_NOFS))) {
                printk("HPFS: hpfs_get_4sectors: out of memory\n");
-                return NULL;
+                goto bail4;
        }
-        if (!(hpfs_get_sector(s, secno, &qbh->bh[0]))) goto bail0;
-        if (!(hpfs_get_sector(s, secno + 1, &qbh->bh[1]))) goto bail1;
-        if (!(hpfs_get_sector(s, secno + 2, &qbh->bh[2]))) goto bail2;
-        if (!(hpfs_get_sector(s, secno + 3, &qbh->bh[3]))) goto bail3;
-        memcpy(qbh->data, qbh->bh[0]->b_data, 512);
-        memcpy(qbh->data + 512, qbh->bh[1]->b_data, 512);
-        memcpy(qbh->data + 2*512, qbh->bh[2]->b_data, 512);
-        memcpy(qbh->data + 3*512, qbh->bh[3]->b_data, 512);
        return qbh->data;
-        bail3:  brelse(qbh->bh[2]);
+bail4:
-        bail2:  brelse(qbh->bh[1]);
+        brelse(qbh->bh[3]);
-        bail1:  brelse(qbh->bh[0]);
+bail3:
-        bail0:
+        brelse(qbh->bh[2]);
+bail2:
+        brelse(qbh->bh[1]);
+bail1:
+        brelse(qbh->bh[0]);
+bail0:
        return NULL;
 }
        
 void hpfs_brelse4(struct quad_buffer_head *qbh)
 {
-        brelse(qbh->bh[3]);
+        if (unlikely(qbh->data != qbh->bh[0]->b_data))
-        brelse(qbh->bh[2]);
+                kfree(qbh->data);
-        brelse(qbh->bh[1]);
        brelse(qbh->bh[0]);
-        kfree(qbh->data);
+        brelse(qbh->bh[1]);
+        brelse(qbh->bh[2]);
+        brelse(qbh->bh[3]);
 }       
 void hpfs_mark_4buffers_dirty(struct quad_buffer_head *qbh)
 {
-        memcpy(qbh->bh[0]->b_data, qbh->data, 512);
+        if (unlikely(qbh->data != qbh->bh[0]->b_data)) {
-        memcpy(qbh->bh[1]->b_data, qbh->data + 512, 512);
+                memcpy(qbh->bh[0]->b_data, qbh->data + 0 * 512, 512);
-        memcpy(qbh->bh[2]->b_data, qbh->data + 2 * 512, 512);
+                memcpy(qbh->bh[1]->b_data, qbh->data + 1 * 512, 512);
-        memcpy(qbh->bh[3]->b_data, qbh->data + 3 * 512, 512);
+                memcpy(qbh->bh[2]->b_data, qbh->data + 2 * 512, 512);
+                memcpy(qbh->bh[3]->b_data, qbh->data + 3 * 512, 512);
+        }
        mark_buffer_dirty(qbh->bh[0]);
        mark_buffer_dirty(qbh->bh[1]);
        mark_buffer_dirty(qbh->bh[2]);
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index 6797bf80f6e2..3ba49c080e42 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -312,7 +312,7 @@ static inline struct hpfs_sb_info *hpfs_sb(struct super_block *sb)
 __printf(2, 3)
 void hpfs_error(struct super_block *, const char *, ...);
 int hpfs_stop_cycles(struct super_block *, int, int *, int *, char *);
-unsigned hpfs_count_one_bitmap(struct super_block *, secno);
+unsigned hpfs_get_free_dnodes(struct super_block *);
 /*
 * local time (HPFS) to GMT (Unix)
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index b8d01ef6f531..4534ff688b76 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -121,7 +121,7 @@ static void hpfs_put_super(struct super_block *s)
        call_rcu(&hpfs_sb(s)->rcu, lazy_free_sbi);
 }
-unsigned hpfs_count_one_bitmap(struct super_block *s, secno secno)
+static unsigned hpfs_count_one_bitmap(struct super_block *s, secno secno)
 {
        struct quad_buffer_head qbh;
        unsigned long *bits;
@@ -129,7 +129,7 @@ unsigned hpfs_count_one_bitmap(struct super_block *s, secno secno)
        bits = hpfs_map_4sectors(s, secno, &qbh, 0);
        if (!bits)
-                return 0;
+                return (unsigned)-1;
        count = bitmap_weight(bits, 2048 * BITS_PER_BYTE);
        hpfs_brelse4(&qbh);
        return count;
@@ -144,30 +144,45 @@ static unsigned count_bitmaps(struct super_block *s)
                hpfs_prefetch_bitmap(s, n);
        }
        for (n = 0; n < n_bands; n++) {
+                unsigned c;
                hpfs_prefetch_bitmap(s, n + COUNT_RD_AHEAD);
-                count += hpfs_count_one_bitmap(s, le32_to_cpu(hpfs_sb(s)->sb_bmp_dir[n]));
+                c = hpfs_count_one_bitmap(s, le32_to_cpu(hpfs_sb(s)->sb_bmp_dir[n]));
+                if (c != (unsigned)-1)
+                        count += c;
        }
        return count;
 }
+unsigned hpfs_get_free_dnodes(struct super_block *s)
+{
+        struct hpfs_sb_info *sbi = hpfs_sb(s);
+        if (sbi->sb_n_free_dnodes == (unsigned)-1) {
+                unsigned c = hpfs_count_one_bitmap(s, sbi->sb_dmap);
+                if (c == (unsigned)-1)
+                        return 0;
+                sbi->sb_n_free_dnodes = c;
+        }
+        return sbi->sb_n_free_dnodes;
+}
 static int hpfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct super_block *s = dentry->d_sb;
        struct hpfs_sb_info *sbi = hpfs_sb(s);
        u64 id = huge_encode_dev(s->s_bdev->bd_dev);
        hpfs_lock(s);
-        /*if (sbi->sb_n_free == -1) {*/
+        if (sbi->sb_n_free == (unsigned)-1)
                sbi->sb_n_free = count_bitmaps(s);
-                sbi->sb_n_free_dnodes = hpfs_count_one_bitmap(s, sbi->sb_dmap);
-        /*}*/
        buf->f_type = s->s_magic;
        buf->f_bsize = 512;
        buf->f_blocks = sbi->sb_fs_size;
        buf->f_bfree = sbi->sb_n_free;
        buf->f_bavail = sbi->sb_n_free;
        buf->f_files = sbi->sb_dirband_size / 4;
-        buf->f_ffree = sbi->sb_n_free_dnodes;
+        buf->f_ffree = hpfs_get_free_dnodes(s);
        buf->f_fsid.val[0] = (u32)id;
        buf->f_fsid.val[1] = (u32)(id >> 32);
        buf->f_namelen = 254;
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 2d04f9afafd7..06fe11e0abfa 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -573,7 +573,7 @@ int log_wait_commit(journal_t *journal, tid_t tid)
 #ifdef CONFIG_JBD_DEBUG
        spin_lock(&journal->j_state_lock);
        if (!tid_geq(journal->j_commit_request, tid)) {
-                printk(KERN_EMERG
+                printk(KERN_ERR
                       "%s: error: j_commit_request=%d, tid=%d\n",
                       __func__, journal->j_commit_request, tid);
        }
@@ -604,10 +604,8 @@ int log_wait_commit(journal_t *journal, tid_t tid)
 out_unlock:
        spin_unlock(&journal->j_state_lock);
-        if (unlikely(is_journal_aborted(journal))) {
+        if (unlikely(is_journal_aborted(journal)))
-                printk(KERN_EMERG "journal commit I/O error\n");
                err = -EIO;
-        }
        return err;
 }
@@ -2136,7 +2134,7 @@ static void __exit journal_exit(void)
 #ifdef CONFIG_JBD_DEBUG
        int n = atomic_read(&nr_journal_heads);
        if (n)
-                printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n);
+                printk(KERN_ERR "JBD: leaked %d journal_heads!\n", n);
 #endif
        jbd_remove_debugfs_entry();
        journal_destroy_caches();
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index aa603e017d22..1695ba8334a2 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -675,7 +675,7 @@ repeat:
                                        jbd_alloc(jh2bh(jh)->b_size,
                                                         GFP_NOFS);
                                if (!frozen_buffer) {
-                                        printk(KERN_EMERG
+                                        printk(KERN_ERR
                                               "%s: OOM for frozen_buffer\n",
                                               __func__);
                                        JBUFFER_TRACE(jh, "oom!");
@@ -898,7 +898,7 @@ repeat:
        if (!jh->b_committed_data) {
                committed_data = jbd_alloc(jh2bh(jh)->b_size, GFP_NOFS);
                if (!committed_data) {
-                        printk(KERN_EMERG "%s: No memory for committed data\n",
+                        printk(KERN_ERR "%s: No memory for committed data\n",
                                __func__);
                        err = -ENOMEM;
                        goto out;
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 8360674c85bc..60bb365f54a5 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -514,11 +514,13 @@ int jbd2_journal_start_reserved(handle_t *handle, unsigned int type,
         * similarly constrained call sites
         */
        ret = start_this_handle(journal, handle, GFP_NOFS);
-        if (ret < 0)
+        if (ret < 0) {
                jbd2_journal_free_reserved(handle);
+                return ret;
+        }
        handle->h_type = type;
        handle->h_line_no = line_no;
-        return ret;
+        return 0;
 }
 EXPORT_SYMBOL(jbd2_journal_start_reserved);
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 223283c30111..009ec0b5993d 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -178,10 +178,6 @@ struct posix_acl *jffs2_get_acl(struct inode *inode, int type)
        char *value = NULL;
        int rc, xprefix;
-        acl = get_cached_acl(inode, type);
-        if (acl != ACL_NOT_CACHED)
-                return acl;
        switch (type) {
        case ACL_TYPE_ACCESS:
                xprefix = JFFS2_XPREFIX_ACL_ACCESS;
@@ -232,13 +228,10 @@ static int __jffs2_set_acl(struct inode *inode, int xprefix, struct posix_acl *a
        return rc;
 }
-static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
+int jffs2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 {
        int rc, xprefix;
-        if (S_ISLNK(inode->i_mode))
-                return -EOPNOTSUPP;
        switch (type) {
        case ACL_TYPE_ACCESS:
                xprefix = JFFS2_XPREFIX_ACL_ACCESS;
@@ -277,30 +270,21 @@ static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, umode_t *i_mode)
 {
-        struct posix_acl *acl;
+        struct posix_acl *default_acl, *acl;
        int rc;
        cache_no_acl(inode);
-        if (S_ISLNK(*i_mode))
+        rc = posix_acl_create(dir_i, i_mode, &default_acl, &acl);
-                return 0;       /* Symlink always has no-ACL */
+        if (rc)
+                return rc;
-        acl = jffs2_get_acl(dir_i, ACL_TYPE_DEFAULT);
-        if (IS_ERR(acl))
-                return PTR_ERR(acl);
-        if (!acl) {
-                *i_mode &= ~current_umask();
-        } else {
-                if (S_ISDIR(*i_mode))
-                        set_cached_acl(inode, ACL_TYPE_DEFAULT, acl);
-                rc = posix_acl_create(&acl, GFP_KERNEL, i_mode);
-                if (rc < 0)
-                        return rc;
-                if (rc > 0)
-                        set_cached_acl(inode, ACL_TYPE_ACCESS, acl);
+        if (default_acl) {
+                set_cached_acl(inode, ACL_TYPE_DEFAULT, default_acl);
+                posix_acl_release(default_acl);
+        }
+        if (acl) {
+                set_cached_acl(inode, ACL_TYPE_ACCESS, acl);
                posix_acl_release(acl);
        }
        return 0;
@@ -324,106 +308,3 @@ int jffs2_init_acl_post(struct inode *inode)
        return 0;
 }
-int jffs2_acl_chmod(struct inode *inode)
-{
-        struct posix_acl *acl;
-        int rc;
-        if (S_ISLNK(inode->i_mode))
-                return -EOPNOTSUPP;
-        acl = jffs2_get_acl(inode, ACL_TYPE_ACCESS);
-        if (IS_ERR(acl) || !acl)
-                return PTR_ERR(acl);
-        rc = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
-        if (rc)
-                return rc;
-        rc = jffs2_set_acl(inode, ACL_TYPE_ACCESS, acl);
-        posix_acl_release(acl);
-        return rc;
-}
-static size_t jffs2_acl_access_listxattr(struct dentry *dentry, char *list,
-                size_t list_size, const char *name, size_t name_len, int type)
-{
-        const int retlen = sizeof(POSIX_ACL_XATTR_ACCESS);
-        if (list && retlen <= list_size)
-                strcpy(list, POSIX_ACL_XATTR_ACCESS);
-        return retlen;
-}
-static size_t jffs2_acl_default_listxattr(struct dentry *dentry, char *list,
-                size_t list_size, const char *name, size_t name_len, int type)
-{
-        const int retlen = sizeof(POSIX_ACL_XATTR_DEFAULT);
-        if (list && retlen <= list_size)
-                strcpy(list, POSIX_ACL_XATTR_DEFAULT);
-        return retlen;
-}
-static int jffs2_acl_getxattr(struct dentry *dentry, const char *name,
-                void *buffer, size_t size, int type)
-{
-        struct posix_acl *acl;
-        int rc;
-        if (name[0] != '\0')
-                return -EINVAL;
-        acl = jffs2_get_acl(dentry->d_inode, type);
-        if (IS_ERR(acl))
-                return PTR_ERR(acl);
-        if (!acl)
-                return -ENODATA;
-        rc = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
-        posix_acl_release(acl);
-        return rc;
-}
-static int jffs2_acl_setxattr(struct dentry *dentry, const char *name,
-                const void *value, size_t size, int flags, int type)
-{
-        struct posix_acl *acl;
-        int rc;
-        if (name[0] != '\0')
-                return -EINVAL;
-        if (!inode_owner_or_capable(dentry->d_inode))
-                return -EPERM;
-        if (value) {
-                acl = posix_acl_from_xattr(&init_user_ns, value, size);
-                if (IS_ERR(acl))
-                        return PTR_ERR(acl);
-                if (acl) {
-                        rc = posix_acl_valid(acl);
-                        if (rc)
-                                goto out;
-                }
-        } else {
-                acl = NULL;
-        }
-        rc = jffs2_set_acl(dentry->d_inode, type, acl);
- out:
-        posix_acl_release(acl);
-        return rc;
-}
-const struct xattr_handler jffs2_acl_access_xattr_handler = {
-        .prefix = POSIX_ACL_XATTR_ACCESS,
-        .flags  = ACL_TYPE_DEFAULT,
-        .list   = jffs2_acl_access_listxattr,
-        .get    = jffs2_acl_getxattr,
-        .set    = jffs2_acl_setxattr,
-};
-const struct xattr_handler jffs2_acl_default_xattr_handler = {
-        .prefix = POSIX_ACL_XATTR_DEFAULT,
-        .flags  = ACL_TYPE_DEFAULT,
-        .list   = jffs2_acl_default_listxattr,
-        .get    = jffs2_acl_getxattr,
-        .set    = jffs2_acl_setxattr,
-};
diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h
index 9b477246f2a6..2e2b5745c3b7 100644
--- a/fs/jffs2/acl.h
+++ b/fs/jffs2/acl.h
@@ -27,17 +27,14 @@ struct jffs2_acl_header {
 #ifdef CONFIG_JFFS2_FS_POSIX_ACL
 struct posix_acl *jffs2_get_acl(struct inode *inode, int type);
-extern int jffs2_acl_chmod(struct inode *);
+int jffs2_set_acl(struct inode *inode, struct posix_acl *acl, int type);
 extern int jffs2_init_acl_pre(struct inode *, struct inode *, umode_t *);
 extern int jffs2_init_acl_post(struct inode *);
-extern const struct xattr_handler jffs2_acl_access_xattr_handler;
-extern const struct xattr_handler jffs2_acl_default_xattr_handler;
 #else
 #define jffs2_get_acl                           (NULL)
-#define jffs2_acl_chmod(inode)                  (0)
+#define jffs2_set_acl                           (NULL)
 #define jffs2_init_acl_pre(dir_i,inode,mode)    (0)
 #define jffs2_init_acl_post(inode)              (0)
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index e3aac222472e..938556025d64 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -59,6 +59,7 @@ const struct inode_operations jffs2_dir_inode_operations =
        .mknod =        jffs2_mknod,
        .rename =       jffs2_rename,
        .get_acl =      jffs2_get_acl,
+        .set_acl =      jffs2_set_acl,
        .setattr =      jffs2_setattr,
        .setxattr =     jffs2_setxattr,
        .getxattr =     jffs2_getxattr,
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 1506673c087e..256cd19a3b78 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -66,6 +66,7 @@ const struct file_operations jffs2_file_operations =
 const struct inode_operations jffs2_file_inode_operations =
 {
        .get_acl =      jffs2_get_acl,
+        .set_acl =      jffs2_set_acl,
        .setattr =      jffs2_setattr,
        .setxattr =     jffs2_setxattr,
        .getxattr =     jffs2_getxattr,
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 09b3ed455724..a69e426435dd 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -190,15 +190,16 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr)
 int jffs2_setattr(struct dentry *dentry, struct iattr *iattr)
 {
+        struct inode *inode = dentry->d_inode;
        int rc;
-        rc = inode_change_ok(dentry->d_inode, iattr);
+        rc = inode_change_ok(inode, iattr);
        if (rc)
                return rc;
-        rc = jffs2_do_setattr(dentry->d_inode, iattr);
+        rc = jffs2_do_setattr(inode, iattr);
        if (!rc && (iattr->ia_valid & ATTR_MODE))
-                rc = jffs2_acl_chmod(dentry->d_inode);
+                rc = posix_acl_chmod(inode, inode->i_mode);
        return rc;
 }
diff --git a/fs/jffs2/malloc.c b/fs/jffs2/malloc.c
index 4f47aa24b556..b8fd651307a4 100644
--- a/fs/jffs2/malloc.c
+++ b/fs/jffs2/malloc.c
@@ -288,6 +288,8 @@ struct jffs2_xattr_datum *jffs2_alloc_xattr_datum(void)
        struct jffs2_xattr_datum *xd;
        xd = kmem_cache_zalloc(xattr_datum_cache, GFP_KERNEL);
        dbg_memalloc("%p\n", xd);
+        if (!xd)
+                return NULL;
        xd->class = RAWNODE_CLASS_XATTR_DATUM;
        xd->node = (void *)xd;
@@ -306,6 +308,8 @@ struct jffs2_xattr_ref *jffs2_alloc_xattr_ref(void)
        struct jffs2_xattr_ref *ref;
        ref = kmem_cache_zalloc(xattr_ref_cache, GFP_KERNEL);
        dbg_memalloc("%p\n", ref);
+        if (!ref)
+                return NULL;
        ref->class = RAWNODE_CLASS_XATTR_REF;
        ref->node = (void *)ref;
diff --git a/fs/jffs2/nodelist.c b/fs/jffs2/nodelist.c
index 975a1f562c10..9a5449bc3afb 100644
--- a/fs/jffs2/nodelist.c
+++ b/fs/jffs2/nodelist.c
@@ -564,25 +564,10 @@ struct jffs2_node_frag *jffs2_lookup_node_frag(struct rb_root *fragtree, uint32_
   they're killed. */
 void jffs2_kill_fragtree(struct rb_root *root, struct jffs2_sb_info *c)
 {
-        struct jffs2_node_frag *frag;
+        struct jffs2_node_frag *frag, *next;
-        struct jffs2_node_frag *parent;
-        if (!root->rb_node)
-                return;
        dbg_fragtree("killing\n");
+        rbtree_postorder_for_each_entry_safe(frag, next, root, rb) {
-        frag = (rb_entry(root->rb_node, struct jffs2_node_frag, rb));
-        while(frag) {
-                if (frag->rb.rb_left) {
-                        frag = frag_left(frag);
-                        continue;
-                }
-                if (frag->rb.rb_right) {
-                        frag = frag_right(frag);
-                        continue;
-                }
                if (frag->node && !(--frag->node->frags)) {
                        /* Not a hole, and it's the final remaining frag
                           of this node. Free the node */
@@ -591,17 +576,8 @@ void jffs2_kill_fragtree(struct rb_root *root, struct jffs2_sb_info *c)
                        jffs2_free_full_dnode(frag->node);
                }
-                parent = frag_parent(frag);
-                if (parent) {
-                        if (frag_left(parent) == frag)
-                                parent->rb.rb_left = NULL;
-                        else
-                                parent->rb.rb_right = NULL;
-                }
                jffs2_free_node_frag(frag);
-                frag = parent;
                cond_resched();
        }
 }
diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index ae81b01e6fd7..386303dca382 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -543,33 +543,13 @@ static int jffs2_build_inode_fragtree(struct jffs2_sb_info *c,
 static void jffs2_free_tmp_dnode_info_list(struct rb_root *list)
 {
-        struct rb_node *this;
+        struct jffs2_tmp_dnode_info *tn, *next;
-        struct jffs2_tmp_dnode_info *tn;
-        this = list->rb_node;
-        /* Now at bottom of tree */
+        rbtree_postorder_for_each_entry_safe(tn, next, list, rb) {
-        while (this) {
-                if (this->rb_left)
-                        this = this->rb_left;
-                else if (this->rb_right)
-                        this = this->rb_right;
-                else {
-                        tn = rb_entry(this, struct jffs2_tmp_dnode_info, rb);
                        jffs2_free_full_dnode(tn->fn);
                        jffs2_free_tmp_dnode_info(tn);
-                        this = rb_parent(this);
-                        if (!this)
-                                break;
-                        if (this->rb_left == &tn->rb)
-                                this->rb_left = NULL;
-                        else if (this->rb_right == &tn->rb)
-                                this->rb_right = NULL;
-                        else BUG();
-                }
        }
        *list = RB_ROOT;
 }
diff --git a/fs/jffs2/symlink.c b/fs/jffs2/symlink.c
index 6e563332bb24..c7c77b0dfccd 100644
--- a/fs/jffs2/symlink.c
+++ b/fs/jffs2/symlink.c
@@ -22,7 +22,6 @@ const struct inode_operations jffs2_symlink_inode_operations =
 {
        .readlink =     generic_readlink,
        .follow_link =  jffs2_follow_link,
-        .get_acl =      jffs2_get_acl,
        .setattr =      jffs2_setattr,
        .setxattr =     jffs2_setxattr,
        .getxattr =     jffs2_getxattr,
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index 3034e970eb9a..ad0f2e2a1700 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -22,6 +22,7 @@
 #include <linux/crc32.h>
 #include <linux/jffs2.h>
 #include <linux/xattr.h>
+#include <linux/posix_acl_xattr.h>
 #include <linux/mtd/mtd.h>
 #include "nodelist.h"
 /* -------- xdatum related functions ----------------
@@ -921,8 +922,8 @@ const struct xattr_handler *jffs2_xattr_handlers[] = {
        &jffs2_security_xattr_handler,
 #endif
 #ifdef CONFIG_JFFS2_FS_POSIX_ACL
-        &jffs2_acl_access_xattr_handler,
+        &posix_acl_access_xattr_handler,
-        &jffs2_acl_default_xattr_handler,
+        &posix_acl_default_xattr_handler,
 #endif
        &jffs2_trusted_xattr_handler,
        NULL
@@ -942,10 +943,10 @@ static const struct xattr_handler *xprefix_to_handler(int xprefix) {
 #endif
 #ifdef CONFIG_JFFS2_FS_POSIX_ACL
        case JFFS2_XPREFIX_ACL_ACCESS:
-                ret = &jffs2_acl_access_xattr_handler;
+                ret = &posix_acl_access_xattr_handler;
                break;
        case JFFS2_XPREFIX_ACL_DEFAULT:
-                ret = &jffs2_acl_default_xattr_handler;
+                ret = &posix_acl_default_xattr_handler;
                break;
 #endif
        case JFFS2_XPREFIX_TRUSTED:
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index d254d6d35995..5a8ea16eedbc 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -72,7 +72,7 @@ struct posix_acl *jfs_get_acl(struct inode *inode, int type)
        return acl;
 }
-static int jfs_set_acl(tid_t tid, struct inode *inode, int type,
+static int __jfs_set_acl(tid_t tid, struct inode *inode, int type,
                       struct posix_acl *acl)
 {
        char *ea_name;
@@ -80,21 +80,24 @@ static int jfs_set_acl(tid_t tid, struct inode *inode, int type,
        int size = 0;
        char *value = NULL;
-        if (S_ISLNK(inode->i_mode))
+        switch (type) {
-                return -EOPNOTSUPP;
+        case ACL_TYPE_ACCESS:
+                ea_name = POSIX_ACL_XATTR_ACCESS;
-        switch(type) {
+                rc = posix_acl_equiv_mode(acl, &inode->i_mode);
-                case ACL_TYPE_ACCESS:
+                if (rc < 0)
-                        ea_name = POSIX_ACL_XATTR_ACCESS;
+                        return rc;
-                        break;
+                inode->i_ctime = CURRENT_TIME;
-                case ACL_TYPE_DEFAULT:
+                mark_inode_dirty(inode);
-                        ea_name = POSIX_ACL_XATTR_DEFAULT;
+                if (rc == 0)
-                        if (!S_ISDIR(inode->i_mode))
+                        acl = NULL;
-                                return acl ? -EACCES : 0;
+                break;
-                        break;
+        case ACL_TYPE_DEFAULT:
-                default:
+                ea_name = POSIX_ACL_XATTR_DEFAULT;
-                        return -EINVAL;
+                break;
+        default:
+                return -EINVAL;
        }
        if (acl) {
                size = posix_acl_xattr_size(acl->a_count);
                value = kmalloc(size, GFP_KERNEL);
@@ -114,65 +117,43 @@ out:
        return rc;
 }
+int jfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+{
+        int rc;
+        tid_t tid;
+        tid = txBegin(inode->i_sb, 0);
+        mutex_lock(&JFS_IP(inode)->commit_mutex);
+        rc = __jfs_set_acl(tid, inode, type, acl);
+        if (!rc)
+                rc = txCommit(tid, 1, &inode, 0);
+        txEnd(tid);
+        mutex_unlock(&JFS_IP(inode)->commit_mutex);
+        return rc;
+}
 int jfs_init_acl(tid_t tid, struct inode *inode, struct inode *dir)
 {
-        struct posix_acl *acl = NULL;
+        struct posix_acl *default_acl, *acl;
        int rc = 0;
-        if (S_ISLNK(inode->i_mode))
+        rc = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
-                return 0;
+        if (rc)
+                return rc;
-        acl = jfs_get_acl(dir, ACL_TYPE_DEFAULT);
+        if (default_acl) {
-        if (IS_ERR(acl))
+                rc = __jfs_set_acl(tid, inode, ACL_TYPE_DEFAULT, default_acl);
-                return PTR_ERR(acl);
+                posix_acl_release(default_acl);
+        }
        if (acl) {
-                if (S_ISDIR(inode->i_mode)) {
+                if (!rc)
-                        rc = jfs_set_acl(tid, inode, ACL_TYPE_DEFAULT, acl);
+                        rc = __jfs_set_acl(tid, inode, ACL_TYPE_ACCESS, acl);
-                        if (rc)
-                                goto cleanup;
-                }
-                rc = posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode);
-                if (rc < 0)
-                        goto cleanup; /* posix_acl_release(NULL) is no-op */
-                if (rc > 0)
-                        rc = jfs_set_acl(tid, inode, ACL_TYPE_ACCESS, acl);
-cleanup:
                posix_acl_release(acl);
-        } else
+        }
-                inode->i_mode &= ~current_umask();
        JFS_IP(inode)->mode2 = (JFS_IP(inode)->mode2 & 0xffff0000) |
                               inode->i_mode;
        return rc;
 }
-int jfs_acl_chmod(struct inode *inode)
-{
-        struct posix_acl *acl;
-        int rc;
-        tid_t tid;
-        if (S_ISLNK(inode->i_mode))
-                return -EOPNOTSUPP;
-        acl = jfs_get_acl(inode, ACL_TYPE_ACCESS);
-        if (IS_ERR(acl) || !acl)
-                return PTR_ERR(acl);
-        rc = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
-        if (rc)
-                return rc;
-        tid = txBegin(inode->i_sb, 0);
-        mutex_lock(&JFS_IP(inode)->commit_mutex);
-        rc = jfs_set_acl(tid, inode, ACL_TYPE_ACCESS, acl);
-        if (!rc)
-                rc = txCommit(tid, 1, &inode, 0);
-        txEnd(tid);
-        mutex_unlock(&JFS_IP(inode)->commit_mutex);
-        posix_acl_release(acl);
-        return rc;
-}
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index dd7442c58358..794da944d5cd 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -19,6 +19,7 @@
 #include <linux/mm.h>
 #include <linux/fs.h>
+#include <linux/posix_acl.h>
 #include <linux/quotaops.h>
 #include "jfs_incore.h"
 #include "jfs_inode.h"
@@ -131,7 +132,7 @@ int jfs_setattr(struct dentry *dentry, struct iattr *iattr)
        mark_inode_dirty(inode);
        if (iattr->ia_valid & ATTR_MODE)
-                rc = jfs_acl_chmod(inode);
+                rc = posix_acl_chmod(inode, inode->i_mode);
        return rc;
 }
@@ -143,6 +144,7 @@ const struct inode_operations jfs_file_inode_operations = {
        .setattr        = jfs_setattr,
 #ifdef CONFIG_JFS_POSIX_ACL
        .get_acl        = jfs_get_acl,
+        .set_acl        = jfs_set_acl,
 #endif
 };
diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h
index ad84fe50ca9e..489f993b7b13 100644
--- a/fs/jfs/jfs_acl.h
+++ b/fs/jfs/jfs_acl.h
@@ -21,8 +21,8 @@
 #ifdef CONFIG_JFS_POSIX_ACL
 struct posix_acl *jfs_get_acl(struct inode *inode, int type);
+int jfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
 int jfs_init_acl(tid_t, struct inode *, struct inode *);
-int jfs_acl_chmod(struct inode *inode);
 #else
@@ -32,10 +32,5 @@ static inline int jfs_init_acl(tid_t tid, struct inode *inode,
        return 0;
 }
-static inline int jfs_acl_chmod(struct inode *inode)
-{
-        return 0;
-}
 #endif
 #endif          /* _H_JFS_ACL */
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 360d27c48887..8d811e02b4b9 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -1998,20 +1998,20 @@ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp)
        bio = bio_alloc(GFP_NOFS, 1);
-        bio->bi_sector = bp->l_blkno << (log->l2bsize - 9);
+        bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9);
        bio->bi_bdev = log->bdev;
        bio->bi_io_vec[0].bv_page = bp->l_page;
        bio->bi_io_vec[0].bv_len = LOGPSIZE;
        bio->bi_io_vec[0].bv_offset = bp->l_offset;
        bio->bi_vcnt = 1;
-        bio->bi_size = LOGPSIZE;
+        bio->bi_iter.bi_size = LOGPSIZE;
        bio->bi_end_io = lbmIODone;
        bio->bi_private = bp;
        /*check if journaling to disk has been disabled*/
        if (log->no_integrity) {
-                bio->bi_size = 0;
+                bio->bi_iter.bi_size = 0;
                lbmIODone(bio, 0);
        } else {
                submit_bio(READ_SYNC, bio);
@@ -2144,21 +2144,21 @@ static void lbmStartIO(struct lbuf * bp)
        jfs_info("lbmStartIO\n");
        bio = bio_alloc(GFP_NOFS, 1);
-        bio->bi_sector = bp->l_blkno << (log->l2bsize - 9);
+        bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9);
        bio->bi_bdev = log->bdev;
        bio->bi_io_vec[0].bv_page = bp->l_page;
        bio->bi_io_vec[0].bv_len = LOGPSIZE;
        bio->bi_io_vec[0].bv_offset = bp->l_offset;
        bio->bi_vcnt = 1;
-        bio->bi_size = LOGPSIZE;
+        bio->bi_iter.bi_size = LOGPSIZE;
        bio->bi_end_io = lbmIODone;
        bio->bi_private = bp;
        /* check if journaling to disk has been disabled */
        if (log->no_integrity) {
-                bio->bi_size = 0;
+                bio->bi_iter.bi_size = 0;
                lbmIODone(bio, 0);
        } else {
                submit_bio(WRITE_SYNC, bio);
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index d165cde0c68d..49ba7ff1bbb9 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -416,7 +416,7 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc)
                         * count from hitting zero before we're through
                         */
                        inc_io(page);
-                        if (!bio->bi_size)
+                        if (!bio->bi_iter.bi_size)
                                goto dump_bio;
                        submit_bio(WRITE, bio);
                        nr_underway++;
@@ -438,7 +438,7 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc)
                bio = bio_alloc(GFP_NOFS, 1);
                bio->bi_bdev = inode->i_sb->s_bdev;
-                bio->bi_sector = pblock << (inode->i_blkbits - 9);
+                bio->bi_iter.bi_sector = pblock << (inode->i_blkbits - 9);
                bio->bi_end_io = metapage_write_end_io;
                bio->bi_private = page;
@@ -452,7 +452,7 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc)
        if (bio) {
                if (bio_add_page(bio, page, bio_bytes, bio_offset) < bio_bytes)
                                goto add_failed;
-                if (!bio->bi_size)
+                if (!bio->bi_iter.bi_size)
                        goto dump_bio;
                submit_bio(WRITE, bio);
@@ -517,7 +517,8 @@ static int metapage_readpage(struct file *fp, struct page *page)
                        bio = bio_alloc(GFP_NOFS, 1);
                        bio->bi_bdev = inode->i_sb->s_bdev;
-                        bio->bi_sector = pblock << (inode->i_blkbits - 9);
+                        bio->bi_iter.bi_sector =
+                                pblock << (inode->i_blkbits - 9);
                        bio->bi_end_io = metapage_read_end_io;
                        bio->bi_private = page;
                        len = xlen << inode->i_blkbits;
diff --git a/fs/jfs/jfs_xattr.h b/fs/jfs/jfs_xattr.h
index e9e100fd7c09..e8d717dabca3 100644
--- a/fs/jfs/jfs_xattr.h
+++ b/fs/jfs/jfs_xattr.h
@@ -61,6 +61,8 @@ extern ssize_t jfs_getxattr(struct dentry *, const char *, void *, size_t);
 extern ssize_t jfs_listxattr(struct dentry *, char *, size_t);
 extern int jfs_removexattr(struct dentry *, const char *);
+extern const struct xattr_handler *jfs_xattr_handlers[];
 #ifdef CONFIG_JFS_SECURITY
 extern int jfs_init_security(tid_t, struct inode *, struct inode *,
                             const struct qstr *);
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index aa8a3370631b..d59c7defb1ef 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1524,6 +1524,7 @@ const struct inode_operations jfs_dir_inode_operations = {
        .setattr        = jfs_setattr,
 #ifdef CONFIG_JFS_POSIX_ACL
        .get_acl        = jfs_get_acl,
+        .set_acl        = jfs_set_acl,
 #endif
 };
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 6669aa2042c3..e2b7483444fd 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -44,6 +44,7 @@
 #include "jfs_imap.h"
 #include "jfs_acl.h"
 #include "jfs_debug.h"
+#include "jfs_xattr.h"
 MODULE_DESCRIPTION("The Journaled Filesystem (JFS)");
 MODULE_AUTHOR("Steve Best/Dave Kleikamp/Barry Arndt, IBM");
@@ -522,6 +523,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
         */
        sb->s_op = &jfs_super_operations;
        sb->s_export_op = &jfs_export_operations;
+        sb->s_xattr = jfs_xattr_handlers;
 #ifdef CONFIG_QUOTA
        sb->dq_op = &dquot_operations;
        sb->s_qcop = &dquot_quotactl_ops;
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index d3472f4cd530..46325d5c34fc 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -666,81 +666,12 @@ static int ea_put(tid_t tid, struct inode *inode, struct ea_buffer *ea_buf,
 }
 /*
- * can_set_system_xattr
- *
- * This code is specific to the system.* namespace.  It contains policy
- * which doesn't belong in the main xattr codepath.
- */
-static int can_set_system_xattr(struct inode *inode, const char *name,
-                                const void *value, size_t value_len)
-{
-#ifdef CONFIG_JFS_POSIX_ACL
-        struct posix_acl *acl;
-        int rc;
-        if (!inode_owner_or_capable(inode))
-                return -EPERM;
-        /*
-         * POSIX_ACL_XATTR_ACCESS is tied to i_mode
-         */
-        if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0) {
-                acl = posix_acl_from_xattr(&init_user_ns, value, value_len);
-                if (IS_ERR(acl)) {
-                        rc = PTR_ERR(acl);
-                        printk(KERN_ERR "posix_acl_from_xattr returned %d\n",
-                               rc);
-                        return rc;
-                }
-                if (acl) {
-                        rc = posix_acl_equiv_mode(acl, &inode->i_mode);
-                        posix_acl_release(acl);
-                        if (rc < 0) {
-                                printk(KERN_ERR
-                                       "posix_acl_equiv_mode returned %d\n",
-                                       rc);
-                                return rc;
-                        }
-                        mark_inode_dirty(inode);
-                }
-                /*
-                 * We're changing the ACL.  Get rid of the cached one
-                 */
-                forget_cached_acl(inode, ACL_TYPE_ACCESS);
-                return 0;
-        } else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0) {
-                acl = posix_acl_from_xattr(&init_user_ns, value, value_len);
-                if (IS_ERR(acl)) {
-                        rc = PTR_ERR(acl);
-                        printk(KERN_ERR "posix_acl_from_xattr returned %d\n",
-                               rc);
-                        return rc;
-                }
-                posix_acl_release(acl);
-                /*
-                 * We're changing the default ACL.  Get rid of the cached one
-                 */
-                forget_cached_acl(inode, ACL_TYPE_DEFAULT);
-                return 0;
-        }
-#endif                  /* CONFIG_JFS_POSIX_ACL */
-        return -EOPNOTSUPP;
-}
-/*
 * Most of the permission checking is done by xattr_permission in the vfs.
- * The local file system is responsible for handling the system.* namespace.
 * We also need to verify that this is a namespace that we recognize.
 */
 static int can_set_xattr(struct inode *inode, const char *name,
                         const void *value, size_t value_len)
 {
-        if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
-                return can_set_system_xattr(inode, name, value, value_len);
        if (!strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN)) {
                /*
                 * This makes sure that we aren't trying to set an
@@ -748,7 +679,7 @@ static int can_set_xattr(struct inode *inode, const char *name,
                 * with "os2."
                 */
                if (is_known_namespace(name + XATTR_OS2_PREFIX_LEN))
-                                return -EOPNOTSUPP;
+                        return -EOPNOTSUPP;
                return 0;
        }
@@ -860,6 +791,19 @@ int __jfs_setxattr(tid_t tid, struct inode *inode, const char *name,
                        /* Completely new ea list */
                        xattr_size = sizeof (struct jfs_ea_list);
+                /*
+                 * The size of EA value is limitted by on-disk format up to
+                 *  __le16, there would be an overflow if the size is equal
+                 * to XATTR_SIZE_MAX (65536).  In order to avoid this issue,
+                 * we can pre-checkup the value size against USHRT_MAX, and
+                 * return -E2BIG in this case, which is consistent with the
+                 * VFS setxattr interface.
+                 */
+                if (value_len >= USHRT_MAX) {
+                        rc = -E2BIG;
+                        goto release;
+                }
                ea = (struct jfs_ea *) ((char *) ealist + xattr_size);
                ea->flag = 0;
                ea->namelen = namelen;
@@ -874,7 +818,7 @@ int __jfs_setxattr(tid_t tid, struct inode *inode, const char *name,
        /* DEBUG - If we did this right, these number match */
        if (xattr_size != new_size) {
                printk(KERN_ERR
-                       "jfs_xsetattr: xattr_size = %d, new_size = %d\n",
+                       "__jfs_setxattr: xattr_size = %d, new_size = %d\n",
                       xattr_size, new_size);
                rc = -EINVAL;
@@ -910,6 +854,14 @@ int jfs_setxattr(struct dentry *dentry, const char *name, const void *value,
        int rc;
        tid_t tid;
+        /*
+         * If this is a request for a synthetic attribute in the system.*
+         * namespace use the generic infrastructure to resolve a handler
+         * for it via sb->s_xattr.
+         */
+        if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+                return generic_setxattr(dentry, name, value, value_len, flags);
        if ((rc = can_set_xattr(inode, name, value, value_len)))
                return rc;
@@ -986,6 +938,14 @@ ssize_t jfs_getxattr(struct dentry *dentry, const char *name, void *data,
 {
        int err;
+        /*
+         * If this is a request for a synthetic attribute in the system.*
+         * namespace use the generic infrastructure to resolve a handler
+         * for it via sb->s_xattr.
+         */
+        if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+                return generic_getxattr(dentry, name, data, buf_size);
        if (strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) {
                /*
                 * skip past "os2." prefix
@@ -1074,6 +1034,14 @@ int jfs_removexattr(struct dentry *dentry, const char *name)
        int rc;
        tid_t tid;
+        /*
+         * If this is a request for a synthetic attribute in the system.*
+         * namespace use the generic infrastructure to resolve a handler
+         * for it via sb->s_xattr.
+         */
+        if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+                return generic_removexattr(dentry, name);
        if ((rc = can_set_xattr(inode, name, NULL, 0)))
                return rc;
@@ -1088,6 +1056,19 @@ int jfs_removexattr(struct dentry *dentry, const char *name)
        return rc;
 }
+/*
+ * List of handlers for synthetic system.* attributes.  All real ondisk
+ * attributes are handled directly.
+ */
+const struct xattr_handler *jfs_xattr_handlers[] = {
+#ifdef CONFIG_JFS_POSIX_ACL
+        &posix_acl_access_xattr_handler,
+        &posix_acl_default_xattr_handler,
+#endif
+        NULL,
+};
 #ifdef CONFIG_JFS_SECURITY
 static int jfs_initxattrs(struct inode *inode, const struct xattr *xattr_array,
                          void *fs_info)
diff --git a/fs/kernfs/Makefile b/fs/kernfs/Makefile
new file mode 100644
index 000000000000..674337c76673
--- /dev/null
+++ b/fs/kernfs/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for the kernfs pseudo filesystem
+#
+obj-y           := mount.o inode.o dir.o file.o symlink.o
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
new file mode 100644
index 000000000000..bd6e18be6e1a
--- /dev/null
+++ b/fs/kernfs/dir.c
@@ -0,0 +1,1077 @@
+/*
+ * fs/kernfs/dir.c - kernfs directory implementation
+ *
+ * Copyright (c) 2001-3 Patrick Mochel
+ * Copyright (c) 2007 SUSE Linux Products GmbH
+ * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2.
+ */
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/idr.h>
+#include <linux/slab.h>
+#include <linux/security.h>
+#include <linux/hash.h>
+#include "kernfs-internal.h"
+DEFINE_MUTEX(kernfs_mutex);
+#define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb)
+/**
+ *      kernfs_name_hash
+ *      @name: Null terminated string to hash
+ *      @ns:   Namespace tag to hash
+ *
+ *      Returns 31 bit hash of ns + name (so it fits in an off_t )
+ */
+static unsigned int kernfs_name_hash(const char *name, const void *ns)
+{
+        unsigned long hash = init_name_hash();
+        unsigned int len = strlen(name);
+        while (len--)
+                hash = partial_name_hash(*name++, hash);
+        hash = (end_name_hash(hash) ^ hash_ptr((void *)ns, 31));
+        hash &= 0x7fffffffU;
+        /* Reserve hash numbers 0, 1 and INT_MAX for magic directory entries */
+        if (hash < 1)
+                hash += 2;
+        if (hash >= INT_MAX)
+                hash = INT_MAX - 1;
+        return hash;
+}
+static int kernfs_name_compare(unsigned int hash, const char *name,
+                               const void *ns, const struct kernfs_node *kn)
+{
+        if (hash != kn->hash)
+                return hash - kn->hash;
+        if (ns != kn->ns)
+                return ns - kn->ns;
+        return strcmp(name, kn->name);
+}
+static int kernfs_sd_compare(const struct kernfs_node *left,
+                             const struct kernfs_node *right)
+{
+        return kernfs_name_compare(left->hash, left->name, left->ns, right);
+}
+/**
+ *      kernfs_link_sibling - link kernfs_node into sibling rbtree
+ *      @kn: kernfs_node of interest
+ *
+ *      Link @kn into its sibling rbtree which starts from
+ *      @kn->parent->dir.children.
+ *
+ *      Locking:
+ *      mutex_lock(kernfs_mutex)
+ *
+ *      RETURNS:
+ *      0 on susccess -EEXIST on failure.
+ */
+static int kernfs_link_sibling(struct kernfs_node *kn)
+{
+        struct rb_node **node = &kn->parent->dir.children.rb_node;
+        struct rb_node *parent = NULL;
+        if (kernfs_type(kn) == KERNFS_DIR)
+                kn->parent->dir.subdirs++;
+        while (*node) {
+                struct kernfs_node *pos;
+                int result;
+                pos = rb_to_kn(*node);
+                parent = *node;
+                result = kernfs_sd_compare(kn, pos);
+                if (result < 0)
+                        node = &pos->rb.rb_left;
+                else if (result > 0)
+                        node = &pos->rb.rb_right;
+                else
+                        return -EEXIST;
+        }
+        /* add new node and rebalance the tree */
+        rb_link_node(&kn->rb, parent, node);
+        rb_insert_color(&kn->rb, &kn->parent->dir.children);
+        return 0;
+}
+/**
+ *      kernfs_unlink_sibling - unlink kernfs_node from sibling rbtree
+ *      @kn: kernfs_node of interest
+ *
+ *      Unlink @kn from its sibling rbtree which starts from
+ *      kn->parent->dir.children.
+ *
+ *      Locking:
+ *      mutex_lock(kernfs_mutex)
+ */
+static void kernfs_unlink_sibling(struct kernfs_node *kn)
+{
+        if (kernfs_type(kn) == KERNFS_DIR)
+                kn->parent->dir.subdirs--;
+        rb_erase(&kn->rb, &kn->parent->dir.children);
+}
+/**
+ *      kernfs_get_active - get an active reference to kernfs_node
+ *      @kn: kernfs_node to get an active reference to
+ *
+ *      Get an active reference of @kn.  This function is noop if @kn
+ *      is NULL.
+ *
+ *      RETURNS:
+ *      Pointer to @kn on success, NULL on failure.
+ */
+struct kernfs_node *kernfs_get_active(struct kernfs_node *kn)
+{
+        if (unlikely(!kn))
+                return NULL;
+        if (!atomic_inc_unless_negative(&kn->active))
+                return NULL;
+        if (kn->flags & KERNFS_LOCKDEP)
+                rwsem_acquire_read(&kn->dep_map, 0, 1, _RET_IP_);
+        return kn;
+}
+/**
+ *      kernfs_put_active - put an active reference to kernfs_node
+ *      @kn: kernfs_node to put an active reference to
+ *
+ *      Put an active reference to @kn.  This function is noop if @kn
+ *      is NULL.
+ */
+void kernfs_put_active(struct kernfs_node *kn)
+{
+        int v;
+        if (unlikely(!kn))
+                return;
+        if (kn->flags & KERNFS_LOCKDEP)
+                rwsem_release(&kn->dep_map, 1, _RET_IP_);
+        v = atomic_dec_return(&kn->active);
+        if (likely(v != KN_DEACTIVATED_BIAS))
+                return;
+        /*
+         * atomic_dec_return() is a mb(), we'll always see the updated
+         * kn->u.completion.
+         */
+        complete(kn->u.completion);
+}
+/**
+ *      kernfs_deactivate - deactivate kernfs_node
+ *      @kn: kernfs_node to deactivate
+ *
+ *      Deny new active references and drain existing ones.
+ */
+static void kernfs_deactivate(struct kernfs_node *kn)
+{
+        DECLARE_COMPLETION_ONSTACK(wait);
+        int v;
+        BUG_ON(!(kn->flags & KERNFS_REMOVED));
+        if (!(kernfs_type(kn) & KERNFS_ACTIVE_REF))
+                return;
+        kn->u.completion = (void *)&wait;
+        if (kn->flags & KERNFS_LOCKDEP)
+                rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_);
+        /* atomic_add_return() is a mb(), put_active() will always see
+         * the updated kn->u.completion.
+         */
+        v = atomic_add_return(KN_DEACTIVATED_BIAS, &kn->active);
+        if (v != KN_DEACTIVATED_BIAS) {
+                if (kn->flags & KERNFS_LOCKDEP)
+                        lock_contended(&kn->dep_map, _RET_IP_);
+                wait_for_completion(&wait);
+        }
+        if (kn->flags & KERNFS_LOCKDEP) {
+                lock_acquired(&kn->dep_map, _RET_IP_);
+                rwsem_release(&kn->dep_map, 1, _RET_IP_);
+        }
+}
+/**
+ * kernfs_get - get a reference count on a kernfs_node
+ * @kn: the target kernfs_node
+ */
+void kernfs_get(struct kernfs_node *kn)
+{
+        if (kn) {
+                WARN_ON(!atomic_read(&kn->count));
+                atomic_inc(&kn->count);
+        }
+}
+EXPORT_SYMBOL_GPL(kernfs_get);
+/**
+ * kernfs_put - put a reference count on a kernfs_node
+ * @kn: the target kernfs_node
+ *
+ * Put a reference count of @kn and destroy it if it reached zero.
+ */
+void kernfs_put(struct kernfs_node *kn)
+{
+        struct kernfs_node *parent;
+        struct kernfs_root *root;
+        if (!kn || !atomic_dec_and_test(&kn->count))
+                return;
+        root = kernfs_root(kn);
+ repeat:
+        /* Moving/renaming is always done while holding reference.
+         * kn->parent won't change beneath us.
+         */
+        parent = kn->parent;
+        WARN(!(kn->flags & KERNFS_REMOVED), "kernfs: free using entry: %s/%s\n",
+             parent ? parent->name : "", kn->name);
+        if (kernfs_type(kn) == KERNFS_LINK)
+                kernfs_put(kn->symlink.target_kn);
+        if (!(kn->flags & KERNFS_STATIC_NAME))
+                kfree(kn->name);
+        if (kn->iattr) {
+                if (kn->iattr->ia_secdata)
+                        security_release_secctx(kn->iattr->ia_secdata,
+                                                kn->iattr->ia_secdata_len);
+                simple_xattrs_free(&kn->iattr->xattrs);
+        }
+        kfree(kn->iattr);
+        ida_simple_remove(&root->ino_ida, kn->ino);
+        kmem_cache_free(kernfs_node_cache, kn);
+        kn = parent;
+        if (kn) {
+                if (atomic_dec_and_test(&kn->count))
+                        goto repeat;
+        } else {
+                /* just released the root kn, free @root too */
+                ida_destroy(&root->ino_ida);
+                kfree(root);
+        }
+}
+EXPORT_SYMBOL_GPL(kernfs_put);
+static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags)
+{
+        struct kernfs_node *kn;
+        if (flags & LOOKUP_RCU)
+                return -ECHILD;
+        /* Always perform fresh lookup for negatives */
+        if (!dentry->d_inode)
+                goto out_bad_unlocked;
+        kn = dentry->d_fsdata;
+        mutex_lock(&kernfs_mutex);
+        /* The kernfs node has been deleted */
+        if (kn->flags & KERNFS_REMOVED)
+                goto out_bad;
+        /* The kernfs node has been moved? */
+        if (dentry->d_parent->d_fsdata != kn->parent)
+                goto out_bad;
+        /* The kernfs node has been renamed */
+        if (strcmp(dentry->d_name.name, kn->name) != 0)
+                goto out_bad;
+        /* The kernfs node has been moved to a different namespace */
+        if (kn->parent && kernfs_ns_enabled(kn->parent) &&
+            kernfs_info(dentry->d_sb)->ns != kn->ns)
+                goto out_bad;
+        mutex_unlock(&kernfs_mutex);
+out_valid:
+        return 1;
+out_bad:
+        mutex_unlock(&kernfs_mutex);
+out_bad_unlocked:
+        /*
+         * @dentry doesn't match the underlying kernfs node, drop the
+         * dentry and force lookup.  If we have submounts we must allow the
+         * vfs caches to lie about the state of the filesystem to prevent
+         * leaks and other nasty things, so use check_submounts_and_drop()
+         * instead of d_drop().
+         */
+        if (check_submounts_and_drop(dentry) != 0)
+                goto out_valid;
+        return 0;
+}
+static void kernfs_dop_release(struct dentry *dentry)
+{
+        kernfs_put(dentry->d_fsdata);
+}
+const struct dentry_operations kernfs_dops = {
+        .d_revalidate   = kernfs_dop_revalidate,
+        .d_release      = kernfs_dop_release,
+};
+static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
+                                             const char *name, umode_t mode,
+                                             unsigned flags)
+{
+        char *dup_name = NULL;
+        struct kernfs_node *kn;
+        int ret;
+        if (!(flags & KERNFS_STATIC_NAME)) {
+                name = dup_name = kstrdup(name, GFP_KERNEL);
+                if (!name)
+                        return NULL;
+        }
+        kn = kmem_cache_zalloc(kernfs_node_cache, GFP_KERNEL);
+        if (!kn)
+                goto err_out1;
+        ret = ida_simple_get(&root->ino_ida, 1, 0, GFP_KERNEL);
+        if (ret < 0)
+                goto err_out2;
+        kn->ino = ret;
+        atomic_set(&kn->count, 1);
+        atomic_set(&kn->active, 0);
+        kn->name = name;
+        kn->mode = mode;
+        kn->flags = flags | KERNFS_REMOVED;
+        return kn;
+ err_out2:
+        kmem_cache_free(kernfs_node_cache, kn);
+ err_out1:
+        kfree(dup_name);
+        return NULL;
+}
+struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
+                                    const char *name, umode_t mode,
+                                    unsigned flags)
+{
+        struct kernfs_node *kn;
+        kn = __kernfs_new_node(kernfs_root(parent), name, mode, flags);
+        if (kn) {
+                kernfs_get(parent);
+                kn->parent = parent;
+        }
+        return kn;
+}
+/**
+ *      kernfs_addrm_start - prepare for kernfs_node add/remove
+ *      @acxt: pointer to kernfs_addrm_cxt to be used
+ *
+ *      This function is called when the caller is about to add or remove
+ *      kernfs_node.  This function acquires kernfs_mutex.  @acxt is used
+ *      to keep and pass context to other addrm functions.
+ *
+ *      LOCKING:
+ *      Kernel thread context (may sleep).  kernfs_mutex is locked on
+ *      return.
+ */
+void kernfs_addrm_start(struct kernfs_addrm_cxt *acxt)
+        __acquires(kernfs_mutex)
+{
+        memset(acxt, 0, sizeof(*acxt));
+        mutex_lock(&kernfs_mutex);
+}
+/**
+ *      kernfs_add_one - add kernfs_node to parent without warning
+ *      @acxt: addrm context to use
+ *      @kn: kernfs_node to be added
+ *
+ *      The caller must already have initialized @kn->parent.  This
+ *      function increments nlink of the parent's inode if @kn is a
+ *      directory and link into the children list of the parent.
+ *
+ *      This function should be called between calls to
+ *      kernfs_addrm_start() and kernfs_addrm_finish() and should be passed
+ *      the same @acxt as passed to kernfs_addrm_start().
+ *
+ *      LOCKING:
+ *      Determined by kernfs_addrm_start().
+ *
+ *      RETURNS:
+ *      0 on success, -EEXIST if entry with the given name already
+ *      exists.
+ */
+int kernfs_add_one(struct kernfs_addrm_cxt *acxt, struct kernfs_node *kn)
+{
+        struct kernfs_node *parent = kn->parent;
+        bool has_ns = kernfs_ns_enabled(parent);
+        struct kernfs_iattrs *ps_iattr;
+        int ret;
+        if (has_ns != (bool)kn->ns) {
+                WARN(1, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n",
+                     has_ns ? "required" : "invalid", parent->name, kn->name);
+                return -EINVAL;
+        }
+        if (kernfs_type(parent) != KERNFS_DIR)
+                return -EINVAL;
+        if (parent->flags & KERNFS_REMOVED)
+                return -ENOENT;
+        kn->hash = kernfs_name_hash(kn->name, kn->ns);
+        ret = kernfs_link_sibling(kn);
+        if (ret)
+                return ret;
+        /* Update timestamps on the parent */
+        ps_iattr = parent->iattr;
+        if (ps_iattr) {
+                struct iattr *ps_iattrs = &ps_iattr->ia_iattr;
+                ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME;
+        }
+        /* Mark the entry added into directory tree */
+        kn->flags &= ~KERNFS_REMOVED;
+        return 0;
+}
+/**
+ *      kernfs_remove_one - remove kernfs_node from parent
+ *      @acxt: addrm context to use
+ *      @kn: kernfs_node to be removed
+ *
+ *      Mark @kn removed and drop nlink of parent inode if @kn is a
+ *      directory.  @kn is unlinked from the children list.
+ *
+ *      This function should be called between calls to
+ *      kernfs_addrm_start() and kernfs_addrm_finish() and should be
+ *      passed the same @acxt as passed to kernfs_addrm_start().
+ *
+ *      LOCKING:
+ *      Determined by kernfs_addrm_start().
+ */
+static void kernfs_remove_one(struct kernfs_addrm_cxt *acxt,
+                              struct kernfs_node *kn)
+{
+        struct kernfs_iattrs *ps_iattr;
+        /*
+         * Removal can be called multiple times on the same node.  Only the
+         * first invocation is effective and puts the base ref.
+         */
+        if (kn->flags & KERNFS_REMOVED)
+                return;
+        if (kn->parent) {
+                kernfs_unlink_sibling(kn);
+                /* Update timestamps on the parent */
+                ps_iattr = kn->parent->iattr;
+                if (ps_iattr) {
+                        ps_iattr->ia_iattr.ia_ctime = CURRENT_TIME;
+                        ps_iattr->ia_iattr.ia_mtime = CURRENT_TIME;
+                }
+        }
+        kn->flags |= KERNFS_REMOVED;
+        kn->u.removed_list = acxt->removed;
+        acxt->removed = kn;
+}
+/**
+ *      kernfs_addrm_finish - finish up kernfs_node add/remove
+ *      @acxt: addrm context to finish up
+ *
+ *      Finish up kernfs_node add/remove.  Resources acquired by
+ *      kernfs_addrm_start() are released and removed kernfs_nodes are
+ *      cleaned up.
+ *
+ *      LOCKING:
+ *      kernfs_mutex is released.
+ */
+void kernfs_addrm_finish(struct kernfs_addrm_cxt *acxt)
+        __releases(kernfs_mutex)
+{
+        /* release resources acquired by kernfs_addrm_start() */
+        mutex_unlock(&kernfs_mutex);
+        /* kill removed kernfs_nodes */
+        while (acxt->removed) {
+                struct kernfs_node *kn = acxt->removed;
+                acxt->removed = kn->u.removed_list;
+                kernfs_deactivate(kn);
+                kernfs_unmap_bin_file(kn);
+                kernfs_put(kn);
+        }
+}
+/**
+ * kernfs_find_ns - find kernfs_node with the given name
+ * @parent: kernfs_node to search under
+ * @name: name to look for
+ * @ns: the namespace tag to use
+ *
+ * Look for kernfs_node with name @name under @parent.  Returns pointer to
+ * the found kernfs_node on success, %NULL on failure.
+ */
+static struct kernfs_node *kernfs_find_ns(struct kernfs_node *parent,
+                                          const unsigned char *name,
+                                          const void *ns)
+{
+        struct rb_node *node = parent->dir.children.rb_node;
+        bool has_ns = kernfs_ns_enabled(parent);
+        unsigned int hash;
+        lockdep_assert_held(&kernfs_mutex);
+        if (has_ns != (bool)ns) {
+                WARN(1, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n",
+                     has_ns ? "required" : "invalid", parent->name, name);
+                return NULL;
+        }
+        hash = kernfs_name_hash(name, ns);
+        while (node) {
+                struct kernfs_node *kn;
+                int result;
+                kn = rb_to_kn(node);
+                result = kernfs_name_compare(hash, name, ns, kn);
+                if (result < 0)
+                        node = node->rb_left;
+                else if (result > 0)
+                        node = node->rb_right;
+                else
+                        return kn;
+        }
+        return NULL;
+}
+/**
+ * kernfs_find_and_get_ns - find and get kernfs_node with the given name
+ * @parent: kernfs_node to search under
+ * @name: name to look for
+ * @ns: the namespace tag to use
+ *
+ * Look for kernfs_node with name @name under @parent and get a reference
+ * if found.  This function may sleep and returns pointer to the found
+ * kernfs_node on success, %NULL on failure.
+ */
+struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent,
+                                           const char *name, const void *ns)
+{
+        struct kernfs_node *kn;
+        mutex_lock(&kernfs_mutex);
+        kn = kernfs_find_ns(parent, name, ns);
+        kernfs_get(kn);
+        mutex_unlock(&kernfs_mutex);
+        return kn;
+}
+EXPORT_SYMBOL_GPL(kernfs_find_and_get_ns);
+/**
+ * kernfs_create_root - create a new kernfs hierarchy
+ * @kdops: optional directory syscall operations for the hierarchy
+ * @priv: opaque data associated with the new directory
+ *
+ * Returns the root of the new hierarchy on success, ERR_PTR() value on
+ * failure.
+ */
+struct kernfs_root *kernfs_create_root(struct kernfs_dir_ops *kdops, void *priv)
+{
+        struct kernfs_root *root;
+        struct kernfs_node *kn;
+        root = kzalloc(sizeof(*root), GFP_KERNEL);
+        if (!root)
+                return ERR_PTR(-ENOMEM);
+        ida_init(&root->ino_ida);
+        kn = __kernfs_new_node(root, "", S_IFDIR | S_IRUGO | S_IXUGO,
+                               KERNFS_DIR);
+        if (!kn) {
+                ida_destroy(&root->ino_ida);
+                kfree(root);
+                return ERR_PTR(-ENOMEM);
+        }
+        kn->flags &= ~KERNFS_REMOVED;
+        kn->priv = priv;
+        kn->dir.root = root;
+        root->dir_ops = kdops;
+        root->kn = kn;
+        return root;
+}
+/**
+ * kernfs_destroy_root - destroy a kernfs hierarchy
+ * @root: root of the hierarchy to destroy
+ *
+ * Destroy the hierarchy anchored at @root by removing all existing
+ * directories and destroying @root.
+ */
+void kernfs_destroy_root(struct kernfs_root *root)
+{
+        kernfs_remove(root->kn);        /* will also free @root */
+}
+/**
+ * kernfs_create_dir_ns - create a directory
+ * @parent: parent in which to create a new directory
+ * @name: name of the new directory
+ * @mode: mode of the new directory
+ * @priv: opaque data associated with the new directory
+ * @ns: optional namespace tag of the directory
+ *
+ * Returns the created node on success, ERR_PTR() value on failure.
+ */
+struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent,
+                                         const char *name, umode_t mode,
+                                         void *priv, const void *ns)
+{
+        struct kernfs_addrm_cxt acxt;
+        struct kernfs_node *kn;
+        int rc;
+        /* allocate */
+        kn = kernfs_new_node(parent, name, mode | S_IFDIR, KERNFS_DIR);
+        if (!kn)
+                return ERR_PTR(-ENOMEM);
+        kn->dir.root = parent->dir.root;
+        kn->ns = ns;
+        kn->priv = priv;
+        /* link in */
+        kernfs_addrm_start(&acxt);
+        rc = kernfs_add_one(&acxt, kn);
+        kernfs_addrm_finish(&acxt);
+        if (!rc)
+                return kn;
+        kernfs_put(kn);
+        return ERR_PTR(rc);
+}
+static struct dentry *kernfs_iop_lookup(struct inode *dir,
+                                        struct dentry *dentry,
+                                        unsigned int flags)
+{
+        struct dentry *ret;
+        struct kernfs_node *parent = dentry->d_parent->d_fsdata;
+        struct kernfs_node *kn;
+        struct inode *inode;
+        const void *ns = NULL;
+        mutex_lock(&kernfs_mutex);
+        if (kernfs_ns_enabled(parent))
+                ns = kernfs_info(dir->i_sb)->ns;
+        kn = kernfs_find_ns(parent, dentry->d_name.name, ns);
+        /* no such entry */
+        if (!kn) {
+                ret = NULL;
+                goto out_unlock;
+        }
+        kernfs_get(kn);
+        dentry->d_fsdata = kn;
+        /* attach dentry and inode */
+        inode = kernfs_get_inode(dir->i_sb, kn);
+        if (!inode) {
+                ret = ERR_PTR(-ENOMEM);
+                goto out_unlock;
+        }
+        /* instantiate and hash dentry */
+        ret = d_materialise_unique(dentry, inode);
+ out_unlock:
+        mutex_unlock(&kernfs_mutex);
+        return ret;
+}
+static int kernfs_iop_mkdir(struct inode *dir, struct dentry *dentry,
+                            umode_t mode)
+{
+        struct kernfs_node *parent = dir->i_private;
+        struct kernfs_dir_ops *kdops = kernfs_root(parent)->dir_ops;
+        if (!kdops || !kdops->mkdir)
+                return -EPERM;
+        return kdops->mkdir(parent, dentry->d_name.name, mode);
+}
+static int kernfs_iop_rmdir(struct inode *dir, struct dentry *dentry)
+{
+        struct kernfs_node *kn  = dentry->d_fsdata;
+        struct kernfs_dir_ops *kdops = kernfs_root(kn)->dir_ops;
+        if (!kdops || !kdops->rmdir)
+                return -EPERM;
+        return kdops->rmdir(kn);
+}
+static int kernfs_iop_rename(struct inode *old_dir, struct dentry *old_dentry,
+                             struct inode *new_dir, struct dentry *new_dentry)
+{
+        struct kernfs_node *kn  = old_dentry->d_fsdata;
+        struct kernfs_node *new_parent = new_dir->i_private;
+        struct kernfs_dir_ops *kdops = kernfs_root(kn)->dir_ops;
+        if (!kdops || !kdops->rename)
+                return -EPERM;
+        return kdops->rename(kn, new_parent, new_dentry->d_name.name);
+}
+const struct inode_operations kernfs_dir_iops = {
+        .lookup         = kernfs_iop_lookup,
+        .permission     = kernfs_iop_permission,
+        .setattr        = kernfs_iop_setattr,
+        .getattr        = kernfs_iop_getattr,
+        .setxattr       = kernfs_iop_setxattr,
+        .removexattr    = kernfs_iop_removexattr,
+        .getxattr       = kernfs_iop_getxattr,
+        .listxattr      = kernfs_iop_listxattr,
+        .mkdir          = kernfs_iop_mkdir,
+        .rmdir          = kernfs_iop_rmdir,
+        .rename         = kernfs_iop_rename,
+};
+static struct kernfs_node *kernfs_leftmost_descendant(struct kernfs_node *pos)
+{
+        struct kernfs_node *last;
+        while (true) {
+                struct rb_node *rbn;
+                last = pos;
+                if (kernfs_type(pos) != KERNFS_DIR)
+                        break;
+                rbn = rb_first(&pos->dir.children);
+                if (!rbn)
+                        break;
+                pos = rb_to_kn(rbn);
+        }
+        return last;
+}
+/**
+ * kernfs_next_descendant_post - find the next descendant for post-order walk
+ * @pos: the current position (%NULL to initiate traversal)
+ * @root: kernfs_node whose descendants to walk
+ *
+ * Find the next descendant to visit for post-order traversal of @root's
+ * descendants.  @root is included in the iteration and the last node to be
+ * visited.
+ */
+static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos,
+                                                       struct kernfs_node *root)
+{
+        struct rb_node *rbn;
+        lockdep_assert_held(&kernfs_mutex);
+        /* if first iteration, visit leftmost descendant which may be root */
+        if (!pos)
+                return kernfs_leftmost_descendant(root);
+        /* if we visited @root, we're done */
+        if (pos == root)
+                return NULL;
+        /* if there's an unvisited sibling, visit its leftmost descendant */
+        rbn = rb_next(&pos->rb);
+        if (rbn)
+                return kernfs_leftmost_descendant(rb_to_kn(rbn));
+        /* no sibling left, visit parent */
+        return pos->parent;
+}
+static void __kernfs_remove(struct kernfs_addrm_cxt *acxt,
+                            struct kernfs_node *kn)
+{
+        struct kernfs_node *pos, *next;
+        if (!kn)
+                return;
+        pr_debug("kernfs %s: removing\n", kn->name);
+        next = NULL;
+        do {
+                pos = next;
+                next = kernfs_next_descendant_post(pos, kn);
+                if (pos)
+                        kernfs_remove_one(acxt, pos);
+        } while (next);
+}
+/**
+ * kernfs_remove - remove a kernfs_node recursively
+ * @kn: the kernfs_node to remove
+ *
+ * Remove @kn along with all its subdirectories and files.
+ */
+void kernfs_remove(struct kernfs_node *kn)
+{
+        struct kernfs_addrm_cxt acxt;
+        kernfs_addrm_start(&acxt);
+        __kernfs_remove(&acxt, kn);
+        kernfs_addrm_finish(&acxt);
+}
+/**
+ * kernfs_remove_by_name_ns - find a kernfs_node by name and remove it
+ * @parent: parent of the target
+ * @name: name of the kernfs_node to remove
+ * @ns: namespace tag of the kernfs_node to remove
+ *
+ * Look for the kernfs_node with @name and @ns under @parent and remove it.
+ * Returns 0 on success, -ENOENT if such entry doesn't exist.
+ */
+int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name,
+                             const void *ns)
+{
+        struct kernfs_addrm_cxt acxt;
+        struct kernfs_node *kn;
+        if (!parent) {
+                WARN(1, KERN_WARNING "kernfs: can not remove '%s', no directory\n",
+                        name);
+                return -ENOENT;
+        }
+        kernfs_addrm_start(&acxt);
+        kn = kernfs_find_ns(parent, name, ns);
+        if (kn)
+                __kernfs_remove(&acxt, kn);
+        kernfs_addrm_finish(&acxt);
+        if (kn)
+                return 0;
+        else
+                return -ENOENT;
+}
+/**
+ * kernfs_rename_ns - move and rename a kernfs_node
+ * @kn: target node
+ * @new_parent: new parent to put @sd under
+ * @new_name: new name
+ * @new_ns: new namespace tag
+ */
+int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
+                     const char *new_name, const void *new_ns)
+{
+        int error;
+        mutex_lock(&kernfs_mutex);
+        error = -ENOENT;
+        if ((kn->flags | new_parent->flags) & KERNFS_REMOVED)
+                goto out;
+        error = 0;
+        if ((kn->parent == new_parent) && (kn->ns == new_ns) &&
+            (strcmp(kn->name, new_name) == 0))
+                goto out;       /* nothing to rename */
+        error = -EEXIST;
+        if (kernfs_find_ns(new_parent, new_name, new_ns))
+                goto out;
+        /* rename kernfs_node */
+        if (strcmp(kn->name, new_name) != 0) {
+                error = -ENOMEM;
+                new_name = kstrdup(new_name, GFP_KERNEL);
+                if (!new_name)
+                        goto out;
+                if (kn->flags & KERNFS_STATIC_NAME)
+                        kn->flags &= ~KERNFS_STATIC_NAME;
+                else
+                        kfree(kn->name);
+                kn->name = new_name;
+        }
+        /*
+         * Move to the appropriate place in the appropriate directories rbtree.
+         */
+        kernfs_unlink_sibling(kn);
+        kernfs_get(new_parent);
+        kernfs_put(kn->parent);
+        kn->ns = new_ns;
+        kn->hash = kernfs_name_hash(kn->name, kn->ns);
+        kn->parent = new_parent;
+        kernfs_link_sibling(kn);
+        error = 0;
+ out:
+        mutex_unlock(&kernfs_mutex);
+        return error;
+}
+/* Relationship between s_mode and the DT_xxx types */
+static inline unsigned char dt_type(struct kernfs_node *kn)
+{
+        return (kn->mode >> 12) & 15;
+}
+static int kernfs_dir_fop_release(struct inode *inode, struct file *filp)
+{
+        kernfs_put(filp->private_data);
+        return 0;
+}
+static struct kernfs_node *kernfs_dir_pos(const void *ns,
+        struct kernfs_node *parent, loff_t hash, struct kernfs_node *pos)
+{
+        if (pos) {
+                int valid = !(pos->flags & KERNFS_REMOVED) &&
+                        pos->parent == parent && hash == pos->hash;
+                kernfs_put(pos);
+                if (!valid)
+                        pos = NULL;
+        }
+        if (!pos && (hash > 1) && (hash < INT_MAX)) {
+                struct rb_node *node = parent->dir.children.rb_node;
+                while (node) {
+                        pos = rb_to_kn(node);
+                        if (hash < pos->hash)
+                                node = node->rb_left;
+                        else if (hash > pos->hash)
+                                node = node->rb_right;
+                        else
+                                break;
+                }
+        }
+        /* Skip over entries in the wrong namespace */
+        while (pos && pos->ns != ns) {
+                struct rb_node *node = rb_next(&pos->rb);
+                if (!node)
+                        pos = NULL;
+                else
+                        pos = rb_to_kn(node);
+        }
+        return pos;
+}
+static struct kernfs_node *kernfs_dir_next_pos(const void *ns,
+        struct kernfs_node *parent, ino_t ino, struct kernfs_node *pos)
+{
+        pos = kernfs_dir_pos(ns, parent, ino, pos);
+        if (pos)
+                do {
+                        struct rb_node *node = rb_next(&pos->rb);
+                        if (!node)
+                                pos = NULL;
+                        else
+                                pos = rb_to_kn(node);
+                } while (pos && pos->ns != ns);
+        return pos;
+}
+static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx)
+{
+        struct dentry *dentry = file->f_path.dentry;
+        struct kernfs_node *parent = dentry->d_fsdata;
+        struct kernfs_node *pos = file->private_data;
+        const void *ns = NULL;
+        if (!dir_emit_dots(file, ctx))
+                return 0;
+        mutex_lock(&kernfs_mutex);
+        if (kernfs_ns_enabled(parent))
+                ns = kernfs_info(dentry->d_sb)->ns;
+        for (pos = kernfs_dir_pos(ns, parent, ctx->pos, pos);
+             pos;
+             pos = kernfs_dir_next_pos(ns, parent, ctx->pos, pos)) {
+                const char *name = pos->name;
+                unsigned int type = dt_type(pos);
+                int len = strlen(name);
+                ino_t ino = pos->ino;
+                ctx->pos = pos->hash;
+                file->private_data = pos;
+                kernfs_get(pos);
+                mutex_unlock(&kernfs_mutex);
+                if (!dir_emit(ctx, name, len, ino, type))
+                        return 0;
+                mutex_lock(&kernfs_mutex);
+        }
+        mutex_unlock(&kernfs_mutex);
+        file->private_data = NULL;
+        ctx->pos = INT_MAX;
+        return 0;
+}
+static loff_t kernfs_dir_fop_llseek(struct file *file, loff_t offset,
+                                    int whence)
+{
+        struct inode *inode = file_inode(file);
+        loff_t ret;
+        mutex_lock(&inode->i_mutex);
+        ret = generic_file_llseek(file, offset, whence);
+        mutex_unlock(&inode->i_mutex);
+        return ret;
+}
+const struct file_operations kernfs_dir_fops = {
+        .read           = generic_read_dir,
+        .iterate        = kernfs_fop_readdir,
+        .release        = kernfs_dir_fop_release,
+        .llseek         = kernfs_dir_fop_llseek,
+};
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
new file mode 100644
index 000000000000..dbf397bfdff2
--- /dev/null
+++ b/fs/kernfs/file.c
@@ -0,0 +1,867 @@
+/*
+ * fs/kernfs/file.c - kernfs file implementation
+ *
+ * Copyright (c) 2001-3 Patrick Mochel
+ * Copyright (c) 2007 SUSE Linux Products GmbH
+ * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2.
+ */
+#include <linux/fs.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/poll.h>
+#include <linux/pagemap.h>
+#include <linux/sched.h>
+#include "kernfs-internal.h"
+/*
+ * There's one kernfs_open_file for each open file and one kernfs_open_node
+ * for each kernfs_node with one or more open files.
+ *
+ * kernfs_node->attr.open points to kernfs_open_node.  attr.open is
+ * protected by kernfs_open_node_lock.
+ *
+ * filp->private_data points to seq_file whose ->private points to
+ * kernfs_open_file.  kernfs_open_files are chained at
+ * kernfs_open_node->files, which is protected by kernfs_open_file_mutex.
+ */
+static DEFINE_SPINLOCK(kernfs_open_node_lock);
+static DEFINE_MUTEX(kernfs_open_file_mutex);
+struct kernfs_open_node {
+        atomic_t                refcnt;
+        atomic_t                event;
+        wait_queue_head_t       poll;
+        struct list_head        files; /* goes through kernfs_open_file.list */
+};
+static struct kernfs_open_file *kernfs_of(struct file *file)
+{
+        return ((struct seq_file *)file->private_data)->private;
+}
+/*
+ * Determine the kernfs_ops for the given kernfs_node.  This function must
+ * be called while holding an active reference.
+ */
+static const struct kernfs_ops *kernfs_ops(struct kernfs_node *kn)
+{
+        if (kn->flags & KERNFS_LOCKDEP)
+                lockdep_assert_held(kn);
+        return kn->attr.ops;
+}
+/*
+ * As kernfs_seq_stop() is also called after kernfs_seq_start() or
+ * kernfs_seq_next() failure, it needs to distinguish whether it's stopping
+ * a seq_file iteration which is fully initialized with an active reference
+ * or an aborted kernfs_seq_start() due to get_active failure.  The
+ * position pointer is the only context for each seq_file iteration and
+ * thus the stop condition should be encoded in it.  As the return value is
+ * directly visible to userland, ERR_PTR(-ENODEV) is the only acceptable
+ * choice to indicate get_active failure.
+ *
+ * Unfortunately, this is complicated due to the optional custom seq_file
+ * operations which may return ERR_PTR(-ENODEV) too.  kernfs_seq_stop()
+ * can't distinguish whether ERR_PTR(-ENODEV) is from get_active failure or
+ * custom seq_file operations and thus can't decide whether put_active
+ * should be performed or not only on ERR_PTR(-ENODEV).
+ *
+ * This is worked around by factoring out the custom seq_stop() and
+ * put_active part into kernfs_seq_stop_active(), skipping it from
+ * kernfs_seq_stop() if ERR_PTR(-ENODEV) while invoking it directly after
+ * custom seq_file operations fail with ERR_PTR(-ENODEV) - this ensures
+ * that kernfs_seq_stop_active() is skipped only after get_active failure.
+ */
+static void kernfs_seq_stop_active(struct seq_file *sf, void *v)
+{
+        struct kernfs_open_file *of = sf->private;
+        const struct kernfs_ops *ops = kernfs_ops(of->kn);
+        if (ops->seq_stop)
+                ops->seq_stop(sf, v);
+        kernfs_put_active(of->kn);
+}
+static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos)
+{
+        struct kernfs_open_file *of = sf->private;
+        const struct kernfs_ops *ops;
+        /*
+         * @of->mutex nests outside active ref and is just to ensure that
+         * the ops aren't called concurrently for the same open file.
+         */
+        mutex_lock(&of->mutex);
+        if (!kernfs_get_active(of->kn))
+                return ERR_PTR(-ENODEV);
+        ops = kernfs_ops(of->kn);
+        if (ops->seq_start) {
+                void *next = ops->seq_start(sf, ppos);
+                /* see the comment above kernfs_seq_stop_active() */
+                if (next == ERR_PTR(-ENODEV))
+                        kernfs_seq_stop_active(sf, next);
+                return next;
+        } else {
+                /*
+                 * The same behavior and code as single_open().  Returns
+                 * !NULL if pos is at the beginning; otherwise, NULL.
+                 */
+                return NULL + !*ppos;
+        }
+}
+static void *kernfs_seq_next(struct seq_file *sf, void *v, loff_t *ppos)
+{
+        struct kernfs_open_file *of = sf->private;
+        const struct kernfs_ops *ops = kernfs_ops(of->kn);
+        if (ops->seq_next) {
+                void *next = ops->seq_next(sf, v, ppos);
+                /* see the comment above kernfs_seq_stop_active() */
+                if (next == ERR_PTR(-ENODEV))
+                        kernfs_seq_stop_active(sf, next);
+                return next;
+        } else {
+                /*
+                 * The same behavior and code as single_open(), always
+                 * terminate after the initial read.
+                 */
+                ++*ppos;
+                return NULL;
+        }
+}
+static void kernfs_seq_stop(struct seq_file *sf, void *v)
+{
+        struct kernfs_open_file *of = sf->private;
+        if (v != ERR_PTR(-ENODEV))
+                kernfs_seq_stop_active(sf, v);
+        mutex_unlock(&of->mutex);
+}
+static int kernfs_seq_show(struct seq_file *sf, void *v)
+{
+        struct kernfs_open_file *of = sf->private;
+        of->event = atomic_read(&of->kn->attr.open->event);
+        return of->kn->attr.ops->seq_show(sf, v);
+}
+static const struct seq_operations kernfs_seq_ops = {
+        .start = kernfs_seq_start,
+        .next = kernfs_seq_next,
+        .stop = kernfs_seq_stop,
+        .show = kernfs_seq_show,
+};
+/*
+ * As reading a bin file can have side-effects, the exact offset and bytes
+ * specified in read(2) call should be passed to the read callback making
+ * it difficult to use seq_file.  Implement simplistic custom buffering for
+ * bin files.
+ */
+static ssize_t kernfs_file_direct_read(struct kernfs_open_file *of,
+                                       char __user *user_buf, size_t count,
+                                       loff_t *ppos)
+{
+        ssize_t len = min_t(size_t, count, PAGE_SIZE);
+        const struct kernfs_ops *ops;
+        char *buf;
+        buf = kmalloc(len, GFP_KERNEL);
+        if (!buf)
+                return -ENOMEM;
+        /*
+         * @of->mutex nests outside active ref and is just to ensure that
+         * the ops aren't called concurrently for the same open file.
+         */
+        mutex_lock(&of->mutex);
+        if (!kernfs_get_active(of->kn)) {
+                len = -ENODEV;
+                mutex_unlock(&of->mutex);
+                goto out_free;
+        }
+        ops = kernfs_ops(of->kn);
+        if (ops->read)
+                len = ops->read(of, buf, len, *ppos);
+        else
+                len = -EINVAL;
+        kernfs_put_active(of->kn);
+        mutex_unlock(&of->mutex);
+        if (len < 0)
+                goto out_free;
+        if (copy_to_user(user_buf, buf, len)) {
+                len = -EFAULT;
+                goto out_free;
+        }
+        *ppos += len;
+ out_free:
+        kfree(buf);
+        return len;
+}
+/**
+ * kernfs_fop_read - kernfs vfs read callback
+ * @file: file pointer
+ * @user_buf: data to write
+ * @count: number of bytes
+ * @ppos: starting offset
+ */
+static ssize_t kernfs_fop_read(struct file *file, char __user *user_buf,
+                               size_t count, loff_t *ppos)
+{
+        struct kernfs_open_file *of = kernfs_of(file);
+        if (of->kn->flags & KERNFS_HAS_SEQ_SHOW)
+                return seq_read(file, user_buf, count, ppos);
+        else
+                return kernfs_file_direct_read(of, user_buf, count, ppos);
+}
+/**
+ * kernfs_fop_write - kernfs vfs write callback
+ * @file: file pointer
+ * @user_buf: data to write
+ * @count: number of bytes
+ * @ppos: starting offset
+ *
+ * Copy data in from userland and pass it to the matching kernfs write
+ * operation.
+ *
+ * There is no easy way for us to know if userspace is only doing a partial
+ * write, so we don't support them. We expect the entire buffer to come on
+ * the first write.  Hint: if you're writing a value, first read the file,
+ * modify only the the value you're changing, then write entire buffer
+ * back.
+ */
+static ssize_t kernfs_fop_write(struct file *file, const char __user *user_buf,
+                                size_t count, loff_t *ppos)
+{
+        struct kernfs_open_file *of = kernfs_of(file);
+        ssize_t len = min_t(size_t, count, PAGE_SIZE);
+        const struct kernfs_ops *ops;
+        char *buf;
+        buf = kmalloc(len + 1, GFP_KERNEL);
+        if (!buf)
+                return -ENOMEM;
+        if (copy_from_user(buf, user_buf, len)) {
+                len = -EFAULT;
+                goto out_free;
+        }
+        buf[len] = '\0';        /* guarantee string termination */
+        /*
+         * @of->mutex nests outside active ref and is just to ensure that
+         * the ops aren't called concurrently for the same open file.
+         */
+        mutex_lock(&of->mutex);
+        if (!kernfs_get_active(of->kn)) {
+                mutex_unlock(&of->mutex);
+                len = -ENODEV;
+                goto out_free;
+        }
+        ops = kernfs_ops(of->kn);
+        if (ops->write)
+                len = ops->write(of, buf, len, *ppos);
+        else
+                len = -EINVAL;
+        kernfs_put_active(of->kn);
+        mutex_unlock(&of->mutex);
+        if (len > 0)
+                *ppos += len;
+out_free:
+        kfree(buf);
+        return len;
+}
+static void kernfs_vma_open(struct vm_area_struct *vma)
+{
+        struct file *file = vma->vm_file;
+        struct kernfs_open_file *of = kernfs_of(file);
+        if (!of->vm_ops)
+                return;
+        if (!kernfs_get_active(of->kn))
+                return;
+        if (of->vm_ops->open)
+                of->vm_ops->open(vma);
+        kernfs_put_active(of->kn);
+}
+static int kernfs_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+        struct file *file = vma->vm_file;
+        struct kernfs_open_file *of = kernfs_of(file);
+        int ret;
+        if (!of->vm_ops)
+                return VM_FAULT_SIGBUS;
+        if (!kernfs_get_active(of->kn))
+                return VM_FAULT_SIGBUS;
+        ret = VM_FAULT_SIGBUS;
+        if (of->vm_ops->fault)
+                ret = of->vm_ops->fault(vma, vmf);
+        kernfs_put_active(of->kn);
+        return ret;
+}
+static int kernfs_vma_page_mkwrite(struct vm_area_struct *vma,
+                                   struct vm_fault *vmf)
+{
+        struct file *file = vma->vm_file;
+        struct kernfs_open_file *of = kernfs_of(file);
+        int ret;
+        if (!of->vm_ops)
+                return VM_FAULT_SIGBUS;
+        if (!kernfs_get_active(of->kn))
+                return VM_FAULT_SIGBUS;
+        ret = 0;
+        if (of->vm_ops->page_mkwrite)
+                ret = of->vm_ops->page_mkwrite(vma, vmf);
+        else
+                file_update_time(file);
+        kernfs_put_active(of->kn);
+        return ret;
+}
+static int kernfs_vma_access(struct vm_area_struct *vma, unsigned long addr,
+                             void *buf, int len, int write)
+{
+        struct file *file = vma->vm_file;
+        struct kernfs_open_file *of = kernfs_of(file);
+        int ret;
+        if (!of->vm_ops)
+                return -EINVAL;
+        if (!kernfs_get_active(of->kn))
+                return -EINVAL;
+        ret = -EINVAL;
+        if (of->vm_ops->access)
+                ret = of->vm_ops->access(vma, addr, buf, len, write);
+        kernfs_put_active(of->kn);
+        return ret;
+}
+#ifdef CONFIG_NUMA
+static int kernfs_vma_set_policy(struct vm_area_struct *vma,
+                                 struct mempolicy *new)
+{
+        struct file *file = vma->vm_file;
+        struct kernfs_open_file *of = kernfs_of(file);
+        int ret;
+        if (!of->vm_ops)
+                return 0;
+        if (!kernfs_get_active(of->kn))
+                return -EINVAL;
+        ret = 0;
+        if (of->vm_ops->set_policy)
+                ret = of->vm_ops->set_policy(vma, new);
+        kernfs_put_active(of->kn);
+        return ret;
+}
+static struct mempolicy *kernfs_vma_get_policy(struct vm_area_struct *vma,
+                                               unsigned long addr)
+{
+        struct file *file = vma->vm_file;
+        struct kernfs_open_file *of = kernfs_of(file);
+        struct mempolicy *pol;
+        if (!of->vm_ops)
+                return vma->vm_policy;
+        if (!kernfs_get_active(of->kn))
+                return vma->vm_policy;
+        pol = vma->vm_policy;
+        if (of->vm_ops->get_policy)
+                pol = of->vm_ops->get_policy(vma, addr);
+        kernfs_put_active(of->kn);
+        return pol;
+}
+static int kernfs_vma_migrate(struct vm_area_struct *vma,
+                              const nodemask_t *from, const nodemask_t *to,
+                              unsigned long flags)
+{
+        struct file *file = vma->vm_file;
+        struct kernfs_open_file *of = kernfs_of(file);
+        int ret;
+        if (!of->vm_ops)
+                return 0;
+        if (!kernfs_get_active(of->kn))
+                return 0;
+        ret = 0;
+        if (of->vm_ops->migrate)
+                ret = of->vm_ops->migrate(vma, from, to, flags);
+        kernfs_put_active(of->kn);
+        return ret;
+}
+#endif
+static const struct vm_operations_struct kernfs_vm_ops = {
+        .open           = kernfs_vma_open,
+        .fault          = kernfs_vma_fault,
+        .page_mkwrite   = kernfs_vma_page_mkwrite,
+        .access         = kernfs_vma_access,
+#ifdef CONFIG_NUMA
+        .set_policy     = kernfs_vma_set_policy,
+        .get_policy     = kernfs_vma_get_policy,
+        .migrate        = kernfs_vma_migrate,
+#endif
+};
+static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma)
+{
+        struct kernfs_open_file *of = kernfs_of(file);
+        const struct kernfs_ops *ops;
+        int rc;
+        /*
+         * mmap path and of->mutex are prone to triggering spurious lockdep
+         * warnings and we don't want to add spurious locking dependency
+         * between the two.  Check whether mmap is actually implemented
+         * without grabbing @of->mutex by testing HAS_MMAP flag.  See the
+         * comment in kernfs_file_open() for more details.
+         */
+        if (!(of->kn->flags & KERNFS_HAS_MMAP))
+                return -ENODEV;
+        mutex_lock(&of->mutex);
+        rc = -ENODEV;
+        if (!kernfs_get_active(of->kn))
+                goto out_unlock;
+        ops = kernfs_ops(of->kn);
+        rc = ops->mmap(of, vma);
+        /*
+         * PowerPC's pci_mmap of legacy_mem uses shmem_zero_setup()
+         * to satisfy versions of X which crash if the mmap fails: that
+         * substitutes a new vm_file, and we don't then want bin_vm_ops.
+         */
+        if (vma->vm_file != file)
+                goto out_put;
+        rc = -EINVAL;
+        if (of->mmapped && of->vm_ops != vma->vm_ops)
+                goto out_put;
+        /*
+         * It is not possible to successfully wrap close.
+         * So error if someone is trying to use close.
+         */
+        rc = -EINVAL;
+        if (vma->vm_ops && vma->vm_ops->close)
+                goto out_put;
+        rc = 0;
+        of->mmapped = 1;
+        of->vm_ops = vma->vm_ops;
+        vma->vm_ops = &kernfs_vm_ops;
+out_put:
+        kernfs_put_active(of->kn);
+out_unlock:
+        mutex_unlock(&of->mutex);
+        return rc;
+}
+/**
+ *      kernfs_get_open_node - get or create kernfs_open_node
+ *      @kn: target kernfs_node
+ *      @of: kernfs_open_file for this instance of open
+ *
+ *      If @kn->attr.open exists, increment its reference count; otherwise,
+ *      create one.  @of is chained to the files list.
+ *
+ *      LOCKING:
+ *      Kernel thread context (may sleep).
+ *
+ *      RETURNS:
+ *      0 on success, -errno on failure.
+ */
+static int kernfs_get_open_node(struct kernfs_node *kn,
+                                struct kernfs_open_file *of)
+{
+        struct kernfs_open_node *on, *new_on = NULL;
+ retry:
+        mutex_lock(&kernfs_open_file_mutex);
+        spin_lock_irq(&kernfs_open_node_lock);
+        if (!kn->attr.open && new_on) {
+                kn->attr.open = new_on;
+                new_on = NULL;
+        }
+        on = kn->attr.open;
+        if (on) {
+                atomic_inc(&on->refcnt);
+                list_add_tail(&of->list, &on->files);
+        }
+        spin_unlock_irq(&kernfs_open_node_lock);
+        mutex_unlock(&kernfs_open_file_mutex);
+        if (on) {
+                kfree(new_on);
+                return 0;
+        }
+        /* not there, initialize a new one and retry */
+        new_on = kmalloc(sizeof(*new_on), GFP_KERNEL);
+        if (!new_on)
+                return -ENOMEM;
+        atomic_set(&new_on->refcnt, 0);
+        atomic_set(&new_on->event, 1);
+        init_waitqueue_head(&new_on->poll);
+        INIT_LIST_HEAD(&new_on->files);
+        goto retry;
+}
+/**
+ *      kernfs_put_open_node - put kernfs_open_node
+ *      @kn: target kernfs_nodet
+ *      @of: associated kernfs_open_file
+ *
+ *      Put @kn->attr.open and unlink @of from the files list.  If
+ *      reference count reaches zero, disassociate and free it.
+ *
+ *      LOCKING:
+ *      None.
+ */
+static void kernfs_put_open_node(struct kernfs_node *kn,
+                                 struct kernfs_open_file *of)
+{
+        struct kernfs_open_node *on = kn->attr.open;
+        unsigned long flags;
+        mutex_lock(&kernfs_open_file_mutex);
+        spin_lock_irqsave(&kernfs_open_node_lock, flags);
+        if (of)
+                list_del(&of->list);
+        if (atomic_dec_and_test(&on->refcnt))
+                kn->attr.open = NULL;
+        else
+                on = NULL;
+        spin_unlock_irqrestore(&kernfs_open_node_lock, flags);
+        mutex_unlock(&kernfs_open_file_mutex);
+        kfree(on);
+}
+static int kernfs_fop_open(struct inode *inode, struct file *file)
+{
+        struct kernfs_node *kn = file->f_path.dentry->d_fsdata;
+        const struct kernfs_ops *ops;
+        struct kernfs_open_file *of;
+        bool has_read, has_write, has_mmap;
+        int error = -EACCES;
+        if (!kernfs_get_active(kn))
+                return -ENODEV;
+        ops = kernfs_ops(kn);
+        has_read = ops->seq_show || ops->read || ops->mmap;
+        has_write = ops->write || ops->mmap;
+        has_mmap = ops->mmap;
+        /* check perms and supported operations */
+        if ((file->f_mode & FMODE_WRITE) &&
+            (!(inode->i_mode & S_IWUGO) || !has_write))
+                goto err_out;
+        if ((file->f_mode & FMODE_READ) &&
+            (!(inode->i_mode & S_IRUGO) || !has_read))
+                goto err_out;
+        /* allocate a kernfs_open_file for the file */
+        error = -ENOMEM;
+        of = kzalloc(sizeof(struct kernfs_open_file), GFP_KERNEL);
+        if (!of)
+                goto err_out;
+        /*
+         * The following is done to give a different lockdep key to
+         * @of->mutex for files which implement mmap.  This is a rather
+         * crude way to avoid false positive lockdep warning around
+         * mm->mmap_sem - mmap nests @of->mutex under mm->mmap_sem and
+         * reading /sys/block/sda/trace/act_mask grabs sr_mutex, under
+         * which mm->mmap_sem nests, while holding @of->mutex.  As each
+         * open file has a separate mutex, it's okay as long as those don't
+         * happen on the same file.  At this point, we can't easily give
+         * each file a separate locking class.  Let's differentiate on
+         * whether the file has mmap or not for now.
+         *
+         * Both paths of the branch look the same.  They're supposed to
+         * look that way and give @of->mutex different static lockdep keys.
+         */
+        if (has_mmap)
+                mutex_init(&of->mutex);
+        else
+                mutex_init(&of->mutex);
+        of->kn = kn;
+        of->file = file;
+        /*
+         * Always instantiate seq_file even if read access doesn't use
+         * seq_file or is not requested.  This unifies private data access
+         * and readable regular files are the vast majority anyway.
+         */
+        if (ops->seq_show)
+                error = seq_open(file, &kernfs_seq_ops);
+        else
+                error = seq_open(file, NULL);
+        if (error)
+                goto err_free;
+        ((struct seq_file *)file->private_data)->private = of;
+        /* seq_file clears PWRITE unconditionally, restore it if WRITE */
+        if (file->f_mode & FMODE_WRITE)
+                file->f_mode |= FMODE_PWRITE;
+        /* make sure we have open node struct */
+        error = kernfs_get_open_node(kn, of);
+        if (error)
+                goto err_close;
+        /* open succeeded, put active references */
+        kernfs_put_active(kn);
+        return 0;
+err_close:
+        seq_release(inode, file);
+err_free:
+        kfree(of);
+err_out:
+        kernfs_put_active(kn);
+        return error;
+}
+static int kernfs_fop_release(struct inode *inode, struct file *filp)
+{
+        struct kernfs_node *kn = filp->f_path.dentry->d_fsdata;
+        struct kernfs_open_file *of = kernfs_of(filp);
+        kernfs_put_open_node(kn, of);
+        seq_release(inode, filp);
+        kfree(of);
+        return 0;
+}
+void kernfs_unmap_bin_file(struct kernfs_node *kn)
+{
+        struct kernfs_open_node *on;
+        struct kernfs_open_file *of;
+        if (!(kn->flags & KERNFS_HAS_MMAP))
+                return;
+        spin_lock_irq(&kernfs_open_node_lock);
+        on = kn->attr.open;
+        if (on)
+                atomic_inc(&on->refcnt);
+        spin_unlock_irq(&kernfs_open_node_lock);
+        if (!on)
+                return;
+        mutex_lock(&kernfs_open_file_mutex);
+        list_for_each_entry(of, &on->files, list) {
+                struct inode *inode = file_inode(of->file);
+                unmap_mapping_range(inode->i_mapping, 0, 0, 1);
+        }
+        mutex_unlock(&kernfs_open_file_mutex);
+        kernfs_put_open_node(kn, NULL);
+}
+/*
+ * Kernfs attribute files are pollable.  The idea is that you read
+ * the content and then you use 'poll' or 'select' to wait for
+ * the content to change.  When the content changes (assuming the
+ * manager for the kobject supports notification), poll will
+ * return POLLERR|POLLPRI, and select will return the fd whether
+ * it is waiting for read, write, or exceptions.
+ * Once poll/select indicates that the value has changed, you
+ * need to close and re-open the file, or seek to 0 and read again.
+ * Reminder: this only works for attributes which actively support
+ * it, and it is not possible to test an attribute from userspace
+ * to see if it supports poll (Neither 'poll' nor 'select' return
+ * an appropriate error code).  When in doubt, set a suitable timeout value.
+ */
+static unsigned int kernfs_fop_poll(struct file *filp, poll_table *wait)
+{
+        struct kernfs_open_file *of = kernfs_of(filp);
+        struct kernfs_node *kn = filp->f_path.dentry->d_fsdata;
+        struct kernfs_open_node *on = kn->attr.open;
+        /* need parent for the kobj, grab both */
+        if (!kernfs_get_active(kn))
+                goto trigger;
+        poll_wait(filp, &on->poll, wait);
+        kernfs_put_active(kn);
+        if (of->event != atomic_read(&on->event))
+                goto trigger;
+        return DEFAULT_POLLMASK;
+ trigger:
+        return DEFAULT_POLLMASK|POLLERR|POLLPRI;
+}
+/**
+ * kernfs_notify - notify a kernfs file
+ * @kn: file to notify
+ *
+ * Notify @kn such that poll(2) on @kn wakes up.
+ */
+void kernfs_notify(struct kernfs_node *kn)
+{
+        struct kernfs_open_node *on;
+        unsigned long flags;
+        spin_lock_irqsave(&kernfs_open_node_lock, flags);
+        if (!WARN_ON(kernfs_type(kn) != KERNFS_FILE)) {
+                on = kn->attr.open;
+                if (on) {
+                        atomic_inc(&on->event);
+                        wake_up_interruptible(&on->poll);
+                }
+        }
+        spin_unlock_irqrestore(&kernfs_open_node_lock, flags);
+}
+EXPORT_SYMBOL_GPL(kernfs_notify);
+const struct file_operations kernfs_file_fops = {
+        .read           = kernfs_fop_read,
+        .write          = kernfs_fop_write,
+        .llseek         = generic_file_llseek,
+        .mmap           = kernfs_fop_mmap,
+        .open           = kernfs_fop_open,
+        .release        = kernfs_fop_release,
+        .poll           = kernfs_fop_poll,
+};
+/**
+ * __kernfs_create_file - kernfs internal function to create a file
+ * @parent: directory to create the file in
+ * @name: name of the file
+ * @mode: mode of the file
+ * @size: size of the file
+ * @ops: kernfs operations for the file
+ * @priv: private data for the file
+ * @ns: optional namespace tag of the file
+ * @static_name: don't copy file name
+ * @key: lockdep key for the file's active_ref, %NULL to disable lockdep
+ *
+ * Returns the created node on success, ERR_PTR() value on error.
+ */
+struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
+                                         const char *name,
+                                         umode_t mode, loff_t size,
+                                         const struct kernfs_ops *ops,
+                                         void *priv, const void *ns,
+                                         bool name_is_static,
+                                         struct lock_class_key *key)
+{
+        struct kernfs_addrm_cxt acxt;
+        struct kernfs_node *kn;
+        unsigned flags;
+        int rc;
+        flags = KERNFS_FILE;
+        if (name_is_static)
+                flags |= KERNFS_STATIC_NAME;
+        kn = kernfs_new_node(parent, name, (mode & S_IALLUGO) | S_IFREG, flags);
+        if (!kn)
+                return ERR_PTR(-ENOMEM);
+        kn->attr.ops = ops;
+        kn->attr.size = size;
+        kn->ns = ns;
+        kn->priv = priv;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+        if (key) {
+                lockdep_init_map(&kn->dep_map, "s_active", key, 0);
+                kn->flags |= KERNFS_LOCKDEP;
+        }
+#endif
+        /*
+         * kn->attr.ops is accesible only while holding active ref.  We
+         * need to know whether some ops are implemented outside active
+         * ref.  Cache their existence in flags.
+         */
+        if (ops->seq_show)
+                kn->flags |= KERNFS_HAS_SEQ_SHOW;
+        if (ops->mmap)
+                kn->flags |= KERNFS_HAS_MMAP;
+        kernfs_addrm_start(&acxt);
+        rc = kernfs_add_one(&acxt, kn);
+        kernfs_addrm_finish(&acxt);
+        if (rc) {
+                kernfs_put(kn);
+                return ERR_PTR(rc);
+        }
+        return kn;
+}
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
new file mode 100644
index 000000000000..e55126f85bd2
--- /dev/null
+++ b/fs/kernfs/inode.c
@@ -0,0 +1,377 @@
+/*
+ * fs/kernfs/inode.c - kernfs inode implementation
+ *
+ * Copyright (c) 2001-3 Patrick Mochel
+ * Copyright (c) 2007 SUSE Linux Products GmbH
+ * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2.
+ */
+#include <linux/pagemap.h>
+#include <linux/backing-dev.h>
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/xattr.h>
+#include <linux/security.h>
+#include "kernfs-internal.h"
+static const struct address_space_operations kernfs_aops = {
+        .readpage       = simple_readpage,
+        .write_begin    = simple_write_begin,
+        .write_end      = simple_write_end,
+};
+static struct backing_dev_info kernfs_bdi = {
+        .name           = "kernfs",
+        .ra_pages       = 0,    /* No readahead */
+        .capabilities   = BDI_CAP_NO_ACCT_AND_WRITEBACK,
+};
+static const struct inode_operations kernfs_iops = {
+        .permission     = kernfs_iop_permission,
+        .setattr        = kernfs_iop_setattr,
+        .getattr        = kernfs_iop_getattr,
+        .setxattr       = kernfs_iop_setxattr,
+        .removexattr    = kernfs_iop_removexattr,
+        .getxattr       = kernfs_iop_getxattr,
+        .listxattr      = kernfs_iop_listxattr,
+};
+void __init kernfs_inode_init(void)
+{
+        if (bdi_init(&kernfs_bdi))
+                panic("failed to init kernfs_bdi");
+}
+static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn)
+{
+        struct iattr *iattrs;
+        if (kn->iattr)
+                return kn->iattr;
+        kn->iattr = kzalloc(sizeof(struct kernfs_iattrs), GFP_KERNEL);
+        if (!kn->iattr)
+                return NULL;
+        iattrs = &kn->iattr->ia_iattr;
+        /* assign default attributes */
+        iattrs->ia_mode = kn->mode;
+        iattrs->ia_uid = GLOBAL_ROOT_UID;
+        iattrs->ia_gid = GLOBAL_ROOT_GID;
+        iattrs->ia_atime = iattrs->ia_mtime = iattrs->ia_ctime = CURRENT_TIME;
+        simple_xattrs_init(&kn->iattr->xattrs);
+        return kn->iattr;
+}
+static int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
+{
+        struct kernfs_iattrs *attrs;
+        struct iattr *iattrs;
+        unsigned int ia_valid = iattr->ia_valid;
+        attrs = kernfs_iattrs(kn);
+        if (!attrs)
+                return -ENOMEM;
+        iattrs = &attrs->ia_iattr;
+        if (ia_valid & ATTR_UID)
+                iattrs->ia_uid = iattr->ia_uid;
+        if (ia_valid & ATTR_GID)
+                iattrs->ia_gid = iattr->ia_gid;
+        if (ia_valid & ATTR_ATIME)
+                iattrs->ia_atime = iattr->ia_atime;
+        if (ia_valid & ATTR_MTIME)
+                iattrs->ia_mtime = iattr->ia_mtime;
+        if (ia_valid & ATTR_CTIME)
+                iattrs->ia_ctime = iattr->ia_ctime;
+        if (ia_valid & ATTR_MODE) {
+                umode_t mode = iattr->ia_mode;
+                iattrs->ia_mode = kn->mode = mode;
+        }
+        return 0;
+}
+/**
+ * kernfs_setattr - set iattr on a node
+ * @kn: target node
+ * @iattr: iattr to set
+ *
+ * Returns 0 on success, -errno on failure.
+ */
+int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
+{
+        int ret;
+        mutex_lock(&kernfs_mutex);
+        ret = __kernfs_setattr(kn, iattr);
+        mutex_unlock(&kernfs_mutex);
+        return ret;
+}
+int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+        struct inode *inode = dentry->d_inode;
+        struct kernfs_node *kn = dentry->d_fsdata;
+        int error;
+        if (!kn)
+                return -EINVAL;
+        mutex_lock(&kernfs_mutex);
+        error = inode_change_ok(inode, iattr);
+        if (error)
+                goto out;
+        error = __kernfs_setattr(kn, iattr);
+        if (error)
+                goto out;
+        /* this ignores size changes */
+        setattr_copy(inode, iattr);
+out:
+        mutex_unlock(&kernfs_mutex);
+        return error;
+}
+static int kernfs_node_setsecdata(struct kernfs_node *kn, void **secdata,
+                                  u32 *secdata_len)
+{
+        struct kernfs_iattrs *attrs;
+        void *old_secdata;
+        size_t old_secdata_len;
+        attrs = kernfs_iattrs(kn);
+        if (!attrs)
+                return -ENOMEM;
+        old_secdata = attrs->ia_secdata;
+        old_secdata_len = attrs->ia_secdata_len;
+        attrs->ia_secdata = *secdata;
+        attrs->ia_secdata_len = *secdata_len;
+        *secdata = old_secdata;
+        *secdata_len = old_secdata_len;
+        return 0;
+}
+int kernfs_iop_setxattr(struct dentry *dentry, const char *name,
+                        const void *value, size_t size, int flags)
+{
+        struct kernfs_node *kn = dentry->d_fsdata;
+        struct kernfs_iattrs *attrs;
+        void *secdata;
+        int error;
+        u32 secdata_len = 0;
+        attrs = kernfs_iattrs(kn);
+        if (!attrs)
+                return -ENOMEM;
+        if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) {
+                const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;
+                error = security_inode_setsecurity(dentry->d_inode, suffix,
+                                                value, size, flags);
+                if (error)
+                        return error;
+                error = security_inode_getsecctx(dentry->d_inode,
+                                                &secdata, &secdata_len);
+                if (error)
+                        return error;
+                mutex_lock(&kernfs_mutex);
+                error = kernfs_node_setsecdata(kn, &secdata, &secdata_len);
+                mutex_unlock(&kernfs_mutex);
+                if (secdata)
+                        security_release_secctx(secdata, secdata_len);
+                return error;
+        } else if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) {
+                return simple_xattr_set(&attrs->xattrs, name, value, size,
+                                        flags);
+        }
+        return -EINVAL;
+}
+int kernfs_iop_removexattr(struct dentry *dentry, const char *name)
+{
+        struct kernfs_node *kn = dentry->d_fsdata;
+        struct kernfs_iattrs *attrs;
+        attrs = kernfs_iattrs(kn);
+        if (!attrs)
+                return -ENOMEM;
+        return simple_xattr_remove(&attrs->xattrs, name);
+}
+ssize_t kernfs_iop_getxattr(struct dentry *dentry, const char *name, void *buf,
+                            size_t size)
+{
+        struct kernfs_node *kn = dentry->d_fsdata;
+        struct kernfs_iattrs *attrs;
+        attrs = kernfs_iattrs(kn);
+        if (!attrs)
+                return -ENOMEM;
+        return simple_xattr_get(&attrs->xattrs, name, buf, size);
+}
+ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size)
+{
+        struct kernfs_node *kn = dentry->d_fsdata;
+        struct kernfs_iattrs *attrs;
+        attrs = kernfs_iattrs(kn);
+        if (!attrs)
+                return -ENOMEM;
+        return simple_xattr_list(&attrs->xattrs, buf, size);
+}
+static inline void set_default_inode_attr(struct inode *inode, umode_t mode)
+{
+        inode->i_mode = mode;
+        inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+}
+static inline void set_inode_attr(struct inode *inode, struct iattr *iattr)
+{
+        inode->i_uid = iattr->ia_uid;
+        inode->i_gid = iattr->ia_gid;
+        inode->i_atime = iattr->ia_atime;
+        inode->i_mtime = iattr->ia_mtime;
+        inode->i_ctime = iattr->ia_ctime;
+}
+static void kernfs_refresh_inode(struct kernfs_node *kn, struct inode *inode)
+{
+        struct kernfs_iattrs *attrs = kn->iattr;
+        inode->i_mode = kn->mode;
+        if (attrs) {
+                /*
+                 * kernfs_node has non-default attributes get them from
+                 * persistent copy in kernfs_node.
+                 */
+                set_inode_attr(inode, &attrs->ia_iattr);
+                security_inode_notifysecctx(inode, attrs->ia_secdata,
+                                            attrs->ia_secdata_len);
+        }
+        if (kernfs_type(kn) == KERNFS_DIR)
+                set_nlink(inode, kn->dir.subdirs + 2);
+}
+int kernfs_iop_getattr(struct vfsmount *mnt, struct dentry *dentry,
+                   struct kstat *stat)
+{
+        struct kernfs_node *kn = dentry->d_fsdata;
+        struct inode *inode = dentry->d_inode;
+        mutex_lock(&kernfs_mutex);
+        kernfs_refresh_inode(kn, inode);
+        mutex_unlock(&kernfs_mutex);
+        generic_fillattr(inode, stat);
+        return 0;
+}
+static void kernfs_init_inode(struct kernfs_node *kn, struct inode *inode)
+{
+        kernfs_get(kn);
+        inode->i_private = kn;
+        inode->i_mapping->a_ops = &kernfs_aops;
+        inode->i_mapping->backing_dev_info = &kernfs_bdi;
+        inode->i_op = &kernfs_iops;
+        set_default_inode_attr(inode, kn->mode);
+        kernfs_refresh_inode(kn, inode);
+        /* initialize inode according to type */
+        switch (kernfs_type(kn)) {
+        case KERNFS_DIR:
+                inode->i_op = &kernfs_dir_iops;
+                inode->i_fop = &kernfs_dir_fops;
+                break;
+        case KERNFS_FILE:
+                inode->i_size = kn->attr.size;
+                inode->i_fop = &kernfs_file_fops;
+                break;
+        case KERNFS_LINK:
+                inode->i_op = &kernfs_symlink_iops;
+                break;
+        default:
+                BUG();
+        }
+        unlock_new_inode(inode);
+}
+/**
+ *      kernfs_get_inode - get inode for kernfs_node
+ *      @sb: super block
+ *      @kn: kernfs_node to allocate inode for
+ *
+ *      Get inode for @kn.  If such inode doesn't exist, a new inode is
+ *      allocated and basics are initialized.  New inode is returned
+ *      locked.
+ *
+ *      LOCKING:
+ *      Kernel thread context (may sleep).
+ *
+ *      RETURNS:
+ *      Pointer to allocated inode on success, NULL on failure.
+ */
+struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn)
+{
+        struct inode *inode;
+        inode = iget_locked(sb, kn->ino);
+        if (inode && (inode->i_state & I_NEW))
+                kernfs_init_inode(kn, inode);
+        return inode;
+}
+/*
+ * The kernfs_node serves as both an inode and a directory entry for
+ * kernfs.  To prevent the kernfs inode numbers from being freed
+ * prematurely we take a reference to kernfs_node from the kernfs inode.  A
+ * super_operations.evict_inode() implementation is needed to drop that
+ * reference upon inode destruction.
+ */
+void kernfs_evict_inode(struct inode *inode)
+{
+        struct kernfs_node *kn = inode->i_private;
+        truncate_inode_pages(&inode->i_data, 0);
+        clear_inode(inode);
+        kernfs_put(kn);
+}
+int kernfs_iop_permission(struct inode *inode, int mask)
+{
+        struct kernfs_node *kn;
+        if (mask & MAY_NOT_BLOCK)
+                return -ECHILD;
+        kn = inode->i_private;
+        mutex_lock(&kernfs_mutex);
+        kernfs_refresh_inode(kn, inode);
+        mutex_unlock(&kernfs_mutex);
+        return generic_permission(inode, mask);
+}
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
new file mode 100644
index 000000000000..eb536b76374a
--- /dev/null
+++ b/fs/kernfs/kernfs-internal.h
@@ -0,0 +1,122 @@
+/*
+ * fs/kernfs/kernfs-internal.h - kernfs internal header file
+ *
+ * Copyright (c) 2001-3 Patrick Mochel
+ * Copyright (c) 2007 SUSE Linux Products GmbH
+ * Copyright (c) 2007, 2013 Tejun Heo <teheo@suse.de>
+ *
+ * This file is released under the GPLv2.
+ */
+#ifndef __KERNFS_INTERNAL_H
+#define __KERNFS_INTERNAL_H
+#include <linux/lockdep.h>
+#include <linux/fs.h>
+#include <linux/mutex.h>
+#include <linux/xattr.h>
+#include <linux/kernfs.h>
+struct kernfs_iattrs {
+        struct iattr            ia_iattr;
+        void                    *ia_secdata;
+        u32                     ia_secdata_len;
+        struct simple_xattrs    xattrs;
+};
+#define KN_DEACTIVATED_BIAS             INT_MIN
+/* KERNFS_TYPE_MASK and types are defined in include/linux/kernfs.h */
+/**
+ * kernfs_root - find out the kernfs_root a kernfs_node belongs to
+ * @kn: kernfs_node of interest
+ *
+ * Return the kernfs_root @kn belongs to.
+ */
+static inline struct kernfs_root *kernfs_root(struct kernfs_node *kn)
+{
+        /* if parent exists, it's always a dir; otherwise, @sd is a dir */
+        if (kn->parent)
+                kn = kn->parent;
+        return kn->dir.root;
+}
+/*
+ * Context structure to be used while adding/removing nodes.
+ */
+struct kernfs_addrm_cxt {
+        struct kernfs_node      *removed;
+};
+/*
+ * mount.c
+ */
+struct kernfs_super_info {
+        /*
+         * The root associated with this super_block.  Each super_block is
+         * identified by the root and ns it's associated with.
+         */
+        struct kernfs_root      *root;
+        /*
+         * Each sb is associated with one namespace tag, currently the
+         * network namespace of the task which mounted this kernfs
+         * instance.  If multiple tags become necessary, make the following
+         * an array and compare kernfs_node tag against every entry.
+         */
+        const void              *ns;
+};
+#define kernfs_info(SB) ((struct kernfs_super_info *)(SB->s_fs_info))
+extern struct kmem_cache *kernfs_node_cache;
+/*
+ * inode.c
+ */
+struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn);
+void kernfs_evict_inode(struct inode *inode);
+int kernfs_iop_permission(struct inode *inode, int mask);
+int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr);
+int kernfs_iop_getattr(struct vfsmount *mnt, struct dentry *dentry,
+                       struct kstat *stat);
+int kernfs_iop_setxattr(struct dentry *dentry, const char *name, const void *value,
+                        size_t size, int flags);
+int kernfs_iop_removexattr(struct dentry *dentry, const char *name);
+ssize_t kernfs_iop_getxattr(struct dentry *dentry, const char *name, void *buf,
+                            size_t size);
+ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size);
+void kernfs_inode_init(void);
+/*
+ * dir.c
+ */
+extern struct mutex kernfs_mutex;
+extern const struct dentry_operations kernfs_dops;
+extern const struct file_operations kernfs_dir_fops;
+extern const struct inode_operations kernfs_dir_iops;
+struct kernfs_node *kernfs_get_active(struct kernfs_node *kn);
+void kernfs_put_active(struct kernfs_node *kn);
+void kernfs_addrm_start(struct kernfs_addrm_cxt *acxt);
+int kernfs_add_one(struct kernfs_addrm_cxt *acxt, struct kernfs_node *kn);
+void kernfs_addrm_finish(struct kernfs_addrm_cxt *acxt);
+struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
+                                    const char *name, umode_t mode,
+                                    unsigned flags);
+/*
+ * file.c
+ */
+extern const struct file_operations kernfs_file_fops;
+void kernfs_unmap_bin_file(struct kernfs_node *kn);
+/*
+ * symlink.c
+ */
+extern const struct inode_operations kernfs_symlink_iops;
+#endif  /* __KERNFS_INTERNAL_H */
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
new file mode 100644
index 000000000000..0f4152defe7b
--- /dev/null
+++ b/fs/kernfs/mount.c
@@ -0,0 +1,171 @@
+/*
+ * fs/kernfs/mount.c - kernfs mount implementation
+ *
+ * Copyright (c) 2001-3 Patrick Mochel
+ * Copyright (c) 2007 SUSE Linux Products GmbH
+ * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2.
+ */
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/init.h>
+#include <linux/magic.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include "kernfs-internal.h"
+struct kmem_cache *kernfs_node_cache;
+static const struct super_operations kernfs_sops = {
+        .statfs         = simple_statfs,
+        .drop_inode     = generic_delete_inode,
+        .evict_inode    = kernfs_evict_inode,
+};
+static int kernfs_fill_super(struct super_block *sb)
+{
+        struct kernfs_super_info *info = kernfs_info(sb);
+        struct inode *inode;
+        struct dentry *root;
+        sb->s_blocksize = PAGE_CACHE_SIZE;
+        sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+        sb->s_magic = SYSFS_MAGIC;
+        sb->s_op = &kernfs_sops;
+        sb->s_time_gran = 1;
+        /* get root inode, initialize and unlock it */
+        mutex_lock(&kernfs_mutex);
+        inode = kernfs_get_inode(sb, info->root->kn);
+        mutex_unlock(&kernfs_mutex);
+        if (!inode) {
+                pr_debug("kernfs: could not get root inode\n");
+                return -ENOMEM;
+        }
+        /* instantiate and link root dentry */
+        root = d_make_root(inode);
+        if (!root) {
+                pr_debug("%s: could not get root dentry!\n", __func__);
+                return -ENOMEM;
+        }
+        kernfs_get(info->root->kn);
+        root->d_fsdata = info->root->kn;
+        sb->s_root = root;
+        sb->s_d_op = &kernfs_dops;
+        return 0;
+}
+static int kernfs_test_super(struct super_block *sb, void *data)
+{
+        struct kernfs_super_info *sb_info = kernfs_info(sb);
+        struct kernfs_super_info *info = data;
+        return sb_info->root == info->root && sb_info->ns == info->ns;
+}
+static int kernfs_set_super(struct super_block *sb, void *data)
+{
+        int error;
+        error = set_anon_super(sb, data);
+        if (!error)
+                sb->s_fs_info = data;
+        return error;
+}
+/**
+ * kernfs_super_ns - determine the namespace tag of a kernfs super_block
+ * @sb: super_block of interest
+ *
+ * Return the namespace tag associated with kernfs super_block @sb.
+ */
+const void *kernfs_super_ns(struct super_block *sb)
+{
+        struct kernfs_super_info *info = kernfs_info(sb);
+        return info->ns;
+}
+/**
+ * kernfs_mount_ns - kernfs mount helper
+ * @fs_type: file_system_type of the fs being mounted
+ * @flags: mount flags specified for the mount
+ * @root: kernfs_root of the hierarchy being mounted
+ * @new_sb_created: tell the caller if we allocated a new superblock
+ * @ns: optional namespace tag of the mount
+ *
+ * This is to be called from each kernfs user's file_system_type->mount()
+ * implementation, which should pass through the specified @fs_type and
+ * @flags, and specify the hierarchy and namespace tag to mount via @root
+ * and @ns, respectively.
+ *
+ * The return value can be passed to the vfs layer verbatim.
+ */
+struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags,
+                               struct kernfs_root *root, bool *new_sb_created,
+                               const void *ns)
+{
+        struct super_block *sb;
+        struct kernfs_super_info *info;
+        int error;
+        info = kzalloc(sizeof(*info), GFP_KERNEL);
+        if (!info)
+                return ERR_PTR(-ENOMEM);
+        info->root = root;
+        info->ns = ns;
+        sb = sget(fs_type, kernfs_test_super, kernfs_set_super, flags, info);
+        if (IS_ERR(sb) || sb->s_fs_info != info)
+                kfree(info);
+        if (IS_ERR(sb))
+                return ERR_CAST(sb);
+        if (new_sb_created)
+                *new_sb_created = !sb->s_root;
+        if (!sb->s_root) {
+                error = kernfs_fill_super(sb);
+                if (error) {
+                        deactivate_locked_super(sb);
+                        return ERR_PTR(error);
+                }
+                sb->s_flags |= MS_ACTIVE;
+        }
+        return dget(sb->s_root);
+}
+/**
+ * kernfs_kill_sb - kill_sb for kernfs
+ * @sb: super_block being killed
+ *
+ * This can be used directly for file_system_type->kill_sb().  If a kernfs
+ * user needs extra cleanup, it can implement its own kill_sb() and call
+ * this function at the end.
+ */
+void kernfs_kill_sb(struct super_block *sb)
+{
+        struct kernfs_super_info *info = kernfs_info(sb);
+        struct kernfs_node *root_kn = sb->s_root->d_fsdata;
+        /*
+         * Remove the superblock from fs_supers/s_instances
+         * so we can't find it, before freeing kernfs_super_info.
+         */
+        kill_anon_super(sb);
+        kfree(info);
+        kernfs_put(root_kn);
+}
+void __init kernfs_init(void)
+{
+        kernfs_node_cache = kmem_cache_create("kernfs_node_cache",
+                                              sizeof(struct kernfs_node),
+                                              0, SLAB_PANIC, NULL);
+        kernfs_inode_init();
+}
diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c
new file mode 100644
index 000000000000..4d457055acb9
--- /dev/null
+++ b/fs/kernfs/symlink.c
@@ -0,0 +1,151 @@
+/*
+ * fs/kernfs/symlink.c - kernfs symlink implementation
+ *
+ * Copyright (c) 2001-3 Patrick Mochel
+ * Copyright (c) 2007 SUSE Linux Products GmbH
+ * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2.
+ */
+#include <linux/fs.h>
+#include <linux/gfp.h>
+#include <linux/namei.h>
+#include "kernfs-internal.h"
+/**
+ * kernfs_create_link - create a symlink
+ * @parent: directory to create the symlink in
+ * @name: name of the symlink
+ * @target: target node for the symlink to point to
+ *
+ * Returns the created node on success, ERR_PTR() value on error.
+ */
+struct kernfs_node *kernfs_create_link(struct kernfs_node *parent,
+                                       const char *name,
+                                       struct kernfs_node *target)
+{
+        struct kernfs_node *kn;
+        struct kernfs_addrm_cxt acxt;
+        int error;
+        kn = kernfs_new_node(parent, name, S_IFLNK|S_IRWXUGO, KERNFS_LINK);
+        if (!kn)
+                return ERR_PTR(-ENOMEM);
+        if (kernfs_ns_enabled(parent))
+                kn->ns = target->ns;
+        kn->symlink.target_kn = target;
+        kernfs_get(target);     /* ref owned by symlink */
+        kernfs_addrm_start(&acxt);
+        error = kernfs_add_one(&acxt, kn);
+        kernfs_addrm_finish(&acxt);
+        if (!error)
+                return kn;
+        kernfs_put(kn);
+        return ERR_PTR(error);
+}
+static int kernfs_get_target_path(struct kernfs_node *parent,
+                                  struct kernfs_node *target, char *path)
+{
+        struct kernfs_node *base, *kn;
+        char *s = path;
+        int len = 0;
+        /* go up to the root, stop at the base */
+        base = parent;
+        while (base->parent) {
+                kn = target->parent;
+                while (kn->parent && base != kn)
+                        kn = kn->parent;
+                if (base == kn)
+                        break;
+                strcpy(s, "../");
+                s += 3;
+                base = base->parent;
+        }
+        /* determine end of target string for reverse fillup */
+        kn = target;
+        while (kn->parent && kn != base) {
+                len += strlen(kn->name) + 1;
+                kn = kn->parent;
+        }
+        /* check limits */
+        if (len < 2)
+                return -EINVAL;
+        len--;
+        if ((s - path) + len > PATH_MAX)
+                return -ENAMETOOLONG;
+        /* reverse fillup of target string from target to base */
+        kn = target;
+        while (kn->parent && kn != base) {
+                int slen = strlen(kn->name);
+                len -= slen;
+                strncpy(s + len, kn->name, slen);
+                if (len)
+                        s[--len] = '/';
+                kn = kn->parent;
+        }
+        return 0;
+}
+static int kernfs_getlink(struct dentry *dentry, char *path)
+{
+        struct kernfs_node *kn = dentry->d_fsdata;
+        struct kernfs_node *parent = kn->parent;
+        struct kernfs_node *target = kn->symlink.target_kn;
+        int error;
+        mutex_lock(&kernfs_mutex);
+        error = kernfs_get_target_path(parent, target, path);
+        mutex_unlock(&kernfs_mutex);
+        return error;
+}
+static void *kernfs_iop_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+        int error = -ENOMEM;
+        unsigned long page = get_zeroed_page(GFP_KERNEL);
+        if (page) {
+                error = kernfs_getlink(dentry, (char *) page);
+                if (error < 0)
+                        free_page((unsigned long)page);
+        }
+        nd_set_link(nd, error ? ERR_PTR(error) : (char *)page);
+        return NULL;
+}
+static void kernfs_iop_put_link(struct dentry *dentry, struct nameidata *nd,
+                                void *cookie)
+{
+        char *page = nd_get_link(nd);
+        if (!IS_ERR(page))
+                free_page((unsigned long)page);
+}
+const struct inode_operations kernfs_symlink_iops = {
+        .setxattr       = kernfs_iop_setxattr,
+        .removexattr    = kernfs_iop_removexattr,
+        .getxattr       = kernfs_iop_getxattr,
+        .listxattr      = kernfs_iop_listxattr,
+        .readlink       = generic_readlink,
+        .follow_link    = kernfs_iop_follow_link,
+        .put_link       = kernfs_iop_put_link,
+        .setattr        = kernfs_iop_setattr,
+        .getattr        = kernfs_iop_getattr,
+        .permission     = kernfs_iop_permission,
+};
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index e066a3902973..ab798a88ec1d 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -779,6 +779,7 @@ nlmsvc_grant_blocked(struct nlm_block *block)
        struct nlm_file         *file = block->b_file;
        struct nlm_lock         *lock = &block->b_call->a_args.lock;
        int                     error;
+        loff_t                  fl_start, fl_end;
        dprintk("lockd: grant blocked lock %p\n", block);
@@ -796,9 +797,16 @@ nlmsvc_grant_blocked(struct nlm_block *block)
        }
        /* Try the lock operation again */
+        /* vfs_lock_file() can mangle fl_start and fl_end, but we need
+         * them unchanged for the GRANT_MSG
+         */
        lock->fl.fl_flags |= FL_SLEEP;
+        fl_start = lock->fl.fl_start;
+        fl_end = lock->fl.fl_end;
        error = vfs_lock_file(file->f_file, F_SETLK, &lock->fl, NULL);
        lock->fl.fl_flags &= ~FL_SLEEP;
+        lock->fl.fl_start = fl_start;
+        lock->fl.fl_end = fl_end;
        switch (error) {
        case 0:
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index 0f95f0d0b313..76279e11982d 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -26,9 +26,9 @@ static int sync_request(struct page *page, struct block_device *bdev, int rw)
        bio_vec.bv_len = PAGE_SIZE;
        bio_vec.bv_offset = 0;
        bio.bi_vcnt = 1;
-        bio.bi_size = PAGE_SIZE;
        bio.bi_bdev = bdev;
-        bio.bi_sector = page->index * (PAGE_SIZE >> 9);
+        bio.bi_iter.bi_sector = page->index * (PAGE_SIZE >> 9);
+        bio.bi_iter.bi_size = PAGE_SIZE;
        return submit_bio_wait(rw, &bio);
 }
@@ -56,22 +56,18 @@ static DECLARE_WAIT_QUEUE_HEAD(wq);
 static void writeseg_end_io(struct bio *bio, int err)
 {
        const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-        struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+        struct bio_vec *bvec;
+        int i;
        struct super_block *sb = bio->bi_private;
        struct logfs_super *super = logfs_super(sb);
-        struct page *page;
        BUG_ON(!uptodate); /* FIXME: Retry io or write elsewhere */
        BUG_ON(err);
-        BUG_ON(bio->bi_vcnt == 0);
-        do {
+        bio_for_each_segment_all(bvec, bio, i) {
-                page = bvec->bv_page;
+                end_page_writeback(bvec->bv_page);
-                if (--bvec >= bio->bi_io_vec)
+                page_cache_release(bvec->bv_page);
-                        prefetchw(&bvec->bv_page->flags);
+        }
-                end_page_writeback(page);
-                page_cache_release(page);
-        } while (bvec >= bio->bi_io_vec);
        bio_put(bio);
        if (atomic_dec_and_test(&super->s_pending_writes))
                wake_up(&wq);
@@ -96,9 +92,9 @@ static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
                if (i >= max_pages) {
                        /* Block layer cannot split bios :( */
                        bio->bi_vcnt = i;
-                        bio->bi_size = i * PAGE_SIZE;
+                        bio->bi_iter.bi_size = i * PAGE_SIZE;
                        bio->bi_bdev = super->s_bdev;
-                        bio->bi_sector = ofs >> 9;
+                        bio->bi_iter.bi_sector = ofs >> 9;
                        bio->bi_private = sb;
                        bio->bi_end_io = writeseg_end_io;
                        atomic_inc(&super->s_pending_writes);
@@ -123,9 +119,9 @@ static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
                unlock_page(page);
        }
        bio->bi_vcnt = nr_pages;
-        bio->bi_size = nr_pages * PAGE_SIZE;
+        bio->bi_iter.bi_size = nr_pages * PAGE_SIZE;
        bio->bi_bdev = super->s_bdev;
-        bio->bi_sector = ofs >> 9;
+        bio->bi_iter.bi_sector = ofs >> 9;
        bio->bi_private = sb;
        bio->bi_end_io = writeseg_end_io;
        atomic_inc(&super->s_pending_writes);
@@ -188,9 +184,9 @@ static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index,
                if (i >= max_pages) {
                        /* Block layer cannot split bios :( */
                        bio->bi_vcnt = i;
-                        bio->bi_size = i * PAGE_SIZE;
+                        bio->bi_iter.bi_size = i * PAGE_SIZE;
                        bio->bi_bdev = super->s_bdev;
-                        bio->bi_sector = ofs >> 9;
+                        bio->bi_iter.bi_sector = ofs >> 9;
                        bio->bi_private = sb;
                        bio->bi_end_io = erase_end_io;
                        atomic_inc(&super->s_pending_writes);
@@ -209,9 +205,9 @@ static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index,
                bio->bi_io_vec[i].bv_offset = 0;
        }
        bio->bi_vcnt = nr_pages;
-        bio->bi_size = nr_pages * PAGE_SIZE;
+        bio->bi_iter.bi_size = nr_pages * PAGE_SIZE;
        bio->bi_bdev = super->s_bdev;
-        bio->bi_sector = ofs >> 9;
+        bio->bi_iter.bi_sector = ofs >> 9;
        bio->bi_private = sb;
        bio->bi_end_io = erase_end_io;
        atomic_inc(&super->s_pending_writes);
diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c
index d448a777166b..7f9b096d8d57 100644
--- a/fs/logfs/segment.c
+++ b/fs/logfs/segment.c
@@ -62,7 +62,8 @@ static struct page *get_mapping_page(struct super_block *sb, pgoff_t index,
                page = read_cache_page(mapping, index, filler, sb);
        else {
                page = find_or_create_page(mapping, index, GFP_NOFS);
-                unlock_page(page);
+                if (page)
+                        unlock_page(page);
        }
        return page;
 }
diff --git a/fs/mount.h b/fs/mount.h
index d64c594be6c4..b29e42f05f34 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -19,13 +19,13 @@ struct mnt_pcp {
 };
 struct mountpoint {
-        struct list_head m_hash;
+        struct hlist_node m_hash;
        struct dentry *m_dentry;
        int m_count;
 };
 struct mount {
-        struct list_head mnt_hash;
+        struct hlist_node mnt_hash;
        struct mount *mnt_parent;
        struct dentry *mnt_mountpoint;
        struct vfsmount mnt;
@@ -74,7 +74,7 @@ static inline int mnt_has_parent(struct mount *mnt)
 static inline int is_mounted(struct vfsmount *mnt)
 {
        /* neither detached nor internal? */
-        return !IS_ERR_OR_NULL(real_mount(mnt));
+        return !IS_ERR_OR_NULL(real_mount(mnt)->mnt_ns);
 }
 extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *);
diff --git a/fs/mpage.c b/fs/mpage.c
index 0face1c4d4c6..4979ffa60aaa 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -43,16 +43,14 @@
 */
 static void mpage_end_io(struct bio *bio, int err)
 {
-        const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+        struct bio_vec *bv;
-        struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+        int i;
-        do {
+        bio_for_each_segment_all(bv, bio, i) {
-                struct page *page = bvec->bv_page;
+                struct page *page = bv->bv_page;
-                if (--bvec >= bio->bi_io_vec)
-                        prefetchw(&bvec->bv_page->flags);
                if (bio_data_dir(bio) == READ) {
-                        if (uptodate) {
+                        if (!err) {
                                SetPageUptodate(page);
                        } else {
                                ClearPageUptodate(page);
@@ -60,14 +58,15 @@ static void mpage_end_io(struct bio *bio, int err)
                        }
                        unlock_page(page);
                } else { /* bio_data_dir(bio) == WRITE */
-                        if (!uptodate) {
+                        if (err) {
                                SetPageError(page);
                                if (page->mapping)
                                        set_bit(AS_EIO, &page->mapping->flags);
                        }
                        end_page_writeback(page);
                }
-        } while (bvec >= bio->bi_io_vec);
+        }
        bio_put(bio);
 }
@@ -94,7 +93,7 @@ mpage_alloc(struct block_device *bdev,
        if (bio) {
                bio->bi_bdev = bdev;
-                bio->bi_sector = first_sector;
+                bio->bi_iter.bi_sector = first_sector;
        }
        return bio;
 }
diff --git a/fs/namei.c b/fs/namei.c
index 3531deebad30..4b491b431990 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -196,6 +196,7 @@ recopy:
                goto error;
        result->uptr = filename;
+        result->aname = NULL;
        audit_getname(result);
        return result;
@@ -209,7 +210,35 @@ getname(const char __user * filename)
 {
        return getname_flags(filename, 0, NULL);
 }
-EXPORT_SYMBOL(getname);
+/*
+ * The "getname_kernel()" interface doesn't do pathnames longer
+ * than EMBEDDED_NAME_MAX. Deal with it - you're a kernel user.
+ */
+struct filename *
+getname_kernel(const char * filename)
+{
+        struct filename *result;
+        char *kname;
+        int len;
+        len = strlen(filename);
+        if (len >= EMBEDDED_NAME_MAX)
+                return ERR_PTR(-ENAMETOOLONG);
+        result = __getname();
+        if (unlikely(!result))
+                return ERR_PTR(-ENOMEM);
+        kname = (char *)result + sizeof(*result);
+        result->name = kname;
+        result->uptr = NULL;
+        result->aname = NULL;
+        result->separate = false;
+        strlcpy(kname, filename, EMBEDDED_NAME_MAX);
+        return result;
+}
 #ifdef CONFIG_AUDITSYSCALL
 void putname(struct filename *name)
@@ -235,27 +264,9 @@ static int check_acl(struct inode *inode, int mask)
                return posix_acl_permission(inode, acl, mask & ~MAY_NOT_BLOCK);
        }
-        acl = get_cached_acl(inode, ACL_TYPE_ACCESS);
+        acl = get_acl(inode, ACL_TYPE_ACCESS);
+        if (IS_ERR(acl))
-        /*
+                return PTR_ERR(acl);
-         * A filesystem can force a ACL callback by just never filling the
-         * ACL cache. But normally you'd fill the cache either at inode
-         * instantiation time, or on the first ->get_acl call.
-         *
-         * If the filesystem doesn't have a get_acl() function at all, we'll
-         * just create the negative cache entry.
-         */
-        if (acl == ACL_NOT_CACHED) {
-                if (inode->i_op->get_acl) {
-                        acl = inode->i_op->get_acl(inode, ACL_TYPE_ACCESS);
-                        if (IS_ERR(acl))
-                                return PTR_ERR(acl);
-                } else {
-                        set_cached_acl(inode, ACL_TYPE_ACCESS, NULL);
-                        return -EAGAIN;
-                }
-        }
        if (acl) {
                int error = posix_acl_permission(inode, acl, mask);
                posix_acl_release(acl);
@@ -1098,7 +1109,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
                        return false;
                if (!d_mountpoint(path->dentry))
-                        break;
+                        return true;
                mounted = __lookup_mnt(path->mnt, path->dentry);
                if (!mounted)
@@ -1114,20 +1125,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
                 */
                *inode = path->dentry->d_inode;
        }
-        return true;
+        return read_seqretry(&mount_lock, nd->m_seq);
-}
-static void follow_mount_rcu(struct nameidata *nd)
-{
-        while (d_mountpoint(nd->path.dentry)) {
-                struct mount *mounted;
-                mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry);
-                if (!mounted)
-                        break;
-                nd->path.mnt = &mounted->mnt;
-                nd->path.dentry = mounted->mnt.mnt_root;
-                nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
-        }
 }
 static int follow_dotdot_rcu(struct nameidata *nd)
@@ -1155,7 +1153,17 @@ static int follow_dotdot_rcu(struct nameidata *nd)
                        break;
                nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
        }
-        follow_mount_rcu(nd);
+        while (d_mountpoint(nd->path.dentry)) {
+                struct mount *mounted;
+                mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry);
+                if (!mounted)
+                        break;
+                nd->path.mnt = &mounted->mnt;
+                nd->path.dentry = mounted->mnt.mnt_root;
+                nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
+                if (!read_seqretry(&mount_lock, nd->m_seq))
+                        goto failed;
+        }
        nd->inode = nd->path.dentry->d_inode;
        return 0;
@@ -1873,7 +1881,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
                nd->path = f.file->f_path;
                if (flags & LOOKUP_RCU) {
-                        if (f.need_put)
+                        if (f.flags & FDPUT_FPUT)
                                *fp = f.file;
                        nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
                        rcu_read_lock();
@@ -3945,10 +3953,13 @@ out_dput:
        done_path_create(&new_path, new_dentry);
        if (delegated_inode) {
                error = break_deleg_wait(&delegated_inode);
-                if (!error)
+                if (!error) {
+                        path_put(&old_path);
                        goto retry;
+                }
        }
        if (retry_estale(error, how)) {
+                path_put(&old_path);
                how |= LOOKUP_REVAL;
                goto retry;
        }
diff --git a/fs/namespace.c b/fs/namespace.c
index ac2ce8a766e1..2ffc5a2905d4 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -23,11 +23,34 @@
 #include <linux/uaccess.h>
 #include <linux/proc_ns.h>
 #include <linux/magic.h>
+#include <linux/bootmem.h>
 #include "pnode.h"
 #include "internal.h"
-#define HASH_SHIFT ilog2(PAGE_SIZE / sizeof(struct list_head))
+static unsigned int m_hash_mask __read_mostly;
-#define HASH_SIZE (1UL << HASH_SHIFT)
+static unsigned int m_hash_shift __read_mostly;
+static unsigned int mp_hash_mask __read_mostly;
+static unsigned int mp_hash_shift __read_mostly;
+static __initdata unsigned long mhash_entries;
+static int __init set_mhash_entries(char *str)
+{
+        if (!str)
+                return 0;
+        mhash_entries = simple_strtoul(str, &str, 0);
+        return 1;
+}
+__setup("mhash_entries=", set_mhash_entries);
+static __initdata unsigned long mphash_entries;
+static int __init set_mphash_entries(char *str)
+{
+        if (!str)
+                return 0;
+        mphash_entries = simple_strtoul(str, &str, 0);
+        return 1;
+}
+__setup("mphash_entries=", set_mphash_entries);
 static int event;
 static DEFINE_IDA(mnt_id_ida);
@@ -36,8 +59,8 @@ static DEFINE_SPINLOCK(mnt_id_lock);
 static int mnt_id_start = 0;
 static int mnt_group_start = 1;
-static struct list_head *mount_hashtable __read_mostly;
+static struct hlist_head *mount_hashtable __read_mostly;
-static struct list_head *mountpoint_hashtable __read_mostly;
+static struct hlist_head *mountpoint_hashtable __read_mostly;
 static struct kmem_cache *mnt_cache __read_mostly;
 static DECLARE_RWSEM(namespace_sem);
@@ -55,12 +78,19 @@ EXPORT_SYMBOL_GPL(fs_kobj);
 */
 __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);
-static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
+static inline struct hlist_head *m_hash(struct vfsmount *mnt, struct dentry *dentry)
 {
        unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES);
        tmp += ((unsigned long)dentry / L1_CACHE_BYTES);
-        tmp = tmp + (tmp >> HASH_SHIFT);
+        tmp = tmp + (tmp >> m_hash_shift);
-        return tmp & (HASH_SIZE - 1);
+        return &mount_hashtable[tmp & m_hash_mask];
+}
+static inline struct hlist_head *mp_hash(struct dentry *dentry)
+{
+        unsigned long tmp = ((unsigned long)dentry / L1_CACHE_BYTES);
+        tmp = tmp + (tmp >> mp_hash_shift);
+        return &mountpoint_hashtable[tmp & mp_hash_mask];
 }
 /*
@@ -187,7 +217,7 @@ static struct mount *alloc_vfsmnt(const char *name)
                mnt->mnt_writers = 0;
 #endif
-                INIT_LIST_HEAD(&mnt->mnt_hash);
+                INIT_HLIST_NODE(&mnt->mnt_hash);
                INIT_LIST_HEAD(&mnt->mnt_child);
                INIT_LIST_HEAD(&mnt->mnt_mounts);
                INIT_LIST_HEAD(&mnt->mnt_list);
@@ -575,10 +605,10 @@ bool legitimize_mnt(struct vfsmount *bastard, unsigned seq)
 */
 struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
 {
-        struct list_head *head = mount_hashtable + hash(mnt, dentry);
+        struct hlist_head *head = m_hash(mnt, dentry);
        struct mount *p;
-        list_for_each_entry_rcu(p, head, mnt_hash)
+        hlist_for_each_entry_rcu(p, head, mnt_hash)
                if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry)
                        return p;
        return NULL;
@@ -590,13 +620,17 @@ struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
 */
 struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
 {
-        struct list_head *head = mount_hashtable + hash(mnt, dentry);
+        struct mount *p, *res;
-        struct mount *p;
+        res = p = __lookup_mnt(mnt, dentry);
+        if (!p)
-        list_for_each_entry_reverse(p, head, mnt_hash)
+                goto out;
-                if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry)
+        hlist_for_each_entry_continue(p, mnt_hash) {
-                        return p;
+                if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry)
-        return NULL;
+                        break;
+                res = p;
+        }
+out:
+        return res;
 }
 /*
@@ -633,11 +667,11 @@ struct vfsmount *lookup_mnt(struct path *path)
 static struct mountpoint *new_mountpoint(struct dentry *dentry)
 {
-        struct list_head *chain = mountpoint_hashtable + hash(NULL, dentry);
+        struct hlist_head *chain = mp_hash(dentry);
        struct mountpoint *mp;
        int ret;
-        list_for_each_entry(mp, chain, m_hash) {
+        hlist_for_each_entry(mp, chain, m_hash) {
                if (mp->m_dentry == dentry) {
                        /* might be worth a WARN_ON() */
                        if (d_unlinked(dentry))
@@ -659,7 +693,7 @@ static struct mountpoint *new_mountpoint(struct dentry *dentry)
        mp->m_dentry = dentry;
        mp->m_count = 1;
-        list_add(&mp->m_hash, chain);
+        hlist_add_head(&mp->m_hash, chain);
        return mp;
 }
@@ -670,7 +704,7 @@ static void put_mountpoint(struct mountpoint *mp)
                spin_lock(&dentry->d_lock);
                dentry->d_flags &= ~DCACHE_MOUNTED;
                spin_unlock(&dentry->d_lock);
-                list_del(&mp->m_hash);
+                hlist_del(&mp->m_hash);
                kfree(mp);
        }
 }
@@ -712,7 +746,7 @@ static void detach_mnt(struct mount *mnt, struct path *old_path)
        mnt->mnt_parent = mnt;
        mnt->mnt_mountpoint = mnt->mnt.mnt_root;
        list_del_init(&mnt->mnt_child);
-        list_del_init(&mnt->mnt_hash);
+        hlist_del_init_rcu(&mnt->mnt_hash);
        put_mountpoint(mnt->mnt_mp);
        mnt->mnt_mp = NULL;
 }
@@ -739,15 +773,14 @@ static void attach_mnt(struct mount *mnt,
                        struct mountpoint *mp)
 {
        mnt_set_mountpoint(parent, mp, mnt);
-        list_add_tail(&mnt->mnt_hash, mount_hashtable +
+        hlist_add_head_rcu(&mnt->mnt_hash, m_hash(&parent->mnt, mp->m_dentry));
-                        hash(&parent->mnt, mp->m_dentry));
        list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
 }
 /*
 * vfsmount lock must be held for write
 */
-static void commit_tree(struct mount *mnt)
+static void commit_tree(struct mount *mnt, struct mount *shadows)
 {
        struct mount *parent = mnt->mnt_parent;
        struct mount *m;
@@ -762,8 +795,11 @@ static void commit_tree(struct mount *mnt)
        list_splice(&head, n->list.prev);
-        list_add_tail(&mnt->mnt_hash, mount_hashtable +
+        if (shadows)
-                                hash(&parent->mnt, mnt->mnt_mountpoint));
+                hlist_add_after_rcu(&shadows->mnt_hash, &mnt->mnt_hash);
+        else
+                hlist_add_head_rcu(&mnt->mnt_hash,
+                                m_hash(&parent->mnt, mnt->mnt_mountpoint));
        list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
        touch_mnt_namespace(n);
 }
@@ -1153,26 +1189,28 @@ int may_umount(struct vfsmount *mnt)
 EXPORT_SYMBOL(may_umount);
-static LIST_HEAD(unmounted);    /* protected by namespace_sem */
+static HLIST_HEAD(unmounted);   /* protected by namespace_sem */
 static void namespace_unlock(void)
 {
        struct mount *mnt;
-        LIST_HEAD(head);
+        struct hlist_head head = unmounted;
-        if (likely(list_empty(&unmounted))) {
+        if (likely(hlist_empty(&head))) {
                up_write(&namespace_sem);
                return;
        }
-        list_splice_init(&unmounted, &head);
+        head.first->pprev = &head.first;
+        INIT_HLIST_HEAD(&unmounted);
        up_write(&namespace_sem);
        synchronize_rcu();
-        while (!list_empty(&head)) {
+        while (!hlist_empty(&head)) {
-                mnt = list_first_entry(&head, struct mount, mnt_hash);
+                mnt = hlist_entry(head.first, struct mount, mnt_hash);
-                list_del_init(&mnt->mnt_hash);
+                hlist_del_init(&mnt->mnt_hash);
                if (mnt->mnt_ex_mountpoint.mnt)
                        path_put(&mnt->mnt_ex_mountpoint);
                mntput(&mnt->mnt);
@@ -1193,16 +1231,19 @@ static inline void namespace_lock(void)
 */
 void umount_tree(struct mount *mnt, int how)
 {
-        LIST_HEAD(tmp_list);
+        HLIST_HEAD(tmp_list);
        struct mount *p;
+        struct mount *last = NULL;
-        for (p = mnt; p; p = next_mnt(p, mnt))
+        for (p = mnt; p; p = next_mnt(p, mnt)) {
-                list_move(&p->mnt_hash, &tmp_list);
+                hlist_del_init_rcu(&p->mnt_hash);
+                hlist_add_head(&p->mnt_hash, &tmp_list);
+        }
        if (how)
                propagate_umount(&tmp_list);
-        list_for_each_entry(p, &tmp_list, mnt_hash) {
+        hlist_for_each_entry(p, &tmp_list, mnt_hash) {
                list_del_init(&p->mnt_expire);
                list_del_init(&p->mnt_list);
                __touch_mnt_namespace(p->mnt_ns);
@@ -1220,8 +1261,13 @@ void umount_tree(struct mount *mnt, int how)
                        p->mnt_mp = NULL;
                }
                change_mnt_propagation(p, MS_PRIVATE);
+                last = p;
+        }
+        if (last) {
+                last->mnt_hash.next = unmounted.first;
+                unmounted.first = tmp_list.first;
+                unmounted.first->pprev = &unmounted.first;
        }
-        list_splice(&tmp_list, &unmounted);
 }
 static void shrink_submounts(struct mount *mnt);
@@ -1605,24 +1651,23 @@ static int attach_recursive_mnt(struct mount *source_mnt,
                        struct mountpoint *dest_mp,
                        struct path *parent_path)
 {
-        LIST_HEAD(tree_list);
+        HLIST_HEAD(tree_list);
        struct mount *child, *p;
+        struct hlist_node *n;
        int err;
        if (IS_MNT_SHARED(dest_mnt)) {
                err = invent_group_ids(source_mnt, true);
                if (err)
                        goto out;
-        }
+                err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list);
-        err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list);
+                if (err)
-        if (err)
+                        goto out_cleanup_ids;
-                goto out_cleanup_ids;
+                lock_mount_hash();
-        lock_mount_hash();
-        if (IS_MNT_SHARED(dest_mnt)) {
                for (p = source_mnt; p; p = next_mnt(p, source_mnt))
                        set_mnt_shared(p);
+        } else {
+                lock_mount_hash();
        }
        if (parent_path) {
                detach_mnt(source_mnt, parent_path);
@@ -1630,20 +1675,22 @@ static int attach_recursive_mnt(struct mount *source_mnt,
                touch_mnt_namespace(source_mnt->mnt_ns);
        } else {
                mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
-                commit_tree(source_mnt);
+                commit_tree(source_mnt, NULL);
        }
-        list_for_each_entry_safe(child, p, &tree_list, mnt_hash) {
+        hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) {
-                list_del_init(&child->mnt_hash);
+                struct mount *q;
-                commit_tree(child);
+                hlist_del_init(&child->mnt_hash);
+                q = __lookup_mnt_last(&child->mnt_parent->mnt,
+                                      child->mnt_mountpoint);
+                commit_tree(child, q);
        }
        unlock_mount_hash();
        return 0;
 out_cleanup_ids:
-        if (IS_MNT_SHARED(dest_mnt))
+        cleanup_group_ids(source_mnt, NULL);
-                cleanup_group_ids(source_mnt, NULL);
 out:
        return err;
 }
@@ -2777,18 +2824,26 @@ void __init mnt_init(void)
        mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount),
                        0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
-        mount_hashtable = (struct list_head *)__get_free_page(GFP_ATOMIC);
+        mount_hashtable = alloc_large_system_hash("Mount-cache",
-        mountpoint_hashtable = (struct list_head *)__get_free_page(GFP_ATOMIC);
+                                sizeof(struct hlist_head),
+                                mhash_entries, 19,
+                                0,
+                                &m_hash_shift, &m_hash_mask, 0, 0);
+        mountpoint_hashtable = alloc_large_system_hash("Mountpoint-cache",
+                                sizeof(struct hlist_head),
+                                mphash_entries, 19,
+                                0,
+                                &mp_hash_shift, &mp_hash_mask, 0, 0);
        if (!mount_hashtable || !mountpoint_hashtable)
                panic("Failed to allocate mount hash table\n");
-        printk(KERN_INFO "Mount-cache hash table entries: %lu\n", HASH_SIZE);
+        for (u = 0; u <= m_hash_mask; u++)
+                INIT_HLIST_HEAD(&mount_hashtable[u]);
+        for (u = 0; u <= mp_hash_mask; u++)
+                INIT_HLIST_HEAD(&mountpoint_hashtable[u]);
-        for (u = 0; u < HASH_SIZE; u++)
+        kernfs_init();
-                INIT_LIST_HEAD(&mount_hashtable[u]);
-        for (u = 0; u < HASH_SIZE; u++)
-                INIT_LIST_HEAD(&mountpoint_hashtable[u]);
        err = sysfs_init();
        if (err)
@@ -2886,7 +2941,7 @@ bool fs_fully_visible(struct file_system_type *type)
                        struct inode *inode = child->mnt_mountpoint->d_inode;
                        if (!S_ISDIR(inode->i_mode))
                                goto next;
-                        if (inode->i_nlink != 2)
+                        if (inode->i_nlink > 2)
                                goto next;
                }
                visible = true;
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index e242bbf72972..56ff823ca82e 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -134,8 +134,8 @@ bl_submit_bio(int rw, struct bio *bio)
        if (bio) {
                get_parallel(bio->bi_private);
                dprintk("%s submitting %s bio %u@%llu\n", __func__,
-                        rw == READ ? "read" : "write",
+                        rw == READ ? "read" : "write", bio->bi_iter.bi_size,
-                        bio->bi_size, (unsigned long long)bio->bi_sector);
+                        (unsigned long long)bio->bi_iter.bi_sector);
                submit_bio(rw, bio);
        }
        return NULL;
@@ -156,7 +156,8 @@ static struct bio *bl_alloc_init_bio(int npg, sector_t isect,
        }
        if (bio) {
-                bio->bi_sector = isect - be->be_f_offset + be->be_v_offset;
+                bio->bi_iter.bi_sector = isect - be->be_f_offset +
+                        be->be_v_offset;
                bio->bi_bdev = be->be_mdev;
                bio->bi_end_io = end_io;
                bio->bi_private = par;
@@ -201,18 +202,14 @@ static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw,
 static void bl_end_io_read(struct bio *bio, int err)
 {
        struct parallel_io *par = bio->bi_private;
-        const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+        struct bio_vec *bvec;
-        struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+        int i;
-        do {
+        if (!err)
-                struct page *page = bvec->bv_page;
+                bio_for_each_segment_all(bvec, bio, i)
+                        SetPageUptodate(bvec->bv_page);
-                if (--bvec >= bio->bi_io_vec)
+        if (err) {
-                        prefetchw(&bvec->bv_page->flags);
-                if (uptodate)
-                        SetPageUptodate(page);
-        } while (bvec >= bio->bi_io_vec);
-        if (!uptodate) {
                struct nfs_read_data *rdata = par->data;
                struct nfs_pgio_header *header = rdata->header;
@@ -383,20 +380,16 @@ static void mark_extents_written(struct pnfs_block_layout *bl,
 static void bl_end_io_write_zero(struct bio *bio, int err)
 {
        struct parallel_io *par = bio->bi_private;
-        const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+        struct bio_vec *bvec;
-        struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+        int i;
-        do {
-                struct page *page = bvec->bv_page;
-                if (--bvec >= bio->bi_io_vec)
+        bio_for_each_segment_all(bvec, bio, i) {
-                        prefetchw(&bvec->bv_page->flags);
                /* This is the zeroing page we added */
-                end_page_writeback(page);
+                end_page_writeback(bvec->bv_page);
-                page_cache_release(page);
+                page_cache_release(bvec->bv_page);
-        } while (bvec >= bio->bi_io_vec);
+        }
-        if (unlikely(!uptodate)) {
+        if (unlikely(err)) {
                struct nfs_write_data *data = par->data;
                struct nfs_pgio_header *header = data->header;
@@ -519,7 +512,7 @@ bl_do_readpage_sync(struct page *page, struct pnfs_block_extent *be,
        isect = (page->index << PAGE_CACHE_SECTOR_SHIFT) +
                (offset / SECTOR_SIZE);
-        bio->bi_sector = isect - be->be_f_offset + be->be_v_offset;
+        bio->bi_iter.bi_sector = isect - be->be_f_offset + be->be_v_offset;
        bio->bi_bdev = be->be_mdev;
        bio->bi_end_io = bl_read_single_end_io;
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index ef792f29f831..5d8ccecf5f5c 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -659,16 +659,19 @@ int nfs_async_inode_return_delegation(struct inode *inode,
        rcu_read_lock();
        delegation = rcu_dereference(NFS_I(inode)->delegation);
+        if (delegation == NULL)
+                goto out_enoent;
-        if (!clp->cl_mvops->match_stateid(&delegation->stateid, stateid)) {
+        if (!clp->cl_mvops->match_stateid(&delegation->stateid, stateid))
-                rcu_read_unlock();
+                goto out_enoent;
-                return -ENOENT;
-        }
        nfs_mark_return_delegation(server, delegation);
        rcu_read_unlock();
        nfs_delegation_run_state_manager(clp);
        return 0;
+out_enoent:
+        rcu_read_unlock();
+        return -ENOENT;
 }
 static struct inode *
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 812154aff981..4a48fe4b84b6 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -274,6 +274,15 @@ out_eof:
        return -EBADCOOKIE;
 }
+static bool
+nfs_readdir_inode_mapping_valid(struct nfs_inode *nfsi)
+{
+        if (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))
+                return false;
+        smp_rmb();
+        return !test_bit(NFS_INO_INVALIDATING, &nfsi->flags);
+}
 static
 int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc)
 {
@@ -287,8 +296,8 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
                        struct nfs_open_dir_context *ctx = desc->file->private_data;
                        new_pos = desc->current_index + i;
-                        if (ctx->attr_gencount != nfsi->attr_gencount
+                        if (ctx->attr_gencount != nfsi->attr_gencount ||
-                            || (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))) {
+                            !nfs_readdir_inode_mapping_valid(nfsi)) {
                                ctx->duped = 0;
                                ctx->attr_gencount = nfsi->attr_gencount;
                        } else if (new_pos < desc->ctx->pos) {
@@ -1404,7 +1413,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
        /* Expect a negative dentry */
        BUG_ON(dentry->d_inode);
-        dfprintk(VFS, "NFS: atomic_open(%s/%ld), %pd\n",
+        dfprintk(VFS, "NFS: atomic_open(%s/%lu), %pd\n",
                        dir->i_sb->s_id, dir->i_ino, dentry);
        err = nfs_check_flags(open_flags);
@@ -1594,7 +1603,7 @@ int nfs_create(struct inode *dir, struct dentry *dentry,
        int open_flags = excl ? O_CREAT | O_EXCL : O_CREAT;
        int error;
-        dfprintk(VFS, "NFS: create(%s/%ld), %pd\n",
+        dfprintk(VFS, "NFS: create(%s/%lu), %pd\n",
                        dir->i_sb->s_id, dir->i_ino, dentry);
        attr.ia_mode = mode;
@@ -1621,7 +1630,7 @@ nfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev)
        struct iattr attr;
        int status;
-        dfprintk(VFS, "NFS: mknod(%s/%ld), %pd\n",
+        dfprintk(VFS, "NFS: mknod(%s/%lu), %pd\n",
                        dir->i_sb->s_id, dir->i_ino, dentry);
        if (!new_valid_dev(rdev))
@@ -1650,7 +1659,7 @@ int nfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
        struct iattr attr;
        int error;
-        dfprintk(VFS, "NFS: mkdir(%s/%ld), %pd\n",
+        dfprintk(VFS, "NFS: mkdir(%s/%lu), %pd\n",
                        dir->i_sb->s_id, dir->i_ino, dentry);
        attr.ia_valid = ATTR_MODE;
@@ -1678,7 +1687,7 @@ int nfs_rmdir(struct inode *dir, struct dentry *dentry)
 {
        int error;
-        dfprintk(VFS, "NFS: rmdir(%s/%ld), %pd\n",
+        dfprintk(VFS, "NFS: rmdir(%s/%lu), %pd\n",
                        dir->i_sb->s_id, dir->i_ino, dentry);
        trace_nfs_rmdir_enter(dir, dentry);
@@ -1747,7 +1756,7 @@ int nfs_unlink(struct inode *dir, struct dentry *dentry)
        int error;
        int need_rehash = 0;
-        dfprintk(VFS, "NFS: unlink(%s/%ld, %pd)\n", dir->i_sb->s_id,
+        dfprintk(VFS, "NFS: unlink(%s/%lu, %pd)\n", dir->i_sb->s_id,
                dir->i_ino, dentry);
        trace_nfs_unlink_enter(dir, dentry);
@@ -1798,7 +1807,7 @@ int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
        unsigned int pathlen = strlen(symname);
        int error;
-        dfprintk(VFS, "NFS: symlink(%s/%ld, %pd, %s)\n", dir->i_sb->s_id,
+        dfprintk(VFS, "NFS: symlink(%s/%lu, %pd, %s)\n", dir->i_sb->s_id,
                dir->i_ino, dentry, symname);
        if (pathlen > PAGE_SIZE)
@@ -1821,7 +1830,7 @@ int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
        error = NFS_PROTO(dir)->symlink(dir, dentry, page, pathlen, &attr);
        trace_nfs_symlink_exit(dir, dentry, error);
        if (error != 0) {
-                dfprintk(VFS, "NFS: symlink(%s/%ld, %pd, %s) error %d\n",
+                dfprintk(VFS, "NFS: symlink(%s/%lu, %pd, %s) error %d\n",
                        dir->i_sb->s_id, dir->i_ino,
                        dentry, symname, error);
                d_drop(dentry);
@@ -1837,6 +1846,11 @@ int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
                                                        GFP_KERNEL)) {
                SetPageUptodate(page);
                unlock_page(page);
+                /*
+                 * add_to_page_cache_lru() grabs an extra page refcount.
+                 * Drop it here to avoid leaking this page later.
+                 */
+                page_cache_release(page);
        } else
                __free_page(page);
@@ -2304,7 +2318,7 @@ out:
        if (!res && (mask & MAY_EXEC) && !execute_ok(inode))
                res = -EACCES;
-        dfprintk(VFS, "NFS: permission(%s/%ld), mask=0x%x, res=%d\n",
+        dfprintk(VFS, "NFS: permission(%s/%lu), mask=0x%x, res=%d\n",
                inode->i_sb->s_id, inode->i_ino, mask, res);
        return res;
 out_notsup:
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index d71d66c9e0a1..b8797ae6831f 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -222,14 +222,31 @@ out:
 * Synchronous I/O uses a stack-allocated iocb.  Thus we can't trust
 * the iocb is still valid here if this is a synchronous request.
 */
-static void nfs_direct_complete(struct nfs_direct_req *dreq)
+static void nfs_direct_complete(struct nfs_direct_req *dreq, bool write)
 {
+        struct inode *inode = dreq->inode;
+        if (dreq->iocb && write) {
+                loff_t pos = dreq->iocb->ki_pos + dreq->count;
+                spin_lock(&inode->i_lock);
+                if (i_size_read(inode) < pos)
+                        i_size_write(inode, pos);
+                spin_unlock(&inode->i_lock);
+        }
+        if (write)
+                nfs_zap_mapping(inode, inode->i_mapping);
+        inode_dio_done(inode);
        if (dreq->iocb) {
                long res = (long) dreq->error;
                if (!res)
                        res = (long) dreq->count;
                aio_complete(dreq->iocb, res, 0);
        }
        complete_all(&dreq->completion);
        nfs_direct_req_release(dreq);
@@ -237,9 +254,9 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq)
 static void nfs_direct_readpage_release(struct nfs_page *req)
 {
-        dprintk("NFS: direct read done (%s/%lld %d@%lld)\n",
+        dprintk("NFS: direct read done (%s/%llu %d@%lld)\n",
                req->wb_context->dentry->d_inode->i_sb->s_id,
-                (long long)NFS_FILEID(req->wb_context->dentry->d_inode),
+                (unsigned long long)NFS_FILEID(req->wb_context->dentry->d_inode),
                req->wb_bytes,
                (long long)req_offset(req));
        nfs_release_request(req);
@@ -272,7 +289,7 @@ static void nfs_direct_read_completion(struct nfs_pgio_header *hdr)
        }
 out_put:
        if (put_dreq(dreq))
-                nfs_direct_complete(dreq);
+                nfs_direct_complete(dreq, false);
        hdr->release(hdr);
 }
@@ -402,6 +419,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
                                              loff_t pos, bool uio)
 {
        struct nfs_pageio_descriptor desc;
+        struct inode *inode = dreq->inode;
        ssize_t result = -EINVAL;
        size_t requested_bytes = 0;
        unsigned long seg;
@@ -410,6 +428,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
                             &nfs_direct_read_completion_ops);
        get_dreq(dreq);
        desc.pg_dreq = dreq;
+        atomic_inc(&inode->i_dio_count);
        for (seg = 0; seg < nr_segs; seg++) {
                const struct iovec *vec = &iov[seg];
@@ -429,26 +448,69 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
         * generic layer handle the completion.
         */
        if (requested_bytes == 0) {
+                inode_dio_done(inode);
                nfs_direct_req_release(dreq);
                return result < 0 ? result : -EIO;
        }
        if (put_dreq(dreq))
-                nfs_direct_complete(dreq);
+                nfs_direct_complete(dreq, false);
        return 0;
 }
-static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov,
+/**
-                               unsigned long nr_segs, loff_t pos, bool uio)
+ * nfs_file_direct_read - file direct read operation for NFS files
+ * @iocb: target I/O control block
+ * @iov: vector of user buffers into which to read data
+ * @nr_segs: size of iov vector
+ * @pos: byte offset in file where reading starts
+ *
+ * We use this function for direct reads instead of calling
+ * generic_file_aio_read() in order to avoid gfar's check to see if
+ * the request starts before the end of the file.  For that check
+ * to work, we must generate a GETATTR before each direct read, and
+ * even then there is a window between the GETATTR and the subsequent
+ * READ where the file size could change.  Our preference is simply
+ * to do all reads the application wants, and the server will take
+ * care of managing the end of file boundary.
+ *
+ * This function also eliminates unnecessarily updating the file's
+ * atime locally, as the NFS server sets the file's atime, and this
+ * client must read the updated atime from the server back into its
+ * cache.
+ */
+ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov,
+                                unsigned long nr_segs, loff_t pos, bool uio)
 {
-        ssize_t result = -ENOMEM;
+        struct file *file = iocb->ki_filp;
-        struct inode *inode = iocb->ki_filp->f_mapping->host;
+        struct address_space *mapping = file->f_mapping;
+        struct inode *inode = mapping->host;
        struct nfs_direct_req *dreq;
        struct nfs_lock_context *l_ctx;
+        ssize_t result = -EINVAL;
+        size_t count;
+        count = iov_length(iov, nr_segs);
+        nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count);
+        dfprintk(FILE, "NFS: direct read(%pD2, %zd@%Ld)\n",
+                file, count, (long long) pos);
+        result = 0;
+        if (!count)
+                goto out;
+        mutex_lock(&inode->i_mutex);
+        result = nfs_sync_mapping(mapping);
+        if (result)
+                goto out_unlock;
+        task_io_account_read(count);
+        result = -ENOMEM;
        dreq = nfs_direct_req_alloc();
        if (dreq == NULL)
-                goto out;
+                goto out_unlock;
        dreq->inode = inode;
        dreq->bytes_left = iov_length(iov, nr_segs);
@@ -464,20 +526,26 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov,
        NFS_I(inode)->read_io += iov_length(iov, nr_segs);
        result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos, uio);
-        if (!result)
+        mutex_unlock(&inode->i_mutex);
+        if (!result) {
                result = nfs_direct_wait(dreq);
+                if (result > 0)
+                        iocb->ki_pos = pos + result;
+        }
+        nfs_direct_req_release(dreq);
+        return result;
 out_release:
        nfs_direct_req_release(dreq);
+out_unlock:
+        mutex_unlock(&inode->i_mutex);
 out:
        return result;
 }
-static void nfs_inode_dio_write_done(struct inode *inode)
-{
-        nfs_zap_mapping(inode, inode->i_mapping);
-        inode_dio_done(inode);
-}
 #if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
 static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
 {
@@ -593,8 +661,7 @@ static void nfs_direct_write_schedule_work(struct work_struct *work)
                        nfs_direct_write_reschedule(dreq);
                        break;
                default:
-                        nfs_inode_dio_write_done(dreq->inode);
+                        nfs_direct_complete(dreq, true);
-                        nfs_direct_complete(dreq);
        }
 }
@@ -610,8 +677,7 @@ static void nfs_direct_write_schedule_work(struct work_struct *work)
 static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
 {
-        nfs_inode_dio_write_done(inode);
+        nfs_direct_complete(dreq, true);
-        nfs_direct_complete(dreq);
 }
 #endif
@@ -842,93 +908,6 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
        return 0;
 }
-static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
-                                unsigned long nr_segs, loff_t pos,
-                                size_t count, bool uio)
-{
-        ssize_t result = -ENOMEM;
-        struct inode *inode = iocb->ki_filp->f_mapping->host;
-        struct nfs_direct_req *dreq;
-        struct nfs_lock_context *l_ctx;
-        dreq = nfs_direct_req_alloc();
-        if (!dreq)
-                goto out;
-        dreq->inode = inode;
-        dreq->bytes_left = count;
-        dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
-        l_ctx = nfs_get_lock_context(dreq->ctx);
-        if (IS_ERR(l_ctx)) {
-                result = PTR_ERR(l_ctx);
-                goto out_release;
-        }
-        dreq->l_ctx = l_ctx;
-        if (!is_sync_kiocb(iocb))
-                dreq->iocb = iocb;
-        result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, uio);
-        if (!result)
-                result = nfs_direct_wait(dreq);
-out_release:
-        nfs_direct_req_release(dreq);
-out:
-        return result;
-}
-/**
- * nfs_file_direct_read - file direct read operation for NFS files
- * @iocb: target I/O control block
- * @iov: vector of user buffers into which to read data
- * @nr_segs: size of iov vector
- * @pos: byte offset in file where reading starts
- *
- * We use this function for direct reads instead of calling
- * generic_file_aio_read() in order to avoid gfar's check to see if
- * the request starts before the end of the file.  For that check
- * to work, we must generate a GETATTR before each direct read, and
- * even then there is a window between the GETATTR and the subsequent
- * READ where the file size could change.  Our preference is simply
- * to do all reads the application wants, and the server will take
- * care of managing the end of file boundary.
- *
- * This function also eliminates unnecessarily updating the file's
- * atime locally, as the NFS server sets the file's atime, and this
- * client must read the updated atime from the server back into its
- * cache.
- */
-ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov,
-                                unsigned long nr_segs, loff_t pos, bool uio)
-{
-        ssize_t retval = -EINVAL;
-        struct file *file = iocb->ki_filp;
-        struct address_space *mapping = file->f_mapping;
-        size_t count;
-        count = iov_length(iov, nr_segs);
-        nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count);
-        dfprintk(FILE, "NFS: direct read(%pD2, %zd@%Ld)\n",
-                file, count, (long long) pos);
-        retval = 0;
-        if (!count)
-                goto out;
-        retval = nfs_sync_mapping(mapping);
-        if (retval)
-                goto out;
-        task_io_account_read(count);
-        retval = nfs_direct_read(iocb, iov, nr_segs, pos, uio);
-        if (retval > 0)
-                iocb->ki_pos = pos + retval;
-out:
-        return retval;
-}
 /**
 * nfs_file_direct_write - file direct write operation for NFS files
 * @iocb: target I/O control block
@@ -954,46 +933,96 @@ out:
 ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
                                unsigned long nr_segs, loff_t pos, bool uio)
 {
-        ssize_t retval = -EINVAL;
+        ssize_t result = -EINVAL;
        struct file *file = iocb->ki_filp;
        struct address_space *mapping = file->f_mapping;
+        struct inode *inode = mapping->host;
+        struct nfs_direct_req *dreq;
+        struct nfs_lock_context *l_ctx;
+        loff_t end;
        size_t count;
        count = iov_length(iov, nr_segs);
+        end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
        nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count);
        dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n",
                file, count, (long long) pos);
-        retval = generic_write_checks(file, &pos, &count, 0);
+        result = generic_write_checks(file, &pos, &count, 0);
-        if (retval)
+        if (result)
                goto out;
-        retval = -EINVAL;
+        result = -EINVAL;
        if ((ssize_t) count < 0)
                goto out;
-        retval = 0;
+        result = 0;
        if (!count)
                goto out;
-        retval = nfs_sync_mapping(mapping);
+        mutex_lock(&inode->i_mutex);
-        if (retval)
-                goto out;
+        result = nfs_sync_mapping(mapping);
+        if (result)
+                goto out_unlock;
+        if (mapping->nrpages) {
+                result = invalidate_inode_pages2_range(mapping,
+                                        pos >> PAGE_CACHE_SHIFT, end);
+                if (result)
+                        goto out_unlock;
+        }
        task_io_account_write(count);
-        retval = nfs_direct_write(iocb, iov, nr_segs, pos, count, uio);
+        result = -ENOMEM;
-        if (retval > 0) {
+        dreq = nfs_direct_req_alloc();
-                struct inode *inode = mapping->host;
+        if (!dreq)
+                goto out_unlock;
-                iocb->ki_pos = pos + retval;
+        dreq->inode = inode;
-                spin_lock(&inode->i_lock);
+        dreq->bytes_left = count;
-                if (i_size_read(inode) < iocb->ki_pos)
+        dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
-                        i_size_write(inode, iocb->ki_pos);
+        l_ctx = nfs_get_lock_context(dreq->ctx);
-                spin_unlock(&inode->i_lock);
+        if (IS_ERR(l_ctx)) {
+                result = PTR_ERR(l_ctx);
+                goto out_release;
+        }
+        dreq->l_ctx = l_ctx;
+        if (!is_sync_kiocb(iocb))
+                dreq->iocb = iocb;
+        result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, uio);
+        if (mapping->nrpages) {
+                invalidate_inode_pages2_range(mapping,
+                                              pos >> PAGE_CACHE_SHIFT, end);
        }
+        mutex_unlock(&inode->i_mutex);
+        if (!result) {
+                result = nfs_direct_wait(dreq);
+                if (result > 0) {
+                        struct inode *inode = mapping->host;
+                        iocb->ki_pos = pos + result;
+                        spin_lock(&inode->i_lock);
+                        if (i_size_read(inode) < iocb->ki_pos)
+                                i_size_write(inode, iocb->ki_pos);
+                        spin_unlock(&inode->i_lock);
+                }
+        }
+        nfs_direct_req_release(dreq);
+        return result;
+out_release:
+        nfs_direct_req_release(dreq);
+out_unlock:
+        mutex_unlock(&inode->i_mutex);
 out:
-        return retval;
+        return result;
 }
 /**
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index e2fcacf07de3..5bb790a69c71 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -354,7 +354,7 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
        struct page *page;
        int once_thru = 0;
-        dfprintk(PAGECACHE, "NFS: write_begin(%pD2(%ld), %u@%lld)\n",
+        dfprintk(PAGECACHE, "NFS: write_begin(%pD2(%lu), %u@%lld)\n",
                file, mapping->host->i_ino, len, (long long) pos);
 start:
@@ -395,7 +395,7 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
        struct nfs_open_context *ctx = nfs_file_open_context(file);
        int status;
-        dfprintk(PAGECACHE, "NFS: write_end(%pD2(%ld), %u@%lld)\n",
+        dfprintk(PAGECACHE, "NFS: write_end(%pD2(%lu), %u@%lld)\n",
                file, mapping->host->i_ino, len, (long long) pos);
        /*
@@ -585,7 +585,7 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        int ret = VM_FAULT_NOPAGE;
        struct address_space *mapping;
-        dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%pD2(%ld), offset %lld)\n",
+        dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%pD2(%lu), offset %lld)\n",
                filp, filp->f_mapping->host->i_ino,
                (long long)page_offset(page));
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 00ad1c2b217d..360114ae8b82 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -164,17 +164,16 @@ static void nfs_zap_caches_locked(struct inode *inode)
        if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) {
                nfs_fscache_invalidate(inode);
                nfsi->cache_validity |= NFS_INO_INVALID_ATTR
-                                        | NFS_INO_INVALID_LABEL
                                        | NFS_INO_INVALID_DATA
                                        | NFS_INO_INVALID_ACCESS
                                        | NFS_INO_INVALID_ACL
                                        | NFS_INO_REVAL_PAGECACHE;
        } else
                nfsi->cache_validity |= NFS_INO_INVALID_ATTR
-                                        | NFS_INO_INVALID_LABEL
                                        | NFS_INO_INVALID_ACCESS
                                        | NFS_INO_INVALID_ACL
                                        | NFS_INO_REVAL_PAGECACHE;
+        nfs_zap_label_cache_locked(nfsi);
 }
 void nfs_zap_caches(struct inode *inode)
@@ -266,6 +265,13 @@ nfs_init_locked(struct inode *inode, void *opaque)
 }
 #ifdef CONFIG_NFS_V4_SECURITY_LABEL
+static void nfs_clear_label_invalid(struct inode *inode)
+{
+        spin_lock(&inode->i_lock);
+        NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_LABEL;
+        spin_unlock(&inode->i_lock);
+}
 void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr,
                                        struct nfs4_label *label)
 {
@@ -283,6 +289,7 @@ void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr,
                                        __func__,
                                        (char *)label->label,
                                        label->len, error);
+                nfs_clear_label_invalid(inode);
        }
 }
@@ -458,9 +465,9 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st
                unlock_new_inode(inode);
        } else
                nfs_refresh_inode(inode, fattr);
-        dprintk("NFS: nfs_fhget(%s/%Ld fh_crc=0x%08x ct=%d)\n",
+        dprintk("NFS: nfs_fhget(%s/%Lu fh_crc=0x%08x ct=%d)\n",
                inode->i_sb->s_id,
-                (long long)NFS_FILEID(inode),
+                (unsigned long long)NFS_FILEID(inode),
                nfs_display_fhandle_hash(fh),
                atomic_read(&inode->i_count));
@@ -870,8 +877,8 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
        struct nfs_fattr *fattr = NULL;
        struct nfs_inode *nfsi = NFS_I(inode);
-        dfprintk(PAGECACHE, "NFS: revalidating (%s/%Ld)\n",
+        dfprintk(PAGECACHE, "NFS: revalidating (%s/%Lu)\n",
-                inode->i_sb->s_id, (long long)NFS_FILEID(inode));
+                inode->i_sb->s_id, (unsigned long long)NFS_FILEID(inode));
        trace_nfs_revalidate_inode_enter(inode);
@@ -895,9 +902,9 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
        status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), fattr, label);
        if (status != 0) {
-                dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) getattr failed, error=%d\n",
+                dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Lu) getattr failed, error=%d\n",
                         inode->i_sb->s_id,
-                         (long long)NFS_FILEID(inode), status);
+                         (unsigned long long)NFS_FILEID(inode), status);
                if (status == -ESTALE) {
                        nfs_zap_caches(inode);
                        if (!S_ISDIR(inode->i_mode))
@@ -908,9 +915,9 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
        status = nfs_refresh_inode(inode, fattr);
        if (status) {
-                dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) refresh failed, error=%d\n",
+                dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Lu) refresh failed, error=%d\n",
                         inode->i_sb->s_id,
-                         (long long)NFS_FILEID(inode), status);
+                         (unsigned long long)NFS_FILEID(inode), status);
                goto err_out;
        }
@@ -919,9 +926,9 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
        nfs_setsecurity(inode, fattr, label);
-        dfprintk(PAGECACHE, "NFS: (%s/%Ld) revalidation complete\n",
+        dfprintk(PAGECACHE, "NFS: (%s/%Lu) revalidation complete\n",
                inode->i_sb->s_id,
-                (long long)NFS_FILEID(inode));
+                (unsigned long long)NFS_FILEID(inode));
 err_out:
        nfs4_label_free(label);
@@ -977,16 +984,17 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map
                if (ret < 0)
                        return ret;
        }
-        spin_lock(&inode->i_lock);
+        if (S_ISDIR(inode->i_mode)) {
-        nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
+                spin_lock(&inode->i_lock);
-        if (S_ISDIR(inode->i_mode))
                memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
-        spin_unlock(&inode->i_lock);
+                spin_unlock(&inode->i_lock);
+        }
        nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE);
        nfs_fscache_wait_on_invalidate(inode);
-        dfprintk(PAGECACHE, "NFS: (%s/%Ld) data cache invalidated\n",
+        dfprintk(PAGECACHE, "NFS: (%s/%Lu) data cache invalidated\n",
-                        inode->i_sb->s_id, (long long)NFS_FILEID(inode));
+                        inode->i_sb->s_id,
+                        (unsigned long long)NFS_FILEID(inode));
        return 0;
 }
@@ -1007,6 +1015,7 @@ static bool nfs_mapping_need_revalidate_inode(struct inode *inode)
 int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
 {
        struct nfs_inode *nfsi = NFS_I(inode);
+        unsigned long *bitlock = &nfsi->flags;
        int ret = 0;
        /* swapfiles are not supposed to be shared. */
@@ -1018,12 +1027,46 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
                if (ret < 0)
                        goto out;
        }
-        if (nfsi->cache_validity & NFS_INO_INVALID_DATA) {
-                trace_nfs_invalidate_mapping_enter(inode);
+        /*
-                ret = nfs_invalidate_mapping(inode, mapping);
+         * We must clear NFS_INO_INVALID_DATA first to ensure that
-                trace_nfs_invalidate_mapping_exit(inode, ret);
+         * invalidations that come in while we're shooting down the mappings
+         * are respected. But, that leaves a race window where one revalidator
+         * can clear the flag, and then another checks it before the mapping
+         * gets invalidated. Fix that by serializing access to this part of
+         * the function.
+         *
+         * At the same time, we need to allow other tasks to see whether we
+         * might be in the middle of invalidating the pages, so we only set
+         * the bit lock here if it looks like we're going to be doing that.
+         */
+        for (;;) {
+                ret = wait_on_bit(bitlock, NFS_INO_INVALIDATING,
+                                  nfs_wait_bit_killable, TASK_KILLABLE);
+                if (ret)
+                        goto out;
+                spin_lock(&inode->i_lock);
+                if (test_bit(NFS_INO_INVALIDATING, bitlock)) {
+                        spin_unlock(&inode->i_lock);
+                        continue;
+                }
+                if (nfsi->cache_validity & NFS_INO_INVALID_DATA)
+                        break;
+                spin_unlock(&inode->i_lock);
+                goto out;
        }
+        set_bit(NFS_INO_INVALIDATING, bitlock);
+        smp_wmb();
+        nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
+        spin_unlock(&inode->i_lock);
+        trace_nfs_invalidate_mapping_enter(inode);
+        ret = nfs_invalidate_mapping(inode, mapping);
+        trace_nfs_invalidate_mapping_exit(inode, ret);
+        clear_bit_unlock(NFS_INO_INVALIDATING, bitlock);
+        smp_mb__after_clear_bit();
+        wake_up_bit(bitlock, NFS_INO_INVALIDATING);
 out:
        return ret;
 }
@@ -1282,12 +1325,28 @@ static int nfs_inode_attrs_need_update(const struct inode *inode, const struct n
                ((long)nfsi->attr_gencount - (long)nfs_read_attr_generation_counter() > 0);
 }
+/*
+ * Don't trust the change_attribute, mtime, ctime or size if
+ * a pnfs LAYOUTCOMMIT is outstanding
+ */
+static void nfs_inode_attrs_handle_layoutcommit(struct inode *inode,
+                struct nfs_fattr *fattr)
+{
+        if (pnfs_layoutcommit_outstanding(inode))
+                fattr->valid &= ~(NFS_ATTR_FATTR_CHANGE |
+                                NFS_ATTR_FATTR_MTIME |
+                                NFS_ATTR_FATTR_CTIME |
+                                NFS_ATTR_FATTR_SIZE);
+}
 static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr)
 {
        int ret;
        trace_nfs_refresh_inode_enter(inode);
+        nfs_inode_attrs_handle_layoutcommit(inode, fattr);
        if (nfs_inode_attrs_need_update(inode, fattr))
                ret = nfs_update_inode(inode, fattr);
        else
@@ -1434,7 +1493,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
        unsigned long now = jiffies;
        unsigned long save_cache_validity;
-        dfprintk(VFS, "NFS: %s(%s/%ld fh_crc=0x%08x ct=%d info=0x%x)\n",
+        dfprintk(VFS, "NFS: %s(%s/%lu fh_crc=0x%08x ct=%d info=0x%x)\n",
                        __func__, inode->i_sb->s_id, inode->i_ino,
                        nfs_display_fhandle_hash(NFS_FH(inode)),
                        atomic_read(&inode->i_count), fattr->valid);
@@ -1455,7 +1514,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                /*
                * Big trouble! The inode has become a different object.
                */
-                printk(KERN_DEBUG "NFS: %s: inode %ld mode changed, %07o to %07o\n",
+                printk(KERN_DEBUG "NFS: %s: inode %lu mode changed, %07o to %07o\n",
                                __func__, inode->i_ino, inode->i_mode, fattr->mode);
                goto out_err;
        }
@@ -1517,8 +1576,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                if (new_isize != cur_isize) {
                        /* Do we perhaps have any outstanding writes, or has
                         * the file grown beyond our last write? */
-                        if ((nfsi->npages == 0 && !test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) ||
+                        if ((nfsi->npages == 0) || new_isize > cur_isize) {
-                             new_isize > cur_isize) {
                                i_size_write(inode, new_isize);
                                invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
                        }
@@ -1597,7 +1655,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                inode->i_blocks = fattr->du.nfs2.blocks;
        /* Update attrtimeo value if we're out of the unstable period */
-        if (invalid & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL)) {
+        if (invalid & NFS_INO_INVALID_ATTR) {
                nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE);
                nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
                nfsi->attrtimeo_timestamp = now;
@@ -1610,7 +1668,6 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                }
        }
        invalid &= ~NFS_INO_INVALID_ATTR;
-        invalid &= ~NFS_INO_INVALID_LABEL;
        /* Don't invalidate the data if we were to blame */
        if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)
                                || S_ISLNK(inode->i_mode)))
@@ -1641,10 +1698,6 @@ struct inode *nfs_alloc_inode(struct super_block *sb)
                return NULL;
        nfsi->flags = 0UL;
        nfsi->cache_validity = 0UL;
-#ifdef CONFIG_NFS_V3_ACL
-        nfsi->acl_access = ERR_PTR(-EAGAIN);
-        nfsi->acl_default = ERR_PTR(-EAGAIN);
-#endif
 #if IS_ENABLED(CONFIG_NFS_V4)
        nfsi->nfs4_acl = NULL;
 #endif /* CONFIG_NFS_V4 */
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 8b5cc04a8611..b46cf5a67329 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -176,7 +176,8 @@ extern struct nfs_server *nfs4_create_server(
 extern struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *,
                                                      struct nfs_fh *);
 extern int nfs4_update_server(struct nfs_server *server, const char *hostname,
-                                        struct sockaddr *sap, size_t salen);
+                                        struct sockaddr *sap, size_t salen,
+                                        struct net *net);
 extern void nfs_free_server(struct nfs_server *server);
 extern struct nfs_server *nfs_clone_server(struct nfs_server *,
                                           struct nfs_fh *,
@@ -279,9 +280,18 @@ static inline void nfs4_label_free(struct nfs4_label *label)
        }
        return;
 }
+static inline void nfs_zap_label_cache_locked(struct nfs_inode *nfsi)
+{
+        if (nfs_server_capable(&nfsi->vfs_inode, NFS_CAP_SECURITY_LABEL))
+                nfsi->cache_validity |= NFS_INO_INVALID_LABEL;
+}
 #else
 static inline struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags) { return NULL; }
 static inline void nfs4_label_free(void *label) {}
+static inline void nfs_zap_label_cache_locked(struct nfs_inode *nfsi)
+{
+}
 #endif /* CONFIG_NFS_V4_SECURITY_LABEL */
 /* proc.c */
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 4a1aafba6a20..871d6eda8dba 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -10,179 +10,7 @@
 #define NFSDBG_FACILITY NFSDBG_PROC
-ssize_t nfs3_listxattr(struct dentry *dentry, char *buffer, size_t size)
+struct posix_acl *nfs3_get_acl(struct inode *inode, int type)
-{
-        struct inode *inode = dentry->d_inode;
-        struct posix_acl *acl;
-        int pos=0, len=0;
-#       define output(s) do {                                           \
-                        if (pos + sizeof(s) <= size) {                  \
-                                memcpy(buffer + pos, s, sizeof(s));     \
-                                pos += sizeof(s);                       \
-                        }                                               \
-                        len += sizeof(s);                               \
-                } while(0)
-        acl = nfs3_proc_getacl(inode, ACL_TYPE_ACCESS);
-        if (IS_ERR(acl))
-                return PTR_ERR(acl);
-        if (acl) {
-                output("system.posix_acl_access");
-                posix_acl_release(acl);
-        }
-        if (S_ISDIR(inode->i_mode)) {
-                acl = nfs3_proc_getacl(inode, ACL_TYPE_DEFAULT);
-                if (IS_ERR(acl))
-                        return PTR_ERR(acl);
-                if (acl) {
-                        output("system.posix_acl_default");
-                        posix_acl_release(acl);
-                }
-        }
-#       undef output
-        if (!buffer || len <= size)
-                return len;
-        return -ERANGE;
-}
-ssize_t nfs3_getxattr(struct dentry *dentry, const char *name,
-                void *buffer, size_t size)
-{
-        struct inode *inode = dentry->d_inode;
-        struct posix_acl *acl;
-        int type, error = 0;
-        if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0)
-                type = ACL_TYPE_ACCESS;
-        else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0)
-                type = ACL_TYPE_DEFAULT;
-        else
-                return -EOPNOTSUPP;
-        acl = nfs3_proc_getacl(inode, type);
-        if (IS_ERR(acl))
-                return PTR_ERR(acl);
-        else if (acl) {
-                if (type == ACL_TYPE_ACCESS && acl->a_count == 0)
-                        error = -ENODATA;
-                else
-                        error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
-                posix_acl_release(acl);
-        } else
-                error = -ENODATA;
-        return error;
-}
-int nfs3_setxattr(struct dentry *dentry, const char *name,
-             const void *value, size_t size, int flags)
-{
-        struct inode *inode = dentry->d_inode;
-        struct posix_acl *acl;
-        int type, error;
-        if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0)
-                type = ACL_TYPE_ACCESS;
-        else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0)
-                type = ACL_TYPE_DEFAULT;
-        else
-                return -EOPNOTSUPP;
-        acl = posix_acl_from_xattr(&init_user_ns, value, size);
-        if (IS_ERR(acl))
-                return PTR_ERR(acl);
-        error = nfs3_proc_setacl(inode, type, acl);
-        posix_acl_release(acl);
-        return error;
-}
-int nfs3_removexattr(struct dentry *dentry, const char *name)
-{
-        struct inode *inode = dentry->d_inode;
-        int type;
-        if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0)
-                type = ACL_TYPE_ACCESS;
-        else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0)
-                type = ACL_TYPE_DEFAULT;
-        else
-                return -EOPNOTSUPP;
-        return nfs3_proc_setacl(inode, type, NULL);
-}
-static void __nfs3_forget_cached_acls(struct nfs_inode *nfsi)
-{
-        if (!IS_ERR(nfsi->acl_access)) {
-                posix_acl_release(nfsi->acl_access);
-                nfsi->acl_access = ERR_PTR(-EAGAIN);
-        }
-        if (!IS_ERR(nfsi->acl_default)) {
-                posix_acl_release(nfsi->acl_default);
-                nfsi->acl_default = ERR_PTR(-EAGAIN);
-        }
-}
-void nfs3_forget_cached_acls(struct inode *inode)
-{
-        dprintk("NFS: nfs3_forget_cached_acls(%s/%ld)\n", inode->i_sb->s_id,
-                inode->i_ino);
-        spin_lock(&inode->i_lock);
-        __nfs3_forget_cached_acls(NFS_I(inode));
-        spin_unlock(&inode->i_lock);
-}
-static struct posix_acl *nfs3_get_cached_acl(struct inode *inode, int type)
-{
-        struct nfs_inode *nfsi = NFS_I(inode);
-        struct posix_acl *acl = ERR_PTR(-EINVAL);
-        spin_lock(&inode->i_lock);
-        switch(type) {
-                case ACL_TYPE_ACCESS:
-                        acl = nfsi->acl_access;
-                        break;
-                case ACL_TYPE_DEFAULT:
-                        acl = nfsi->acl_default;
-                        break;
-                default:
-                        goto out;
-        }
-        if (IS_ERR(acl))
-                acl = ERR_PTR(-EAGAIN);
-        else
-                acl = posix_acl_dup(acl);
-out:
-        spin_unlock(&inode->i_lock);
-        dprintk("NFS: nfs3_get_cached_acl(%s/%ld, %d) = %p\n", inode->i_sb->s_id,
-                inode->i_ino, type, acl);
-        return acl;
-}
-static void nfs3_cache_acls(struct inode *inode, struct posix_acl *acl,
-                    struct posix_acl *dfacl)
-{
-        struct nfs_inode *nfsi = NFS_I(inode);
-        dprintk("nfs3_cache_acls(%s/%ld, %p, %p)\n", inode->i_sb->s_id,
-                inode->i_ino, acl, dfacl);
-        spin_lock(&inode->i_lock);
-        __nfs3_forget_cached_acls(NFS_I(inode));
-        if (!IS_ERR(acl))
-                nfsi->acl_access = posix_acl_dup(acl);
-        if (!IS_ERR(dfacl))
-                nfsi->acl_default = posix_acl_dup(dfacl);
-        spin_unlock(&inode->i_lock);
-}
-struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
 {
        struct nfs_server *server = NFS_SERVER(inode);
        struct page *pages[NFSACL_MAXPAGES] = { };
@@ -198,7 +26,6 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
                .rpc_argp       = &args,
                .rpc_resp       = &res,
        };
-        struct posix_acl *acl;
        int status, count;
        if (!nfs_server_capable(inode, NFS_CAP_ACLS))
@@ -207,10 +34,6 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
        status = nfs_revalidate_inode(server, inode);
        if (status < 0)
                return ERR_PTR(status);
-        acl = nfs3_get_cached_acl(inode, type);
-        if (acl != ERR_PTR(-EAGAIN))
-                return acl;
-        acl = NULL;
        /*
         * Only get the access acl when explicitly requested: We don't
@@ -257,40 +80,41 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
        }
        if (res.acl_access != NULL) {
-                if (posix_acl_equiv_mode(res.acl_access, NULL) == 0) {
+                if ((posix_acl_equiv_mode(res.acl_access, NULL) == 0) ||
+                    res.acl_access->a_count == 0) {
                        posix_acl_release(res.acl_access);
                        res.acl_access = NULL;
                }
        }
-        nfs3_cache_acls(inode,
-                (res.mask & NFS_ACL)   ? res.acl_access  : ERR_PTR(-EINVAL),
-                (res.mask & NFS_DFACL) ? res.acl_default : ERR_PTR(-EINVAL));
-        switch(type) {
+        if (res.mask & NFS_ACL)
-                case ACL_TYPE_ACCESS:
+                set_cached_acl(inode, ACL_TYPE_ACCESS, res.acl_access);
-                        acl = res.acl_access;
+        else
-                        res.acl_access = NULL;
+                forget_cached_acl(inode, ACL_TYPE_ACCESS);
-                        break;
-                case ACL_TYPE_DEFAULT:
+        if (res.mask & NFS_DFACL)
-                        acl = res.acl_default;
+                set_cached_acl(inode, ACL_TYPE_DEFAULT, res.acl_default);
-                        res.acl_default = NULL;
+        else
+                forget_cached_acl(inode, ACL_TYPE_DEFAULT);
+        nfs_free_fattr(res.fattr);
+        if (type == ACL_TYPE_ACCESS) {
+                posix_acl_release(res.acl_default);
+                return res.acl_access;
+        } else {
+                posix_acl_release(res.acl_access);
+                return res.acl_default;
        }
 getout:
        posix_acl_release(res.acl_access);
        posix_acl_release(res.acl_default);
        nfs_free_fattr(res.fattr);
+        return ERR_PTR(status);
-        if (status != 0) {
-                posix_acl_release(acl);
-                acl = ERR_PTR(status);
-        }
-        return acl;
 }
-static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
+static int __nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
-                  struct posix_acl *dfacl)
+                struct posix_acl *dfacl)
 {
        struct nfs_server *server = NFS_SERVER(inode);
        struct nfs_fattr *fattr;
@@ -353,7 +177,8 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
        switch (status) {
                case 0:
                        status = nfs_refresh_inode(inode, fattr);
-                        nfs3_cache_acls(inode, acl, dfacl);
+                        set_cached_acl(inode, ACL_TYPE_ACCESS, acl);
+                        set_cached_acl(inode, ACL_TYPE_DEFAULT, dfacl);
                        break;
                case -EPFNOSUPPORT:
                case -EPROTONOSUPPORT:
@@ -373,40 +198,43 @@ out:
        return status;
 }
-int nfs3_proc_setacl(struct inode *inode, int type, struct posix_acl *acl)
+int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
+                struct posix_acl *dfacl)
+{
+        int ret;
+        ret = __nfs3_proc_setacls(inode, acl, dfacl);
+        return (ret == -EOPNOTSUPP) ? 0 : ret;
+}
+int nfs3_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 {
        struct posix_acl *alloc = NULL, *dfacl = NULL;
        int status;
        if (S_ISDIR(inode->i_mode)) {
                switch(type) {
-                        case ACL_TYPE_ACCESS:
+                case ACL_TYPE_ACCESS:
-                                alloc = dfacl = nfs3_proc_getacl(inode,
+                        alloc = dfacl = get_acl(inode, ACL_TYPE_DEFAULT);
-                                                ACL_TYPE_DEFAULT);
+                        if (IS_ERR(alloc))
-                                if (IS_ERR(alloc))
+                                goto fail;
-                                        goto fail;
+                        break;
-                                break;
-                        case ACL_TYPE_DEFAULT:
-                                dfacl = acl;
-                                alloc = acl = nfs3_proc_getacl(inode,
-                                                ACL_TYPE_ACCESS);
-                                if (IS_ERR(alloc))
-                                        goto fail;
-                                break;
-                        default:
+                case ACL_TYPE_DEFAULT:
-                                return -EINVAL;
+                        dfacl = acl;
+                        alloc = acl = get_acl(inode, ACL_TYPE_ACCESS);
+                        if (IS_ERR(alloc))
+                                goto fail;
+                        break;
                }
-        } else if (type != ACL_TYPE_ACCESS)
+        }
-                        return -EINVAL;
        if (acl == NULL) {
                alloc = acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);
                if (IS_ERR(alloc))
                        goto fail;
        }
-        status = nfs3_proc_setacls(inode, acl, dfacl);
+        status = __nfs3_proc_setacls(inode, acl, dfacl);
        posix_acl_release(alloc);
        return status;
@@ -414,27 +242,8 @@ fail:
        return PTR_ERR(alloc);
 }
-int nfs3_proc_set_default_acl(struct inode *dir, struct inode *inode,
+const struct xattr_handler *nfs3_xattr_handlers[] = {
-                umode_t mode)
+        &posix_acl_access_xattr_handler,
-{
+        &posix_acl_default_xattr_handler,
-        struct posix_acl *dfacl, *acl;
+        NULL,
-        int error = 0;
+};
-        dfacl = nfs3_proc_getacl(dir, ACL_TYPE_DEFAULT);
-        if (IS_ERR(dfacl)) {
-                error = PTR_ERR(dfacl);
-                return (error == -EOPNOTSUPP) ? 0 : error;
-        }
-        if (!dfacl)
-                return 0;
-        acl = posix_acl_dup(dfacl);
-        error = posix_acl_create(&acl, GFP_KERNEL, &mode);
-        if (error < 0)
-                goto out_release_dfacl;
-        error = nfs3_proc_setacls(inode, acl, S_ISDIR(inode->i_mode) ?
-                                                      dfacl : NULL);
-        posix_acl_release(acl);
-out_release_dfacl:
-        posix_acl_release(dfacl);
-        return error;
-}
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 01b6f6a49d16..a462ef0fb5d6 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -18,6 +18,7 @@
 #include <linux/lockd/bind.h>
 #include <linux/nfs_mount.h>
 #include <linux/freezer.h>
+#include <linux/xattr.h>
 #include "iostat.h"
 #include "internal.h"
@@ -317,8 +318,8 @@ static int
 nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
                 int flags)
 {
+        struct posix_acl *default_acl, *acl;
        struct nfs3_createdata *data;
-        umode_t mode = sattr->ia_mode;
        int status = -ENOMEM;
        dprintk("NFS call  create %pd\n", dentry);
@@ -340,7 +341,9 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
                data->arg.create.verifier[1] = cpu_to_be32(current->pid);
        }
-        sattr->ia_mode &= ~current_umask();
+        status = posix_acl_create(dir, &sattr->ia_mode, &default_acl, &acl);
+        if (status)
+                goto out;
        for (;;) {
                status = nfs3_do_create(dir, dentry, data);
@@ -366,7 +369,7 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
        }
        if (status != 0)
-                goto out;
+                goto out_release_acls;
        /* When we created the file with exclusive semantics, make
         * sure we set the attributes afterwards. */
@@ -385,9 +388,14 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
                nfs_post_op_update_inode(dentry->d_inode, data->res.fattr);
                dprintk("NFS reply setattr (post-create): %d\n", status);
                if (status != 0)
-                        goto out;
+                        goto out_release_acls;
        }
-        status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode);
+        status = nfs3_proc_setacls(dentry->d_inode, acl, default_acl);
+out_release_acls:
+        posix_acl_release(acl);
+        posix_acl_release(default_acl);
 out:
        nfs3_free_createdata(data);
        dprintk("NFS reply create: %d\n", status);
@@ -572,18 +580,20 @@ out:
 static int
 nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
 {
+        struct posix_acl *default_acl, *acl;
        struct nfs3_createdata *data;
-        umode_t mode = sattr->ia_mode;
        int status = -ENOMEM;
        dprintk("NFS call  mkdir %pd\n", dentry);
-        sattr->ia_mode &= ~current_umask();
        data = nfs3_alloc_createdata();
        if (data == NULL)
                goto out;
+        status = posix_acl_create(dir, &sattr->ia_mode, &default_acl, &acl);
+        if (status)
+                goto out;
        data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_MKDIR];
        data->arg.mkdir.fh = NFS_FH(dir);
        data->arg.mkdir.name = dentry->d_name.name;
@@ -592,9 +602,13 @@ nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
        status = nfs3_do_create(dir, dentry, data);
        if (status != 0)
-                goto out;
+                goto out_release_acls;
-        status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode);
+        status = nfs3_proc_setacls(dentry->d_inode, acl, default_acl);
+out_release_acls:
+        posix_acl_release(acl);
+        posix_acl_release(default_acl);
 out:
        nfs3_free_createdata(data);
        dprintk("NFS reply mkdir: %d\n", status);
@@ -691,19 +705,21 @@ static int
 nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
                dev_t rdev)
 {
+        struct posix_acl *default_acl, *acl;
        struct nfs3_createdata *data;
-        umode_t mode = sattr->ia_mode;
        int status = -ENOMEM;
        dprintk("NFS call  mknod %pd %u:%u\n", dentry,
                        MAJOR(rdev), MINOR(rdev));
-        sattr->ia_mode &= ~current_umask();
        data = nfs3_alloc_createdata();
        if (data == NULL)
                goto out;
+        status = posix_acl_create(dir, &sattr->ia_mode, &default_acl, &acl);
+        if (status)
+                goto out;
        data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_MKNOD];
        data->arg.mknod.fh = NFS_FH(dir);
        data->arg.mknod.name = dentry->d_name.name;
@@ -731,8 +747,13 @@ nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
        status = nfs3_do_create(dir, dentry, data);
        if (status != 0)
-                goto out;
+                goto out_release_acls;
-        status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode);
+        status = nfs3_proc_setacls(dentry->d_inode, acl, default_acl);
+out_release_acls:
+        posix_acl_release(acl);
+        posix_acl_release(default_acl);
 out:
        nfs3_free_createdata(data);
        dprintk("NFS reply mknod: %d\n", status);
@@ -904,20 +925,28 @@ static const struct inode_operations nfs3_dir_inode_operations = {
        .permission     = nfs_permission,
        .getattr        = nfs_getattr,
        .setattr        = nfs_setattr,
-        .listxattr      = nfs3_listxattr,
+#ifdef CONFIG_NFS_V3_ACL
-        .getxattr       = nfs3_getxattr,
+        .listxattr      = generic_listxattr,
-        .setxattr       = nfs3_setxattr,
+        .getxattr       = generic_getxattr,
-        .removexattr    = nfs3_removexattr,
+        .setxattr       = generic_setxattr,
+        .removexattr    = generic_removexattr,
+        .get_acl        = nfs3_get_acl,
+        .set_acl        = nfs3_set_acl,
+#endif
 };
 static const struct inode_operations nfs3_file_inode_operations = {
        .permission     = nfs_permission,
        .getattr        = nfs_getattr,
        .setattr        = nfs_setattr,
-        .listxattr      = nfs3_listxattr,
+#ifdef CONFIG_NFS_V3_ACL
-        .getxattr       = nfs3_getxattr,
+        .listxattr      = generic_listxattr,
-        .setxattr       = nfs3_setxattr,
+        .getxattr       = generic_getxattr,
-        .removexattr    = nfs3_removexattr,
+        .setxattr       = generic_setxattr,
+        .removexattr    = generic_removexattr,
+        .get_acl        = nfs3_get_acl,
+        .set_acl        = nfs3_set_acl,
+#endif
 };
 const struct nfs_rpc_ops nfs_v3_clientops = {
@@ -965,7 +994,7 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
        .commit_rpc_prepare = nfs3_proc_commit_rpc_prepare,
        .commit_done    = nfs3_commit_done,
        .lock           = nfs3_proc_lock,
-        .clear_acl_cache = nfs3_forget_cached_acls,
+        .clear_acl_cache = forget_all_cached_acls,
        .close_context  = nfs_close_context,
        .have_delegation = nfs3_have_delegation,
        .return_delegation = nfs3_return_delegation,
diff --git a/fs/nfs/nfs3super.c b/fs/nfs/nfs3super.c
index cc471c725230..d6a98949af19 100644
--- a/fs/nfs/nfs3super.c
+++ b/fs/nfs/nfs3super.c
@@ -12,6 +12,9 @@ static struct nfs_subversion nfs_v3 = {
        .rpc_vers = &nfs_version3,
        .rpc_ops  = &nfs_v3_clientops,
        .sops     = &nfs_sops,
+#ifdef CONFIG_NFS_V3_ACL
+        .xattr    = nfs3_xattr_handlers,
+#endif
 };
 static int __init init_nfs_v3(void)
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 5609edc742a0..a5b27c2d9689 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -270,6 +270,7 @@ static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *ser
 extern int nfs41_setup_sequence(struct nfs4_session *session,
                struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
                struct rpc_task *task);
+extern int nfs41_sequence_done(struct rpc_task *, struct nfs4_sequence_res *);
 extern int nfs4_proc_create_session(struct nfs_client *, struct rpc_cred *);
 extern int nfs4_proc_destroy_session(struct nfs4_session *, struct rpc_cred *);
 extern int nfs4_proc_get_lease_time(struct nfs_client *clp,
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index b4a160a405ce..0e46d3d1b6cc 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -10,6 +10,7 @@
 #include <linux/sunrpc/auth.h>
 #include <linux/sunrpc/xprt.h>
 #include <linux/sunrpc/bc_xprt.h>
+#include <linux/sunrpc/rpc_pipe_fs.h>
 #include "internal.h"
 #include "callback.h"
 #include "delegation.h"
@@ -169,7 +170,7 @@ void nfs41_shutdown_client(struct nfs_client *clp)
 void nfs40_shutdown_client(struct nfs_client *clp)
 {
        if (clp->cl_slot_tbl) {
-                nfs4_release_slot_table(clp->cl_slot_tbl);
+                nfs4_shutdown_slot_table(clp->cl_slot_tbl);
                kfree(clp->cl_slot_tbl);
        }
 }
@@ -370,6 +371,7 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp,
                __set_bit(NFS_CS_INFINITE_SLOTS, &clp->cl_flags);
        __set_bit(NFS_CS_DISCRTRY, &clp->cl_flags);
        __set_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags);
        error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_GSS_KRB5I);
        if (error == -EINVAL)
                error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX);
@@ -409,13 +411,11 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp,
        error = nfs4_discover_server_trunking(clp, &old);
        if (error < 0)
                goto error;
-        nfs_put_client(clp);
-        if (clp != old) {
-                clp->cl_preserve_clid = true;
-                clp = old;
-        }
-        return clp;
+        if (clp != old)
+                clp->cl_preserve_clid = true;
+        nfs_put_client(clp);
+        return old;
 error:
        nfs_mark_client_ready(clp, error);
@@ -493,9 +493,10 @@ int nfs40_walk_client_list(struct nfs_client *new,
                        prev = pos;
                        status = nfs_wait_client_init_complete(pos);
-                        spin_lock(&nn->nfs_client_lock);
                        if (status < 0)
-                                continue;
+                                goto out;
+                        status = -NFS4ERR_STALE_CLIENTID;
+                        spin_lock(&nn->nfs_client_lock);
                }
                if (pos->cl_cons_state != NFS_CS_READY)
                        continue;
@@ -633,7 +634,8 @@ int nfs41_walk_client_list(struct nfs_client *new,
                        }
                        spin_lock(&nn->nfs_client_lock);
                        if (status < 0)
-                                continue;
+                                break;
+                        status = -NFS4ERR_STALE_CLIENTID;
                }
                if (pos->cl_cons_state != NFS_CS_READY)
                        continue;
@@ -1133,6 +1135,7 @@ static int nfs_probe_destination(struct nfs_server *server)
 * @hostname: new end-point's hostname
 * @sap: new end-point's socket address
 * @salen: size of "sap"
+ * @net: net namespace
 *
 * The nfs_server must be quiescent before this function is invoked.
 * Either its session is drained (NFSv4.1+), or its transport is
@@ -1141,13 +1144,13 @@ static int nfs_probe_destination(struct nfs_server *server)
 * Returns zero on success, or a negative errno value.
 */
 int nfs4_update_server(struct nfs_server *server, const char *hostname,
-                       struct sockaddr *sap, size_t salen)
+                       struct sockaddr *sap, size_t salen, struct net *net)
 {
        struct nfs_client *clp = server->nfs_client;
        struct rpc_clnt *clnt = server->client;
        struct xprt_create xargs = {
                .ident          = clp->cl_proto,
-                .net            = &init_net,
+                .net            = net,
                .dstaddr        = sap,
                .addrlen        = salen,
                .servername     = hostname,
@@ -1187,7 +1190,7 @@ int nfs4_update_server(struct nfs_server *server, const char *hostname,
        error = nfs4_set_client(server, hostname, sap, salen, buf,
                                clp->cl_rpcclient->cl_auth->au_flavor,
                                clp->cl_proto, clnt->cl_timeout,
-                                clp->cl_minorversion, clp->cl_net);
+                                clp->cl_minorversion, net);
        nfs_put_client(clp);
        if (error != 0) {
                nfs_server_insert_lists(server);
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index b86464ba25e1..b9a35c05b60f 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -91,10 +91,10 @@ static void filelayout_reset_write(struct nfs_write_data *data)
        if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
                dprintk("%s Reset task %5u for i/o through MDS "
-                        "(req %s/%lld, %u bytes @ offset %llu)\n", __func__,
+                        "(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
                        data->task.tk_pid,
                        hdr->inode->i_sb->s_id,
-                        (long long)NFS_FILEID(hdr->inode),
+                        (unsigned long long)NFS_FILEID(hdr->inode),
                        data->args.count,
                        (unsigned long long)data->args.offset);
@@ -112,10 +112,10 @@ static void filelayout_reset_read(struct nfs_read_data *data)
        if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
                dprintk("%s Reset task %5u for i/o through MDS "
-                        "(req %s/%lld, %u bytes @ offset %llu)\n", __func__,
+                        "(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
                        data->task.tk_pid,
                        hdr->inode->i_sb->s_id,
-                        (long long)NFS_FILEID(hdr->inode),
+                        (unsigned long long)NFS_FILEID(hdr->inode),
                        data->args.count,
                        (unsigned long long)data->args.offset);
@@ -324,8 +324,9 @@ static void filelayout_read_prepare(struct rpc_task *task, void *data)
                        &rdata->res.seq_res,
                        task))
                return;
-        nfs4_set_rw_stateid(&rdata->args.stateid, rdata->args.context,
+        if (nfs4_set_rw_stateid(&rdata->args.stateid, rdata->args.context,
-                        rdata->args.lock_context, FMODE_READ);
+                        rdata->args.lock_context, FMODE_READ) == -EIO)
+                rpc_exit(task, -EIO); /* lost lock, terminate I/O */
 }
 static void filelayout_read_call_done(struct rpc_task *task, void *data)
@@ -335,8 +336,10 @@ static void filelayout_read_call_done(struct rpc_task *task, void *data)
        dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status);
        if (test_bit(NFS_IOHDR_REDO, &rdata->header->flags) &&
-            task->tk_status == 0)
+            task->tk_status == 0) {
+                nfs41_sequence_done(task, &rdata->res.seq_res);
                return;
+        }
        /* Note this may cause RPC to be resent */
        rdata->header->mds_ops->rpc_call_done(task, data);
@@ -433,8 +436,9 @@ static void filelayout_write_prepare(struct rpc_task *task, void *data)
                        &wdata->res.seq_res,
                        task))
                return;
-        nfs4_set_rw_stateid(&wdata->args.stateid, wdata->args.context,
+        if (nfs4_set_rw_stateid(&wdata->args.stateid, wdata->args.context,
-                        wdata->args.lock_context, FMODE_WRITE);
+                        wdata->args.lock_context, FMODE_WRITE) == -EIO)
+                rpc_exit(task, -EIO); /* lost lock, terminate I/O */
 }
 static void filelayout_write_call_done(struct rpc_task *task, void *data)
@@ -442,8 +446,10 @@ static void filelayout_write_call_done(struct rpc_task *task, void *data)
        struct nfs_write_data *wdata = data;
        if (test_bit(NFS_IOHDR_REDO, &wdata->header->flags) &&
-            task->tk_status == 0)
+            task->tk_status == 0) {
+                nfs41_sequence_done(task, &wdata->res.seq_res);
                return;
+        }
        /* Note this may cause RPC to be resent */
        wdata->header->mds_ops->rpc_call_done(task, data);
@@ -1216,17 +1222,17 @@ static void filelayout_recover_commit_reqs(struct list_head *dst,
        struct pnfs_commit_bucket *b;
        int i;
-        /* NOTE cinfo->lock is NOT held, relying on fact that this is
+        spin_lock(cinfo->lock);
-         * only called on single thread per dreq.
-         * Can't take the lock because need to do pnfs_put_lseg
-         */
        for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) {
                if (transfer_commit_list(&b->written, dst, cinfo, 0)) {
+                        spin_unlock(cinfo->lock);
                        pnfs_put_lseg(b->wlseg);
                        b->wlseg = NULL;
+                        spin_lock(cinfo->lock);
                }
        }
        cinfo->ds->nwritten = 0;
+        spin_unlock(cinfo->lock);
 }
 static unsigned int
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index c7c295e556ed..efac602edb37 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -95,7 +95,7 @@ same_sockaddr(struct sockaddr *addr1, struct sockaddr *addr2)
                b6 = (struct sockaddr_in6 *)addr2;
                /* LINKLOCAL addresses must have matching scope_id */
-                if (ipv6_addr_scope(&a6->sin6_addr) ==
+                if (ipv6_addr_src_scope(&a6->sin6_addr) ==
                    IPV6_ADDR_SCOPE_LINKLOCAL &&
                    a6->sin6_scope_id != b6->sin6_scope_id)
                        return false;
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 4e7f05d3e9db..3d5dbf80d46a 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -121,9 +121,8 @@ static int nfs4_validate_fspath(struct dentry *dentry,
 }
 static size_t nfs_parse_server_name(char *string, size_t len,
-                struct sockaddr *sa, size_t salen, struct nfs_server *server)
+                struct sockaddr *sa, size_t salen, struct net *net)
 {
-        struct net *net = rpc_net_ns(server->client);
        ssize_t ret;
        ret = rpc_pton(net, string, len, sa, salen);
@@ -223,6 +222,7 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
                                     const struct nfs4_fs_location *location)
 {
        const size_t addr_bufsize = sizeof(struct sockaddr_storage);
+        struct net *net = rpc_net_ns(NFS_SB(mountdata->sb)->client);
        struct vfsmount *mnt = ERR_PTR(-ENOENT);
        char *mnt_path;
        unsigned int maxbuflen;
@@ -248,8 +248,7 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
                        continue;
                mountdata->addrlen = nfs_parse_server_name(buf->data, buf->len,
-                                mountdata->addr, addr_bufsize,
+                                mountdata->addr, addr_bufsize, net);
-                                NFS_SB(mountdata->sb));
                if (mountdata->addrlen == 0)
                        continue;
@@ -419,6 +418,7 @@ static int nfs4_try_replacing_one_location(struct nfs_server *server,
                const struct nfs4_fs_location *location)
 {
        const size_t addr_bufsize = sizeof(struct sockaddr_storage);
+        struct net *net = rpc_net_ns(server->client);
        struct sockaddr *sap;
        unsigned int s;
        size_t salen;
@@ -440,7 +440,7 @@ static int nfs4_try_replacing_one_location(struct nfs_server *server,
                        continue;
                salen = nfs_parse_server_name(buf->data, buf->len,
-                                                sap, addr_bufsize, server);
+                                                sap, addr_bufsize, net);
                if (salen == 0)
                        continue;
                rpc_set_port(sap, NFS_PORT);
@@ -450,7 +450,7 @@ static int nfs4_try_replacing_one_location(struct nfs_server *server,
                if (hostname == NULL)
                        break;
-                error = nfs4_update_server(server, hostname, sap, salen);
+                error = nfs4_update_server(server, hostname, sap, salen, net);
                kfree(hostname);
                if (error == 0)
                        break;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 15052b81df42..450bfedbe2f4 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -539,7 +539,7 @@ static int nfs40_sequence_done(struct rpc_task *task,
        struct nfs4_slot *slot = res->sr_slot;
        struct nfs4_slot_table *tbl;
-        if (!RPC_WAS_SENT(task))
+        if (slot == NULL)
                goto out;
        tbl = slot->table;
@@ -559,15 +559,10 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
 {
        struct nfs4_session *session;
        struct nfs4_slot_table *tbl;
+        struct nfs4_slot *slot = res->sr_slot;
        bool send_new_highest_used_slotid = false;
-        if (!res->sr_slot) {
+        tbl = slot->table;
-                /* just wake up the next guy waiting since
-                 * we may have not consumed a slot after all */
-                dprintk("%s: No slot\n", __func__);
-                return;
-        }
-        tbl = res->sr_slot->table;
        session = tbl->session;
        spin_lock(&tbl->slot_tbl_lock);
@@ -577,11 +572,11 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
        if (tbl->highest_used_slotid > tbl->target_highest_slotid)
                send_new_highest_used_slotid = true;
-        if (nfs41_wake_and_assign_slot(tbl, res->sr_slot)) {
+        if (nfs41_wake_and_assign_slot(tbl, slot)) {
                send_new_highest_used_slotid = false;
                goto out_unlock;
        }
-        nfs4_free_slot(tbl, res->sr_slot);
+        nfs4_free_slot(tbl, slot);
        if (tbl->highest_used_slotid != NFS4_NO_SLOT)
                send_new_highest_used_slotid = false;
@@ -592,19 +587,20 @@ out_unlock:
                nfs41_server_notify_highest_slotid_update(session->clp);
 }
-static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res)
+int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res)
 {
        struct nfs4_session *session;
-        struct nfs4_slot *slot;
+        struct nfs4_slot *slot = res->sr_slot;
        struct nfs_client *clp;
        bool interrupted = false;
        int ret = 1;
+        if (slot == NULL)
+                goto out_noaction;
        /* don't increment the sequence number if the task wasn't sent */
        if (!RPC_WAS_SENT(task))
                goto out;
-        slot = res->sr_slot;
        session = slot->table->session;
        if (slot->interrupted) {
@@ -679,6 +675,7 @@ out:
        /* The session may be reset by one of the error handlers. */
        dprintk("%s: Error %d free the slot \n", __func__, res->sr_status);
        nfs41_sequence_free_slot(res);
+out_noaction:
        return ret;
 retry_nowait:
        if (rpc_restart_call_prepare(task)) {
@@ -692,6 +689,7 @@ out_retry:
        rpc_delay(task, NFS4_POLL_RETRY_MAX);
        return 0;
 }
+EXPORT_SYMBOL_GPL(nfs41_sequence_done);
 static int nfs4_sequence_done(struct rpc_task *task,
                               struct nfs4_sequence_res *res)
@@ -1622,15 +1620,15 @@ static void nfs4_open_confirm_prepare(struct rpc_task *task, void *calldata)
 {
        struct nfs4_opendata *data = calldata;
-        nfs40_setup_sequence(data->o_arg.server, &data->o_arg.seq_args,
+        nfs40_setup_sequence(data->o_arg.server, &data->c_arg.seq_args,
-                                &data->o_res.seq_res, task);
+                                &data->c_res.seq_res, task);
 }
 static void nfs4_open_confirm_done(struct rpc_task *task, void *calldata)
 {
        struct nfs4_opendata *data = calldata;
-        nfs40_sequence_done(task, &data->o_res.seq_res);
+        nfs40_sequence_done(task, &data->c_res.seq_res);
        data->rpc_status = task->tk_status;
        if (data->rpc_status == 0) {
@@ -1688,7 +1686,7 @@ static int _nfs4_proc_open_confirm(struct nfs4_opendata *data)
        };
        int status;
-        nfs4_init_sequence(&data->o_arg.seq_args, &data->o_res.seq_res, 1);
+        nfs4_init_sequence(&data->c_arg.seq_args, &data->c_res.seq_res, 1);
        kref_get(&data->kref);
        data->rpc_done = 0;
        data->rpc_status = 0;
@@ -2400,13 +2398,16 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
        if (nfs4_copy_delegation_stateid(&arg.stateid, inode, fmode)) {
                /* Use that stateid */
-        } else if (truncate && state != NULL && nfs4_valid_open_stateid(state)) {
+        } else if (truncate && state != NULL) {
                struct nfs_lockowner lockowner = {
                        .l_owner = current->files,
                        .l_pid = current->tgid,
                };
-                nfs4_select_rw_stateid(&arg.stateid, state, FMODE_WRITE,
+                if (!nfs4_valid_open_stateid(state))
-                                &lockowner);
+                        return -EBADF;
+                if (nfs4_select_rw_stateid(&arg.stateid, state, FMODE_WRITE,
+                                &lockowner) == -EIO)
+                        return -EBADF;
        } else
                nfs4_stateid_copy(&arg.stateid, &zero_stateid);
@@ -2744,7 +2745,8 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
                                NFS_CAP_OWNER_GROUP|NFS_CAP_ATIME|
                                NFS_CAP_CTIME|NFS_CAP_MTIME|
                                NFS_CAP_SECURITY_LABEL);
-                if (res.attr_bitmask[0] & FATTR4_WORD0_ACL)
+                if (res.attr_bitmask[0] & FATTR4_WORD0_ACL &&
+                                res.acl_bitmask & ACL4_SUPPORT_ALLOW_ACL)
                        server->caps |= NFS_CAP_ACLS;
                if (res.has_links != 0)
                        server->caps |= NFS_CAP_HARDLINKS;
@@ -4012,8 +4014,9 @@ static bool nfs4_stateid_is_current(nfs4_stateid *stateid,
 {
        nfs4_stateid current_stateid;
-        if (nfs4_set_rw_stateid(&current_stateid, ctx, l_ctx, fmode))
+        /* If the current stateid represents a lost lock, then exit */
-                return false;
+        if (nfs4_set_rw_stateid(&current_stateid, ctx, l_ctx, fmode) == -EIO)
+                return true;
        return nfs4_stateid_match(stateid, &current_stateid);
 }
@@ -4321,9 +4324,7 @@ static int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred)
 static inline int nfs4_server_supports_acls(struct nfs_server *server)
 {
-        return (server->caps & NFS_CAP_ACLS)
+        return server->caps & NFS_CAP_ACLS;
-                && (server->acl_bitmask & ACL4_SUPPORT_ALLOW_ACL)
-                && (server->acl_bitmask & ACL4_SUPPORT_DENY_ACL);
 }
 /* Assuming that XATTR_SIZE_MAX is a multiple of PAGE_SIZE, and that
@@ -5831,8 +5832,7 @@ struct nfs_release_lockowner_data {
        struct nfs4_lock_state *lsp;
        struct nfs_server *server;
        struct nfs_release_lockowner_args args;
-        struct nfs4_sequence_args seq_args;
+        struct nfs_release_lockowner_res res;
-        struct nfs4_sequence_res seq_res;
        unsigned long timestamp;
 };
@@ -5840,7 +5840,7 @@ static void nfs4_release_lockowner_prepare(struct rpc_task *task, void *calldata
 {
        struct nfs_release_lockowner_data *data = calldata;
        nfs40_setup_sequence(data->server,
-                                &data->seq_args, &data->seq_res, task);
+                                &data->args.seq_args, &data->res.seq_res, task);
        data->timestamp = jiffies;
 }
@@ -5849,7 +5849,7 @@ static void nfs4_release_lockowner_done(struct rpc_task *task, void *calldata)
        struct nfs_release_lockowner_data *data = calldata;
        struct nfs_server *server = data->server;
-        nfs40_sequence_done(task, &data->seq_res);
+        nfs40_sequence_done(task, &data->res.seq_res);
        switch (task->tk_status) {
        case 0:
@@ -5890,7 +5890,6 @@ static int nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_st
        data = kmalloc(sizeof(*data), GFP_NOFS);
        if (!data)
                return -ENOMEM;
-        nfs4_init_sequence(&data->seq_args, &data->seq_res, 0);
        data->lsp = lsp;
        data->server = server;
        data->args.lock_owner.clientid = server->nfs_client->cl_clientid;
@@ -5898,6 +5897,8 @@ static int nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_st
        data->args.lock_owner.s_dev = server->s_dev;
        msg.rpc_argp = &data->args;
+        msg.rpc_resp = &data->res;
+        nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0);
        rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, data);
        return 0;
 }
@@ -7409,9 +7410,9 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
        struct nfs_server *server = NFS_SERVER(inode);
        struct pnfs_layout_hdr *lo;
        struct nfs4_state *state = NULL;
-        unsigned long timeo, giveup;
+        unsigned long timeo, now, giveup;
-        dprintk("--> %s\n", __func__);
+        dprintk("--> %s tk_status => %d\n", __func__, -task->tk_status);
        if (!nfs41_sequence_done(task, &lgp->res.seq_res))
                goto out;
@@ -7419,12 +7420,38 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
        switch (task->tk_status) {
        case 0:
                goto out;
+        /*
+         * NFS4ERR_LAYOUTTRYLATER is a conflict with another client
+         * (or clients) writing to the same RAID stripe
+         */
        case -NFS4ERR_LAYOUTTRYLATER:
+        /*
+         * NFS4ERR_RECALLCONFLICT is when conflict with self (must recall
+         * existing layout before getting a new one).
+         */
        case -NFS4ERR_RECALLCONFLICT:
                timeo = rpc_get_timeout(task->tk_client);
                giveup = lgp->args.timestamp + timeo;
-                if (time_after(giveup, jiffies))
+                now = jiffies;
-                        task->tk_status = -NFS4ERR_DELAY;
+                if (time_after(giveup, now)) {
+                        unsigned long delay;
+                        /* Delay for:
+                         * - Not less then NFS4_POLL_RETRY_MIN.
+                         * - One last time a jiffie before we give up
+                         * - exponential backoff (time_now minus start_attempt)
+                         */
+                        delay = max_t(unsigned long, NFS4_POLL_RETRY_MIN,
+                                    min((giveup - now - 1),
+                                        now - lgp->args.timestamp));
+                        dprintk("%s: NFS4ERR_RECALLCONFLICT waiting %lu\n",
+                                __func__, delay);
+                        rpc_delay(task, delay);
+                        task->tk_status = 0;
+                        rpc_restart_call_prepare(task);
+                        goto out; /* Do not call nfs4_async_handle_error() */
+                }
                break;
        case -NFS4ERR_EXPIRED:
        case -NFS4ERR_BAD_STATEID:
@@ -7780,10 +7807,7 @@ nfs4_layoutcommit_done(struct rpc_task *task, void *calldata)
        case -NFS4ERR_BADLAYOUT:     /* no layout */
        case -NFS4ERR_GRACE:        /* loca_recalim always false */
                task->tk_status = 0;
-                break;
        case 0:
-                nfs_post_op_update_inode_force_wcc(data->args.inode,
-                                                   data->res.fattr);
                break;
        default:
                if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) {
@@ -7798,6 +7822,8 @@ static void nfs4_layoutcommit_release(void *calldata)
        struct nfs4_layoutcommit_data *data = calldata;
        pnfs_cleanup_layoutcommit(data);
+        nfs_post_op_update_inode_force_wcc(data->args.inode,
+                                           data->res.fattr);
        put_rpccred(data->cred);
        kfree(data);
 }
@@ -7920,7 +7946,7 @@ nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle,
                switch (err) {
                case 0:
                case -NFS4ERR_WRONGSEC:
-                case -NFS4ERR_NOTSUPP:
+                case -ENOTSUPP:
                        goto out;
                default:
                        err = nfs4_handle_exception(server, err, &exception);
@@ -7954,7 +7980,7 @@ nfs41_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle,
         * Fall back on "guess and check" method if
         * the server doesn't support SECINFO_NO_NAME
         */
-        if (err == -NFS4ERR_WRONGSEC || err == -NFS4ERR_NOTSUPP) {
+        if (err == -NFS4ERR_WRONGSEC || err == -ENOTSUPP) {
                err = nfs4_find_root_sec(server, fhandle, info);
                goto out_freepage;
        }
diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c
index cf883c7ae053..e799dc3c3b1d 100644
--- a/fs/nfs/nfs4session.c
+++ b/fs/nfs/nfs4session.c
@@ -231,14 +231,23 @@ out:
        return ret;
 }
+/*
+ * nfs4_release_slot_table - release all slot table entries
+ */
+static void nfs4_release_slot_table(struct nfs4_slot_table *tbl)
+{
+        nfs4_shrink_slot_table(tbl, 0);
+}
 /**
- * nfs4_release_slot_table - release resources attached to a slot table
+ * nfs4_shutdown_slot_table - release resources attached to a slot table
 * @tbl: slot table to shut down
 *
 */
-void nfs4_release_slot_table(struct nfs4_slot_table *tbl)
+void nfs4_shutdown_slot_table(struct nfs4_slot_table *tbl)
 {
-        nfs4_shrink_slot_table(tbl, 0);
+        nfs4_release_slot_table(tbl);
+        rpc_destroy_wait_queue(&tbl->slot_tbl_waitq);
 }
 /**
@@ -422,7 +431,7 @@ void nfs41_update_target_slotid(struct nfs4_slot_table *tbl,
        spin_unlock(&tbl->slot_tbl_lock);
 }
-static void nfs4_destroy_session_slot_tables(struct nfs4_session *session)
+static void nfs4_release_session_slot_tables(struct nfs4_session *session)
 {
        nfs4_release_slot_table(&session->fc_slot_table);
        nfs4_release_slot_table(&session->bc_slot_table);
@@ -450,7 +459,7 @@ int nfs4_setup_session_slot_tables(struct nfs4_session *ses)
        if (status && tbl->slots == NULL)
                /* Fore and back channel share a connection so get
                 * both slot tables or neither */
-                nfs4_destroy_session_slot_tables(ses);
+                nfs4_release_session_slot_tables(ses);
        return status;
 }
@@ -470,6 +479,12 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
        return session;
 }
+static void nfs4_destroy_session_slot_tables(struct nfs4_session *session)
+{
+        nfs4_shutdown_slot_table(&session->fc_slot_table);
+        nfs4_shutdown_slot_table(&session->bc_slot_table);
+}
 void nfs4_destroy_session(struct nfs4_session *session)
 {
        struct rpc_xprt *xprt;
diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h
index 232306100651..b34ada9bc6a2 100644
--- a/fs/nfs/nfs4session.h
+++ b/fs/nfs/nfs4session.h
@@ -74,7 +74,7 @@ enum nfs4_session_state {
 extern int nfs4_setup_slot_table(struct nfs4_slot_table *tbl,
                unsigned int max_reqs, const char *queue);
-extern void nfs4_release_slot_table(struct nfs4_slot_table *tbl);
+extern void nfs4_shutdown_slot_table(struct nfs4_slot_table *tbl);
 extern struct nfs4_slot *nfs4_alloc_slot(struct nfs4_slot_table *tbl);
 extern void nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot);
 extern void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl);
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 059c01b67a71..0deb32105ccf 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -974,9 +974,6 @@ static int nfs4_copy_lock_stateid(nfs4_stateid *dst,
        else if (lsp != NULL && test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0) {
                nfs4_stateid_copy(dst, &lsp->ls_stateid);
                ret = 0;
-                smp_rmb();
-                if (!list_empty(&lsp->ls_seqid.list))
-                        ret = -EWOULDBLOCK;
        }
        spin_unlock(&state->state_lock);
        nfs4_put_lock_state(lsp);
@@ -984,10 +981,9 @@ out:
        return ret;
 }
-static int nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state)
+static void nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state)
 {
        const nfs4_stateid *src;
-        int ret;
        int seq;
        do {
@@ -996,12 +992,7 @@ static int nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state)
                if (test_bit(NFS_OPEN_STATE, &state->flags))
                        src = &state->open_stateid;
                nfs4_stateid_copy(dst, src);
-                ret = 0;
-                smp_rmb();
-                if (!list_empty(&state->owner->so_seqid.list))
-                        ret = -EWOULDBLOCK;
        } while (read_seqretry(&state->seqlock, seq));
-        return ret;
 }
 /*
@@ -1015,15 +1006,19 @@ int nfs4_select_rw_stateid(nfs4_stateid *dst, struct nfs4_state *state,
        if (ret == -EIO)
                /* A lost lock - don't even consider delegations */
                goto out;
-        if (nfs4_copy_delegation_stateid(dst, state->inode, fmode))
+        /* returns true if delegation stateid found and copied */
+        if (nfs4_copy_delegation_stateid(dst, state->inode, fmode)) {
+                ret = 0;
                goto out;
+        }
        if (ret != -ENOENT)
                /* nfs4_copy_delegation_stateid() didn't over-write
                 * dst, so it still has the lock stateid which we now
                 * choose to use.
                 */
                goto out;
-        ret = nfs4_copy_open_stateid(dst, state);
+        nfs4_copy_open_stateid(dst, state);
+        ret = 0;
 out:
        if (nfs_server_capable(state->inode, NFS_CAP_STATEID_NFSV41))
                dst->seqid = 0;
@@ -1071,7 +1066,7 @@ void nfs_free_seqid(struct nfs_seqid *seqid)
 /*
 * Increment the seqid if the OPEN/OPEN_DOWNGRADE/CLOSE succeeded, or
 * failed with a seqid incrementing error -
- * see comments nfs_fs.h:seqid_mutating_error()
+ * see comments nfs4.h:seqid_mutating_error()
 */
 static void nfs_increment_seqid(int status, struct nfs_seqid *seqid)
 {
@@ -1116,7 +1111,7 @@ void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid)
 /*
 * Increment the seqid if the LOCK/LOCKU succeeded, or
 * failed with a seqid incrementing error -
- * see comments nfs_fs.h:seqid_mutating_error()
+ * see comments nfs4.h:seqid_mutating_error()
 */
 void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid)
 {
diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c
index 65ab0a0ca1c4..808f29574412 100644
--- a/fs/nfs/nfs4super.c
+++ b/fs/nfs/nfs4super.c
@@ -77,17 +77,9 @@ static int nfs4_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
        int ret = nfs_write_inode(inode, wbc);
-        if (ret >= 0 && test_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(inode)->flags)) {
+        if (ret == 0)
-                int status;
+                ret = pnfs_layoutcommit_inode(inode,
-                bool sync = true;
+                                wbc->sync_mode == WB_SYNC_ALL);
-                if (wbc->sync_mode == WB_SYNC_NONE)
-                        sync = false;
-                status = pnfs_layoutcommit_inode(inode, sync);
-                if (status < 0)
-                        return status;
-        }
        return ret;
 }
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 5be2868c02f1..72f3bf1754ef 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -3097,7 +3097,8 @@ out_overflow:
        return -EIO;
 }
-static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
+static bool __decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected,
+                int *nfs_retval)
 {
        __be32 *p;
        uint32_t opnum;
@@ -3107,19 +3108,32 @@ static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
        if (unlikely(!p))
                goto out_overflow;
        opnum = be32_to_cpup(p++);
-        if (opnum != expected) {
+        if (unlikely(opnum != expected))
-                dprintk("nfs: Server returned operation"
+                goto out_bad_operation;
-                        " %d but we issued a request for %d\n",
-                                opnum, expected);
-                return -EIO;
-        }
        nfserr = be32_to_cpup(p);
-        if (nfserr != NFS_OK)
+        if (nfserr == NFS_OK)
-                return nfs4_stat_to_errno(nfserr);
+                *nfs_retval = 0;
-        return 0;
+        else
+                *nfs_retval = nfs4_stat_to_errno(nfserr);
+        return true;
+out_bad_operation:
+        dprintk("nfs: Server returned operation"
+                " %d but we issued a request for %d\n",
+                        opnum, expected);
+        *nfs_retval = -EREMOTEIO;
+        return false;
 out_overflow:
        print_overflow_msg(__func__, xdr);
-        return -EIO;
+        *nfs_retval = -EIO;
+        return false;
+}
+static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
+{
+        int retval;
+        __decode_op_hdr(xdr, expected, &retval);
+        return retval;
 }
 /* Dummy routine */
@@ -3435,7 +3449,7 @@ static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint
 {
        __be32 *p;
-        *res = ACL4_SUPPORT_ALLOW_ACL|ACL4_SUPPORT_DENY_ACL;
+        *res = 0;
        if (unlikely(bitmap[0] & (FATTR4_WORD0_ACLSUPPORT - 1U)))
                return -EIO;
        if (likely(bitmap[0] & FATTR4_WORD0_ACLSUPPORT)) {
@@ -5001,11 +5015,12 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
        uint32_t savewords, bmlen, i;
        int status;
-        status = decode_op_hdr(xdr, OP_OPEN);
+        if (!__decode_op_hdr(xdr, OP_OPEN, &status))
-        if (status != -EIO)
+                return status;
-                nfs_increment_open_seqid(status, res->seqid);
+        nfs_increment_open_seqid(status, res->seqid);
-        if (!status)
+        if (status)
-                status = decode_stateid(xdr, &res->stateid);
+                return status;
+        status = decode_stateid(xdr, &res->stateid);
        if (unlikely(status))
                return status;
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index 89fe741e58b1..59f838cdc009 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -36,6 +36,7 @@
        __print_flags(v, "|", \
                        { 1 << NFS_INO_ADVISE_RDPLUS, "ADVISE_RDPLUS" }, \
                        { 1 << NFS_INO_STALE, "STALE" }, \
+                        { 1 << NFS_INO_INVALIDATING, "INVALIDATING" }, \
                        { 1 << NFS_INO_FLUSHING, "FLUSHING" }, \
                        { 1 << NFS_INO_FSCACHE, "FSCACHE" }, \
                        { 1 << NFS_INO_COMMIT, "COMMIT" }, \
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index d75d938d36cb..4755858e37a0 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1790,6 +1790,15 @@ pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
 }
 EXPORT_SYMBOL_GPL(pnfs_generic_pg_readpages);
+static void pnfs_clear_layoutcommitting(struct inode *inode)
+{
+        unsigned long *bitlock = &NFS_I(inode)->flags;
+        clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock);
+        smp_mb__after_clear_bit();
+        wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING);
+}
 /*
 * There can be multiple RW segments.
 */
@@ -1807,7 +1816,6 @@ static void pnfs_list_write_lseg(struct inode *inode, struct list_head *listp)
 static void pnfs_list_write_lseg_done(struct inode *inode, struct list_head *listp)
 {
        struct pnfs_layout_segment *lseg, *tmp;
-        unsigned long *bitlock = &NFS_I(inode)->flags;
        /* Matched by references in pnfs_set_layoutcommit */
        list_for_each_entry_safe(lseg, tmp, listp, pls_lc_list) {
@@ -1815,9 +1823,7 @@ static void pnfs_list_write_lseg_done(struct inode *inode, struct list_head *lis
                pnfs_put_lseg(lseg);
        }
-        clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock);
+        pnfs_clear_layoutcommitting(inode);
-        smp_mb__after_clear_bit();
-        wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING);
 }
 void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg)
@@ -1881,43 +1887,37 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
        struct nfs4_layoutcommit_data *data;
        struct nfs_inode *nfsi = NFS_I(inode);
        loff_t end_pos;
-        int status = 0;
+        int status;
-        dprintk("--> %s inode %lu\n", __func__, inode->i_ino);
+        if (!pnfs_layoutcommit_outstanding(inode))
-        if (!test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags))
                return 0;
-        /* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */
+        dprintk("--> %s inode %lu\n", __func__, inode->i_ino);
-        data = kzalloc(sizeof(*data), GFP_NOFS);
-        if (!data) {
-                status = -ENOMEM;
-                goto out;
-        }
-        if (!test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags))
-                goto out_free;
+        status = -EAGAIN;
        if (test_and_set_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags)) {
-                if (!sync) {
+                if (!sync)
-                        status = -EAGAIN;
+                        goto out;
-                        goto out_free;
+                status = wait_on_bit_lock(&nfsi->flags,
-                }
+                                NFS_INO_LAYOUTCOMMITTING,
-                status = wait_on_bit_lock(&nfsi->flags, NFS_INO_LAYOUTCOMMITTING,
+                                nfs_wait_bit_killable,
-                                        nfs_wait_bit_killable, TASK_KILLABLE);
+                                TASK_KILLABLE);
                if (status)
-                        goto out_free;
+                        goto out;
        }
-        INIT_LIST_HEAD(&data->lseg_list);
+        status = -ENOMEM;
+        /* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */
+        data = kzalloc(sizeof(*data), GFP_NOFS);
+        if (!data)
+                goto clear_layoutcommitting;
+        status = 0;
        spin_lock(&inode->i_lock);
-        if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
+        if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags))
-                clear_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags);
+                goto out_unlock;
-                spin_unlock(&inode->i_lock);
-                wake_up_bit(&nfsi->flags, NFS_INO_LAYOUTCOMMITTING);
-                goto out_free;
-        }
+        INIT_LIST_HEAD(&data->lseg_list);
        pnfs_list_write_lseg(inode, &data->lseg_list);
        end_pos = nfsi->layout->plh_lwb;
@@ -1940,8 +1940,11 @@ out:
                mark_inode_dirty_sync(inode);
        dprintk("<-- %s status %d\n", __func__, status);
        return status;
-out_free:
+out_unlock:
+        spin_unlock(&inode->i_lock);
        kfree(data);
+clear_layoutcommitting:
+        pnfs_clear_layoutcommitting(inode);
        goto out;
 }
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index a4f41810a7f4..023793909778 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -359,6 +359,15 @@ pnfs_ld_layoutret_on_setattr(struct inode *inode)
                PNFS_LAYOUTRET_ON_SETATTR;
 }
+static inline bool
+pnfs_layoutcommit_outstanding(struct inode *inode)
+{
+        struct nfs_inode *nfsi = NFS_I(inode);
+        return test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags) != 0 ||
+                test_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags) != 0;
+}
 static inline int pnfs_return_layout(struct inode *ino)
 {
        struct nfs_inode *nfsi = NFS_I(ino);
@@ -515,6 +524,13 @@ pnfs_use_threshold(struct nfs4_threshold **dst, struct nfs4_threshold *src,
        return false;
 }
+static inline bool
+pnfs_layoutcommit_outstanding(struct inode *inode)
+{
+        return false;
+}
 static inline struct nfs4_threshold *pnfs_mdsthreshold_alloc(void)
 {
        return NULL;
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 31db5c366b81..411aedda14bb 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -163,9 +163,9 @@ static void nfs_readpage_release(struct nfs_page *req)
        unlock_page(req->wb_page);
-        dprintk("NFS: read done (%s/%Ld %d@%Ld)\n",
+        dprintk("NFS: read done (%s/%Lu %d@%Ld)\n",
                        req->wb_context->dentry->d_inode->i_sb->s_id,
-                        (long long)NFS_FILEID(req->wb_context->dentry->d_inode),
+                        (unsigned long long)NFS_FILEID(req->wb_context->dentry->d_inode),
                        req->wb_bytes,
                        (long long)req_offset(req));
        nfs_release_request(req);
@@ -228,11 +228,11 @@ int nfs_initiate_read(struct rpc_clnt *clnt,
        /* Set up the initial task struct. */
        NFS_PROTO(inode)->read_setup(data, &msg);
-        dprintk("NFS: %5u initiated read call (req %s/%lld, %u bytes @ "
+        dprintk("NFS: %5u initiated read call (req %s/%llu, %u bytes @ "
                        "offset %llu)\n",
                        data->task.tk_pid,
                        inode->i_sb->s_id,
-                        (long long)NFS_FILEID(inode),
+                        (unsigned long long)NFS_FILEID(inode),
                        data->args.count,
                        (unsigned long long)data->args.offset);
@@ -630,9 +630,9 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
        unsigned long npages;
        int ret = -ESTALE;
-        dprintk("NFS: nfs_readpages (%s/%Ld %d)\n",
+        dprintk("NFS: nfs_readpages (%s/%Lu %d)\n",
                        inode->i_sb->s_id,
-                        (long long)NFS_FILEID(inode),
+                        (unsigned long long)NFS_FILEID(inode),
                        nr_pages);
        nfs_inc_stats(inode, NFSIOS_VFSREADPAGES);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index c1d548211c31..9a3b6a4cd6b9 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -909,9 +909,14 @@ bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx)
 */
 static bool nfs_write_pageuptodate(struct page *page, struct inode *inode)
 {
+        struct nfs_inode *nfsi = NFS_I(inode);
        if (nfs_have_delegated_attributes(inode))
                goto out;
-        if (NFS_I(inode)->cache_validity & (NFS_INO_INVALID_DATA|NFS_INO_REVAL_PAGECACHE))
+        if (nfsi->cache_validity & (NFS_INO_INVALID_DATA|NFS_INO_REVAL_PAGECACHE))
+                return false;
+        smp_rmb();
+        if (test_bit(NFS_INO_INVALIDATING, &nfsi->flags))
                return false;
 out:
        return PageUptodate(page) != 0;
@@ -922,19 +927,20 @@ out:
 * extend the write to cover the entire page in order to avoid fragmentation
 * inefficiencies.
 *
- * If the file is opened for synchronous writes or if we have a write delegation
+ * If the file is opened for synchronous writes then we can just skip the rest
- * from the server then we can just skip the rest of the checks.
+ * of the checks.
 */
 static int nfs_can_extend_write(struct file *file, struct page *page, struct inode *inode)
 {
        if (file->f_flags & O_DSYNC)
                return 0;
+        if (!nfs_write_pageuptodate(page, inode))
+                return 0;
        if (NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE))
                return 1;
-        if (nfs_write_pageuptodate(page, inode) && (inode->i_flock == NULL ||
+        if (inode->i_flock == NULL || (inode->i_flock->fl_start == 0 &&
-                        (inode->i_flock->fl_start == 0 &&
                        inode->i_flock->fl_end == OFFSET_MAX &&
-                        inode->i_flock->fl_type != F_RDLCK)))
+                        inode->i_flock->fl_type != F_RDLCK))
                return 1;
        return 0;
 }
@@ -1013,10 +1019,10 @@ int nfs_initiate_write(struct rpc_clnt *clnt,
        NFS_PROTO(inode)->write_setup(data, &msg);
        dprintk("NFS: %5u initiated write call "
-                "(req %s/%lld, %u bytes @ offset %llu)\n",
+                "(req %s/%llu, %u bytes @ offset %llu)\n",
                data->task.tk_pid,
                inode->i_sb->s_id,
-                (long long)NFS_FILEID(inode),
+                (unsigned long long)NFS_FILEID(inode),
                data->args.count,
                (unsigned long long)data->args.offset);
@@ -1606,9 +1612,9 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
                nfs_list_remove_request(req);
                nfs_clear_page_commit(req->wb_page);
-                dprintk("NFS:       commit (%s/%lld %d@%lld)",
+                dprintk("NFS:       commit (%s/%llu %d@%lld)",
                        req->wb_context->dentry->d_sb->s_id,
-                        (long long)NFS_FILEID(req->wb_context->dentry->d_inode),
+                        (unsigned long long)NFS_FILEID(req->wb_context->dentry->d_inode),
                        req->wb_bytes,
                        (long long)req_offset(req));
                if (status < 0) {
diff --git a/fs/nfsd/acl.h b/fs/nfsd/acl.h
index 8b186a4955cc..a812fd1b92a4 100644
--- a/fs/nfsd/acl.h
+++ b/fs/nfsd/acl.h
@@ -35,7 +35,9 @@
 #ifndef LINUX_NFS4_ACL_H
 #define LINUX_NFS4_ACL_H
-#include <linux/posix_acl.h>
+struct nfs4_acl;
+struct svc_fh;
+struct svc_rqst;
 /* Maximum ACL we'll accept from client; chosen (somewhat arbitrarily) to
 * fit in a page: */
@@ -43,15 +45,11 @@
 struct nfs4_acl *nfs4_acl_new(int);
 int nfs4_acl_get_whotype(char *, u32);
-int nfs4_acl_write_who(int who, char *p);
+__be32 nfs4_acl_write_who(int who, __be32 **p, int *len);
-#define NFS4_ACL_TYPE_DEFAULT   0x01
+int nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry,
-#define NFS4_ACL_DIR            0x02
+                struct nfs4_acl **acl);
-#define NFS4_ACL_OWNER          0x04
+__be32 nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp,
+                struct nfs4_acl *acl);
-struct nfs4_acl *nfs4_acl_posix_to_nfsv4(struct posix_acl *,
-                                struct posix_acl *, unsigned int flags);
-int nfs4_acl_nfsv4_to_posix(struct nfs4_acl *, struct posix_acl **,
-                                struct posix_acl **, unsigned int flags);
 #endif /* LINUX_NFS4_ACL_H */
diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h
index d5c5b3e00266..b582f9ab6b2a 100644
--- a/fs/nfsd/cache.h
+++ b/fs/nfsd/cache.h
@@ -84,12 +84,4 @@ int	nfsd_cache_lookup(struct svc_rqst *);
 void    nfsd_cache_update(struct svc_rqst *, int, __be32 *);
 int     nfsd_reply_cache_stats_open(struct inode *, struct file *);
-#ifdef CONFIG_NFSD_V4
-void    nfsd4_set_statp(struct svc_rqst *rqstp, __be32 *statp);
-#else  /* CONFIG_NFSD_V4 */
-static inline void nfsd4_set_statp(struct svc_rqst *rqstp, __be32 *statp)
-{
-}
-#endif /* CONFIG_NFSD_V4 */
 #endif /* NFSCACHE_H */
diff --git a/fs/nfsd/idmap.h b/fs/nfsd/idmap.h
index bf95f6b817a4..66e58db01936 100644
--- a/fs/nfsd/idmap.h
+++ b/fs/nfsd/idmap.h
@@ -56,7 +56,7 @@ static inline void nfsd_idmap_shutdown(struct net *net)
 __be32 nfsd_map_name_to_uid(struct svc_rqst *, const char *, size_t, kuid_t *);
 __be32 nfsd_map_name_to_gid(struct svc_rqst *, const char *, size_t, kgid_t *);
-int nfsd_map_uid_to_name(struct svc_rqst *, kuid_t, char *);
+__be32 nfsd4_encode_user(struct svc_rqst *, kuid_t, __be32 **, int *);
-int nfsd_map_gid_to_name(struct svc_rqst *, kgid_t, char *);
+__be32 nfsd4_encode_group(struct svc_rqst *, kgid_t, __be32 **, int *);
 #endif /* LINUX_NFSD_IDMAP_H */
diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index 849a7c3ced22..d32b3aa6600d 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -95,6 +95,7 @@ struct nfsd_net {
        time_t nfsd4_grace;
        bool nfsd_net_up;
+        bool lockd_up;
        /*
         * Time of server startup
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index 95d76dc6c5da..11c1fba29312 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -30,8 +30,9 @@ nfsacld_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
 static __be32 nfsacld_proc_getacl(struct svc_rqst * rqstp,
                struct nfsd3_getaclargs *argp, struct nfsd3_getaclres *resp)
 {
-        svc_fh *fh;
        struct posix_acl *acl;
+        struct inode *inode;
+        svc_fh *fh;
        __be32 nfserr = 0;
        dprintk("nfsd: GETACL(2acl)   %s\n", SVCFH_fmt(&argp->fh));
@@ -41,6 +42,8 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst * rqstp,
        if (nfserr)
                RETURN_STATUS(nfserr);
+        inode = fh->fh_dentry->d_inode;
        if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT))
                RETURN_STATUS(nfserr_inval);
        resp->mask = argp->mask;
@@ -50,21 +53,13 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst * rqstp,
                goto fail;
        if (resp->mask & (NFS_ACL|NFS_ACLCNT)) {
-                acl = nfsd_get_posix_acl(fh, ACL_TYPE_ACCESS);
+                acl = get_acl(inode, ACL_TYPE_ACCESS);
                if (IS_ERR(acl)) {
-                        int err = PTR_ERR(acl);
+                        nfserr = nfserrno(PTR_ERR(acl));
+                        goto fail;
-                        if (err == -ENODATA || err == -EOPNOTSUPP)
-                                acl = NULL;
-                        else {
-                                nfserr = nfserrno(err);
-                                goto fail;
-                        }
                }
                if (acl == NULL) {
                        /* Solaris returns the inode's minimum ACL. */
-                        struct inode *inode = fh->fh_dentry->d_inode;
                        acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);
                }
                resp->acl_access = acl;
@@ -72,17 +67,10 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst * rqstp,
        if (resp->mask & (NFS_DFACL|NFS_DFACLCNT)) {
                /* Check how Solaris handles requests for the Default ACL
                   of a non-directory! */
+                acl = get_acl(inode, ACL_TYPE_DEFAULT);
-                acl = nfsd_get_posix_acl(fh, ACL_TYPE_DEFAULT);
                if (IS_ERR(acl)) {
-                        int err = PTR_ERR(acl);
+                        nfserr = nfserrno(PTR_ERR(acl));
+                        goto fail;
-                        if (err == -ENODATA || err == -EOPNOTSUPP)
-                                acl = NULL;
-                        else {
-                                nfserr = nfserrno(err);
-                                goto fail;
-                        }
                }
                resp->acl_default = acl;
        }
@@ -103,31 +91,51 @@ static __be32 nfsacld_proc_setacl(struct svc_rqst * rqstp,
                struct nfsd3_setaclargs *argp,
                struct nfsd_attrstat *resp)
 {
+        struct inode *inode;
        svc_fh *fh;
        __be32 nfserr = 0;
+        int error;
        dprintk("nfsd: SETACL(2acl)   %s\n", SVCFH_fmt(&argp->fh));
        fh = fh_copy(&resp->fh, &argp->fh);
        nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_SATTR);
+        if (nfserr)
+                goto out;
-        if (!nfserr) {
+        inode = fh->fh_dentry->d_inode;
-                nfserr = nfserrno( nfsd_set_posix_acl(
+        if (!IS_POSIXACL(inode) || !inode->i_op->set_acl) {
-                        fh, ACL_TYPE_ACCESS, argp->acl_access) );
+                error = -EOPNOTSUPP;
-        }
+                goto out_errno;
-        if (!nfserr) {
-                nfserr = nfserrno( nfsd_set_posix_acl(
-                        fh, ACL_TYPE_DEFAULT, argp->acl_default) );
-        }
-        if (!nfserr) {
-                nfserr = fh_getattr(fh, &resp->stat);
        }
+        error = fh_want_write(fh);
+        if (error)
+                goto out_errno;
+        error = inode->i_op->set_acl(inode, argp->acl_access, ACL_TYPE_ACCESS);
+        if (error)
+                goto out_drop_write;
+        error = inode->i_op->set_acl(inode, argp->acl_default,
+                                     ACL_TYPE_DEFAULT);
+        if (error)
+                goto out_drop_write;
+        fh_drop_write(fh);
+        nfserr = fh_getattr(fh, &resp->stat);
+out:
        /* argp->acl_{access,default} may have been allocated in
           nfssvc_decode_setaclargs. */
        posix_acl_release(argp->acl_access);
        posix_acl_release(argp->acl_default);
        return nfserr;
+out_drop_write:
+        fh_drop_write(fh);
+out_errno:
+        nfserr = nfserrno(error);
+        goto out;
 }
 /*
diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c
index 9cbc1a841f87..adc5f1b1dc26 100644
--- a/fs/nfsd/nfs3acl.c
+++ b/fs/nfsd/nfs3acl.c
@@ -29,8 +29,9 @@ nfsd3_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
 static __be32 nfsd3_proc_getacl(struct svc_rqst * rqstp,
                struct nfsd3_getaclargs *argp, struct nfsd3_getaclres *resp)
 {
-        svc_fh *fh;
        struct posix_acl *acl;
+        struct inode *inode;
+        svc_fh *fh;
        __be32 nfserr = 0;
        fh = fh_copy(&resp->fh, &argp->fh);
@@ -38,26 +39,20 @@ static __be32 nfsd3_proc_getacl(struct svc_rqst * rqstp,
        if (nfserr)
                RETURN_STATUS(nfserr);
+        inode = fh->fh_dentry->d_inode;
        if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT))
                RETURN_STATUS(nfserr_inval);
        resp->mask = argp->mask;
        if (resp->mask & (NFS_ACL|NFS_ACLCNT)) {
-                acl = nfsd_get_posix_acl(fh, ACL_TYPE_ACCESS);
+                acl = get_acl(inode, ACL_TYPE_ACCESS);
                if (IS_ERR(acl)) {
-                        int err = PTR_ERR(acl);
+                        nfserr = nfserrno(PTR_ERR(acl));
+                        goto fail;
-                        if (err == -ENODATA || err == -EOPNOTSUPP)
-                                acl = NULL;
-                        else {
-                                nfserr = nfserrno(err);
-                                goto fail;
-                        }
                }
                if (acl == NULL) {
                        /* Solaris returns the inode's minimum ACL. */
-                        struct inode *inode = fh->fh_dentry->d_inode;
                        acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);
                }
                resp->acl_access = acl;
@@ -65,17 +60,10 @@ static __be32 nfsd3_proc_getacl(struct svc_rqst * rqstp,
        if (resp->mask & (NFS_DFACL|NFS_DFACLCNT)) {
                /* Check how Solaris handles requests for the Default ACL
                   of a non-directory! */
+                acl = get_acl(inode, ACL_TYPE_DEFAULT);
-                acl = nfsd_get_posix_acl(fh, ACL_TYPE_DEFAULT);
                if (IS_ERR(acl)) {
-                        int err = PTR_ERR(acl);
+                        nfserr = nfserrno(PTR_ERR(acl));
+                        goto fail;
-                        if (err == -ENODATA || err == -EOPNOTSUPP)
-                                acl = NULL;
-                        else {
-                                nfserr = nfserrno(err);
-                                goto fail;
-                        }
                }
                resp->acl_default = acl;
        }
@@ -96,21 +84,37 @@ static __be32 nfsd3_proc_setacl(struct svc_rqst * rqstp,
                struct nfsd3_setaclargs *argp,
                struct nfsd3_attrstat *resp)
 {
+        struct inode *inode;
        svc_fh *fh;
        __be32 nfserr = 0;
+        int error;
        fh = fh_copy(&resp->fh, &argp->fh);
        nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_SATTR);
+        if (nfserr)
+                goto out;
-        if (!nfserr) {
+        inode = fh->fh_dentry->d_inode;
-                nfserr = nfserrno( nfsd_set_posix_acl(
+        if (!IS_POSIXACL(inode) || !inode->i_op->set_acl) {
-                        fh, ACL_TYPE_ACCESS, argp->acl_access) );
+                error = -EOPNOTSUPP;
-        }
+                goto out_errno;
-        if (!nfserr) {
-                nfserr = nfserrno( nfsd_set_posix_acl(
-                        fh, ACL_TYPE_DEFAULT, argp->acl_default) );
        }
+        error = fh_want_write(fh);
+        if (error)
+                goto out_errno;
+        error = inode->i_op->set_acl(inode, argp->acl_access, ACL_TYPE_ACCESS);
+        if (error)
+                goto out_drop_write;
+        error = inode->i_op->set_acl(inode, argp->acl_default,
+                                     ACL_TYPE_DEFAULT);
+out_drop_write:
+        fh_drop_write(fh);
+out_errno:
+        nfserr = nfserrno(error);
+out:
        /* argp->acl_{access,default} may have been allocated in
           nfs3svc_decode_setaclargs. */
        posix_acl_release(argp->acl_access);
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 14d9ecb96cff..de6e39e12cb3 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -168,7 +168,7 @@ encode_fattr3(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp,
              struct kstat *stat)
 {
        *p++ = htonl(nfs3_ftypes[(stat->mode & S_IFMT) >> 12]);
-        *p++ = htonl((u32) stat->mode);
+        *p++ = htonl((u32) (stat->mode & S_IALLUGO));
        *p++ = htonl((u32) stat->nlink);
        *p++ = htonl((u32) from_kuid(&init_user_ns, stat->uid));
        *p++ = htonl((u32) from_kgid(&init_user_ns, stat->gid));
@@ -842,21 +842,21 @@ out:
 static __be32 *encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name, int namlen)
 {
-        struct svc_fh   fh;
+        struct svc_fh   *fh = &cd->scratch;
        __be32 err;
-        fh_init(&fh, NFS3_FHSIZE);
+        fh_init(fh, NFS3_FHSIZE);
-        err = compose_entry_fh(cd, &fh, name, namlen);
+        err = compose_entry_fh(cd, fh, name, namlen);
        if (err) {
                *p++ = 0;
                *p++ = 0;
                goto out;
        }
-        p = encode_post_op_attr(cd->rqstp, p, &fh);
+        p = encode_post_op_attr(cd->rqstp, p, fh);
        *p++ = xdr_one;                 /* yes, a file handle follows */
-        p = encode_fh(p, &fh);
+        p = encode_fh(p, fh);
 out:
-        fh_put(&fh);
+        fh_put(fh);
        return p;
 }
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index 8a50b3c18093..d190e33d0ec2 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -37,8 +37,14 @@
 #include <linux/slab.h>
 #include <linux/nfs_fs.h>
 #include <linux/export.h>
+#include "nfsfh.h"
+#include "nfsd.h"
 #include "acl.h"
+#include "vfs.h"
+#define NFS4_ACL_TYPE_DEFAULT   0x01
+#define NFS4_ACL_DIR            0x02
+#define NFS4_ACL_OWNER          0x04
 /* mode bit translations: */
 #define NFS4_READ_MODE (NFS4_ACE_READ_DATA)
@@ -130,36 +136,47 @@ static short ace2type(struct nfs4_ace *);
 static void _posix_to_nfsv4_one(struct posix_acl *, struct nfs4_acl *,
                                unsigned int);
-struct nfs4_acl *
+int
-nfs4_acl_posix_to_nfsv4(struct posix_acl *pacl, struct posix_acl *dpacl,
+nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry,
-                        unsigned int flags)
+                struct nfs4_acl **acl)
 {
-        struct nfs4_acl *acl;
+        struct inode *inode = dentry->d_inode;
+        int error = 0;
+        struct posix_acl *pacl = NULL, *dpacl = NULL;
+        unsigned int flags = 0;
        int size = 0;
-        if (pacl) {
+        pacl = get_acl(inode, ACL_TYPE_ACCESS);
-                if (posix_acl_valid(pacl) < 0)
+        if (!pacl) {
-                        return ERR_PTR(-EINVAL);
+                pacl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);
-                size += 2*pacl->a_count;
+                if (IS_ERR(pacl))
+                        return PTR_ERR(pacl);
        }
-        if (dpacl) {
+        /* allocate for worst case: one (deny, allow) pair each: */
-                if (posix_acl_valid(dpacl) < 0)
+        size += 2 * pacl->a_count;
-                        return ERR_PTR(-EINVAL);
-                size += 2*dpacl->a_count;
+        if (S_ISDIR(inode->i_mode)) {
+                flags = NFS4_ACL_DIR;
+                dpacl = get_acl(inode, ACL_TYPE_DEFAULT);
+                if (dpacl)
+                        size += 2 * dpacl->a_count;
        }
-        /* Allocate for worst case: one (deny, allow) pair each: */
+        *acl = nfs4_acl_new(size);
-        acl = nfs4_acl_new(size);
+        if (*acl == NULL) {
-        if (acl == NULL)
+                error = -ENOMEM;
-                return ERR_PTR(-ENOMEM);
+                goto out;
+        }
-        if (pacl)
+        _posix_to_nfsv4_one(pacl, *acl, flags & ~NFS4_ACL_TYPE_DEFAULT);
-                _posix_to_nfsv4_one(pacl, acl, flags & ~NFS4_ACL_TYPE_DEFAULT);
        if (dpacl)
-                _posix_to_nfsv4_one(dpacl, acl, flags | NFS4_ACL_TYPE_DEFAULT);
+                _posix_to_nfsv4_one(dpacl, *acl, flags | NFS4_ACL_TYPE_DEFAULT);
-        return acl;
+ out:
+        posix_acl_release(pacl);
+        posix_acl_release(dpacl);
+        return error;
 }
 struct posix_acl_summary {
@@ -719,8 +736,9 @@ static void process_one_v4_ace(struct posix_acl_state *state,
        }
 }
-int nfs4_acl_nfsv4_to_posix(struct nfs4_acl *acl, struct posix_acl **pacl,
+static int nfs4_acl_nfsv4_to_posix(struct nfs4_acl *acl,
-                            struct posix_acl **dpacl, unsigned int flags)
+                struct posix_acl **pacl, struct posix_acl **dpacl,
+                unsigned int flags)
 {
        struct posix_acl_state effective_acl_state, default_acl_state;
        struct nfs4_ace *ace;
@@ -780,6 +798,57 @@ out_estate:
        return ret;
 }
+__be32
+nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp,
+                struct nfs4_acl *acl)
+{
+        __be32 error;
+        int host_error;
+        struct dentry *dentry;
+        struct inode *inode;
+        struct posix_acl *pacl = NULL, *dpacl = NULL;
+        unsigned int flags = 0;
+        /* Get inode */
+        error = fh_verify(rqstp, fhp, 0, NFSD_MAY_SATTR);
+        if (error)
+                return error;
+        dentry = fhp->fh_dentry;
+        inode = dentry->d_inode;
+        if (!inode->i_op->set_acl || !IS_POSIXACL(inode))
+                return nfserr_attrnotsupp;
+        if (S_ISDIR(inode->i_mode))
+                flags = NFS4_ACL_DIR;
+        host_error = nfs4_acl_nfsv4_to_posix(acl, &pacl, &dpacl, flags);
+        if (host_error == -EINVAL)
+                return nfserr_attrnotsupp;
+        if (host_error < 0)
+                goto out_nfserr;
+        host_error = inode->i_op->set_acl(inode, pacl, ACL_TYPE_ACCESS);
+        if (host_error < 0)
+                goto out_release;
+        if (S_ISDIR(inode->i_mode)) {
+                host_error = inode->i_op->set_acl(inode, dpacl,
+                                                  ACL_TYPE_DEFAULT);
+        }
+out_release:
+        posix_acl_release(pacl);
+        posix_acl_release(dpacl);
+out_nfserr:
+        if (host_error == -EOPNOTSUPP)
+                return nfserr_attrnotsupp;
+        else
+                return nfserrno(host_error);
+}
 static short
 ace2type(struct nfs4_ace *ace)
 {
@@ -798,9 +867,6 @@ ace2type(struct nfs4_ace *ace)
        return -1;
 }
-EXPORT_SYMBOL(nfs4_acl_posix_to_nfsv4);
-EXPORT_SYMBOL(nfs4_acl_nfsv4_to_posix);
 struct nfs4_acl *
 nfs4_acl_new(int n)
 {
@@ -848,21 +914,22 @@ nfs4_acl_get_whotype(char *p, u32 len)
        return NFS4_ACL_WHO_NAMED;
 }
-int
+__be32 nfs4_acl_write_who(int who, __be32 **p, int *len)
-nfs4_acl_write_who(int who, char *p)
 {
        int i;
+        int bytes;
        for (i = 0; i < ARRAY_SIZE(s2t_map); i++) {
-                if (s2t_map[i].type == who) {
+                if (s2t_map[i].type != who)
-                        memcpy(p, s2t_map[i].string, s2t_map[i].stringlen);
+                        continue;
-                        return s2t_map[i].stringlen;
+                bytes = 4 + (XDR_QUADLEN(s2t_map[i].stringlen) << 2);
-                }
+                if (bytes > *len)
+                        return nfserr_resource;
+                *p = xdr_encode_opaque(*p, s2t_map[i].string,
+                                        s2t_map[i].stringlen);
+                *len -= bytes;
+                return 0;
        }
-        BUG();
+        WARN_ON_ONCE(1);
        return -1;
 }
-EXPORT_SYMBOL(nfs4_acl_new);
-EXPORT_SYMBOL(nfs4_acl_get_whotype);
-EXPORT_SYMBOL(nfs4_acl_write_who);
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index 4832fd819f88..c0dfde68742e 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -551,27 +551,46 @@ idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen
        return 0;
 }
-static int
+static __be32 encode_ascii_id(u32 id, __be32 **p, int *buflen)
-idmap_id_to_name(struct svc_rqst *rqstp, int type, u32 id, char *name)
+{
+        char buf[11];
+        int len;
+        int bytes;
+        len = sprintf(buf, "%u", id);
+        bytes = 4 + (XDR_QUADLEN(len) << 2);
+        if (bytes > *buflen)
+                return nfserr_resource;
+        *p = xdr_encode_opaque(*p, buf, len);
+        *buflen -= bytes;
+        return 0;
+}
+static __be32 idmap_id_to_name(struct svc_rqst *rqstp, int type, u32 id, __be32 **p, int *buflen)
 {
        struct ent *item, key = {
                .id = id,
                .type = type,
        };
        int ret;
+        int bytes;
        struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
        strlcpy(key.authname, rqst_authname(rqstp), sizeof(key.authname));
        ret = idmap_lookup(rqstp, idtoname_lookup, &key, nn->idtoname_cache, &item);
        if (ret == -ENOENT)
-                return sprintf(name, "%u", id);
+                return encode_ascii_id(id, p, buflen);
        if (ret)
-                return ret;
+                return nfserrno(ret);
        ret = strlen(item->name);
-        BUG_ON(ret > IDMAP_NAMESZ);
+        WARN_ON_ONCE(ret > IDMAP_NAMESZ);
-        memcpy(name, item->name, ret);
+        bytes = 4 + (XDR_QUADLEN(ret) << 2);
+        if (bytes > *buflen)
+                return nfserr_resource;
+        *p = xdr_encode_opaque(*p, item->name, ret);
+        *buflen -= bytes;
        cache_put(&item->h, nn->idtoname_cache);
-        return ret;
+        return 0;
 }
 static bool
@@ -603,12 +622,11 @@ do_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, u
        return idmap_name_to_id(rqstp, type, name, namelen, id);
 }
-static int
+static __be32 encode_name_from_id(struct svc_rqst *rqstp, int type, u32 id, __be32 **p, int *buflen)
-do_id_to_name(struct svc_rqst *rqstp, int type, u32 id, char *name)
 {
        if (nfs4_disable_idmapping && rqstp->rq_cred.cr_flavor < RPC_AUTH_GSS)
-                return sprintf(name, "%u", id);
+                return encode_ascii_id(id, p, buflen);
-        return idmap_id_to_name(rqstp, type, id, name);
+        return idmap_id_to_name(rqstp, type, id, p, buflen);
 }
 __be32
@@ -637,16 +655,14 @@ nfsd_map_name_to_gid(struct svc_rqst *rqstp, const char *name, size_t namelen,
        return status;
 }
-int
+__be32 nfsd4_encode_user(struct svc_rqst *rqstp, kuid_t uid,  __be32 **p, int *buflen)
-nfsd_map_uid_to_name(struct svc_rqst *rqstp, kuid_t uid, char *name)
 {
        u32 id = from_kuid(&init_user_ns, uid);
-        return do_id_to_name(rqstp, IDMAP_TYPE_USER, id, name);
+        return encode_name_from_id(rqstp, IDMAP_TYPE_USER, id, p, buflen);
 }
-int
+__be32 nfsd4_encode_group(struct svc_rqst *rqstp, kgid_t gid, __be32 **p, int *buflen)
-nfsd_map_gid_to_name(struct svc_rqst *rqstp, kgid_t gid, char *name)
 {
        u32 id = from_kgid(&init_user_ns, gid);
-        return do_id_to_name(rqstp, IDMAP_TYPE_GROUP, id, name);
+        return encode_name_from_id(rqstp, IDMAP_TYPE_GROUP, id, p, buflen);
 }
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 419572f33b72..82189b208af3 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -41,6 +41,7 @@
 #include "vfs.h"
 #include "current_stateid.h"
 #include "netns.h"
+#include "acl.h"
 #ifdef CONFIG_NFSD_V4_SECURITY_LABEL
 #include <linux/security.h>
@@ -230,17 +231,16 @@ static void nfsd4_set_open_owner_reply_cache(struct nfsd4_compound_state *cstate
 }
 static __be32
-do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_open *open)
+do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_open *open, struct svc_fh **resfh)
 {
        struct svc_fh *current_fh = &cstate->current_fh;
-        struct svc_fh *resfh;
        int accmode;
        __be32 status;
-        resfh = kmalloc(sizeof(struct svc_fh), GFP_KERNEL);
+        *resfh = kmalloc(sizeof(struct svc_fh), GFP_KERNEL);
-        if (!resfh)
+        if (!*resfh)
                return nfserr_jukebox;
-        fh_init(resfh, NFS4_FHSIZE);
+        fh_init(*resfh, NFS4_FHSIZE);
        open->op_truncate = 0;
        if (open->op_create) {
@@ -265,12 +265,12 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru
                 */
                status = do_nfsd_create(rqstp, current_fh, open->op_fname.data,
                                        open->op_fname.len, &open->op_iattr,
-                                        resfh, open->op_createmode,
+                                        *resfh, open->op_createmode,
                                        (u32 *)open->op_verf.data,
                                        &open->op_truncate, &open->op_created);
                if (!status && open->op_label.len)
-                        nfsd4_security_inode_setsecctx(resfh, &open->op_label, open->op_bmval);
+                        nfsd4_security_inode_setsecctx(*resfh, &open->op_label, open->op_bmval);
                /*
                 * Following rfc 3530 14.2.16, use the returned bitmask
@@ -280,31 +280,32 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru
                if (open->op_createmode == NFS4_CREATE_EXCLUSIVE && status == 0)
                        open->op_bmval[1] = (FATTR4_WORD1_TIME_ACCESS |
                                                        FATTR4_WORD1_TIME_MODIFY);
-        } else {
+        } else
+                /*
+                 * Note this may exit with the parent still locked.
+                 * We will hold the lock until nfsd4_open's final
+                 * lookup, to prevent renames or unlinks until we've had
+                 * a chance to an acquire a delegation if appropriate.
+                 */
                status = nfsd_lookup(rqstp, current_fh,
-                                     open->op_fname.data, open->op_fname.len, resfh);
+                                     open->op_fname.data, open->op_fname.len, *resfh);
-                fh_unlock(current_fh);
-        }
        if (status)
                goto out;
-        status = nfsd_check_obj_isreg(resfh);
+        status = nfsd_check_obj_isreg(*resfh);
        if (status)
                goto out;
        if (is_create_with_attrs(open) && open->op_acl != NULL)
-                do_set_nfs4_acl(rqstp, resfh, open->op_acl, open->op_bmval);
+                do_set_nfs4_acl(rqstp, *resfh, open->op_acl, open->op_bmval);
-        nfsd4_set_open_owner_reply_cache(cstate, open, resfh);
+        nfsd4_set_open_owner_reply_cache(cstate, open, *resfh);
        accmode = NFSD_MAY_NOP;
        if (open->op_created ||
                        open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR)
                accmode |= NFSD_MAY_OWNER_OVERRIDE;
-        status = do_open_permission(rqstp, resfh, open, accmode);
+        status = do_open_permission(rqstp, *resfh, open, accmode);
        set_change_info(&open->op_cinfo, current_fh);
-        fh_dup2(current_fh, resfh);
 out:
-        fh_put(resfh);
-        kfree(resfh);
        return status;
 }
@@ -357,6 +358,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
           struct nfsd4_open *open)
 {
        __be32 status;
+        struct svc_fh *resfh = NULL;
        struct nfsd4_compoundres *resp;
        struct net *net = SVC_NET(rqstp);
        struct nfsd_net *nn = net_generic(net, nfsd_net_id);
@@ -423,7 +425,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        switch (open->op_claim_type) {
                case NFS4_OPEN_CLAIM_DELEGATE_CUR:
                case NFS4_OPEN_CLAIM_NULL:
-                        status = do_open_lookup(rqstp, cstate, open);
+                        status = do_open_lookup(rqstp, cstate, open, &resfh);
                        if (status)
                                goto out;
                        break;
@@ -439,6 +441,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                        status = do_open_fhandle(rqstp, cstate, open);
                        if (status)
                                goto out;
+                        resfh = &cstate->current_fh;
                        break;
                case NFS4_OPEN_CLAIM_DELEG_PREV_FH:
                case NFS4_OPEN_CLAIM_DELEGATE_PREV:
@@ -458,9 +461,14 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
         * successful, it (1) truncates the file if open->op_truncate was
         * set, (2) sets open->op_stateid, (3) sets open->op_delegation.
         */
-        status = nfsd4_process_open2(rqstp, &cstate->current_fh, open);
+        status = nfsd4_process_open2(rqstp, resfh, open);
        WARN_ON(status && open->op_created);
 out:
+        if (resfh && resfh != &cstate->current_fh) {
+                fh_dup2(&cstate->current_fh, resfh);
+                fh_put(resfh);
+                kfree(resfh);
+        }
        nfsd4_cleanup_open_state(open, status);
        if (open->op_openowner && !nfsd4_has_session(cstate))
                cstate->replay_owner = &open->op_openowner->oo_owner;
@@ -1069,8 +1077,10 @@ _nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                                    cstate->current_fh.fh_dentry, &p,
                                    count, verify->ve_bmval,
                                    rqstp, 0);
+        /*
-        /* this means that nfsd4_encode_fattr() ran out of space */
+         * If nfsd4_encode_fattr() ran out of space, assume that's because
+         * the attributes are longer (hence different) than those given:
+         */
        if (status == nfserr_resource)
                status = nfserr_not_same;
        if (status)
@@ -1524,7 +1534,8 @@ static inline u32 nfsd4_write_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
 static inline u32 nfsd4_exchange_id_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
 {
        return (op_encode_hdr_size + 2 + 1 + /* eir_clientid, eir_sequenceid */\
-                1 + 1 + 2 + /* eir_flags, spr_how, spo_must_enforce & _allow */\
+                1 + 1 + /* eir_flags, spr_how */\
+                4 + /* spo_must_enforce & _allow with bitmap */\
                2 + /*eir_server_owner.so_minor_id */\
                /* eir_server_owner.so_major_id<> */\
                XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 +\
@@ -1881,6 +1892,7 @@ struct svc_version	nfsd_version4 = {
                .vs_proc        = nfsd_procedures4,
                .vs_dispatch    = nfsd_dispatch,
                .vs_xdrsize     = NFS4_SVC_XDRSIZE,
+                .vs_rpcb_optnl  = 1,
 };
 /*
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 105d6fa7c514..d5d070fbeb35 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -832,10 +832,11 @@ static void nfsd4_put_drc_mem(struct nfsd4_channel_attrs *ca)
        spin_unlock(&nfsd_drc_lock);
 }
-static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *attrs)
+static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs,
+                                           struct nfsd4_channel_attrs *battrs)
 {
-        int numslots = attrs->maxreqs;
+        int numslots = fattrs->maxreqs;
-        int slotsize = slot_bytes(attrs);
+        int slotsize = slot_bytes(fattrs);
        struct nfsd4_session *new;
        int mem, i;
@@ -852,6 +853,10 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *attrs)
                if (!new->se_slots[i])
                        goto out_free;
        }
+        memcpy(&new->se_fchannel, fattrs, sizeof(struct nfsd4_channel_attrs));
+        memcpy(&new->se_bchannel, battrs, sizeof(struct nfsd4_channel_attrs));
        return new;
 out_free:
        while (i--)
@@ -997,8 +1002,7 @@ static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, stru
        list_add(&new->se_perclnt, &clp->cl_sessions);
        spin_unlock(&clp->cl_lock);
        spin_unlock(&nn->client_lock);
-        memcpy(&new->se_fchannel, &cses->fore_channel,
-                        sizeof(struct nfsd4_channel_attrs));
        if (cses->flags & SESSION4_BACK_CHAN) {
                struct sockaddr *sa = svc_addr(rqstp);
                /*
@@ -1851,6 +1855,11 @@ static __be32 check_forechannel_attrs(struct nfsd4_channel_attrs *ca, struct nfs
        return nfs_ok;
 }
+#define NFSD_CB_MAX_REQ_SZ      ((NFS4_enc_cb_recall_sz + \
+                                 RPC_MAX_HEADER_WITH_AUTH) * sizeof(__be32))
+#define NFSD_CB_MAX_RESP_SZ     ((NFS4_dec_cb_recall_sz + \
+                                 RPC_MAX_REPHEADER_WITH_AUTH) * sizeof(__be32))
 static __be32 check_backchannel_attrs(struct nfsd4_channel_attrs *ca)
 {
        ca->headerpadsz = 0;
@@ -1861,9 +1870,9 @@ static __be32 check_backchannel_attrs(struct nfsd4_channel_attrs *ca)
         * less than 1k.  Tighten up this estimate in the unlikely event
         * it turns out to be a problem for some client:
         */
-        if (ca->maxreq_sz < NFS4_enc_cb_recall_sz + RPC_MAX_HEADER_WITH_AUTH)
+        if (ca->maxreq_sz < NFSD_CB_MAX_REQ_SZ)
                return nfserr_toosmall;
-        if (ca->maxresp_sz < NFS4_dec_cb_recall_sz + RPC_MAX_REPHEADER_WITH_AUTH)
+        if (ca->maxresp_sz < NFSD_CB_MAX_RESP_SZ)
                return nfserr_toosmall;
        ca->maxresp_cached = 0;
        if (ca->maxops < 2)
@@ -1913,9 +1922,9 @@ nfsd4_create_session(struct svc_rqst *rqstp,
                return status;
        status = check_backchannel_attrs(&cr_ses->back_channel);
        if (status)
-                return status;
+                goto out_release_drc_mem;
        status = nfserr_jukebox;
-        new = alloc_session(&cr_ses->fore_channel);
+        new = alloc_session(&cr_ses->fore_channel, &cr_ses->back_channel);
        if (!new)
                goto out_release_drc_mem;
        conn = alloc_conn_from_crses(rqstp, cr_ses);
@@ -3034,18 +3043,18 @@ static int nfs4_setlease(struct nfs4_delegation *dp)
        if (!fl)
                return -ENOMEM;
        fl->fl_file = find_readable_file(fp);
-        list_add(&dp->dl_perclnt, &dp->dl_stid.sc_client->cl_delegations);
        status = vfs_setlease(fl->fl_file, fl->fl_type, &fl);
-        if (status) {
+        if (status)
-                list_del_init(&dp->dl_perclnt);
+                goto out_free;
-                locks_free_lock(fl);
+        list_add(&dp->dl_perclnt, &dp->dl_stid.sc_client->cl_delegations);
-                return status;
-        }
        fp->fi_lease = fl;
        fp->fi_deleg_file = get_file(fl->fl_file);
        atomic_set(&fp->fi_delegees, 1);
        list_add(&dp->dl_perfile, &fp->fi_delegations);
        return 0;
+out_free:
+        locks_free_lock(fl);
+        return status;
 }
 static int nfs4_set_delegation(struct nfs4_delegation *dp, struct nfs4_file *fp)
@@ -3125,6 +3134,7 @@ nfs4_open_delegation(struct net *net, struct svc_fh *fh,
                                goto out_no_deleg;
                        break;
                case NFS4_OPEN_CLAIM_NULL:
+                case NFS4_OPEN_CLAIM_FH:
                        /*
                         * Let's not give out any delegations till everyone's
                         * had the chance to reclaim theirs....
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index ee7237f99f54..63f2395c57ed 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -103,11 +103,6 @@ xdr_error:					\
        (x) = (u64)ntohl(*p++) << 32;           \
        (x) |= ntohl(*p++);                     \
 } while (0)
-#define READTIME(x)       do {                  \
-        p++;                                    \
-        (x) = ntohl(*p++);                      \
-        p++;                                    \
-} while (0)
 #define READMEM(x,nbytes) do {                  \
        x = (char *)p;                          \
        p += XDR_QUADLEN(nbytes);               \
@@ -190,6 +185,15 @@ static int zero_clientid(clientid_t *clid)
        return (clid->cl_boot == 0) && (clid->cl_id == 0);
 }
+/**
+ * defer_free - mark an allocation as deferred freed
+ * @argp: NFSv4 compound argument structure to be freed with
+ * @release: release callback to free @p, typically kfree()
+ * @p: pointer to be freed
+ *
+ * Marks @p to be freed when processing the compound operation
+ * described in @argp finishes.
+ */
 static int
 defer_free(struct nfsd4_compoundargs *argp,
                void (*release)(const void *), void *p)
@@ -206,6 +210,16 @@ defer_free(struct nfsd4_compoundargs *argp,
        return 0;
 }
+/**
+ * savemem - duplicate a chunk of memory for later processing
+ * @argp: NFSv4 compound argument structure to be freed with
+ * @p: pointer to be duplicated
+ * @nbytes: length to be duplicated
+ *
+ * Returns a pointer to a copy of @nbytes bytes of memory at @p
+ * that are preserved until processing of the NFSv4 compound
+ * operation described by @argp finishes.
+ */
 static char *savemem(struct nfsd4_compoundargs *argp, __be32 *p, int nbytes)
 {
        if (p == argp->tmp) {
@@ -257,7 +271,6 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
        int expected_len, len = 0;
        u32 dummy32;
        char *buf;
-        int host_err;
        DECODE_HEAD;
        iattr->ia_valid = 0;
@@ -284,10 +297,9 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
                        return nfserr_resource;
                *acl = nfs4_acl_new(nace);
-                if (*acl == NULL) {
+                if (*acl == NULL)
-                        host_err = -ENOMEM;
+                        return nfserr_jukebox;
-                        goto out_nfserr;
-                }
                defer_free(argp, kfree, *acl);
                (*acl)->naces = nace;
@@ -425,10 +437,6 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
                goto xdr_error;
        DECODE_TAIL;
-out_nfserr:
-        status = nfserrno(host_err);
-        goto out;
 }
 static __be32
@@ -1957,56 +1965,16 @@ static u32 nfs4_file_type(umode_t mode)
        };
 }
-static __be32
-nfsd4_encode_name(struct svc_rqst *rqstp, int whotype, kuid_t uid, kgid_t gid,
-                        __be32 **p, int *buflen)
-{
-        int status;
-        if (*buflen < (XDR_QUADLEN(IDMAP_NAMESZ) << 2) + 4)
-                return nfserr_resource;
-        if (whotype != NFS4_ACL_WHO_NAMED)
-                status = nfs4_acl_write_who(whotype, (u8 *)(*p + 1));
-        else if (gid_valid(gid))
-                status = nfsd_map_gid_to_name(rqstp, gid, (u8 *)(*p + 1));
-        else
-                status = nfsd_map_uid_to_name(rqstp, uid, (u8 *)(*p + 1));
-        if (status < 0)
-                return nfserrno(status);
-        *p = xdr_encode_opaque(*p, NULL, status);
-        *buflen -= (XDR_QUADLEN(status) << 2) + 4;
-        BUG_ON(*buflen < 0);
-        return 0;
-}
-static inline __be32
-nfsd4_encode_user(struct svc_rqst *rqstp, kuid_t user, __be32 **p, int *buflen)
-{
-        return nfsd4_encode_name(rqstp, NFS4_ACL_WHO_NAMED, user, INVALID_GID,
-                                 p, buflen);
-}
-static inline __be32
-nfsd4_encode_group(struct svc_rqst *rqstp, kgid_t group, __be32 **p, int *buflen)
-{
-        return nfsd4_encode_name(rqstp, NFS4_ACL_WHO_NAMED, INVALID_UID, group,
-                                 p, buflen);
-}
 static inline __be32
 nfsd4_encode_aclname(struct svc_rqst *rqstp, struct nfs4_ace *ace,
                __be32 **p, int *buflen)
 {
-        kuid_t uid = INVALID_UID;
+        if (ace->whotype != NFS4_ACL_WHO_NAMED)
-        kgid_t gid = INVALID_GID;
+                return nfs4_acl_write_who(ace->whotype, p, buflen);
+        else if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP)
-        if (ace->whotype == NFS4_ACL_WHO_NAMED) {
+                return nfsd4_encode_group(rqstp, ace->who_gid, p, buflen);
-                if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP)
+        else
-                        gid = ace->who_gid;
+                return nfsd4_encode_user(rqstp, ace->who_uid, p, buflen);
-                else
-                        uid = ace->who_uid;
-        }
-        return nfsd4_encode_name(rqstp, ace->whotype, uid, gid, p, buflen);
 }
 #define WORD0_ABSENT_FS_ATTRS (FATTR4_WORD0_FS_LOCATIONS | FATTR4_WORD0_FSID | \
@@ -2090,7 +2058,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
        u32 bmval1 = bmval[1];
        u32 bmval2 = bmval[2];
        struct kstat stat;
-        struct svc_fh tempfh;
+        struct svc_fh *tempfh = NULL;
        struct kstatfs statfs;
        int buflen = count << 2;
        __be32 *attrlenp;
@@ -2137,11 +2105,15 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
                        goto out_nfserr;
        }
        if ((bmval0 & (FATTR4_WORD0_FILEHANDLE | FATTR4_WORD0_FSID)) && !fhp) {
-                fh_init(&tempfh, NFS4_FHSIZE);
+                tempfh = kmalloc(sizeof(struct svc_fh), GFP_KERNEL);
-                status = fh_compose(&tempfh, exp, dentry, NULL);
+                status = nfserr_jukebox;
+                if (!tempfh)
+                        goto out;
+                fh_init(tempfh, NFS4_FHSIZE);
+                status = fh_compose(tempfh, exp, dentry, NULL);
                if (status)
                        goto out;
-                fhp = &tempfh;
+                fhp = tempfh;
        }
        if (bmval0 & (FATTR4_WORD0_ACL | FATTR4_WORD0_ACLSUPPORT
                        | FATTR4_WORD0_SUPPORTED_ATTRS)) {
@@ -2222,8 +2194,10 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
                if ((buflen -= 4) < 0)
                        goto out_resource;
                dummy = nfs4_file_type(stat.mode);
-                if (dummy == NF4BAD)
+                if (dummy == NF4BAD) {
-                        goto out_serverfault;
+                        status = nfserr_serverfault;
+                        goto out;
+                }
                WRITE32(dummy);
        }
        if (bmval0 & FATTR4_WORD0_FH_EXPIRE_TYPE) {
@@ -2317,8 +2291,6 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
                        WRITE32(ace->flag);
                        WRITE32(ace->access_mask & NFS4_ACE_MASK_ALL);
                        status = nfsd4_encode_aclname(rqstp, ace, &p, &buflen);
-                        if (status == nfserr_resource)
-                                goto out_resource;
                        if (status)
                                goto out;
                }
@@ -2379,8 +2351,6 @@ out_acl:
        }
        if (bmval0 & FATTR4_WORD0_FS_LOCATIONS) {
                status = nfsd4_encode_fs_locations(rqstp, exp, &p, &buflen);
-                if (status == nfserr_resource)
-                        goto out_resource;
                if (status)
                        goto out;
        }
@@ -2431,15 +2401,11 @@ out_acl:
        }
        if (bmval1 & FATTR4_WORD1_OWNER) {
                status = nfsd4_encode_user(rqstp, stat.uid, &p, &buflen);
-                if (status == nfserr_resource)
-                        goto out_resource;
                if (status)
                        goto out;
        }
        if (bmval1 & FATTR4_WORD1_OWNER_GROUP) {
                status = nfsd4_encode_group(rqstp, stat.gid, &p, &buflen);
-                if (status == nfserr_resource)
-                        goto out_resource;
                if (status)
                        goto out;
        }
@@ -2533,8 +2499,8 @@ out:
                security_release_secctx(context, contextlen);
 #endif /* CONFIG_NFSD_V4_SECURITY_LABEL */
        kfree(acl);
-        if (fhp == &tempfh)
+        if (tempfh)
-                fh_put(&tempfh);
+                fh_put(tempfh);
        return status;
 out_nfserr:
        status = nfserrno(err);
@@ -2542,9 +2508,6 @@ out_nfserr:
 out_resource:
        status = nfserr_resource;
        goto out;
-out_serverfault:
-        status = nfserr_serverfault;
-        goto out;
 }
 static inline int attributes_need_mount(u32 *bmval)
@@ -2621,17 +2584,14 @@ out_put:
 static __be32 *
 nfsd4_encode_rdattr_error(__be32 *p, int buflen, __be32 nfserr)
 {
-        __be32 *attrlenp;
        if (buflen < 6)
                return NULL;
        *p++ = htonl(2);
        *p++ = htonl(FATTR4_WORD0_RDATTR_ERROR); /* bmval0 */
        *p++ = htonl(0);                         /* bmval1 */
-        attrlenp = p++;
+        *p++ = htonl(4);     /* attribute length */
        *p++ = nfserr;       /* no htonl */
-        *attrlenp = htonl((char *)p - (char *)attrlenp - 4);
        return p;
 }
@@ -3244,7 +3204,7 @@ nfsd4_do_encode_secinfo(struct nfsd4_compoundres *resp,
                if (rpcauth_get_gssinfo(pf, &info) == 0) {
                        supported++;
-                        RESERVE_SPACE(4 + 4 + info.oid.len + 4 + 4);
+                        RESERVE_SPACE(4 + 4 + XDR_LEN(info.oid.len) + 4 + 4);
                        WRITE32(RPC_AUTH_GSS);
                        WRITE32(info.oid.len);
                        WRITEMEM(info.oid.data, info.oid.len);
@@ -3379,35 +3339,43 @@ nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr,
                8 /* eir_clientid */ +
                4 /* eir_sequenceid */ +
                4 /* eir_flags */ +
-                4 /* spr_how */ +
+                4 /* spr_how */);
-                8 /* spo_must_enforce, spo_must_allow */ +
-                8 /* so_minor_id */ +
-                4 /* so_major_id.len */ +
-                (XDR_QUADLEN(major_id_sz) * 4) +
-                4 /* eir_server_scope.len */ +
-                (XDR_QUADLEN(server_scope_sz) * 4) +
-                4 /* eir_server_impl_id.count (0) */);
        WRITEMEM(&exid->clientid, 8);
        WRITE32(exid->seqid);
        WRITE32(exid->flags);
        WRITE32(exid->spa_how);
+        ADJUST_ARGS();
        switch (exid->spa_how) {
        case SP4_NONE:
                break;
        case SP4_MACH_CRED:
+                /* spo_must_enforce, spo_must_allow */
+                RESERVE_SPACE(16);
                /* spo_must_enforce bitmap: */
                WRITE32(2);
                WRITE32(nfs4_minimal_spo_must_enforce[0]);
                WRITE32(nfs4_minimal_spo_must_enforce[1]);
                /* empty spo_must_allow bitmap: */
                WRITE32(0);
+                ADJUST_ARGS();
                break;
        default:
                WARN_ON_ONCE(1);
        }
+        RESERVE_SPACE(
+                8 /* so_minor_id */ +
+                4 /* so_major_id.len */ +
+                (XDR_QUADLEN(major_id_sz) * 4) +
+                4 /* eir_server_scope.len */ +
+                (XDR_QUADLEN(server_scope_sz) * 4) +
+                4 /* eir_server_impl_id.count (0) */);
        /* The server_owner struct */
        WRITE64(minor_id);      /* Minor id */
        /* major id */
@@ -3474,28 +3442,6 @@ nfsd4_encode_create_session(struct nfsd4_compoundres *resp, __be32 nfserr,
 }
 static __be32
-nfsd4_encode_destroy_session(struct nfsd4_compoundres *resp, __be32 nfserr,
-                             struct nfsd4_destroy_session *destroy_session)
-{
-        return nfserr;
-}
-static __be32
-nfsd4_encode_free_stateid(struct nfsd4_compoundres *resp, __be32 nfserr,
-                          struct nfsd4_free_stateid *free_stateid)
-{
-        __be32 *p;
-        if (nfserr)
-                return nfserr;
-        RESERVE_SPACE(4);
-        *p++ = nfserr;
-        ADJUST_ARGS();
-        return nfserr;
-}
-static __be32
 nfsd4_encode_sequence(struct nfsd4_compoundres *resp, __be32 nfserr,
                      struct nfsd4_sequence *seq)
 {
@@ -3593,8 +3539,8 @@ static nfsd4_enc nfsd4_enc_ops[] = {
        [OP_BIND_CONN_TO_SESSION] = (nfsd4_enc)nfsd4_encode_bind_conn_to_session,
        [OP_EXCHANGE_ID]        = (nfsd4_enc)nfsd4_encode_exchange_id,
        [OP_CREATE_SESSION]     = (nfsd4_enc)nfsd4_encode_create_session,
-        [OP_DESTROY_SESSION]    = (nfsd4_enc)nfsd4_encode_destroy_session,
+        [OP_DESTROY_SESSION]    = (nfsd4_enc)nfsd4_encode_noop,
-        [OP_FREE_STATEID]       = (nfsd4_enc)nfsd4_encode_free_stateid,
+        [OP_FREE_STATEID]       = (nfsd4_enc)nfsd4_encode_noop,
        [OP_GET_DIR_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop,
        [OP_GETDEVICEINFO]      = (nfsd4_enc)nfsd4_encode_noop,
        [OP_GETDEVICELIST]      = (nfsd4_enc)nfsd4_encode_noop,
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index b6af150c96b8..f8f060ffbf4f 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -132,13 +132,6 @@ nfsd_reply_cache_alloc(void)
 }
 static void
-nfsd_reply_cache_unhash(struct svc_cacherep *rp)
-{
-        hlist_del_init(&rp->c_hash);
-        list_del_init(&rp->c_lru);
-}
-static void
 nfsd_reply_cache_free_locked(struct svc_cacherep *rp)
 {
        if (rp->c_type == RC_REPLBUFF && rp->c_replvec.iov_base) {
@@ -416,22 +409,8 @@ nfsd_cache_lookup(struct svc_rqst *rqstp)
        /*
         * Since the common case is a cache miss followed by an insert,
-         * preallocate an entry. First, try to reuse the first entry on the LRU
+         * preallocate an entry.
-         * if it works, then go ahead and prune the LRU list.
         */
-        spin_lock(&cache_lock);
-        if (!list_empty(&lru_head)) {
-                rp = list_first_entry(&lru_head, struct svc_cacherep, c_lru);
-                if (nfsd_cache_entry_expired(rp) ||
-                    num_drc_entries >= max_drc_entries) {
-                        nfsd_reply_cache_unhash(rp);
-                        prune_cache_entries();
-                        goto search_cache;
-                }
-        }
-        /* No expired ones available, allocate a new one. */
-        spin_unlock(&cache_lock);
        rp = nfsd_reply_cache_alloc();
        spin_lock(&cache_lock);
        if (likely(rp)) {
@@ -439,7 +418,9 @@ nfsd_cache_lookup(struct svc_rqst *rqstp)
                drc_mem_usage += sizeof(*rp);
        }
-search_cache:
+        /* go ahead and prune the cache */
+        prune_cache_entries();
        found = nfsd_cache_search(rqstp, csum);
        if (found) {
                if (likely(rp))
@@ -453,15 +434,6 @@ search_cache:
                goto out;
        }
-        /*
-         * We're keeping the one we just allocated. Are we now over the
-         * limit? Prune one off the tip of the LRU in trade for the one we
-         * just allocated if so.
-         */
-        if (num_drc_entries >= max_drc_entries)
-                nfsd_reply_cache_free_locked(list_first_entry(&lru_head,
-                                                struct svc_cacherep, c_lru));
        nfsdstats.rcmisses++;
        rqstp->rq_cacherep = rp;
        rp->c_state = RC_INPROG;
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 760c85a6f534..9a4a5f9e7468 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -241,6 +241,15 @@ static void nfsd_shutdown_generic(void)
        nfsd_racache_shutdown();
 }
+static bool nfsd_needs_lockd(void)
+{
+#if defined(CONFIG_NFSD_V3)
+        return (nfsd_versions[2] != NULL) || (nfsd_versions[3] != NULL);
+#else
+        return (nfsd_versions[2] != NULL);
+#endif
+}
 static int nfsd_startup_net(int nrservs, struct net *net)
 {
        struct nfsd_net *nn = net_generic(net, nfsd_net_id);
@@ -255,9 +264,14 @@ static int nfsd_startup_net(int nrservs, struct net *net)
        ret = nfsd_init_socks(net);
        if (ret)
                goto out_socks;
-        ret = lockd_up(net);
-        if (ret)
+        if (nfsd_needs_lockd() && !nn->lockd_up) {
-                goto out_socks;
+                ret = lockd_up(net);
+                if (ret)
+                        goto out_socks;
+                nn->lockd_up = 1;
+        }
        ret = nfs4_state_start_net(net);
        if (ret)
                goto out_lockd;
@@ -266,7 +280,10 @@ static int nfsd_startup_net(int nrservs, struct net *net)
        return 0;
 out_lockd:
-        lockd_down(net);
+        if (nn->lockd_up) {
+                lockd_down(net);
+                nn->lockd_up = 0;
+        }
 out_socks:
        nfsd_shutdown_generic();
        return ret;
@@ -277,7 +294,10 @@ static void nfsd_shutdown_net(struct net *net)
        struct nfsd_net *nn = net_generic(net, nfsd_net_id);
        nfs4_state_shutdown_net(net);
-        lockd_down(net);
+        if (nn->lockd_up) {
+                lockd_down(net);
+                nn->lockd_up = 0;
+        }
        nn->nfsd_net_up = false;
        nfsd_shutdown_generic();
 }
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index 9c769a47ac5a..b17d93214d01 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -152,7 +152,7 @@ encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp,
        type = (stat->mode & S_IFMT);
        *p++ = htonl(nfs_ftypes[type >> 12]);
-        *p++ = htonl((u32) stat->mode);
+        *p++ = htonl((u32) (stat->mode & S_IALLUGO));
        *p++ = htonl((u32) stat->nlink);
        *p++ = htonl((u32) from_kuid(&init_user_ns, stat->uid));
        *p++ = htonl((u32) from_kgid(&init_user_ns, stat->gid));
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 7eea63cada1d..6d7be3f80356 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -207,7 +207,12 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
                                goto out_nfserr;
                }
        } else {
-                fh_lock(fhp);
+                /*
+                 * In the nfsd4_open() case, this may be held across
+                 * subsequent open and delegation acquisition which may
+                 * need to take the child's i_mutex:
+                 */
+                fh_lock_nested(fhp, I_MUTEX_PARENT);
                dentry = lookup_one_len(name, dparent, len);
                host_err = PTR_ERR(dentry);
                if (IS_ERR(dentry))
@@ -273,13 +278,6 @@ out:
        return err;
 }
-static int nfsd_break_lease(struct inode *inode)
-{
-        if (!S_ISREG(inode->i_mode))
-                return 0;
-        return break_lease(inode, O_WRONLY | O_NONBLOCK);
-}
 /*
 * Commit metadata changes to stable storage.
 */
@@ -348,8 +346,7 @@ nfsd_sanitize_attrs(struct inode *inode, struct iattr *iap)
        /* Revoke setuid/setgid on chown */
        if (!S_ISDIR(inode->i_mode) &&
-            (((iap->ia_valid & ATTR_UID) && !uid_eq(iap->ia_uid, inode->i_uid)) ||
+            ((iap->ia_valid & ATTR_UID) || (iap->ia_valid & ATTR_GID))) {
-             ((iap->ia_valid & ATTR_GID) && !gid_eq(iap->ia_gid, inode->i_gid)))) {
                iap->ia_valid |= ATTR_KILL_PRIV;
                if (iap->ia_valid & ATTR_MODE) {
                        /* we're setting mode too, just clear the s*id bits */
@@ -449,16 +446,11 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
                goto out_put_write_access;
        }
-        host_err = nfsd_break_lease(inode);
-        if (host_err)
-                goto out_put_write_access_nfserror;
        fh_lock(fhp);
        host_err = notify_change(dentry, iap, NULL);
        fh_unlock(fhp);
-out_put_write_access_nfserror:
        err = nfserrno(host_err);
 out_put_write_access:
        if (size_change)
                put_write_access(inode);
@@ -468,158 +460,7 @@ out:
        return err;
 }
-#if defined(CONFIG_NFSD_V2_ACL) || \
-    defined(CONFIG_NFSD_V3_ACL) || \
-    defined(CONFIG_NFSD_V4)
-static ssize_t nfsd_getxattr(struct dentry *dentry, char *key, void **buf)
-{
-        ssize_t buflen;
-        ssize_t ret;
-        buflen = vfs_getxattr(dentry, key, NULL, 0);
-        if (buflen <= 0)
-                return buflen;
-        *buf = kmalloc(buflen, GFP_KERNEL);
-        if (!*buf)
-                return -ENOMEM;
-        ret = vfs_getxattr(dentry, key, *buf, buflen);
-        if (ret < 0)
-                kfree(*buf);
-        return ret;
-}
-#endif
 #if defined(CONFIG_NFSD_V4)
-static int
-set_nfsv4_acl_one(struct dentry *dentry, struct posix_acl *pacl, char *key)
-{
-        int len;
-        size_t buflen;
-        char *buf = NULL;
-        int error = 0;
-        buflen = posix_acl_xattr_size(pacl->a_count);
-        buf = kmalloc(buflen, GFP_KERNEL);
-        error = -ENOMEM;
-        if (buf == NULL)
-                goto out;
-        len = posix_acl_to_xattr(&init_user_ns, pacl, buf, buflen);
-        if (len < 0) {
-                error = len;
-                goto out;
-        }
-        error = vfs_setxattr(dentry, key, buf, len, 0);
-out:
-        kfree(buf);
-        return error;
-}
-__be32
-nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp,
-    struct nfs4_acl *acl)
-{
-        __be32 error;
-        int host_error;
-        struct dentry *dentry;
-        struct inode *inode;
-        struct posix_acl *pacl = NULL, *dpacl = NULL;
-        unsigned int flags = 0;
-        /* Get inode */
-        error = fh_verify(rqstp, fhp, 0, NFSD_MAY_SATTR);
-        if (error)
-                return error;
-        dentry = fhp->fh_dentry;
-        inode = dentry->d_inode;
-        if (S_ISDIR(inode->i_mode))
-                flags = NFS4_ACL_DIR;
-        host_error = nfs4_acl_nfsv4_to_posix(acl, &pacl, &dpacl, flags);
-        if (host_error == -EINVAL) {
-                return nfserr_attrnotsupp;
-        } else if (host_error < 0)
-                goto out_nfserr;
-        host_error = set_nfsv4_acl_one(dentry, pacl, POSIX_ACL_XATTR_ACCESS);
-        if (host_error < 0)
-                goto out_release;
-        if (S_ISDIR(inode->i_mode))
-                host_error = set_nfsv4_acl_one(dentry, dpacl, POSIX_ACL_XATTR_DEFAULT);
-out_release:
-        posix_acl_release(pacl);
-        posix_acl_release(dpacl);
-out_nfserr:
-        if (host_error == -EOPNOTSUPP)
-                return nfserr_attrnotsupp;
-        else
-                return nfserrno(host_error);
-}
-static struct posix_acl *
-_get_posix_acl(struct dentry *dentry, char *key)
-{
-        void *buf = NULL;
-        struct posix_acl *pacl = NULL;
-        int buflen;
-        buflen = nfsd_getxattr(dentry, key, &buf);
-        if (!buflen)
-                buflen = -ENODATA;
-        if (buflen <= 0)
-                return ERR_PTR(buflen);
-        pacl = posix_acl_from_xattr(&init_user_ns, buf, buflen);
-        kfree(buf);
-        return pacl;
-}
-int
-nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, struct nfs4_acl **acl)
-{
-        struct inode *inode = dentry->d_inode;
-        int error = 0;
-        struct posix_acl *pacl = NULL, *dpacl = NULL;
-        unsigned int flags = 0;
-        pacl = _get_posix_acl(dentry, POSIX_ACL_XATTR_ACCESS);
-        if (IS_ERR(pacl) && PTR_ERR(pacl) == -ENODATA)
-                pacl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);
-        if (IS_ERR(pacl)) {
-                error = PTR_ERR(pacl);
-                pacl = NULL;
-                goto out;
-        }
-        if (S_ISDIR(inode->i_mode)) {
-                dpacl = _get_posix_acl(dentry, POSIX_ACL_XATTR_DEFAULT);
-                if (IS_ERR(dpacl) && PTR_ERR(dpacl) == -ENODATA)
-                        dpacl = NULL;
-                else if (IS_ERR(dpacl)) {
-                        error = PTR_ERR(dpacl);
-                        dpacl = NULL;
-                        goto out;
-                }
-                flags = NFS4_ACL_DIR;
-        }
-        *acl = nfs4_acl_posix_to_nfsv4(pacl, dpacl, flags);
-        if (IS_ERR(*acl)) {
-                error = PTR_ERR(*acl);
-                *acl = NULL;
-        }
- out:
-        posix_acl_release(pacl);
-        posix_acl_release(dpacl);
-        return error;
-}
 /*
 * NFS junction information is stored in an extended attribute.
 */
@@ -1760,11 +1601,6 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
        err = nfserr_noent;
        if (!dold->d_inode)
                goto out_dput;
-        host_err = nfsd_break_lease(dold->d_inode);
-        if (host_err) {
-                err = nfserrno(host_err);
-                goto out_dput;
-        }
        host_err = vfs_link(dold, dirp, dnew, NULL);
        if (!host_err) {
                err = nfserrno(commit_metadata(ffhp));
@@ -1858,14 +1694,6 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
        if (ffhp->fh_export->ex_path.dentry != tfhp->fh_export->ex_path.dentry)
                goto out_dput_new;
-        host_err = nfsd_break_lease(odentry->d_inode);
-        if (host_err)
-                goto out_dput_new;
-        if (ndentry->d_inode) {
-                host_err = nfsd_break_lease(ndentry->d_inode);
-                if (host_err)
-                        goto out_dput_new;
-        }
        host_err = vfs_rename(fdir, odentry, tdir, ndentry, NULL);
        if (!host_err) {
                host_err = commit_metadata(tfhp);
@@ -1935,16 +1763,12 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
        if (!type)
                type = rdentry->d_inode->i_mode & S_IFMT;
-        host_err = nfsd_break_lease(rdentry->d_inode);
-        if (host_err)
-                goto out_put;
        if (type != S_IFDIR)
                host_err = vfs_unlink(dirp, rdentry, NULL);
        else
                host_err = vfs_rmdir(dirp, rdentry);
        if (!host_err)
                host_err = commit_metadata(fhp);
-out_put:
        dput(rdentry);
 out_nfserr:
@@ -2284,93 +2108,3 @@ out_nomem:
        nfsd_racache_shutdown();
        return -ENOMEM;
 }
-#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
-struct posix_acl *
-nfsd_get_posix_acl(struct svc_fh *fhp, int type)
-{
-        struct inode *inode = fhp->fh_dentry->d_inode;
-        char *name;
-        void *value = NULL;
-        ssize_t size;
-        struct posix_acl *acl;
-        if (!IS_POSIXACL(inode))
-                return ERR_PTR(-EOPNOTSUPP);
-        switch (type) {
-        case ACL_TYPE_ACCESS:
-                name = POSIX_ACL_XATTR_ACCESS;
-                break;
-        case ACL_TYPE_DEFAULT:
-                name = POSIX_ACL_XATTR_DEFAULT;
-                break;
-        default:
-                return ERR_PTR(-EOPNOTSUPP);
-        }
-        size = nfsd_getxattr(fhp->fh_dentry, name, &value);
-        if (size < 0)
-                return ERR_PTR(size);
-        acl = posix_acl_from_xattr(&init_user_ns, value, size);
-        kfree(value);
-        return acl;
-}
-int
-nfsd_set_posix_acl(struct svc_fh *fhp, int type, struct posix_acl *acl)
-{
-        struct inode *inode = fhp->fh_dentry->d_inode;
-        char *name;
-        void *value = NULL;
-        size_t size;
-        int error;
-        if (!IS_POSIXACL(inode) ||
-            !inode->i_op->setxattr || !inode->i_op->removexattr)
-                return -EOPNOTSUPP;
-        switch(type) {
-                case ACL_TYPE_ACCESS:
-                        name = POSIX_ACL_XATTR_ACCESS;
-                        break;
-                case ACL_TYPE_DEFAULT:
-                        name = POSIX_ACL_XATTR_DEFAULT;
-                        break;
-                default:
-                        return -EOPNOTSUPP;
-        }
-        if (acl && acl->a_count) {
-                size = posix_acl_xattr_size(acl->a_count);
-                value = kmalloc(size, GFP_KERNEL);
-                if (!value)
-                        return -ENOMEM;
-                error = posix_acl_to_xattr(&init_user_ns, acl, value, size);
-                if (error < 0)
-                        goto getout;
-                size = error;
-        } else
-                size = 0;
-        error = fh_want_write(fhp);
-        if (error)
-                goto getout;
-        if (size)
-                error = vfs_setxattr(fhp->fh_dentry, name, value, size, 0);
-        else {
-                if (!S_ISDIR(inode->i_mode) && type == ACL_TYPE_DEFAULT)
-                        error = 0;
-                else {
-                        error = vfs_removexattr(fhp->fh_dentry, name);
-                        if (error == -ENODATA)
-                                error = 0;
-                }
-        }
-        fh_drop_write(fhp);
-getout:
-        kfree(value);
-        return error;
-}
-#endif  /* defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) */
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index a4be2e389670..fbe90bdb2214 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -52,9 +52,6 @@ __be32		nfsd_setattr(struct svc_rqst *, struct svc_fh *,
                                struct iattr *, int, time_t);
 int nfsd_mountpoint(struct dentry *, struct svc_export *);
 #ifdef CONFIG_NFSD_V4
-__be32          nfsd4_set_nfs4_acl(struct svc_rqst *, struct svc_fh *,
-                    struct nfs4_acl *);
-int             nfsd4_get_nfs4_acl(struct svc_rqst *, struct dentry *, struct nfs4_acl **);
 __be32          nfsd4_set_nfs4_label(struct svc_rqst *, struct svc_fh *,
                    struct xdr_netobj *);
 #endif /* CONFIG_NFSD_V4 */
@@ -89,8 +86,6 @@ __be32		nfsd_link(struct svc_rqst *, struct svc_fh *,
 __be32          nfsd_rename(struct svc_rqst *,
                                struct svc_fh *, char *, int,
                                struct svc_fh *, char *, int);
-__be32          nfsd_remove(struct svc_rqst *,
-                                struct svc_fh *, char *, int);
 __be32          nfsd_unlink(struct svc_rqst *, struct svc_fh *, int type,
                                char *name, int len);
 __be32          nfsd_readdir(struct svc_rqst *, struct svc_fh *,
@@ -101,11 +96,6 @@ __be32		nfsd_statfs(struct svc_rqst *, struct svc_fh *,
 __be32          nfsd_permission(struct svc_rqst *, struct svc_export *,
                                struct dentry *, int);
-#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
-struct posix_acl *nfsd_get_posix_acl(struct svc_fh *, int);
-int nfsd_set_posix_acl(struct svc_fh *, int, struct posix_acl *);
-#endif
 static inline int fh_want_write(struct svc_fh *fh)
 {
        int ret = mnt_want_write(fh->fh_export->ex_path.mnt);
diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h
index b6d5542a4ac8..335e04aaf7db 100644
--- a/fs/nfsd/xdr3.h
+++ b/fs/nfsd/xdr3.h
@@ -174,6 +174,9 @@ struct nfsd3_linkres {
 struct nfsd3_readdirres {
        __be32                  status;
        struct svc_fh           fh;
+        /* Just to save kmalloc on every readdirplus entry (svc_fh is a
+         * little large for the stack): */
+        struct svc_fh           scratch;
        int                     count;
        __be32                  verf[2];
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index b3ed6446ed8e..d278a0d03496 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -228,7 +228,7 @@ struct nfsd4_open {
        u32             op_create;          /* request */
        u32             op_createmode;      /* request */
        u32             op_bmval[3];        /* request */
-        struct iattr    iattr;              /* UNCHECKED4, GUARDED4, EXCLUSIVE4_1 */
+        struct iattr    op_iattr;           /* UNCHECKED4, GUARDED4, EXCLUSIVE4_1 */
        nfs4_verifier   op_verf __attribute__((aligned(32)));
                                            /* EXCLUSIVE4 */
        clientid_t      op_clientid;        /* request */
@@ -250,7 +250,6 @@ struct nfsd4_open {
        struct nfs4_acl *op_acl;
        struct xdr_netobj op_label;
 };
-#define op_iattr        iattr
 struct nfsd4_open_confirm {
        stateid_t       oc_req_stateid          /* request */;
@@ -374,7 +373,6 @@ struct nfsd4_test_stateid {
 struct nfsd4_free_stateid {
        stateid_t       fr_stateid;         /* request */
-        __be32          fr_status;          /* response */
 };
 /* also used for NVERIFY */
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index b44bdb291b84..2b34021948e4 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -37,7 +37,26 @@
 #include "sufile.h"
 #include "dat.h"
+/**
+ * nilfs_ioctl_wrap_copy - wrapping function of get/set metadata info
+ * @nilfs: nilfs object
+ * @argv: vector of arguments from userspace
+ * @dir: set of direction flags
+ * @dofunc: concrete function of get/set metadata info
+ *
+ * Description: nilfs_ioctl_wrap_copy() gets/sets metadata info by means of
+ * calling dofunc() function on the basis of @argv argument.
+ *
+ * Return Value: On success, 0 is returned and requested metadata info
+ * is copied into userspace. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EINVAL - Invalid arguments from userspace.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-EFAULT - Failure during execution of requested operation.
+ */
 static int nilfs_ioctl_wrap_copy(struct the_nilfs *nilfs,
                                 struct nilfs_argv *argv, int dir,
                                 ssize_t (*dofunc)(struct the_nilfs *,
@@ -57,6 +76,14 @@ static int nilfs_ioctl_wrap_copy(struct the_nilfs *nilfs,
        if (argv->v_size > PAGE_SIZE)
                return -EINVAL;
+        /*
+         * Reject pairs of a start item position (argv->v_index) and a
+         * total count (argv->v_nmembs) which leads position 'pos' to
+         * overflow by the increment at the end of the loop.
+         */
+        if (argv->v_index > ~(__u64)0 - argv->v_nmembs)
+                return -EINVAL;
        buf = (void *)__get_free_pages(GFP_NOFS, 0);
        if (unlikely(!buf))
                return -ENOMEM;
@@ -99,6 +126,9 @@ static int nilfs_ioctl_wrap_copy(struct the_nilfs *nilfs,
        return ret;
 }
+/**
+ * nilfs_ioctl_getflags - ioctl to support lsattr
+ */
 static int nilfs_ioctl_getflags(struct inode *inode, void __user *argp)
 {
        unsigned int flags = NILFS_I(inode)->i_flags & FS_FL_USER_VISIBLE;
@@ -106,6 +136,9 @@ static int nilfs_ioctl_getflags(struct inode *inode, void __user *argp)
        return put_user(flags, (int __user *)argp);
 }
+/**
+ * nilfs_ioctl_setflags - ioctl to support chattr
+ */
 static int nilfs_ioctl_setflags(struct inode *inode, struct file *filp,
                                void __user *argp)
 {
@@ -158,11 +191,33 @@ out:
        return ret;
 }
+/**
+ * nilfs_ioctl_getversion - get info about a file's version (generation number)
+ */
 static int nilfs_ioctl_getversion(struct inode *inode, void __user *argp)
 {
        return put_user(inode->i_generation, (int __user *)argp);
 }
+/**
+ * nilfs_ioctl_change_cpmode - change checkpoint mode (checkpoint/snapshot)
+ * @inode: inode object
+ * @filp: file object
+ * @cmd: ioctl's request code
+ * @argp: pointer on argument from userspace
+ *
+ * Description: nilfs_ioctl_change_cpmode() function changes mode of
+ * given checkpoint between checkpoint and snapshot state. This ioctl
+ * is used in chcp and mkcp utilities.
+ *
+ * Return Value: On success, 0 is returned and mode of a checkpoint is
+ * changed. On error, one of the following negative error codes
+ * is returned.
+ *
+ * %-EPERM - Operation not permitted.
+ *
+ * %-EFAULT - Failure during checkpoint mode changing.
+ */
 static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp,
                                     unsigned int cmd, void __user *argp)
 {
@@ -198,6 +253,25 @@ out:
        return ret;
 }
+/**
+ * nilfs_ioctl_delete_checkpoint - remove checkpoint
+ * @inode: inode object
+ * @filp: file object
+ * @cmd: ioctl's request code
+ * @argp: pointer on argument from userspace
+ *
+ * Description: nilfs_ioctl_delete_checkpoint() function removes
+ * checkpoint from NILFS2 file system. This ioctl is used in rmcp
+ * utility.
+ *
+ * Return Value: On success, 0 is returned and a checkpoint is
+ * removed. On error, one of the following negative error codes
+ * is returned.
+ *
+ * %-EPERM - Operation not permitted.
+ *
+ * %-EFAULT - Failure during checkpoint removing.
+ */
 static int
 nilfs_ioctl_delete_checkpoint(struct inode *inode, struct file *filp,
                              unsigned int cmd, void __user *argp)
@@ -229,6 +303,21 @@ out:
        return ret;
 }
+/**
+ * nilfs_ioctl_do_get_cpinfo - callback method getting info about checkpoints
+ * @nilfs: nilfs object
+ * @posp: pointer on array of checkpoint's numbers
+ * @flags: checkpoint mode (checkpoint or snapshot)
+ * @buf: buffer for storing checkponts' info
+ * @size: size in bytes of one checkpoint info item in array
+ * @nmembs: number of checkpoints in array (numbers and infos)
+ *
+ * Description: nilfs_ioctl_do_get_cpinfo() function returns info about
+ * requested checkpoints. The NILFS_IOCTL_GET_CPINFO ioctl is used in
+ * lscp utility and by nilfs_cleanerd daemon.
+ *
+ * Return value: count of nilfs_cpinfo structures in output buffer.
+ */
 static ssize_t
 nilfs_ioctl_do_get_cpinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
                          void *buf, size_t size, size_t nmembs)
@@ -242,6 +331,27 @@ nilfs_ioctl_do_get_cpinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
        return ret;
 }
+/**
+ * nilfs_ioctl_get_cpstat - get checkpoints statistics
+ * @inode: inode object
+ * @filp: file object
+ * @cmd: ioctl's request code
+ * @argp: pointer on argument from userspace
+ *
+ * Description: nilfs_ioctl_get_cpstat() returns information about checkpoints.
+ * The NILFS_IOCTL_GET_CPSTAT ioctl is used by lscp, rmcp utilities
+ * and by nilfs_cleanerd daemon.
+ *
+ * Return Value: On success, 0 is returned, and checkpoints information is
+ * copied into userspace pointer @argp. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-EFAULT - Failure during getting checkpoints statistics.
+ */
 static int nilfs_ioctl_get_cpstat(struct inode *inode, struct file *filp,
                                  unsigned int cmd, void __user *argp)
 {
@@ -260,6 +370,21 @@ static int nilfs_ioctl_get_cpstat(struct inode *inode, struct file *filp,
        return ret;
 }
+/**
+ * nilfs_ioctl_do_get_suinfo - callback method getting segment usage info
+ * @nilfs: nilfs object
+ * @posp: pointer on array of segment numbers
+ * @flags: *not used*
+ * @buf: buffer for storing suinfo array
+ * @size: size in bytes of one suinfo item in array
+ * @nmembs: count of segment numbers and suinfos in array
+ *
+ * Description: nilfs_ioctl_do_get_suinfo() function returns segment usage
+ * info about requested segments. The NILFS_IOCTL_GET_SUINFO ioctl is used
+ * in lssu, nilfs_resize utilities and by nilfs_cleanerd daemon.
+ *
+ * Return value: count of nilfs_suinfo structures in output buffer.
+ */
 static ssize_t
 nilfs_ioctl_do_get_suinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
                          void *buf, size_t size, size_t nmembs)
@@ -273,6 +398,27 @@ nilfs_ioctl_do_get_suinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
        return ret;
 }
+/**
+ * nilfs_ioctl_get_sustat - get segment usage statistics
+ * @inode: inode object
+ * @filp: file object
+ * @cmd: ioctl's request code
+ * @argp: pointer on argument from userspace
+ *
+ * Description: nilfs_ioctl_get_sustat() returns segment usage statistics.
+ * The NILFS_IOCTL_GET_SUSTAT ioctl is used in lssu, nilfs_resize utilities
+ * and by nilfs_cleanerd daemon.
+ *
+ * Return Value: On success, 0 is returned, and segment usage information is
+ * copied into userspace pointer @argp. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-EFAULT - Failure during getting segment usage statistics.
+ */
 static int nilfs_ioctl_get_sustat(struct inode *inode, struct file *filp,
                                  unsigned int cmd, void __user *argp)
 {
@@ -291,6 +437,21 @@ static int nilfs_ioctl_get_sustat(struct inode *inode, struct file *filp,
        return ret;
 }
+/**
+ * nilfs_ioctl_do_get_vinfo - callback method getting virtual blocks info
+ * @nilfs: nilfs object
+ * @posp: *not used*
+ * @flags: *not used*
+ * @buf: buffer for storing array of nilfs_vinfo structures
+ * @size: size in bytes of one vinfo item in array
+ * @nmembs: count of vinfos in array
+ *
+ * Description: nilfs_ioctl_do_get_vinfo() function returns information
+ * on virtual block addresses. The NILFS_IOCTL_GET_VINFO ioctl is used
+ * by nilfs_cleanerd daemon.
+ *
+ * Return value: count of nilfs_vinfo structures in output buffer.
+ */
 static ssize_t
 nilfs_ioctl_do_get_vinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
                         void *buf, size_t size, size_t nmembs)
@@ -303,6 +464,21 @@ nilfs_ioctl_do_get_vinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
        return ret;
 }
+/**
+ * nilfs_ioctl_do_get_bdescs - callback method getting disk block descriptors
+ * @nilfs: nilfs object
+ * @posp: *not used*
+ * @flags: *not used*
+ * @buf: buffer for storing array of nilfs_bdesc structures
+ * @size: size in bytes of one bdesc item in array
+ * @nmembs: count of bdescs in array
+ *
+ * Description: nilfs_ioctl_do_get_bdescs() function returns information
+ * about descriptors of disk block numbers. The NILFS_IOCTL_GET_BDESCS ioctl
+ * is used by nilfs_cleanerd daemon.
+ *
+ * Return value: count of nilfs_bdescs structures in output buffer.
+ */
 static ssize_t
 nilfs_ioctl_do_get_bdescs(struct the_nilfs *nilfs, __u64 *posp, int flags,
                          void *buf, size_t size, size_t nmembs)
@@ -329,6 +505,29 @@ nilfs_ioctl_do_get_bdescs(struct the_nilfs *nilfs, __u64 *posp, int flags,
        return nmembs;
 }
+/**
+ * nilfs_ioctl_get_bdescs - get disk block descriptors
+ * @inode: inode object
+ * @filp: file object
+ * @cmd: ioctl's request code
+ * @argp: pointer on argument from userspace
+ *
+ * Description: nilfs_ioctl_do_get_bdescs() function returns information
+ * about descriptors of disk block numbers. The NILFS_IOCTL_GET_BDESCS ioctl
+ * is used by nilfs_cleanerd daemon.
+ *
+ * Return Value: On success, 0 is returned, and disk block descriptors are
+ * copied into userspace pointer @argp. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EINVAL - Invalid arguments from userspace.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-EFAULT - Failure during getting disk block descriptors.
+ */
 static int nilfs_ioctl_get_bdescs(struct inode *inode, struct file *filp,
                                  unsigned int cmd, void __user *argp)
 {
@@ -352,6 +551,26 @@ static int nilfs_ioctl_get_bdescs(struct inode *inode, struct file *filp,
        return ret;
 }
+/**
+ * nilfs_ioctl_move_inode_block - prepare data/node block for moving by GC
+ * @inode: inode object
+ * @vdesc: descriptor of virtual block number
+ * @buffers: list of moving buffers
+ *
+ * Description: nilfs_ioctl_move_inode_block() function registers data/node
+ * buffer in the GC pagecache and submit read request.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - Requested block doesn't exist.
+ *
+ * %-EEXIST - Blocks conflict is detected.
+ */
 static int nilfs_ioctl_move_inode_block(struct inode *inode,
                                        struct nilfs_vdesc *vdesc,
                                        struct list_head *buffers)
@@ -397,6 +616,19 @@ static int nilfs_ioctl_move_inode_block(struct inode *inode,
        return 0;
 }
+/**
+ * nilfs_ioctl_move_blocks - move valid inode's blocks during garbage collection
+ * @sb: superblock object
+ * @argv: vector of arguments from userspace
+ * @buf: array of nilfs_vdesc structures
+ *
+ * Description: nilfs_ioctl_move_blocks() function reads valid data/node
+ * blocks that garbage collector specified with the array of nilfs_vdesc
+ * structures and stores them into page caches of GC inodes.
+ *
+ * Return Value: Number of processed nilfs_vdesc structures or
+ * error code, otherwise.
+ */
 static int nilfs_ioctl_move_blocks(struct super_block *sb,
                                   struct nilfs_argv *argv, void *buf)
 {
@@ -462,6 +694,25 @@ static int nilfs_ioctl_move_blocks(struct super_block *sb,
        return ret;
 }
+/**
+ * nilfs_ioctl_delete_checkpoints - delete checkpoints
+ * @nilfs: nilfs object
+ * @argv: vector of arguments from userspace
+ * @buf: array of periods of checkpoints numbers
+ *
+ * Description: nilfs_ioctl_delete_checkpoints() function deletes checkpoints
+ * in the period from p_start to p_end, excluding p_end itself. The checkpoints
+ * which have been already deleted are ignored.
+ *
+ * Return Value: Number of processed nilfs_period structures or
+ * error code, otherwise.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-EINVAL - invalid checkpoints.
+ */
 static int nilfs_ioctl_delete_checkpoints(struct the_nilfs *nilfs,
                                          struct nilfs_argv *argv, void *buf)
 {
@@ -479,6 +730,24 @@ static int nilfs_ioctl_delete_checkpoints(struct the_nilfs *nilfs,
        return nmembs;
 }
+/**
+ * nilfs_ioctl_free_vblocknrs - free virtual block numbers
+ * @nilfs: nilfs object
+ * @argv: vector of arguments from userspace
+ * @buf: array of virtual block numbers
+ *
+ * Description: nilfs_ioctl_free_vblocknrs() function frees
+ * the virtual block numbers specified by @buf and @argv->v_nmembs.
+ *
+ * Return Value: Number of processed virtual block numbers or
+ * error code, otherwise.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - The virtual block number have not been allocated.
+ */
 static int nilfs_ioctl_free_vblocknrs(struct the_nilfs *nilfs,
                                      struct nilfs_argv *argv, void *buf)
 {
@@ -490,6 +759,24 @@ static int nilfs_ioctl_free_vblocknrs(struct the_nilfs *nilfs,
        return (ret < 0) ? ret : nmembs;
 }
+/**
+ * nilfs_ioctl_mark_blocks_dirty - mark blocks dirty
+ * @nilfs: nilfs object
+ * @argv: vector of arguments from userspace
+ * @buf: array of block descriptors
+ *
+ * Description: nilfs_ioctl_mark_blocks_dirty() function marks
+ * metadata file or data blocks as dirty.
+ *
+ * Return Value: Number of processed block descriptors or
+ * error code, otherwise.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ *
+ * %-EIO - I/O error
+ *
+ * %-ENOENT - the specified block does not exist (hole block)
+ */
 static int nilfs_ioctl_mark_blocks_dirty(struct the_nilfs *nilfs,
                                         struct nilfs_argv *argv, void *buf)
 {
@@ -571,6 +858,20 @@ int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *nilfs,
        return ret;
 }
+/**
+ * nilfs_ioctl_clean_segments - clean segments
+ * @inode: inode object
+ * @filp: file object
+ * @cmd: ioctl's request code
+ * @argp: pointer on argument from userspace
+ *
+ * Description: nilfs_ioctl_clean_segments() function makes garbage
+ * collection operation in the environment of requested parameters
+ * from userspace. The NILFS_IOCTL_CLEAN_SEGMENTS ioctl is used by
+ * nilfs_cleanerd daemon.
+ *
+ * Return Value: On success, 0 is returned or error code, otherwise.
+ */
 static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
                                      unsigned int cmd, void __user *argp)
 {
@@ -682,6 +983,33 @@ out:
        return ret;
 }
+/**
+ * nilfs_ioctl_sync - make a checkpoint
+ * @inode: inode object
+ * @filp: file object
+ * @cmd: ioctl's request code
+ * @argp: pointer on argument from userspace
+ *
+ * Description: nilfs_ioctl_sync() function constructs a logical segment
+ * for checkpointing.  This function guarantees that all modified data
+ * and metadata are written out to the device when it successfully
+ * returned.
+ *
+ * Return Value: On success, 0 is retured. On errors, one of the following
+ * negative error code is returned.
+ *
+ * %-EROFS - Read only filesystem.
+ *
+ * %-EIO - I/O error
+ *
+ * %-ENOSPC - No space left on device (only in a panic state).
+ *
+ * %-ERESTARTSYS - Interrupted.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ *
+ * %-EFAULT - Failure during execution of requested operation.
+ */
 static int nilfs_ioctl_sync(struct inode *inode, struct file *filp,
                            unsigned int cmd, void __user *argp)
 {
@@ -710,6 +1038,14 @@ static int nilfs_ioctl_sync(struct inode *inode, struct file *filp,
        return 0;
 }
+/**
+ * nilfs_ioctl_resize - resize NILFS2 volume
+ * @inode: inode object
+ * @filp: file object
+ * @argp: pointer on argument from userspace
+ *
+ * Return Value: On success, 0 is returned or error code, otherwise.
+ */
 static int nilfs_ioctl_resize(struct inode *inode, struct file *filp,
                              void __user *argp)
 {
@@ -735,6 +1071,17 @@ out:
        return ret;
 }
+/**
+ * nilfs_ioctl_set_alloc_range - limit range of segments to be allocated
+ * @inode: inode object
+ * @argp: pointer on argument from userspace
+ *
+ * Decription: nilfs_ioctl_set_alloc_range() function defines lower limit
+ * of segments in bytes and upper limit of segments in bytes.
+ * The NILFS_IOCTL_SET_ALLOC_RANGE is used by nilfs_resize utility.
+ *
+ * Return Value: On success, 0 is returned or error code, otherwise.
+ */
 static int nilfs_ioctl_set_alloc_range(struct inode *inode, void __user *argp)
 {
        struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
@@ -767,6 +1114,28 @@ out:
        return ret;
 }
+/**
+ * nilfs_ioctl_get_info - wrapping function of get metadata info
+ * @inode: inode object
+ * @filp: file object
+ * @cmd: ioctl's request code
+ * @argp: pointer on argument from userspace
+ * @membsz: size of an item in bytes
+ * @dofunc: concrete function of getting metadata info
+ *
+ * Description: nilfs_ioctl_get_info() gets metadata info by means of
+ * calling dofunc() function.
+ *
+ * Return Value: On success, 0 is returned and requested metadata info
+ * is copied into userspace. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EINVAL - Invalid arguments from userspace.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-EFAULT - Failure during execution of requested operation.
+ */
 static int nilfs_ioctl_get_info(struct inode *inode, struct file *filp,
                                unsigned int cmd, void __user *argp,
                                size_t membsz,
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 2d8be51f90dc..dc3a9efdaab8 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -416,7 +416,8 @@ static struct bio *nilfs_alloc_seg_bio(struct the_nilfs *nilfs, sector_t start,
        }
        if (likely(bio)) {
                bio->bi_bdev = nilfs->ns_bdev;
-                bio->bi_sector = start << (nilfs->ns_blocksize_bits - 9);
+                bio->bi_iter.bi_sector =
+                        start << (nilfs->ns_blocksize_bits - 9);
        }
        return bio;
 }
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 9f6b486b6c01..a1a191634abc 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -1440,17 +1440,19 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci,
                nilfs_clear_logs(&sci->sc_segbufs);
-                err = nilfs_segctor_extend_segments(sci, nilfs, nadd);
-                if (unlikely(err))
-                        return err;
                if (sci->sc_stage.flags & NILFS_CF_SUFREED) {
                        err = nilfs_sufile_cancel_freev(nilfs->ns_sufile,
                                                        sci->sc_freesegs,
                                                        sci->sc_nfreesegs,
                                                        NULL);
                        WARN_ON(err); /* do not happen */
+                        sci->sc_stage.flags &= ~NILFS_CF_SUFREED;
                }
+                err = nilfs_segctor_extend_segments(sci, nilfs, nadd);
+                if (unlikely(err))
+                        return err;
                nadd = min_t(int, nadd << 1, SC_MAX_SEGDELTA);
                sci->sc_stage = prev_stage;
        }
diff --git a/fs/nls/mac-celtic.c b/fs/nls/mac-celtic.c
index 634a8b717b02..266c2d7d50bd 100644
--- a/fs/nls/mac-celtic.c
+++ b/fs/nls/mac-celtic.c
@@ -583,7 +583,6 @@ static struct nls_table table = {
        .char2uni       = char2uni,
        .charset2lower  = charset2lower,
        .charset2upper  = charset2upper,
-        .owner          = THIS_MODULE,
 };
 static int __init init_nls_macceltic(void)
diff --git a/fs/nls/mac-centeuro.c b/fs/nls/mac-centeuro.c
index 979e6265ac5e..9789c6057551 100644
--- a/fs/nls/mac-centeuro.c
+++ b/fs/nls/mac-centeuro.c
@@ -513,7 +513,6 @@ static struct nls_table table = {
        .char2uni       = char2uni,
        .charset2lower  = charset2lower,
        .charset2upper  = charset2upper,
-        .owner          = THIS_MODULE,
 };
 static int __init init_nls_maccenteuro(void)
diff --git a/fs/nls/mac-croatian.c b/fs/nls/mac-croatian.c
index dd3f675911ee..bb19e7a07d43 100644
--- a/fs/nls/mac-croatian.c
+++ b/fs/nls/mac-croatian.c
@@ -583,7 +583,6 @@ static struct nls_table table = {
        .char2uni       = char2uni,
        .charset2lower  = charset2lower,
        .charset2upper  = charset2upper,
-        .owner          = THIS_MODULE,
 };
 static int __init init_nls_maccroatian(void)
diff --git a/fs/nls/mac-cyrillic.c b/fs/nls/mac-cyrillic.c
index 1112c84dd8bb..2a7dea36acba 100644
--- a/fs/nls/mac-cyrillic.c
+++ b/fs/nls/mac-cyrillic.c
@@ -478,7 +478,6 @@ static struct nls_table table = {
        .char2uni       = char2uni,
        .charset2lower  = charset2lower,
        .charset2upper  = charset2upper,
-        .owner          = THIS_MODULE,
 };
 static int __init init_nls_maccyrillic(void)
diff --git a/fs/nls/mac-gaelic.c b/fs/nls/mac-gaelic.c
index 2de9158409c8..77b001653588 100644
--- a/fs/nls/mac-gaelic.c
+++ b/fs/nls/mac-gaelic.c
@@ -548,7 +548,6 @@ static struct nls_table table = {
        .char2uni       = char2uni,
        .charset2lower  = charset2lower,
        .charset2upper  = charset2upper,
-        .owner          = THIS_MODULE,
 };
 static int __init init_nls_macgaelic(void)
diff --git a/fs/nls/mac-greek.c b/fs/nls/mac-greek.c
index a86310082802..1eccf499e2eb 100644
--- a/fs/nls/mac-greek.c
+++ b/fs/nls/mac-greek.c
@@ -478,7 +478,6 @@ static struct nls_table table = {
        .char2uni       = char2uni,
        .charset2lower  = charset2lower,
        .charset2upper  = charset2upper,
-        .owner          = THIS_MODULE,
 };
 static int __init init_nls_macgreek(void)
diff --git a/fs/nls/mac-iceland.c b/fs/nls/mac-iceland.c
index babe2998d5ce..cbd0875c6d69 100644
--- a/fs/nls/mac-iceland.c
+++ b/fs/nls/mac-iceland.c
@@ -583,7 +583,6 @@ static struct nls_table table = {
        .char2uni       = char2uni,
        .charset2lower  = charset2lower,
        .charset2upper  = charset2upper,
-        .owner          = THIS_MODULE,
 };
 static int __init init_nls_maciceland(void)
diff --git a/fs/nls/mac-inuit.c b/fs/nls/mac-inuit.c
index 312364f010dc..fba8357aaf03 100644
--- a/fs/nls/mac-inuit.c
+++ b/fs/nls/mac-inuit.c
@@ -513,7 +513,6 @@ static struct nls_table table = {
        .char2uni       = char2uni,
        .charset2lower  = charset2lower,
        .charset2upper  = charset2upper,
-        .owner          = THIS_MODULE,
 };
 static int __init init_nls_macinuit(void)
diff --git a/fs/nls/mac-roman.c b/fs/nls/mac-roman.c
index 53ce0809cbd2..b6a98a5208cd 100644
--- a/fs/nls/mac-roman.c
+++ b/fs/nls/mac-roman.c
@@ -618,7 +618,6 @@ static struct nls_table table = {
        .char2uni       = char2uni,
        .charset2lower  = charset2lower,
        .charset2upper  = charset2upper,
-        .owner          = THIS_MODULE,
 };
 static int __init init_nls_macroman(void)
diff --git a/fs/nls/mac-romanian.c b/fs/nls/mac-romanian.c
index add6f7a0c666..25547f023638 100644
--- a/fs/nls/mac-romanian.c
+++ b/fs/nls/mac-romanian.c
@@ -583,7 +583,6 @@ static struct nls_table table = {
        .char2uni       = char2uni,
        .charset2lower  = charset2lower,
        .charset2upper  = charset2upper,
-        .owner          = THIS_MODULE,
 };
 static int __init init_nls_macromanian(void)
diff --git a/fs/nls/mac-turkish.c b/fs/nls/mac-turkish.c
index dffa96d5de00..b5454bc7b7fa 100644
--- a/fs/nls/mac-turkish.c
+++ b/fs/nls/mac-turkish.c
@@ -583,7 +583,6 @@ static struct nls_table table = {
        .char2uni       = char2uni,
        .charset2lower  = charset2lower,
        .charset2upper  = charset2upper,
-        .owner          = THIS_MODULE,
 };
 static int __init init_nls_macturkish(void)
diff --git a/fs/nls/nls_ascii.c b/fs/nls/nls_ascii.c
index 7020e940f74e..a2620650d5e4 100644
--- a/fs/nls/nls_ascii.c
+++ b/fs/nls/nls_ascii.c
@@ -148,7 +148,6 @@ static struct nls_table table = {
        .char2uni       = char2uni,
        .charset2lower  = charset2lower,
        .charset2upper  = charset2upper,
-        .owner          = THIS_MODULE,
 };
 static int __init init_nls_ascii(void)
diff --git a/fs/nls/nls_base.c b/fs/nls/nls_base.c
index fea6bd5831dc..52ccd34b1e79 100644
--- a/fs/nls/nls_base.c
+++ b/fs/nls/nls_base.c
@@ -232,13 +232,14 @@ int utf16s_to_utf8s(const wchar_t *pwcs, int inlen, enum utf16_endian endian,
 }
 EXPORT_SYMBOL(utf16s_to_utf8s);
-int register_nls(struct nls_table * nls)
+int __register_nls(struct nls_table *nls, struct module *owner)
 {
        struct nls_table ** tmp = &tables;
        if (nls->next)
                return -EBUSY;
+        nls->owner = owner;
        spin_lock(&nls_lock);
        while (*tmp) {
                if (nls == *tmp) {
@@ -252,6 +253,7 @@ int register_nls(struct nls_table * nls)
        spin_unlock(&nls_lock);
        return 0;       
 }
+EXPORT_SYMBOL(__register_nls);
 int unregister_nls(struct nls_table * nls)
 {
@@ -538,7 +540,6 @@ struct nls_table *load_nls_default(void)
                return &default_table;
 }
-EXPORT_SYMBOL(register_nls);
 EXPORT_SYMBOL(unregister_nls);
 EXPORT_SYMBOL(unload_nls);
 EXPORT_SYMBOL(load_nls);
diff --git a/fs/nls/nls_cp1250.c b/fs/nls/nls_cp1250.c
index c8471fe78e4e..ace3e19d3407 100644
--- a/fs/nls/nls_cp1250.c
+++ b/fs/nls/nls_cp1250.c
@@ -329,7 +329,6 @@ static struct nls_table table = {
        .char2uni       = char2uni,
        .charset2lower  = charset2lower,
        .charset2upper  = charset2upper,
-        .owner          = THIS_MODULE,
 };
 static int __init init_nls_cp1250(void)
diff --git a/fs/nls/nls_cp1251.c b/fs/nls/nls_cp1251.c
index 1939b46e772f..9273ddfd08a1 100644
--- a/fs/nls/nls_cp1251.c
+++ b/fs/nls/nls_cp1251.c
@@ -283,7 +283,6 @@ static struct nls_table table = {
        .char2uni       = char2uni,
        .charset2lower  = charset2lower,
        .charset2upper  = charset2upper,
-        .owner          = THIS_MODULE,
 };
 static int __init init_nls_cp1251(void)
diff --git a/fs/nls/nls_cp1255.c b/fs/nls/nls_cp1255.c
index 8120ae2e091a..1caf5dfed85b 100644
--- a/fs/nls/nls_cp1255.c
+++ b/fs/nls/nls_cp1255.c
@@ -365,7 +365,6 @@ static struct nls_table table = {
        .char2uni       = char2uni,
        .charset2lower  = charset2lower,
        .charset2upper  = charset2upper,
-        .owner          = THIS_MODULE,
 };
 static int __init init_nls_cp1255(void)
diff --git a/fs/nls/nls_cp437.c b/fs/nls/nls_cp437.c
index ff37a4628ce4..7ddb830da3fd 100644
--- a/fs/nls/nls_cp437.c
+++ b/fs/nls/nls_cp437.c
@@ -369,7 +369,6 @@ static struct nls_table table = {
        .char2uni       = char2uni,
        .charset2lower  = charset2lower,
        .charset2upper  = charset2upper,
-        .owner          = THIS_MODULE,
 };
 static int __init init_nls_cp437(void)
diff --git a/fs/nls/nls_cp737.c b/fs/nls/nls_cp737.c
index f5576b8be1b9..c593f683a0cd 100644
--- a/fs/nls/nls_cp737.c
+++ b/fs/nls/nls_cp737.c
@@ -332,7 +332,6 @@ static struct nls_table table = {
        .char2uni       = char2uni,
        .charset2lower  = charset2lower,
        .charset2upper  = charset2upper,
-        .owner          = THIS_MODULE,
 };
 static int __init init_nls_cp737(void)
diff --git a/fs/nls/nls_cp775.c b/fs/nls/nls_cp775.c
index 4905635d1c00..554c863745f2 100644
--- a/fs/nls/nls_cp775.c
+++ b/fs/nls/nls_cp775.c
@@ -301,7 +301,6 @@ static struct nls_table table = {
        .char2uni       = char2uni,
        .charset2lower  = charset2lower,
        .charset2upper  = charset2upper,
-        .owner          = THIS_MODULE,
 };
 static int __init init_nls_cp775(void)
diff --git a/fs/nls/nls_cp850.c b/fs/nls/nls_cp850.c
index fe5bdad50e2b..56cccd14b40b 100644
--- a/fs/nls/nls_cp850.c
+++ b/fs/nls/nls_cp850.c
@@ -297,7 +297,6 @@ static struct nls_table table = {
        .char2uni       = char2uni,
        .charset2lower  = charset2lower,
        .charset2upper  = charset2upper,
-        .owner          = THIS_MODULE,
 };
 static int __init init_nls_cp850(void)
diff --git a/fs/nls/nls_cp852.c b/fs/nls/nls_cp852.c
index ceb1c0166dd8..7cdc05ac1d40 100644
--- a/fs/nls/nls_cp852.c
+++ b/fs/nls/nls_cp852.c
@@ -319,7 +319,6 @@ static struct nls_table table = {
        .char2uni       = char2uni,
        .charset2lower  = charset2lower,
        .charset2upper  = charset2upper,
-        .owner          = THIS_MODULE,
 };
 static int __init init_nls_cp852(void)
diff --git a/fs/nls/nls_cp855.c b/fs/nls/nls_cp855.c
index cc7f5fb2e0c2..7426eea05663 100644
--- a/fs/nls/nls_cp855.c
+++ b/fs/nls/nls_cp855.c
@@ -281,7 +281,6 @@ static struct nls_table table = {
        .char2uni       = char2uni,
        .charset2lower  = charset2lower,
        .charset2upper  = charset2upper,
-        .owner          = THIS_MODULE,
 };
 static int __init init_nls_cp855(void)
diff --git a/fs/nls/nls_cp857.c b/fs/nls/nls_cp857.c
index e418e198e8d8..098309733ebd 100644
--- a/fs/nls/nls_cp857.c
+++ b/fs/nls/nls_cp857.c
@@ -283,7 +283,6 @@ static struct nls_table table = {
        .char2uni       = char2uni,
        .charset2lower  = charset2lower,
        .charset2upper  = charset2upper,
-        .owner          = THIS_MODULE,
 };
 static int __init init_nls_cp857(void)
diff --git a/fs/nls/nls_cp860.c b/fs/nls/nls_cp860.c
index a86c97d1aa34..84224478e731 100644
--- a/fs/nls/nls_cp860.c
+++ b/fs/nls/nls_cp860.c
@@ -346,7 +346,6 @@ static struct nls_table table = {
        .char2uni       = char2uni,
        .charset2lower  = charset2lower,
        .charset2upper  = charset2upper,
-        .owner          = THIS_MODULE,
 };
 static int __init init_nls_cp860(void)
diff --git a/fs/nls/nls_cp861.c b/fs/nls/nls_cp861.c
index bd920227acdf..dc873e4be092 100644
--- a/fs/nls/nls_cp861.c
+++ b/fs/nls/nls_cp861.c
@@ -369,7 +369,6 @@ static struct nls_table table = {
        .char2uni       = char2uni,
        .charset2lower  = charset2lower,
        .charset2upper  = charset2upper,
-        .owner          = THIS_MODULE,
 };
 static int __init init_nls_cp861(void)
diff --git a/fs/nls/nls_cp862.c b/fs/nls/nls_cp862.c
index e9b68eb3daf0..d5263e3c5566 100644
--- a/fs/nls/nls_cp862.c
+++ b/fs/nls/nls_cp862.c
@@ -403,7 +403,6 @@ static struct nls_table table = {
        .char2uni       = char2uni,
        .charset2lower  = charset2lower,
        .charset2upper  = charset2upper,
-        .owner          = THIS_MODULE,
 };
 static int __init init_nls_cp862(void)
diff --git a/fs/nls/nls_cp863.c b/fs/nls/nls_cp863.c
index f8a9b07ab4e2..051c9832e36a 100644
--- a/fs/nls/nls_cp863.c
+++ b/fs/nls/nls_cp863.c
@@ -363,7 +363,6 @@ static struct nls_table table = {
        .char2uni       = char2uni,
        .charset2lower  = charset2lower,
        .charset2upper  = charset2upper,
-        .owner          = THIS_MODULE,
 };
 static int __init init_nls_cp863(void)
diff --git a/fs/nls/nls_cp864.c b/fs/nls/nls_cp864.c
index 8d31f435fc6f..97eb1273b2f7 100644
--- a/fs/nls/nls_cp864.c
+++ b/fs/nls/nls_cp864.c
@@ -389,7 +389,6 @@ static struct nls_table table = {
        .char2uni       = char2uni,
        .charset2lower  = charset2lower,
        .charset2upper  = charset2upper,
-        .owner          = THIS_MODULE,
 };
 static int __init init_nls_cp864(void)
diff --git a/fs/nls/nls_cp865.c b/fs/nls/nls_cp865.c
index 4bd902fe3ec9..111214228525 100644
--- a/fs/nls/nls_cp865.c
+++ b/fs/nls/nls_cp865.c
@@ -369,7 +369,6 @@ static struct nls_table table = {
        .char2uni       = char2uni,
        .charset2lower  = charset2lower,
        .charset2upper  = charset2upper,
-        .owner          = THIS_MODULE,
 };
 static int __init init_nls_cp865(void)
diff --git a/fs/nls/nls_cp866.c b/fs/nls/nls_cp866.c
index bdc7cb391398..ffdcbc3fc38d 100644
--- a/fs/nls/nls_cp866.c
+++ b/fs/nls/nls_cp866.c
@@ -287,7 +287,6 @@ static struct nls_table table = {
        .char2uni       = char2uni,
        .charset2lower  = charset2lower,
        .charset2upper  = charset2upper,
-        .owner          = THIS_MODULE,
 };
 static int __init init_nls_cp866(void)
diff --git a/fs/nls/nls_cp869.c b/fs/nls/nls_cp869.c
index 9f283a2b151a..3b5a34589354 100644
--- a/fs/nls/nls_cp869.c
+++ b/fs/nls/nls_cp869.c
@@ -297,7 +297,6 @@ static struct nls_table table = {
        .char2uni       = char2uni,
        .charset2lower  = charset2lower,
        .charset2upper  = charset2upper,
-        .owner          = THIS_MODULE,
 };
 static int __init init_nls_cp869(void)
diff --git a/fs/nls/nls_cp874.c b/fs/nls/nls_cp874.c
index 0b3c4886f8c0..8dfaa10710fa 100644
--- a/fs/nls/nls_cp874.c
+++ b/fs/nls/nls_cp874.c
@@ -256,7 +256,6 @@ static struct nls_table table = {
        .char2uni       = char2uni,
        .charset2lower  = charset2lower,
        .charset2upper  = charset2upper,
-        .owner          = THIS_MODULE,
 };
 static int __init init_nls_cp874(void)
diff --git a/fs/nls/nls_cp932.c b/fs/nls/nls_cp932.c
index 0ffed6f1cebb..67b7398e8483 100644
--- a/fs/nls/nls_cp932.c
+++ b/fs/nls/nls_cp932.c
@@ -7914,7 +7914,6 @@ static struct nls_table table = {
        .char2uni       = char2uni,
        .charset2lower  = charset2lower,
        .charset2upper  = charset2upper,
-        .owner          = THIS_MODULE,
 };
 static int __init init_nls_cp932(void)
diff --git a/fs/nls/nls_cp936.c b/fs/nls/nls_cp936.c
index 82770301bc3d..c96546cfec9f 100644
--- a/fs/nls/nls_cp936.c
+++ b/fs/nls/nls_cp936.c
@@ -11092,7 +11092,6 @@ static struct nls_table table = {
        .char2uni       = char2uni,
        .charset2lower  = charset2lower,
        .charset2upper  = charset2upper,
-        .owner          = THIS_MODULE,
 };
 static int __init init_nls_cp936(void)
diff --git a/fs/nls/nls_cp949.c b/fs/nls/nls_cp949.c
index 8a7a2fe85c65..199171e97aa4 100644
--- a/fs/nls/nls_cp949.c
+++ b/fs/nls/nls_cp949.c
@@ -13927,7 +13927,6 @@ static struct nls_table table = {
        .char2uni       = char2uni,
        .charset2lower  = charset2lower,
        .charset2upper  = charset2upper,
-        .owner          = THIS_MODULE,
 };
 static int __init init_nls_cp949(void)
diff --git a/fs/nls/nls_cp950.c b/fs/nls/nls_cp950.c
index ef2536829aa5..8e1418708209 100644
--- a/fs/nls/nls_cp950.c
+++ b/fs/nls/nls_cp950.c
@@ -9463,7 +9463,6 @@ static struct nls_table table = {
        .char2uni       = char2uni,
        .charset2lower  = charset2lower,
        .charset2upper  = charset2upper,
-        .owner          = THIS_MODULE,
 };
 static int __init init_nls_cp950(void)
diff --git a/fs/nls/nls_euc-jp.c b/fs/nls/nls_euc-jp.c
index 7424929a278b..162b3f160353 100644
--- a/fs/nls/nls_euc-jp.c
+++ b/fs/nls/nls_euc-jp.c
@@ -553,7 +553,6 @@ static struct nls_table table = {
        .charset        = "euc-jp",
        .uni2char       = uni2char,
        .char2uni       = char2uni,
-        .owner          = THIS_MODULE,
 };
 static int __init init_nls_euc_jp(void)
diff --git a/fs/nls/nls_iso8859-1.c b/fs/nls/nls_iso8859-1.c
index 7b951bb5849c..69ac020d43b1 100644
--- a/fs/nls/nls_iso8859-1.c
+++ b/fs/nls/nls_iso8859-1.c
@@ -239,7 +239,6 @@ static struct nls_table table = {
        .char2uni       = char2uni,
        .charset2lower  = charset2lower,
        .charset2upper  = charset2upper,
-        .owner          = THIS_MODULE,
 };
 static int __init init_nls_iso8859_1(void)
diff --git a/fs/nls/nls_iso8859-13.c b/fs/nls/nls_iso8859-13.c
index c4d52ea9f092..afb3f8f275f0 100644
--- a/fs/nls/nls_iso8859-13.c
+++ b/fs/nls/nls_iso8859-13.c
@@ -267,7 +267,6 @@ static struct nls_table table = {
        .char2uni       = char2uni,
        .charset2lower  = charset2lower,
        .charset2upper  = charset2upper,
-        .owner          = THIS_MODULE,
 };
 static int __init init_nls_iso8859_13(void)
diff --git a/fs/nls/nls_iso8859-14.c b/fs/nls/nls_iso8859-14.c
index dc02600c7fe1..046370f0b6f0 100644
--- a/fs/nls/nls_iso8859-14.c
+++ b/fs/nls/nls_iso8859-14.c
@@ -323,7 +323,6 @@ static struct nls_table table = {
        .char2uni       = char2uni,
        .charset2lower  = charset2lower,
        .charset2upper  = charset2upper,
-        .owner          = THIS_MODULE,
 };
 static int __init init_nls_iso8859_14(void)
diff --git a/fs/nls/nls_iso8859-15.c b/fs/nls/nls_iso8859-15.c
index 3c7dfc832ef1..7e34a841a056 100644
--- a/fs/nls/nls_iso8859-15.c
+++ b/fs/nls/nls_iso8859-15.c
@@ -289,7 +289,6 @@ static struct nls_table table = {
        .char2uni       = char2uni,
        .charset2lower  = charset2lower,
        .charset2upper  = charset2upper,
-        .owner          = THIS_MODULE,
 };
 static int __init init_nls_iso8859_15(void)
diff --git a/fs/nls/nls_iso8859-2.c b/fs/nls/nls_iso8859-2.c
index a2d2197e4c77..7dd571181741 100644
--- a/fs/nls/nls_iso8859-2.c
+++ b/fs/nls/nls_iso8859-2.c
@@ -290,7 +290,6 @@ static struct nls_table table = {
        .char2uni       = char2uni,
        .charset2lower  = charset2lower,
        .charset2upper  = charset2upper,
-        .owner          = THIS_MODULE,
 };
 static int __init init_nls_iso8859_2(void)
diff --git a/fs/nls/nls_iso8859-3.c b/fs/nls/nls_iso8859-3.c
index a61e0daa3a86..740b75ec4493 100644
--- a/fs/nls/nls_iso8859-3.c
+++ b/fs/nls/nls_iso8859-3.c
@@ -290,7 +290,6 @@ static struct nls_table table = {
        .char2uni       = char2uni,
        .charset2lower  = charset2lower,
        .charset2upper  = charset2upper,
-        .owner          = THIS_MODULE,
 };
 static int __init init_nls_iso8859_3(void)
diff --git a/fs/nls/nls_iso8859-4.c b/fs/nls/nls_iso8859-4.c
index e8ff555483b6..8826021e32f5 100644
--- a/fs/nls/nls_iso8859-4.c
+++ b/fs/nls/nls_iso8859-4.c
@@ -290,7 +290,6 @@ static struct nls_table table = {
        .char2uni       = char2uni,
        .charset2lower  = charset2lower,
        .charset2upper  = charset2upper,
-        .owner          = THIS_MODULE,
 };
 static int __init init_nls_iso8859_4(void)
diff --git a/fs/nls/nls_iso8859-5.c b/fs/nls/nls_iso8859-5.c
index 4721e8930124..7c04057a1ad8 100644
--- a/fs/nls/nls_iso8859-5.c
+++ b/fs/nls/nls_iso8859-5.c
@@ -254,7 +254,6 @@ static struct nls_table table = {
        .char2uni       = char2uni,
        .charset2lower  = charset2lower,
        .charset2upper  = charset2upper,
-        .owner          = THIS_MODULE,
 };
 static int __init init_nls_iso8859_5(void)
diff --git a/fs/nls/nls_iso8859-6.c b/fs/nls/nls_iso8859-6.c
index 01a517d6d306..d4a881400d74 100644
--- a/fs/nls/nls_iso8859-6.c
+++ b/fs/nls/nls_iso8859-6.c
@@ -245,7 +245,6 @@ static struct nls_table table = {
        .char2uni       = char2uni,
        .charset2lower  = charset2lower,
        .charset2upper  = charset2upper,
-        .owner          = THIS_MODULE,
 };
 static int __init init_nls_iso8859_6(void)
diff --git a/fs/nls/nls_iso8859-7.c b/fs/nls/nls_iso8859-7.c
index 2d27b93ef19e..37b75d825a75 100644
--- a/fs/nls/nls_iso8859-7.c
+++ b/fs/nls/nls_iso8859-7.c
@@ -299,7 +299,6 @@ static struct nls_table table = {
        .char2uni       = char2uni,
        .charset2lower  = charset2lower,
        .charset2upper  = charset2upper,
-        .owner          = THIS_MODULE,
 };
 static int __init init_nls_iso8859_7(void)
diff --git a/fs/nls/nls_iso8859-9.c b/fs/nls/nls_iso8859-9.c
index 694bf070c721..557b98250d37 100644
--- a/fs/nls/nls_iso8859-9.c
+++ b/fs/nls/nls_iso8859-9.c
@@ -254,7 +254,6 @@ static struct nls_table table = {
        .char2uni       = char2uni,
        .charset2lower  = charset2lower,
        .charset2upper  = charset2upper,
-        .owner          = THIS_MODULE,
 };
 static int __init init_nls_iso8859_9(void)
diff --git a/fs/nls/nls_koi8-r.c b/fs/nls/nls_koi8-r.c
index 43875310540d..811f232fccfb 100644
--- a/fs/nls/nls_koi8-r.c
+++ b/fs/nls/nls_koi8-r.c
@@ -305,7 +305,6 @@ static struct nls_table table = {
        .char2uni       = char2uni,
        .charset2lower  = charset2lower,
        .charset2upper  = charset2upper,
-        .owner          = THIS_MODULE,
 };
 static int __init init_nls_koi8_r(void)
diff --git a/fs/nls/nls_koi8-ru.c b/fs/nls/nls_koi8-ru.c
index e7bc1d75c78c..a80a741a8676 100644
--- a/fs/nls/nls_koi8-ru.c
+++ b/fs/nls/nls_koi8-ru.c
@@ -55,7 +55,6 @@ static struct nls_table table = {
        .charset        = "koi8-ru",
        .uni2char       = uni2char,
        .char2uni       = char2uni,
-        .owner          = THIS_MODULE,
 };
 static int __init init_nls_koi8_ru(void)
diff --git a/fs/nls/nls_koi8-u.c b/fs/nls/nls_koi8-u.c
index 8c9f0292b5ae..7e029e4c188a 100644
--- a/fs/nls/nls_koi8-u.c
+++ b/fs/nls/nls_koi8-u.c
@@ -312,7 +312,6 @@ static struct nls_table table = {
        .char2uni       = char2uni,
        .charset2lower  = charset2lower,
        .charset2upper  = charset2upper,
-        .owner          = THIS_MODULE,
 };
 static int __init init_nls_koi8_u(void)
diff --git a/fs/nls/nls_utf8.c b/fs/nls/nls_utf8.c
index 0d60a44acacd..afcfbc4a14db 100644
--- a/fs/nls/nls_utf8.c
+++ b/fs/nls/nls_utf8.c
@@ -46,7 +46,6 @@ static struct nls_table table = {
        .char2uni       = char2uni,
        .charset2lower  = identity,     /* no conversion */
        .charset2upper  = identity,
-        .owner          = THIS_MODULE,
 };
 static int __init init_nls_utf8(void)
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 1fedd5f7ccc4..abc8cbcfe90e 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -82,20 +82,23 @@ static void dnotify_recalc_inode_mask(struct fsnotify_mark *fsn_mark)
 * events.
 */
 static int dnotify_handle_event(struct fsnotify_group *group,
+                                struct inode *inode,
                                struct fsnotify_mark *inode_mark,
                                struct fsnotify_mark *vfsmount_mark,
-                                struct fsnotify_event *event)
+                                u32 mask, void *data, int data_type,
+                                const unsigned char *file_name, u32 cookie)
 {
        struct dnotify_mark *dn_mark;
-        struct inode *to_tell;
        struct dnotify_struct *dn;
        struct dnotify_struct **prev;
        struct fown_struct *fown;
-        __u32 test_mask = event->mask & ~FS_EVENT_ON_CHILD;
+        __u32 test_mask = mask & ~FS_EVENT_ON_CHILD;
-        BUG_ON(vfsmount_mark);
+        /* not a dir, dnotify doesn't care */
+        if (!S_ISDIR(inode->i_mode))
+                return 0;
-        to_tell = event->to_tell;
+        BUG_ON(vfsmount_mark);
        dn_mark = container_of(inode_mark, struct dnotify_mark, fsn_mark);
@@ -122,23 +125,6 @@ static int dnotify_handle_event(struct fsnotify_group *group,
        return 0;
 }
-/*
- * Given an inode and mask determine if dnotify would be interested in sending
- * userspace notification for that pair.
- */
-static bool dnotify_should_send_event(struct fsnotify_group *group,
-                                      struct inode *inode,
-                                      struct fsnotify_mark *inode_mark,
-                                      struct fsnotify_mark *vfsmount_mark,
-                                      __u32 mask, void *data, int data_type)
-{
-        /* not a dir, dnotify doesn't care */
-        if (!S_ISDIR(inode->i_mode))
-                return false;
-        return true;
-}
 static void dnotify_free_mark(struct fsnotify_mark *fsn_mark)
 {
        struct dnotify_mark *dn_mark = container_of(fsn_mark,
@@ -152,10 +138,6 @@ static void dnotify_free_mark(struct fsnotify_mark *fsn_mark)
 static struct fsnotify_ops dnotify_fsnotify_ops = {
        .handle_event = dnotify_handle_event,
-        .should_send_event = dnotify_should_send_event,
-        .free_group_priv = NULL,
-        .freeing_mark = NULL,
-        .free_event_priv = NULL,
 };
 /*
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 0c2f9122b262..dc638f786d5c 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -9,91 +9,59 @@
 #include <linux/types.h>
 #include <linux/wait.h>
-static bool should_merge(struct fsnotify_event *old, struct fsnotify_event *new)
+#include "fanotify.h"
+static bool should_merge(struct fsnotify_event *old_fsn,
+                         struct fsnotify_event *new_fsn)
 {
-        pr_debug("%s: old=%p new=%p\n", __func__, old, new);
+        struct fanotify_event_info *old, *new;
-        if (old->to_tell == new->to_tell &&
+        pr_debug("%s: old=%p new=%p\n", __func__, old_fsn, new_fsn);
-            old->data_type == new->data_type &&
+        old = FANOTIFY_E(old_fsn);
-            old->tgid == new->tgid) {
+        new = FANOTIFY_E(new_fsn);
-                switch (old->data_type) {
-                case (FSNOTIFY_EVENT_PATH):
+        if (old_fsn->inode == new_fsn->inode && old->tgid == new->tgid &&
-#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+            old->path.mnt == new->path.mnt &&
-                        /* dont merge two permission events */
+            old->path.dentry == new->path.dentry)
-                        if ((old->mask & FAN_ALL_PERM_EVENTS) &&
+                return true;
-                            (new->mask & FAN_ALL_PERM_EVENTS))
-                                return false;
-#endif
-                        if ((old->path.mnt == new->path.mnt) &&
-                            (old->path.dentry == new->path.dentry))
-                                return true;
-                        break;
-                case (FSNOTIFY_EVENT_NONE):
-                        return true;
-                default:
-                        BUG();
-                };
-        }
        return false;
 }
 /* and the list better be locked by something too! */
-static struct fsnotify_event *fanotify_merge(struct list_head *list,
+static int fanotify_merge(struct list_head *list, struct fsnotify_event *event)
-                                             struct fsnotify_event *event)
 {
-        struct fsnotify_event_holder *test_holder;
+        struct fsnotify_event *test_event;
-        struct fsnotify_event *test_event = NULL;
+        bool do_merge = false;
-        struct fsnotify_event *new_event;
        pr_debug("%s: list=%p event=%p\n", __func__, list, event);
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+        /*
+         * Don't merge a permission event with any other event so that we know
+         * the event structure we have created in fanotify_handle_event() is the
+         * one we should check for permission response.
+         */
+        if (event->mask & FAN_ALL_PERM_EVENTS)
+                return 0;
+#endif
-        list_for_each_entry_reverse(test_holder, list, event_list) {
+        list_for_each_entry_reverse(test_event, list, list) {
-                if (should_merge(test_holder->event, event)) {
+                if (should_merge(test_event, event)) {
-                        test_event = test_holder->event;
+                        do_merge = true;
                        break;
                }
        }
-        if (!test_event)
+        if (!do_merge)
-                return NULL;
+                return 0;
-        fsnotify_get_event(test_event);
-        /* if they are exactly the same we are done */
-        if (test_event->mask == event->mask)
-                return test_event;
-        /*
-         * if the refcnt == 2 this is the only queue
-         * for this event and so we can update the mask
-         * in place.
-         */
-        if (atomic_read(&test_event->refcnt) == 2) {
-                test_event->mask |= event->mask;
-                return test_event;
-        }
-        new_event = fsnotify_clone_event(test_event);
-        /* done with test_event */
-        fsnotify_put_event(test_event);
-        /* couldn't allocate memory, merge was not possible */
-        if (unlikely(!new_event))
-                return ERR_PTR(-ENOMEM);
-        /* build new event and replace it on the list */
-        new_event->mask = (test_event->mask | event->mask);
-        fsnotify_replace_event(test_holder, new_event);
-        /* we hold a reference on new_event from clone_event */
+        test_event->mask |= event->mask;
-        return new_event;
+        return 1;
 }
 #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
 static int fanotify_get_response_from_access(struct fsnotify_group *group,
-                                             struct fsnotify_event *event)
+                                             struct fanotify_event_info *event)
 {
        int ret;
@@ -106,7 +74,6 @@ static int fanotify_get_response_from_access(struct fsnotify_group *group,
                return 0;
        /* userspace responded, convert to something usable */
-        spin_lock(&event->lock);
        switch (event->response) {
        case FAN_ALLOW:
                ret = 0;
@@ -116,7 +83,6 @@ static int fanotify_get_response_from_access(struct fsnotify_group *group,
                ret = -EPERM;
        }
        event->response = 0;
-        spin_unlock(&event->lock);
        pr_debug("%s: group=%p event=%p about to return ret=%d\n", __func__,
                 group, event, ret);
@@ -125,58 +91,17 @@ static int fanotify_get_response_from_access(struct fsnotify_group *group,
 }
 #endif
-static int fanotify_handle_event(struct fsnotify_group *group,
+static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark,
-                                 struct fsnotify_mark *inode_mark,
-                                 struct fsnotify_mark *fanotify_mark,
-                                 struct fsnotify_event *event)
-{
-        int ret = 0;
-        struct fsnotify_event *notify_event = NULL;
-        BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS);
-        BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY);
-        BUILD_BUG_ON(FAN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE);
-        BUILD_BUG_ON(FAN_CLOSE_WRITE != FS_CLOSE_WRITE);
-        BUILD_BUG_ON(FAN_OPEN != FS_OPEN);
-        BUILD_BUG_ON(FAN_EVENT_ON_CHILD != FS_EVENT_ON_CHILD);
-        BUILD_BUG_ON(FAN_Q_OVERFLOW != FS_Q_OVERFLOW);
-        BUILD_BUG_ON(FAN_OPEN_PERM != FS_OPEN_PERM);
-        BUILD_BUG_ON(FAN_ACCESS_PERM != FS_ACCESS_PERM);
-        BUILD_BUG_ON(FAN_ONDIR != FS_ISDIR);
-        pr_debug("%s: group=%p event=%p\n", __func__, group, event);
-        notify_event = fsnotify_add_notify_event(group, event, NULL, fanotify_merge);
-        if (IS_ERR(notify_event))
-                return PTR_ERR(notify_event);
-#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
-        if (event->mask & FAN_ALL_PERM_EVENTS) {
-                /* if we merged we need to wait on the new event */
-                if (notify_event)
-                        event = notify_event;
-                ret = fanotify_get_response_from_access(group, event);
-        }
-#endif
-        if (notify_event)
-                fsnotify_put_event(notify_event);
-        return ret;
-}
-static bool fanotify_should_send_event(struct fsnotify_group *group,
-                                       struct inode *to_tell,
-                                       struct fsnotify_mark *inode_mark,
                                       struct fsnotify_mark *vfsmnt_mark,
-                                       __u32 event_mask, void *data, int data_type)
+                                       u32 event_mask,
+                                       void *data, int data_type)
 {
        __u32 marks_mask, marks_ignored_mask;
        struct path *path = data;
-        pr_debug("%s: group=%p to_tell=%p inode_mark=%p vfsmnt_mark=%p "
+        pr_debug("%s: inode_mark=%p vfsmnt_mark=%p mask=%x data=%p"
-                 "mask=%x data=%p data_type=%d\n", __func__, group, to_tell,
+                 " data_type=%d\n", __func__, inode_mark, vfsmnt_mark,
-                 inode_mark, vfsmnt_mark, event_mask, data, data_type);
+                 event_mask, data, data_type);
        /* if we don't have enough info to send an event to userspace say no */
        if (data_type != FSNOTIFY_EVENT_PATH)
@@ -217,6 +142,73 @@ static bool fanotify_should_send_event(struct fsnotify_group *group,
        return false;
 }
+static int fanotify_handle_event(struct fsnotify_group *group,
+                                 struct inode *inode,
+                                 struct fsnotify_mark *inode_mark,
+                                 struct fsnotify_mark *fanotify_mark,
+                                 u32 mask, void *data, int data_type,
+                                 const unsigned char *file_name, u32 cookie)
+{
+        int ret = 0;
+        struct fanotify_event_info *event;
+        struct fsnotify_event *fsn_event;
+        BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS);
+        BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY);
+        BUILD_BUG_ON(FAN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE);
+        BUILD_BUG_ON(FAN_CLOSE_WRITE != FS_CLOSE_WRITE);
+        BUILD_BUG_ON(FAN_OPEN != FS_OPEN);
+        BUILD_BUG_ON(FAN_EVENT_ON_CHILD != FS_EVENT_ON_CHILD);
+        BUILD_BUG_ON(FAN_Q_OVERFLOW != FS_Q_OVERFLOW);
+        BUILD_BUG_ON(FAN_OPEN_PERM != FS_OPEN_PERM);
+        BUILD_BUG_ON(FAN_ACCESS_PERM != FS_ACCESS_PERM);
+        BUILD_BUG_ON(FAN_ONDIR != FS_ISDIR);
+        if (!fanotify_should_send_event(inode_mark, fanotify_mark, mask, data,
+                                        data_type))
+                return 0;
+        pr_debug("%s: group=%p inode=%p mask=%x\n", __func__, group, inode,
+                 mask);
+        event = kmem_cache_alloc(fanotify_event_cachep, GFP_KERNEL);
+        if (unlikely(!event))
+                return -ENOMEM;
+        fsn_event = &event->fse;
+        fsnotify_init_event(fsn_event, inode, mask);
+        event->tgid = get_pid(task_tgid(current));
+        if (data_type == FSNOTIFY_EVENT_PATH) {
+                struct path *path = data;
+                event->path = *path;
+                path_get(&event->path);
+        } else {
+                event->path.mnt = NULL;
+                event->path.dentry = NULL;
+        }
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+        event->response = 0;
+#endif
+        ret = fsnotify_add_notify_event(group, fsn_event, fanotify_merge);
+        if (ret) {
+                /* Permission events shouldn't be merged */
+                BUG_ON(ret == 1 && mask & FAN_ALL_PERM_EVENTS);
+                /* Our event wasn't used in the end. Free it. */
+                fsnotify_destroy_event(group, fsn_event);
+                return 0;
+        }
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+        if (mask & FAN_ALL_PERM_EVENTS) {
+                ret = fanotify_get_response_from_access(group, event);
+                fsnotify_destroy_event(group, fsn_event);
+        }
+#endif
+        return ret;
+}
 static void fanotify_free_group_priv(struct fsnotify_group *group)
 {
        struct user_struct *user;
@@ -226,10 +218,18 @@ static void fanotify_free_group_priv(struct fsnotify_group *group)
        free_uid(user);
 }
+static void fanotify_free_event(struct fsnotify_event *fsn_event)
+{
+        struct fanotify_event_info *event;
+        event = FANOTIFY_E(fsn_event);
+        path_put(&event->path);
+        put_pid(event->tgid);
+        kmem_cache_free(fanotify_event_cachep, event);
+}
 const struct fsnotify_ops fanotify_fsnotify_ops = {
        .handle_event = fanotify_handle_event,
-        .should_send_event = fanotify_should_send_event,
        .free_group_priv = fanotify_free_group_priv,
-        .free_event_priv = NULL,
+        .free_event = fanotify_free_event,
-        .freeing_mark = NULL,
 };
diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
new file mode 100644
index 000000000000..32a2f034fb94
--- /dev/null
+++ b/fs/notify/fanotify/fanotify.h
@@ -0,0 +1,30 @@
+#include <linux/fsnotify_backend.h>
+#include <linux/path.h>
+#include <linux/slab.h>
+extern struct kmem_cache *fanotify_event_cachep;
+/*
+ * Lifetime of the structure differs for normal and permission events. In both
+ * cases the structure is allocated in fanotify_handle_event(). For normal
+ * events the structure is freed immediately after reporting it to userspace.
+ * For permission events we free it only after we receive response from
+ * userspace.
+ */
+struct fanotify_event_info {
+        struct fsnotify_event fse;
+        /*
+         * We hold ref to this path so it may be dereferenced at any point
+         * during this object's lifetime
+         */
+        struct path path;
+        struct pid *tgid;
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+        u32 response;   /* userspace answer to question */
+#endif
+};
+static inline struct fanotify_event_info *FANOTIFY_E(struct fsnotify_event *fse)
+{
+        return container_of(fse, struct fanotify_event_info, fse);
+}
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index e44cb6427df3..287a22c04149 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -19,6 +19,7 @@
 #include "../../mount.h"
 #include "../fdinfo.h"
+#include "fanotify.h"
 #define FANOTIFY_DEFAULT_MAX_EVENTS     16384
 #define FANOTIFY_DEFAULT_MAX_MARKS      8192
@@ -28,11 +29,12 @@ extern const struct fsnotify_ops fanotify_fsnotify_ops;
 static struct kmem_cache *fanotify_mark_cache __read_mostly;
 static struct kmem_cache *fanotify_response_event_cache __read_mostly;
+struct kmem_cache *fanotify_event_cachep __read_mostly;
 struct fanotify_response_event {
        struct list_head list;
        __s32 fd;
-        struct fsnotify_event *event;
+        struct fanotify_event_info *event;
 };
 /*
@@ -61,8 +63,8 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
 }
 static int create_fd(struct fsnotify_group *group,
-                        struct fsnotify_event *event,
+                     struct fanotify_event_info *event,
-                        struct file **file)
+                     struct file **file)
 {
        int client_fd;
        struct file *new_file;
@@ -73,12 +75,6 @@ static int create_fd(struct fsnotify_group *group,
        if (client_fd < 0)
                return client_fd;
-        if (event->data_type != FSNOTIFY_EVENT_PATH) {
-                WARN_ON(1);
-                put_unused_fd(client_fd);
-                return -EINVAL;
-        }
        /*
         * we need a new file handle for the userspace program so it can read even if it was
         * originally opened O_WRONLY.
@@ -109,23 +105,25 @@ static int create_fd(struct fsnotify_group *group,
 }
 static int fill_event_metadata(struct fsnotify_group *group,
-                                   struct fanotify_event_metadata *metadata,
+                               struct fanotify_event_metadata *metadata,
-                                   struct fsnotify_event *event,
+                               struct fsnotify_event *fsn_event,
-                                   struct file **file)
+                               struct file **file)
 {
        int ret = 0;
+        struct fanotify_event_info *event;
        pr_debug("%s: group=%p metadata=%p event=%p\n", __func__,
-                 group, metadata, event);
+                 group, metadata, fsn_event);
        *file = NULL;
+        event = container_of(fsn_event, struct fanotify_event_info, fse);
        metadata->event_len = FAN_EVENT_METADATA_LEN;
        metadata->metadata_len = FAN_EVENT_METADATA_LEN;
        metadata->vers = FANOTIFY_METADATA_VERSION;
        metadata->reserved = 0;
-        metadata->mask = event->mask & FAN_ALL_OUTGOING_EVENTS;
+        metadata->mask = fsn_event->mask & FAN_ALL_OUTGOING_EVENTS;
        metadata->pid = pid_vnr(event->tgid);
-        if (unlikely(event->mask & FAN_Q_OVERFLOW))
+        if (unlikely(fsn_event->mask & FAN_Q_OVERFLOW))
                metadata->fd = FAN_NOFD;
        else {
                metadata->fd = create_fd(group, event, file);
@@ -209,7 +207,7 @@ static int prepare_for_access_response(struct fsnotify_group *group,
        if (!re)
                return -ENOMEM;
-        re->event = event;
+        re->event = FANOTIFY_E(event);
        re->fd = fd;
        mutex_lock(&group->fanotify_data.access_mutex);
@@ -217,7 +215,7 @@ static int prepare_for_access_response(struct fsnotify_group *group,
        if (atomic_read(&group->fanotify_data.bypass_perm)) {
                mutex_unlock(&group->fanotify_data.access_mutex);
                kmem_cache_free(fanotify_response_event_cache, re);
-                event->response = FAN_ALLOW;
+                FANOTIFY_E(event)->response = FAN_ALLOW;
                return 0;
        }
                
@@ -273,7 +271,7 @@ out_close_fd:
 out:
 #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
        if (event->mask & FAN_ALL_PERM_EVENTS) {
-                event->response = FAN_DENY;
+                FANOTIFY_E(event)->response = FAN_DENY;
                wake_up(&group->fanotify_data.access_waitq);
        }
 #endif
@@ -321,7 +319,12 @@ static ssize_t fanotify_read(struct file *file, char __user *buf,
                        if (IS_ERR(kevent))
                                break;
                        ret = copy_event_to_user(group, kevent, buf);
-                        fsnotify_put_event(kevent);
+                        /*
+                         * Permission events get destroyed after we
+                         * receive response
+                         */
+                        if (!(kevent->mask & FAN_ALL_PERM_EVENTS))
+                                fsnotify_destroy_event(group, kevent);
                        if (ret < 0)
                                break;
                        buf += ret;
@@ -409,7 +412,7 @@ static int fanotify_release(struct inode *ignored, struct file *file)
 static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
        struct fsnotify_group *group;
-        struct fsnotify_event_holder *holder;
+        struct fsnotify_event *fsn_event;
        void __user *p;
        int ret = -ENOTTY;
        size_t send_len = 0;
@@ -421,7 +424,7 @@ static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long ar
        switch (cmd) {
        case FIONREAD:
                mutex_lock(&group->notification_mutex);
-                list_for_each_entry(holder, &group->notification_list, event_list)
+                list_for_each_entry(fsn_event, &group->notification_list, list)
                        send_len += FAN_EVENT_METADATA_LEN;
                mutex_unlock(&group->notification_mutex);
                ret = put_user(send_len, (int __user *) p);
@@ -695,6 +698,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
        struct fsnotify_group *group;
        int f_flags, fd;
        struct user_struct *user;
+        struct fanotify_event_info *oevent;
        pr_debug("%s: flags=%d event_f_flags=%d\n",
                __func__, flags, event_f_flags);
@@ -727,8 +731,20 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
        group->fanotify_data.user = user;
        atomic_inc(&user->fanotify_listeners);
+        oevent = kmem_cache_alloc(fanotify_event_cachep, GFP_KERNEL);
+        if (unlikely(!oevent)) {
+                fd = -ENOMEM;
+                goto out_destroy_group;
+        }
+        group->overflow_event = &oevent->fse;
+        fsnotify_init_event(group->overflow_event, NULL, FS_Q_OVERFLOW);
+        oevent->tgid = get_pid(task_tgid(current));
+        oevent->path.mnt = NULL;
+        oevent->path.dentry = NULL;
        group->fanotify_data.f_flags = event_f_flags;
 #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+        oevent->response = 0;
        mutex_init(&group->fanotify_data.access_mutex);
        init_waitqueue_head(&group->fanotify_data.access_waitq);
        INIT_LIST_HEAD(&group->fanotify_data.access_list);
@@ -888,9 +904,9 @@ COMPAT_SYSCALL_DEFINE6(fanotify_mark,
 {
        return sys_fanotify_mark(fanotify_fd, flags,
 #ifdef __BIG_ENDIAN
-                                ((__u64)mask1 << 32) | mask0,
-#else
                                ((__u64)mask0 << 32) | mask1,
+#else
+                                ((__u64)mask1 << 32) | mask0,
 #endif
                                 dfd, pathname);
 }
@@ -906,6 +922,7 @@ static int __init fanotify_user_setup(void)
        fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, SLAB_PANIC);
        fanotify_response_event_cache = KMEM_CACHE(fanotify_response_event,
                                                   SLAB_PANIC);
+        fanotify_event_cachep = KMEM_CACHE(fanotify_event_info, SLAB_PANIC);
        return 0;
 }
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 4bb21d67d9b1..9d3e9c50066a 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -128,8 +128,7 @@ static int send_to_group(struct inode *to_tell,
                         struct fsnotify_mark *vfsmount_mark,
                         __u32 mask, void *data,
                         int data_is, u32 cookie,
-                         const unsigned char *file_name,
+                         const unsigned char *file_name)
-                         struct fsnotify_event **event)
 {
        struct fsnotify_group *group = NULL;
        __u32 inode_test_mask = 0;
@@ -170,27 +169,17 @@ static int send_to_group(struct inode *to_tell,
        pr_debug("%s: group=%p to_tell=%p mask=%x inode_mark=%p"
                 " inode_test_mask=%x vfsmount_mark=%p vfsmount_test_mask=%x"
-                 " data=%p data_is=%d cookie=%d event=%p\n",
+                 " data=%p data_is=%d cookie=%d\n",
                 __func__, group, to_tell, mask, inode_mark,
                 inode_test_mask, vfsmount_mark, vfsmount_test_mask, data,
-                 data_is, cookie, *event);
+                 data_is, cookie);
        if (!inode_test_mask && !vfsmount_test_mask)
                return 0;
-        if (group->ops->should_send_event(group, to_tell, inode_mark,
+        return group->ops->handle_event(group, to_tell, inode_mark,
-                                          vfsmount_mark, mask, data,
+                                        vfsmount_mark, mask, data, data_is,
-                                          data_is) == false)
+                                        file_name, cookie);
-                return 0;
-        if (!*event) {
-                *event = fsnotify_create_event(to_tell, mask, data,
-                                                data_is, file_name,
-                                                cookie, GFP_KERNEL);
-                if (!*event)
-                        return -ENOMEM;
-        }
-        return group->ops->handle_event(group, inode_mark, vfsmount_mark, *event);
 }
 /*
@@ -205,7 +194,6 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
        struct hlist_node *inode_node = NULL, *vfsmount_node = NULL;
        struct fsnotify_mark *inode_mark = NULL, *vfsmount_mark = NULL;
        struct fsnotify_group *inode_group, *vfsmount_group;
-        struct fsnotify_event *event = NULL;
        struct mount *mnt;
        int idx, ret = 0;
        /* global tests shouldn't care about events on child only the specific event */
@@ -258,18 +246,18 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
                if (inode_group > vfsmount_group) {
                        /* handle inode */
-                        ret = send_to_group(to_tell, inode_mark, NULL, mask, data,
+                        ret = send_to_group(to_tell, inode_mark, NULL, mask,
-                                            data_is, cookie, file_name, &event);
+                                            data, data_is, cookie, file_name);
                        /* we didn't use the vfsmount_mark */
                        vfsmount_group = NULL;
                } else if (vfsmount_group > inode_group) {
-                        ret = send_to_group(to_tell, NULL, vfsmount_mark, mask, data,
+                        ret = send_to_group(to_tell, NULL, vfsmount_mark, mask,
-                                            data_is, cookie, file_name, &event);
+                                            data, data_is, cookie, file_name);
                        inode_group = NULL;
                } else {
                        ret = send_to_group(to_tell, inode_mark, vfsmount_mark,
-                                            mask, data, data_is, cookie, file_name,
+                                            mask, data, data_is, cookie,
-                                            &event);
+                                            file_name);
                }
                if (ret && (mask & ALL_FSNOTIFY_PERM_EVENTS))
@@ -285,12 +273,6 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
        ret = 0;
 out:
        srcu_read_unlock(&fsnotify_mark_srcu, idx);
-        /*
-         * fsnotify_create_event() took a reference so the event can't be cleaned
-         * up while we are still trying to add it to lists, drop that one.
-         */
-        if (event)
-                fsnotify_put_event(event);
        return ret;
 }
diff --git a/fs/notify/group.c b/fs/notify/group.c
index bd2625bd88b4..ad1995980456 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -55,6 +55,13 @@ void fsnotify_destroy_group(struct fsnotify_group *group)
        /* clear the notification queue of all events */
        fsnotify_flush_notify(group);
+        /*
+         * Destroy overflow event (we cannot use fsnotify_destroy_event() as
+         * that deliberately ignores overflow events.
+         */
+        if (group->overflow_event)
+                group->ops->free_event(group->overflow_event);
        fsnotify_put_group(group);
 }
diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h
index b6642e4de4bf..ed855ef6f077 100644
--- a/fs/notify/inotify/inotify.h
+++ b/fs/notify/inotify/inotify.h
@@ -2,11 +2,12 @@
 #include <linux/inotify.h>
 #include <linux/slab.h> /* struct kmem_cache */
-extern struct kmem_cache *event_priv_cachep;
+struct inotify_event_info {
+        struct fsnotify_event fse;
-struct inotify_event_private_data {
-        struct fsnotify_event_private_data fsnotify_event_priv_data;
        int wd;
+        u32 sync_cookie;
+        int name_len;
+        char name[];
 };
 struct inotify_inode_mark {
@@ -14,8 +15,18 @@ struct inotify_inode_mark {
        int wd;
 };
+static inline struct inotify_event_info *INOTIFY_E(struct fsnotify_event *fse)
+{
+        return container_of(fse, struct inotify_event_info, fse);
+}
 extern void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
                                           struct fsnotify_group *group);
-extern void inotify_free_event_priv(struct fsnotify_event_private_data *event_priv);
+extern int inotify_handle_event(struct fsnotify_group *group,
+                                struct inode *inode,
+                                struct fsnotify_mark *inode_mark,
+                                struct fsnotify_mark *vfsmount_mark,
+                                u32 mask, void *data, int data_type,
+                                const unsigned char *file_name, u32 cookie);
 extern const struct fsnotify_ops inotify_fsnotify_ops;
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 4216308b81b4..43ab1e1a07a2 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -34,107 +34,90 @@
 #include "inotify.h"
 /*
- * Check if 2 events contain the same information.  We do not compare private data
+ * Check if 2 events contain the same information.
- * but at this moment that isn't a problem for any know fsnotify listeners.
 */
-static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new)
+static bool event_compare(struct fsnotify_event *old_fsn,
+                          struct fsnotify_event *new_fsn)
 {
-        if ((old->mask == new->mask) &&
+        struct inotify_event_info *old, *new;
-            (old->to_tell == new->to_tell) &&
-            (old->data_type == new->data_type) &&
+        if (old_fsn->mask & FS_IN_IGNORED)
-            (old->name_len == new->name_len)) {
+                return false;
-                switch (old->data_type) {
+        old = INOTIFY_E(old_fsn);
-                case (FSNOTIFY_EVENT_INODE):
+        new = INOTIFY_E(new_fsn);
-                        /* remember, after old was put on the wait_q we aren't
+        if ((old_fsn->mask == new_fsn->mask) &&
-                         * allowed to look at the inode any more, only thing
+            (old_fsn->inode == new_fsn->inode) &&
-                         * left to check was if the file_name is the same */
+            (old->name_len == new->name_len) &&
-                        if (!old->name_len ||
+            (!old->name_len || !strcmp(old->name, new->name)))
-                            !strcmp(old->file_name, new->file_name))
+                return true;
-                                return true;
-                        break;
-                case (FSNOTIFY_EVENT_PATH):
-                        if ((old->path.mnt == new->path.mnt) &&
-                            (old->path.dentry == new->path.dentry))
-                                return true;
-                        break;
-                case (FSNOTIFY_EVENT_NONE):
-                        if (old->mask & FS_Q_OVERFLOW)
-                                return true;
-                        else if (old->mask & FS_IN_IGNORED)
-                                return false;
-                        return true;
-                };
-        }
        return false;
 }
-static struct fsnotify_event *inotify_merge(struct list_head *list,
+static int inotify_merge(struct list_head *list,
-                                            struct fsnotify_event *event)
+                          struct fsnotify_event *event)
 {
-        struct fsnotify_event_holder *last_holder;
        struct fsnotify_event *last_event;
-        /* and the list better be locked by something too */
+        last_event = list_entry(list->prev, struct fsnotify_event, list);
-        spin_lock(&event->lock);
+        return event_compare(last_event, event);
-        last_holder = list_entry(list->prev, struct fsnotify_event_holder, event_list);
-        last_event = last_holder->event;
-        if (event_compare(last_event, event))
-                fsnotify_get_event(last_event);
-        else
-                last_event = NULL;
-        spin_unlock(&event->lock);
-        return last_event;
 }
-static int inotify_handle_event(struct fsnotify_group *group,
+int inotify_handle_event(struct fsnotify_group *group,
-                                struct fsnotify_mark *inode_mark,
+                         struct inode *inode,
-                                struct fsnotify_mark *vfsmount_mark,
+                         struct fsnotify_mark *inode_mark,
-                                struct fsnotify_event *event)
+                         struct fsnotify_mark *vfsmount_mark,
+                         u32 mask, void *data, int data_type,
+                         const unsigned char *file_name, u32 cookie)
 {
        struct inotify_inode_mark *i_mark;
-        struct inode *to_tell;
+        struct inotify_event_info *event;
-        struct inotify_event_private_data *event_priv;
+        struct fsnotify_event *fsn_event;
-        struct fsnotify_event_private_data *fsn_event_priv;
+        int ret;
-        struct fsnotify_event *added_event;
+        int len = 0;
-        int wd, ret = 0;
+        int alloc_len = sizeof(struct inotify_event_info);
        BUG_ON(vfsmount_mark);
-        pr_debug("%s: group=%p event=%p to_tell=%p mask=%x\n", __func__, group,
+        if ((inode_mark->mask & FS_EXCL_UNLINK) &&
-                 event, event->to_tell, event->mask);
+            (data_type == FSNOTIFY_EVENT_PATH)) {
+                struct path *path = data;
+                if (d_unlinked(path->dentry))
+                        return 0;
+        }
+        if (file_name) {
+                len = strlen(file_name);
+                alloc_len += len + 1;
+        }
-        to_tell = event->to_tell;
+        pr_debug("%s: group=%p inode=%p mask=%x\n", __func__, group, inode,
+                 mask);
        i_mark = container_of(inode_mark, struct inotify_inode_mark,
                              fsn_mark);
-        wd = i_mark->wd;
-        event_priv = kmem_cache_alloc(event_priv_cachep, GFP_KERNEL);
+        event = kmalloc(alloc_len, GFP_KERNEL);
-        if (unlikely(!event_priv))
+        if (unlikely(!event))
                return -ENOMEM;
-        fsn_event_priv = &event_priv->fsnotify_event_priv_data;
+        fsn_event = &event->fse;
+        fsnotify_init_event(fsn_event, inode, mask);
-        fsnotify_get_group(group);
+        event->wd = i_mark->wd;
-        fsn_event_priv->group = group;
+        event->sync_cookie = cookie;
-        event_priv->wd = wd;
+        event->name_len = len;
+        if (len)
-        added_event = fsnotify_add_notify_event(group, event, fsn_event_priv, inotify_merge);
+                strcpy(event->name, file_name);
-        if (added_event) {
-                inotify_free_event_priv(fsn_event_priv);
+        ret = fsnotify_add_notify_event(group, fsn_event, inotify_merge);
-                if (!IS_ERR(added_event))
+        if (ret) {
-                        fsnotify_put_event(added_event);
+                /* Our event wasn't used in the end. Free it. */
-                else
+                fsnotify_destroy_event(group, fsn_event);
-                        ret = PTR_ERR(added_event);
        }
        if (inode_mark->mask & IN_ONESHOT)
                fsnotify_destroy_mark(inode_mark, group);
-        return ret;
+        return 0;
 }
 static void inotify_freeing_mark(struct fsnotify_mark *fsn_mark, struct fsnotify_group *group)
@@ -142,22 +125,6 @@ static void inotify_freeing_mark(struct fsnotify_mark *fsn_mark, struct fsnotify
        inotify_ignored_and_remove_idr(fsn_mark, group);
 }
-static bool inotify_should_send_event(struct fsnotify_group *group, struct inode *inode,
-                                      struct fsnotify_mark *inode_mark,
-                                      struct fsnotify_mark *vfsmount_mark,
-                                      __u32 mask, void *data, int data_type)
-{
-        if ((inode_mark->mask & FS_EXCL_UNLINK) &&
-            (data_type == FSNOTIFY_EVENT_PATH)) {
-                struct path *path = data;
-                if (d_unlinked(path->dentry))
-                        return false;
-        }
-        return true;
-}
 /*
 * This is NEVER supposed to be called.  Inotify marks should either have been
 * removed from the idr when the watch was removed or in the
@@ -202,22 +169,14 @@ static void inotify_free_group_priv(struct fsnotify_group *group)
        free_uid(group->inotify_data.user);
 }
-void inotify_free_event_priv(struct fsnotify_event_private_data *fsn_event_priv)
+static void inotify_free_event(struct fsnotify_event *fsn_event)
 {
-        struct inotify_event_private_data *event_priv;
+        kfree(INOTIFY_E(fsn_event));
-        event_priv = container_of(fsn_event_priv, struct inotify_event_private_data,
-                                  fsnotify_event_priv_data);
-        fsnotify_put_group(fsn_event_priv->group);
-        kmem_cache_free(event_priv_cachep, event_priv);
 }
 const struct fsnotify_ops inotify_fsnotify_ops = {
        .handle_event = inotify_handle_event,
-        .should_send_event = inotify_should_send_event,
        .free_group_priv = inotify_free_group_priv,
-        .free_event_priv = inotify_free_event_priv,
+        .free_event = inotify_free_event,
        .freeing_mark = inotify_freeing_mark,
 };
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 60f954a891ab..78a2ca3966c3 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -50,7 +50,6 @@ static int inotify_max_queued_events __read_mostly;
 static int inotify_max_user_watches __read_mostly;
 static struct kmem_cache *inotify_inode_mark_cachep __read_mostly;
-struct kmem_cache *event_priv_cachep __read_mostly;
 #ifdef CONFIG_SYSCTL
@@ -124,6 +123,16 @@ static unsigned int inotify_poll(struct file *file, poll_table *wait)
        return ret;
 }
+static int round_event_name_len(struct fsnotify_event *fsn_event)
+{
+        struct inotify_event_info *event;
+        event = INOTIFY_E(fsn_event);
+        if (!event->name_len)
+                return 0;
+        return roundup(event->name_len + 1, sizeof(struct inotify_event));
+}
 /*
 * Get an inotify_kernel_event if one exists and is small
 * enough to fit in "count". Return an error pointer if
@@ -144,9 +153,7 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
        pr_debug("%s: group=%p event=%p\n", __func__, group, event);
-        if (event->name_len)
+        event_size += round_event_name_len(event);
-                event_size += roundup(event->name_len + 1, event_size);
        if (event_size > count)
                return ERR_PTR(-EINVAL);
@@ -164,40 +171,27 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
 * buffer we had in "get_one_event()" above.
 */
 static ssize_t copy_event_to_user(struct fsnotify_group *group,
-                                  struct fsnotify_event *event,
+                                  struct fsnotify_event *fsn_event,
                                  char __user *buf)
 {
        struct inotify_event inotify_event;
-        struct fsnotify_event_private_data *fsn_priv;
+        struct inotify_event_info *event;
-        struct inotify_event_private_data *priv;
        size_t event_size = sizeof(struct inotify_event);
-        size_t name_len = 0;
+        size_t name_len;
+        size_t pad_name_len;
-        pr_debug("%s: group=%p event=%p\n", __func__, group, event);
+        pr_debug("%s: group=%p event=%p\n", __func__, group, fsn_event);
-        /* we get the inotify watch descriptor from the event private data */
-        spin_lock(&event->lock);
-        fsn_priv = fsnotify_remove_priv_from_event(group, event);
-        spin_unlock(&event->lock);
-        if (!fsn_priv)
-                inotify_event.wd = -1;
-        else {
-                priv = container_of(fsn_priv, struct inotify_event_private_data,
-                                    fsnotify_event_priv_data);
-                inotify_event.wd = priv->wd;
-                inotify_free_event_priv(fsn_priv);
-        }
+        event = INOTIFY_E(fsn_event);
+        name_len = event->name_len;
        /*
-         * round up event->name_len so it is a multiple of event_size
+         * round up name length so it is a multiple of event_size
         * plus an extra byte for the terminating '\0'.
         */
-        if (event->name_len)
+        pad_name_len = round_event_name_len(fsn_event);
-                name_len = roundup(event->name_len + 1, event_size);
+        inotify_event.len = pad_name_len;
-        inotify_event.len = name_len;
+        inotify_event.mask = inotify_mask_to_arg(fsn_event->mask);
+        inotify_event.wd = event->wd;
-        inotify_event.mask = inotify_mask_to_arg(event->mask);
        inotify_event.cookie = event->sync_cookie;
        /* send the main event */
@@ -209,20 +203,18 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
        /*
         * fsnotify only stores the pathname, so here we have to send the pathname
         * and then pad that pathname out to a multiple of sizeof(inotify_event)
-         * with zeros.  I get my zeros from the nul_inotify_event.
+         * with zeros.
         */
-        if (name_len) {
+        if (pad_name_len) {
-                unsigned int len_to_zero = name_len - event->name_len;
                /* copy the path name */
-                if (copy_to_user(buf, event->file_name, event->name_len))
+                if (copy_to_user(buf, event->name, name_len))
                        return -EFAULT;
-                buf += event->name_len;
+                buf += name_len;
                /* fill userspace with 0's */
-                if (clear_user(buf, len_to_zero))
+                if (clear_user(buf, pad_name_len - name_len))
                        return -EFAULT;
-                buf += len_to_zero;
+                event_size += pad_name_len;
-                event_size += name_len;
        }
        return event_size;
@@ -254,7 +246,7 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
                        if (IS_ERR(kevent))
                                break;
                        ret = copy_event_to_user(group, kevent, buf);
-                        fsnotify_put_event(kevent);
+                        fsnotify_destroy_event(group, kevent);
                        if (ret < 0)
                                break;
                        buf += ret;
@@ -297,8 +289,7 @@ static long inotify_ioctl(struct file *file, unsigned int cmd,
                          unsigned long arg)
 {
        struct fsnotify_group *group;
-        struct fsnotify_event_holder *holder;
+        struct fsnotify_event *fsn_event;
-        struct fsnotify_event *event;
        void __user *p;
        int ret = -ENOTTY;
        size_t send_len = 0;
@@ -311,12 +302,10 @@ static long inotify_ioctl(struct file *file, unsigned int cmd,
        switch (cmd) {
        case FIONREAD:
                mutex_lock(&group->notification_mutex);
-                list_for_each_entry(holder, &group->notification_list, event_list) {
+                list_for_each_entry(fsn_event, &group->notification_list,
-                        event = holder->event;
+                                    list) {
                        send_len += sizeof(struct inotify_event);
-                        if (event->name_len)
+                        send_len += round_event_name_len(fsn_event);
-                                send_len += roundup(event->name_len + 1,
-                                                sizeof(struct inotify_event));
                }
                mutex_unlock(&group->notification_mutex);
                ret = put_user(send_len, (int __user *) p);
@@ -503,43 +492,12 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
                                    struct fsnotify_group *group)
 {
        struct inotify_inode_mark *i_mark;
-        struct fsnotify_event *ignored_event, *notify_event;
-        struct inotify_event_private_data *event_priv;
-        struct fsnotify_event_private_data *fsn_event_priv;
-        int ret;
-        i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
-        ignored_event = fsnotify_create_event(NULL, FS_IN_IGNORED, NULL,
-                                              FSNOTIFY_EVENT_NONE, NULL, 0,
-                                              GFP_NOFS);
-        if (!ignored_event)
-                goto skip_send_ignore;
-        event_priv = kmem_cache_alloc(event_priv_cachep, GFP_NOFS);
-        if (unlikely(!event_priv))
-                goto skip_send_ignore;
-        fsn_event_priv = &event_priv->fsnotify_event_priv_data;
-        fsnotify_get_group(group);
-        fsn_event_priv->group = group;
-        event_priv->wd = i_mark->wd;
-        notify_event = fsnotify_add_notify_event(group, ignored_event, fsn_event_priv, NULL);
-        if (notify_event) {
-                if (IS_ERR(notify_event))
-                        ret = PTR_ERR(notify_event);
-                else
-                        fsnotify_put_event(notify_event);
-                inotify_free_event_priv(fsn_event_priv);
-        }
-skip_send_ignore:
+        /* Queue ignore event for the watch */
-        /* matches the reference taken when the event was created */
+        inotify_handle_event(group, NULL, fsn_mark, NULL, FS_IN_IGNORED,
-        if (ignored_event)
+                             NULL, FSNOTIFY_EVENT_NONE, NULL, 0);
-                fsnotify_put_event(ignored_event);
+        i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
        /* remove this mark from the idr */
        inotify_remove_from_idr(group, i_mark);
@@ -675,11 +633,23 @@ static int inotify_update_watch(struct fsnotify_group *group, struct inode *inod
 static struct fsnotify_group *inotify_new_group(unsigned int max_events)
 {
        struct fsnotify_group *group;
+        struct inotify_event_info *oevent;
        group = fsnotify_alloc_group(&inotify_fsnotify_ops);
        if (IS_ERR(group))
                return group;
+        oevent = kmalloc(sizeof(struct inotify_event_info), GFP_KERNEL);
+        if (unlikely(!oevent)) {
+                fsnotify_destroy_group(group);
+                return ERR_PTR(-ENOMEM);
+        }
+        group->overflow_event = &oevent->fse;
+        fsnotify_init_event(group->overflow_event, NULL, FS_Q_OVERFLOW);
+        oevent->wd = -1;
+        oevent->sync_cookie = 0;
+        oevent->name_len = 0;
        group->max_events = max_events;
        spin_lock_init(&group->inotify_data.idr_lock);
@@ -836,7 +806,6 @@ static int __init inotify_user_setup(void)
        BUG_ON(hweight32(ALL_INOTIFY_BITS) != 21);
        inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, SLAB_PANIC);
-        event_priv_cachep = KMEM_CACHE(inotify_event_private_data, SLAB_PANIC);
        inotify_max_queued_events = 16384;
        inotify_max_user_instances = 128;
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index 7b51b05f160c..1e58402171a5 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -48,15 +48,6 @@
 #include <linux/fsnotify_backend.h>
 #include "fsnotify.h"
-static struct kmem_cache *fsnotify_event_cachep;
-static struct kmem_cache *fsnotify_event_holder_cachep;
-/*
- * This is a magic event we send when the q is too full.  Since it doesn't
- * hold real event information we just keep one system wide and use it any time
- * it is needed.  It's refcnt is set 1 at kernel init time and will never
- * get set to 0 so it will never get 'freed'
- */
-static struct fsnotify_event *q_overflow_event;
 static atomic_t fsnotify_sync_cookie = ATOMIC_INIT(0);
 /**
@@ -76,186 +67,82 @@ bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group)
        return list_empty(&group->notification_list) ? true : false;
 }
-void fsnotify_get_event(struct fsnotify_event *event)
+void fsnotify_destroy_event(struct fsnotify_group *group,
+                            struct fsnotify_event *event)
 {
-        atomic_inc(&event->refcnt);
+        /* Overflow events are per-group and we don't want to free them */
-}
+        if (!event || event->mask == FS_Q_OVERFLOW)
-void fsnotify_put_event(struct fsnotify_event *event)
-{
-        if (!event)
                return;
-        if (atomic_dec_and_test(&event->refcnt)) {
+        group->ops->free_event(event);
-                pr_debug("%s: event=%p\n", __func__, event);
-                if (event->data_type == FSNOTIFY_EVENT_PATH)
-                        path_put(&event->path);
-                BUG_ON(!list_empty(&event->private_data_list));
-                kfree(event->file_name);
-                put_pid(event->tgid);
-                kmem_cache_free(fsnotify_event_cachep, event);
-        }
-}
-struct fsnotify_event_holder *fsnotify_alloc_event_holder(void)
-{
-        return kmem_cache_alloc(fsnotify_event_holder_cachep, GFP_KERNEL);
-}
-void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder)
-{
-        if (holder)
-                kmem_cache_free(fsnotify_event_holder_cachep, holder);
-}
-/*
- * Find the private data that the group previously attached to this event when
- * the group added the event to the notification queue (fsnotify_add_notify_event)
- */
-struct fsnotify_event_private_data *fsnotify_remove_priv_from_event(struct fsnotify_group *group, struct fsnotify_event *event)
-{
-        struct fsnotify_event_private_data *lpriv;
-        struct fsnotify_event_private_data *priv = NULL;
-        assert_spin_locked(&event->lock);
-        list_for_each_entry(lpriv, &event->private_data_list, event_list) {
-                if (lpriv->group == group) {
-                        priv = lpriv;
-                        list_del(&priv->event_list);
-                        break;
-                }
-        }
-        return priv;
 }
 /*
 * Add an event to the group notification queue.  The group can later pull this
- * event off the queue to deal with.  If the event is successfully added to the
+ * event off the queue to deal with.  The function returns 0 if the event was
- * group's notification queue, a reference is taken on event.
+ * added to the queue, 1 if the event was merged with some other queued event,
+ * 2 if the queue of events has overflown.
 */
-struct fsnotify_event *fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_event *event,
+int fsnotify_add_notify_event(struct fsnotify_group *group,
-                                                 struct fsnotify_event_private_data *priv,
+                              struct fsnotify_event *event,
-                                                 struct fsnotify_event *(*merge)(struct list_head *,
+                              int (*merge)(struct list_head *,
-                                                                                 struct fsnotify_event *))
+                                           struct fsnotify_event *))
 {
-        struct fsnotify_event *return_event = NULL;
+        int ret = 0;
-        struct fsnotify_event_holder *holder = NULL;
        struct list_head *list = &group->notification_list;
-        pr_debug("%s: group=%p event=%p priv=%p\n", __func__, group, event, priv);
+        pr_debug("%s: group=%p event=%p\n", __func__, group, event);
-        /*
-         * There is one fsnotify_event_holder embedded inside each fsnotify_event.
-         * Check if we expect to be able to use that holder.  If not alloc a new
-         * holder.
-         * For the overflow event it's possible that something will use the in
-         * event holder before we get the lock so we may need to jump back and
-         * alloc a new holder, this can't happen for most events...
-         */
-        if (!list_empty(&event->holder.event_list)) {
-alloc_holder:
-                holder = fsnotify_alloc_event_holder();
-                if (!holder)
-                        return ERR_PTR(-ENOMEM);
-        }
        mutex_lock(&group->notification_mutex);
        if (group->q_len >= group->max_events) {
-                event = q_overflow_event;
+                ret = 2;
+                /* Queue overflow event only if it isn't already queued */
-                /*
+                if (!list_empty(&group->overflow_event->list)) {
-                 * we need to return the overflow event
-                 * which means we need a ref
-                 */
-                fsnotify_get_event(event);
-                return_event = event;
-                /* sorry, no private data on the overflow event */
-                priv = NULL;
-        }
-        if (!list_empty(list) && merge) {
-                struct fsnotify_event *tmp;
-                tmp = merge(list, event);
-                if (tmp) {
                        mutex_unlock(&group->notification_mutex);
+                        return ret;
-                        if (return_event)
-                                fsnotify_put_event(return_event);
-                        if (holder != &event->holder)
-                                fsnotify_destroy_event_holder(holder);
-                        return tmp;
                }
+                event = group->overflow_event;
+                goto queue;
        }
-        spin_lock(&event->lock);
+        if (!list_empty(list) && merge) {
+                ret = merge(list, event);
-        if (list_empty(&event->holder.event_list)) {
+                if (ret) {
-                if (unlikely(holder))
+                        mutex_unlock(&group->notification_mutex);
-                        fsnotify_destroy_event_holder(holder);
+                        return ret;
-                holder = &event->holder;
-        } else if (unlikely(!holder)) {
-                /* between the time we checked above and got the lock the in
-                 * event holder was used, go back and get a new one */
-                spin_unlock(&event->lock);
-                mutex_unlock(&group->notification_mutex);
-                if (return_event) {
-                        fsnotify_put_event(return_event);
-                        return_event = NULL;
                }
-                goto alloc_holder;
        }
+queue:
        group->q_len++;
-        holder->event = event;
+        list_add_tail(&event->list, list);
-        fsnotify_get_event(event);
-        list_add_tail(&holder->event_list, list);
-        if (priv)
-                list_add_tail(&priv->event_list, &event->private_data_list);
-        spin_unlock(&event->lock);
        mutex_unlock(&group->notification_mutex);
        wake_up(&group->notification_waitq);
        kill_fasync(&group->fsn_fa, SIGIO, POLL_IN);
-        return return_event;
+        return ret;
 }
 /*
- * Remove and return the first event from the notification list.  There is a
+ * Remove and return the first event from the notification list.  It is the
- * reference held on this event since it was on the list.  It is the responsibility
+ * responsibility of the caller to destroy the obtained event
- * of the caller to drop this reference.
 */
 struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group)
 {
        struct fsnotify_event *event;
-        struct fsnotify_event_holder *holder;
        BUG_ON(!mutex_is_locked(&group->notification_mutex));
        pr_debug("%s: group=%p\n", __func__, group);
-        holder = list_first_entry(&group->notification_list, struct fsnotify_event_holder, event_list);
+        event = list_first_entry(&group->notification_list,
+                                 struct fsnotify_event, list);
-        event = holder->event;
+        /*
+         * We need to init list head for the case of overflow event so that
-        spin_lock(&event->lock);
+         * check in fsnotify_add_notify_events() works
-        holder->event = NULL;
+         */
-        list_del_init(&holder->event_list);
+        list_del_init(&event->list);
-        spin_unlock(&event->lock);
-        /* event == holder means we are referenced through the in event holder */
-        if (holder != &event->holder)
-                fsnotify_destroy_event_holder(holder);
        group->q_len--;
        return event;
@@ -266,15 +153,10 @@ struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group
 */
 struct fsnotify_event *fsnotify_peek_notify_event(struct fsnotify_group *group)
 {
-        struct fsnotify_event *event;
-        struct fsnotify_event_holder *holder;
        BUG_ON(!mutex_is_locked(&group->notification_mutex));
-        holder = list_first_entry(&group->notification_list, struct fsnotify_event_holder, event_list);
+        return list_first_entry(&group->notification_list,
-        event = holder->event;
+                                struct fsnotify_event, list);
-        return event;
 }
 /*
@@ -284,181 +166,31 @@ struct fsnotify_event *fsnotify_peek_notify_event(struct fsnotify_group *group)
 void fsnotify_flush_notify(struct fsnotify_group *group)
 {
        struct fsnotify_event *event;
-        struct fsnotify_event_private_data *priv;
        mutex_lock(&group->notification_mutex);
        while (!fsnotify_notify_queue_is_empty(group)) {
                event = fsnotify_remove_notify_event(group);
-                /* if they don't implement free_event_priv they better not have attached any */
+                fsnotify_destroy_event(group, event);
-                if (group->ops->free_event_priv) {
-                        spin_lock(&event->lock);
-                        priv = fsnotify_remove_priv_from_event(group, event);
-                        spin_unlock(&event->lock);
-                        if (priv)
-                                group->ops->free_event_priv(priv);
-                }
-                fsnotify_put_event(event); /* matches fsnotify_add_notify_event */
        }
        mutex_unlock(&group->notification_mutex);
 }
-static void initialize_event(struct fsnotify_event *event)
-{
-        INIT_LIST_HEAD(&event->holder.event_list);
-        atomic_set(&event->refcnt, 1);
-        spin_lock_init(&event->lock);
-        INIT_LIST_HEAD(&event->private_data_list);
-}
-/*
- * Caller damn well better be holding whatever mutex is protecting the
- * old_holder->event_list and the new_event must be a clean event which
- * cannot be found anywhere else in the kernel.
- */
-int fsnotify_replace_event(struct fsnotify_event_holder *old_holder,
-                           struct fsnotify_event *new_event)
-{
-        struct fsnotify_event *old_event = old_holder->event;
-        struct fsnotify_event_holder *new_holder = &new_event->holder;
-        enum event_spinlock_class {
-                SPINLOCK_OLD,
-                SPINLOCK_NEW,
-        };
-        pr_debug("%s: old_event=%p new_event=%p\n", __func__, old_event, new_event);
-        /*
-         * if the new_event's embedded holder is in use someone
-         * screwed up and didn't give us a clean new event.
-         */
-        BUG_ON(!list_empty(&new_holder->event_list));
-        spin_lock_nested(&old_event->lock, SPINLOCK_OLD);
-        spin_lock_nested(&new_event->lock, SPINLOCK_NEW);
-        new_holder->event = new_event;
-        list_replace_init(&old_holder->event_list, &new_holder->event_list);
-        spin_unlock(&new_event->lock);
-        spin_unlock(&old_event->lock);
-        /* event == holder means we are referenced through the in event holder */
-        if (old_holder != &old_event->holder)
-                fsnotify_destroy_event_holder(old_holder);
-        fsnotify_get_event(new_event); /* on the list take reference */
-        fsnotify_put_event(old_event); /* off the list, drop reference */
-        return 0;
-}
-struct fsnotify_event *fsnotify_clone_event(struct fsnotify_event *old_event)
-{
-        struct fsnotify_event *event;
-        event = kmem_cache_alloc(fsnotify_event_cachep, GFP_KERNEL);
-        if (!event)
-                return NULL;
-        pr_debug("%s: old_event=%p new_event=%p\n", __func__, old_event, event);
-        memcpy(event, old_event, sizeof(*event));
-        initialize_event(event);
-        if (event->name_len) {
-                event->file_name = kstrdup(old_event->file_name, GFP_KERNEL);
-                if (!event->file_name) {
-                        kmem_cache_free(fsnotify_event_cachep, event);
-                        return NULL;
-                }
-        }
-        event->tgid = get_pid(old_event->tgid);
-        if (event->data_type == FSNOTIFY_EVENT_PATH)
-                path_get(&event->path);
-        return event;
-}
 /*
 * fsnotify_create_event - Allocate a new event which will be sent to each
 * group's handle_event function if the group was interested in this
 * particular event.
 *
- * @to_tell the inode which is supposed to receive the event (sometimes a
+ * @inode the inode which is supposed to receive the event (sometimes a
 *      parent of the inode to which the event happened.
 * @mask what actually happened.
 * @data pointer to the object which was actually affected
 * @data_type flag indication if the data is a file, path, inode, nothing...
 * @name the filename, if available
 */
-struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, void *data,
+void fsnotify_init_event(struct fsnotify_event *event, struct inode *inode,
-                                             int data_type, const unsigned char *name,
+                         u32 mask)
-                                             u32 cookie, gfp_t gfp)
 {
-        struct fsnotify_event *event;
+        INIT_LIST_HEAD(&event->list);
+        event->inode = inode;
-        event = kmem_cache_zalloc(fsnotify_event_cachep, gfp);
-        if (!event)
-                return NULL;
-        pr_debug("%s: event=%p to_tell=%p mask=%x data=%p data_type=%d\n",
-                 __func__, event, to_tell, mask, data, data_type);
-        initialize_event(event);
-        if (name) {
-                event->file_name = kstrdup(name, gfp);
-                if (!event->file_name) {
-                        kmem_cache_free(fsnotify_event_cachep, event);
-                        return NULL;
-                }
-                event->name_len = strlen(event->file_name);
-        }
-        event->tgid = get_pid(task_tgid(current));
-        event->sync_cookie = cookie;
-        event->to_tell = to_tell;
-        event->data_type = data_type;
-        switch (data_type) {
-        case FSNOTIFY_EVENT_PATH: {
-                struct path *path = data;
-                event->path.dentry = path->dentry;
-                event->path.mnt = path->mnt;
-                path_get(&event->path);
-                break;
-        }
-        case FSNOTIFY_EVENT_INODE:
-                event->inode = data;
-                break;
-        case FSNOTIFY_EVENT_NONE:
-                event->inode = NULL;
-                event->path.dentry = NULL;
-                event->path.mnt = NULL;
-                break;
-        default:
-                BUG();
-        }
        event->mask = mask;
-        return event;
-}
-static __init int fsnotify_notification_init(void)
-{
-        fsnotify_event_cachep = KMEM_CACHE(fsnotify_event, SLAB_PANIC);
-        fsnotify_event_holder_cachep = KMEM_CACHE(fsnotify_event_holder, SLAB_PANIC);
-        q_overflow_event = fsnotify_create_event(NULL, FS_Q_OVERFLOW, NULL,
-                                                 FSNOTIFY_EVENT_NONE, NULL, 0,
-                                                 GFP_KERNEL);
-        if (!q_overflow_event)
-                panic("unable to allocate fsnotify q_overflow_event\n");
-        return 0;
 }
-subsys_initcall(fsnotify_notification_init);
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index ea4ba9daeb47..db9bd8a31725 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -2134,7 +2134,7 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
        ret = ntfs_file_aio_write_nolock(iocb, iov, nr_segs, &iocb->ki_pos);
        mutex_unlock(&inode->i_mutex);
        if (ret > 0) {
-                int err = generic_write_sync(file, pos, ret);
+                int err = generic_write_sync(file, iocb->ki_pos - ret, ret);
                if (err < 0)
                        ret = err;
        }
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index f17e58b32989..ce210d4951a1 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -38,7 +38,6 @@ ocfs2-objs := \
        symlink.o               \
        sysfile.o               \
        uptodate.o              \
-        ver.o                   \
        quota_local.o           \
        quota_global.o          \
        xattr.o                 \
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index b4f788e0ca31..555f4cddefe3 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -160,36 +160,6 @@ static struct posix_acl *ocfs2_get_acl_nolock(struct inode *inode,
        return acl;
 }
-/*
- * Get posix acl.
- */
-static struct posix_acl *ocfs2_get_acl(struct inode *inode, int type)
-{
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        struct buffer_head *di_bh = NULL;
-        struct posix_acl *acl;
-        int ret;
-        if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
-                return NULL;
-        ret = ocfs2_inode_lock(inode, &di_bh, 0);
-        if (ret < 0) {
-                mlog_errno(ret);
-                acl = ERR_PTR(ret);
-                return acl;
-        }
-        acl = ocfs2_get_acl_nolock(inode, type, di_bh);
-        ocfs2_inode_unlock(inode, 0);
-        brelse(di_bh);
-        return acl;
-}
 /*
 * Helper function to set i_mode in memory and disk. Some call paths
 * will not have di_bh or a journal handle to pass, in which case it
@@ -250,7 +220,7 @@ out:
 /*
 * Set the access or default ACL of an inode.
 */
-static int ocfs2_set_acl(handle_t *handle,
+int ocfs2_set_acl(handle_t *handle,
                         struct inode *inode,
                         struct buffer_head *di_bh,
                         int type,
@@ -313,6 +283,11 @@ static int ocfs2_set_acl(handle_t *handle,
        return ret;
 }
+int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+{
+        return ocfs2_set_acl(NULL, inode, NULL, type, acl, NULL, NULL);
+}
 struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type)
 {
        struct ocfs2_super *osb;
@@ -334,200 +309,3 @@ struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type)
        return acl;
 }
-int ocfs2_acl_chmod(struct inode *inode)
-{
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        struct posix_acl *acl;
-        int ret;
-        if (S_ISLNK(inode->i_mode))
-                return -EOPNOTSUPP;
-        if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
-                return 0;
-        acl = ocfs2_get_acl(inode, ACL_TYPE_ACCESS);
-        if (IS_ERR(acl) || !acl)
-                return PTR_ERR(acl);
-        ret = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
-        if (ret)
-                return ret;
-        ret = ocfs2_set_acl(NULL, inode, NULL, ACL_TYPE_ACCESS,
-                            acl, NULL, NULL);
-        posix_acl_release(acl);
-        return ret;
-}
-/*
- * Initialize the ACLs of a new inode. If parent directory has default ACL,
- * then clone to new inode. Called from ocfs2_mknod.
- */
-int ocfs2_init_acl(handle_t *handle,
-                   struct inode *inode,
-                   struct inode *dir,
-                   struct buffer_head *di_bh,
-                   struct buffer_head *dir_bh,
-                   struct ocfs2_alloc_context *meta_ac,
-                   struct ocfs2_alloc_context *data_ac)
-{
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        struct posix_acl *acl = NULL;
-        int ret = 0, ret2;
-        umode_t mode;
-        if (!S_ISLNK(inode->i_mode)) {
-                if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) {
-                        acl = ocfs2_get_acl_nolock(dir, ACL_TYPE_DEFAULT,
-                                                   dir_bh);
-                        if (IS_ERR(acl))
-                                return PTR_ERR(acl);
-                }
-                if (!acl) {
-                        mode = inode->i_mode & ~current_umask();
-                        ret = ocfs2_acl_set_mode(inode, di_bh, handle, mode);
-                        if (ret) {
-                                mlog_errno(ret);
-                                goto cleanup;
-                        }
-                }
-        }
-        if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) {
-                if (S_ISDIR(inode->i_mode)) {
-                        ret = ocfs2_set_acl(handle, inode, di_bh,
-                                            ACL_TYPE_DEFAULT, acl,
-                                            meta_ac, data_ac);
-                        if (ret)
-                                goto cleanup;
-                }
-                mode = inode->i_mode;
-                ret = posix_acl_create(&acl, GFP_NOFS, &mode);
-                if (ret < 0)
-                        return ret;
-                ret2 = ocfs2_acl_set_mode(inode, di_bh, handle, mode);
-                if (ret2) {
-                        mlog_errno(ret2);
-                        ret = ret2;
-                        goto cleanup;
-                }
-                if (ret > 0) {
-                        ret = ocfs2_set_acl(handle, inode,
-                                            di_bh, ACL_TYPE_ACCESS,
-                                            acl, meta_ac, data_ac);
-                }
-        }
-cleanup:
-        posix_acl_release(acl);
-        return ret;
-}
-static size_t ocfs2_xattr_list_acl_access(struct dentry *dentry,
-                                          char *list,
-                                          size_t list_len,
-                                          const char *name,
-                                          size_t name_len,
-                                          int type)
-{
-        struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
-        const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
-        if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
-                return 0;
-        if (list && size <= list_len)
-                memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
-        return size;
-}
-static size_t ocfs2_xattr_list_acl_default(struct dentry *dentry,
-                                           char *list,
-                                           size_t list_len,
-                                           const char *name,
-                                           size_t name_len,
-                                           int type)
-{
-        struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
-        const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
-        if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
-                return 0;
-        if (list && size <= list_len)
-                memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
-        return size;
-}
-static int ocfs2_xattr_get_acl(struct dentry *dentry, const char *name,
-                void *buffer, size_t size, int type)
-{
-        struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
-        struct posix_acl *acl;
-        int ret;
-        if (strcmp(name, "") != 0)
-                return -EINVAL;
-        if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
-                return -EOPNOTSUPP;
-        acl = ocfs2_get_acl(dentry->d_inode, type);
-        if (IS_ERR(acl))
-                return PTR_ERR(acl);
-        if (acl == NULL)
-                return -ENODATA;
-        ret = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
-        posix_acl_release(acl);
-        return ret;
-}
-static int ocfs2_xattr_set_acl(struct dentry *dentry, const char *name,
-                const void *value, size_t size, int flags, int type)
-{
-        struct inode *inode = dentry->d_inode;
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        struct posix_acl *acl;
-        int ret = 0;
-        if (strcmp(name, "") != 0)
-                return -EINVAL;
-        if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
-                return -EOPNOTSUPP;
-        if (!inode_owner_or_capable(inode))
-                return -EPERM;
-        if (value) {
-                acl = posix_acl_from_xattr(&init_user_ns, value, size);
-                if (IS_ERR(acl))
-                        return PTR_ERR(acl);
-                else if (acl) {
-                        ret = posix_acl_valid(acl);
-                        if (ret)
-                                goto cleanup;
-                }
-        } else
-                acl = NULL;
-        ret = ocfs2_set_acl(NULL, inode, NULL, type, acl, NULL, NULL);
-cleanup:
-        posix_acl_release(acl);
-        return ret;
-}
-const struct xattr_handler ocfs2_xattr_acl_access_handler = {
-        .prefix = POSIX_ACL_XATTR_ACCESS,
-        .flags  = ACL_TYPE_ACCESS,
-        .list   = ocfs2_xattr_list_acl_access,
-        .get    = ocfs2_xattr_get_acl,
-        .set    = ocfs2_xattr_set_acl,
-};
-const struct xattr_handler ocfs2_xattr_acl_default_handler = {
-        .prefix = POSIX_ACL_XATTR_DEFAULT,
-        .flags  = ACL_TYPE_DEFAULT,
-        .list   = ocfs2_xattr_list_acl_default,
-        .get    = ocfs2_xattr_get_acl,
-        .set    = ocfs2_xattr_set_acl,
-};
diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h
index 071fbd380f2f..3fce68d08625 100644
--- a/fs/ocfs2/acl.h
+++ b/fs/ocfs2/acl.h
@@ -27,10 +27,13 @@ struct ocfs2_acl_entry {
 };
 struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type);
-extern int ocfs2_acl_chmod(struct inode *);
+int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type);
-extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *,
+int ocfs2_set_acl(handle_t *handle,
-                          struct buffer_head *, struct buffer_head *,
+                         struct inode *inode,
-                          struct ocfs2_alloc_context *,
+                         struct buffer_head *di_bh,
-                          struct ocfs2_alloc_context *);
+                         int type,
+                         struct posix_acl *acl,
+                         struct ocfs2_alloc_context *meta_ac,
+                         struct ocfs2_alloc_context *data_ac);
 #endif /* OCFS2_ACL_H */
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index dc7411fe185d..e2edff38be52 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -4742,6 +4742,7 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,
                                enum ocfs2_alloc_restarted *reason_ret)
 {
        int status = 0, err = 0;
+        int need_free = 0;
        int free_extents;
        enum ocfs2_alloc_restarted reason = RESTART_NONE;
        u32 bit_off, num_bits;
@@ -4796,7 +4797,8 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,
                                              OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
-                goto leave;
+                need_free = 1;
+                goto bail;
        }
        block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
@@ -4807,7 +4809,8 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,
                                     num_bits, flags, meta_ac);
        if (status < 0) {
                mlog_errno(status);
-                goto leave;
+                need_free = 1;
+                goto bail;
        }
        ocfs2_journal_dirty(handle, et->et_root_bh);
@@ -4821,6 +4824,19 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,
                reason = RESTART_TRANS;
        }
+bail:
+        if (need_free) {
+                if (data_ac->ac_which == OCFS2_AC_USE_LOCAL)
+                        ocfs2_free_local_alloc_bits(osb, handle, data_ac,
+                                        bit_off, num_bits);
+                else
+                        ocfs2_free_clusters(handle,
+                                        data_ac->ac_inode,
+                                        data_ac->ac_bh,
+                                        ocfs2_clusters_to_blocks(osb->sb, bit_off),
+                                        num_bits);
+        }
 leave:
        if (reason_ret)
                *reason_ret = reason;
@@ -6805,6 +6821,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
                                         struct buffer_head *di_bh)
 {
        int ret, i, has_data, num_pages = 0;
+        int need_free = 0;
+        u32 bit_off, num;
        handle_t *handle;
        u64 uninitialized_var(block);
        struct ocfs2_inode_info *oi = OCFS2_I(inode);
@@ -6850,7 +6868,6 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
        }
        if (has_data) {
-                u32 bit_off, num;
                unsigned int page_end;
                u64 phys;
@@ -6886,6 +6903,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
                ret = ocfs2_grab_eof_pages(inode, 0, end, pages, &num_pages);
                if (ret) {
                        mlog_errno(ret);
+                        need_free = 1;
                        goto out_commit;
                }
@@ -6896,6 +6914,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
                ret = ocfs2_read_inline_data(inode, pages[0], di_bh);
                if (ret) {
                        mlog_errno(ret);
+                        need_free = 1;
                        goto out_commit;
                }
@@ -6927,6 +6946,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
                ret = ocfs2_insert_extent(handle, &et, 0, block, 1, 0, NULL);
                if (ret) {
                        mlog_errno(ret);
+                        need_free = 1;
                        goto out_commit;
                }
@@ -6938,6 +6958,18 @@ out_commit:
                dquot_free_space_nodirty(inode,
                                          ocfs2_clusters_to_bytes(osb->sb, 1));
+        if (need_free) {
+                if (data_ac->ac_which == OCFS2_AC_USE_LOCAL)
+                        ocfs2_free_local_alloc_bits(osb, handle, data_ac,
+                                        bit_off, num);
+                else
+                        ocfs2_free_clusters(handle,
+                                        data_ac->ac_inode,
+                                        data_ac->ac_bh,
+                                        ocfs2_clusters_to_blocks(osb->sb, bit_off),
+                                        num);
+        }
        ocfs2_commit_trans(osb, handle);
 out_unlock:
@@ -7126,7 +7158,7 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
        if (end > i_size_read(inode))
                end = i_size_read(inode);
-        BUG_ON(start >= end);
+        BUG_ON(start > end);
        if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) ||
            !(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) ||
@@ -7260,14 +7292,8 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
        start = range->start >> osb->s_clustersize_bits;
        len = range->len >> osb->s_clustersize_bits;
        minlen = range->minlen >> osb->s_clustersize_bits;
-        trimmed = 0;
-        if (!len) {
+        if (minlen >= osb->bitmap_cpg || range->len < sb->s_blocksize)
-                range->len = 0;
-                return 0;
-        }
-        if (minlen >= osb->bitmap_cpg)
                return -EINVAL;
        main_bm_inode = ocfs2_get_system_file_inode(osb,
@@ -7293,6 +7319,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
                goto out_unlock;
        }
+        len = range->len >> osb->s_clustersize_bits;
        if (start + len > le32_to_cpu(main_bm->i_clusters))
                len = le32_to_cpu(main_bm->i_clusters) - start;
@@ -7307,6 +7334,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
        last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1);
        last_bit = osb->bitmap_cpg;
+        trimmed = 0;
        for (group = first_group; group <= last_group;) {
                if (first_bit + len >= osb->bitmap_cpg)
                        last_bit = osb->bitmap_cpg;
diff --git a/fs/ocfs2/cluster/Makefile b/fs/ocfs2/cluster/Makefile
index bc8c5e7d8608..1aefc0350ec3 100644
--- a/fs/ocfs2/cluster/Makefile
+++ b/fs/ocfs2/cluster/Makefile
@@ -1,4 +1,4 @@
 obj-$(CONFIG_OCFS2_FS) += ocfs2_nodemanager.o
 ocfs2_nodemanager-objs := heartbeat.o masklog.o sys.o nodemanager.o \
-        quorum.o tcp.o netdebug.o ver.o
+        quorum.o tcp.o netdebug.o
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 73920ffda05b..bf482dfed14f 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -413,7 +413,7 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
        }
        /* Must put everything in 512 byte sectors for the bio... */
-        bio->bi_sector = (reg->hr_start_block + cs) << (bits - 9);
+        bio->bi_iter.bi_sector = (reg->hr_start_block + cs) << (bits - 9);
        bio->bi_bdev = reg->hr_bdev;
        bio->bi_private = wc;
        bio->bi_end_io = o2hb_bio_end_io;
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index bb240647ca5f..441c84e169e6 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -29,7 +29,6 @@
 #include "heartbeat.h"
 #include "masklog.h"
 #include "sys.h"
-#include "ver.h"
 /* for now we operate under the assertion that there can be only one
 * cluster active at a time.  Changing this will require trickling
@@ -945,8 +944,6 @@ static int __init init_o2nm(void)
 {
        int ret = -1;
-        cluster_print_version();
        ret = o2hb_init();
        if (ret)
                goto out;
@@ -984,6 +981,7 @@ out:
 MODULE_AUTHOR("Oracle");
 MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("OCFS2 cluster management");
 module_init(init_o2nm)
 module_exit(exit_o2nm)
diff --git a/fs/ocfs2/cluster/ver.c b/fs/ocfs2/cluster/ver.c
deleted file mode 100644
index a56eee6abad3..000000000000
--- a/fs/ocfs2/cluster/ver.c
+++ /dev/null
@@ -1,42 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * ver.c
- *
- * version string
- *
- * Copyright (C) 2002, 2005 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include "ver.h"
-#define CLUSTER_BUILD_VERSION "1.5.0"
-#define VERSION_STR "OCFS2 Node Manager " CLUSTER_BUILD_VERSION
-void cluster_print_version(void)
-{
-        printk(KERN_INFO "%s\n", VERSION_STR);
-}
-MODULE_DESCRIPTION(VERSION_STR);
-MODULE_VERSION(CLUSTER_BUILD_VERSION);
diff --git a/fs/ocfs2/cluster/ver.h b/fs/ocfs2/cluster/ver.h
deleted file mode 100644
index 32554c3382c2..000000000000
--- a/fs/ocfs2/cluster/ver.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * ver.h
- *
- * Function prototypes
- *
- * Copyright (C) 2005 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-#ifndef O2CLUSTER_VER_H
-#define O2CLUSTER_VER_H
-void cluster_print_version(void);
-#endif /* O2CLUSTER_VER_H */
diff --git a/fs/ocfs2/dlm/Makefile b/fs/ocfs2/dlm/Makefile
index c8a044efbb15..bd1aab1f49a4 100644
--- a/fs/ocfs2/dlm/Makefile
+++ b/fs/ocfs2/dlm/Makefile
@@ -3,5 +3,5 @@ ccflags-y := -Ifs/ocfs2
 obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o
 ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \
-        dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o dlmver.o
+        dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 8b3382abf840..33660a4a52fa 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -43,8 +43,6 @@
 #include "dlmdomain.h"
 #include "dlmdebug.h"
-#include "dlmver.h"
 #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_DOMAIN)
 #include "cluster/masklog.h"
@@ -2328,8 +2326,6 @@ static int __init dlm_init(void)
 {
        int status;
-        dlm_print_version();
        status = dlm_init_mle_cache();
        if (status) {
                mlog(ML_ERROR, "Could not create o2dlm_mle slabcache\n");
@@ -2379,6 +2375,7 @@ static void __exit dlm_exit (void)
 MODULE_AUTHOR("Oracle");
 MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("OCFS2 Distributed Lock Management");
 module_init(dlm_init);
 module_exit(dlm_exit);
diff --git a/fs/ocfs2/dlm/dlmver.c b/fs/ocfs2/dlm/dlmver.c
deleted file mode 100644
index dfc0da4d158d..000000000000
--- a/fs/ocfs2/dlm/dlmver.c
+++ /dev/null
@@ -1,42 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * dlmver.c
- *
- * version string
- *
- * Copyright (C) 2002, 2005 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include "dlmver.h"
-#define DLM_BUILD_VERSION "1.5.0"
-#define VERSION_STR "OCFS2 DLM " DLM_BUILD_VERSION
-void dlm_print_version(void)
-{
-        printk(KERN_INFO "%s\n", VERSION_STR);
-}
-MODULE_DESCRIPTION(VERSION_STR);
-MODULE_VERSION(DLM_BUILD_VERSION);
diff --git a/fs/ocfs2/dlm/dlmver.h b/fs/ocfs2/dlm/dlmver.h
deleted file mode 100644
index f674aee77a16..000000000000
--- a/fs/ocfs2/dlm/dlmver.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * dlmfsver.h
- *
- * Function prototypes
- *
- * Copyright (C) 2005 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-#ifndef DLM_VER_H
-#define DLM_VER_H
-void dlm_print_version(void);
-#endif /* DLM_VER_H */
diff --git a/fs/ocfs2/dlmfs/Makefile b/fs/ocfs2/dlmfs/Makefile
index f14be89a6701..eed3db8c5b49 100644
--- a/fs/ocfs2/dlmfs/Makefile
+++ b/fs/ocfs2/dlmfs/Makefile
@@ -2,4 +2,4 @@ ccflags-y := -Ifs/ocfs2
 obj-$(CONFIG_OCFS2_FS) += ocfs2_dlmfs.o
-ocfs2_dlmfs-objs := userdlm.o dlmfs.o dlmfsver.o
+ocfs2_dlmfs-objs := userdlm.o dlmfs.o
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index efa2b3d339e3..09b7d9dac71d 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -49,7 +49,6 @@
 #include "stackglue.h"
 #include "userdlm.h"
-#include "dlmfsver.h"
 #define MLOG_MASK_PREFIX ML_DLMFS
 #include "cluster/masklog.h"
@@ -644,8 +643,6 @@ static int __init init_dlmfs_fs(void)
        int status;
        int cleanup_inode = 0, cleanup_worker = 0;
-        dlmfs_print_version();
        status = bdi_init(&dlmfs_backing_dev_info);
        if (status)
                return status;
@@ -701,6 +698,7 @@ static void __exit exit_dlmfs_fs(void)
 MODULE_AUTHOR("Oracle");
 MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("OCFS2 DLM-Filesystem");
 module_init(init_dlmfs_fs)
 module_exit(exit_dlmfs_fs)
diff --git a/fs/ocfs2/dlmfs/dlmfsver.c b/fs/ocfs2/dlmfs/dlmfsver.c
deleted file mode 100644
index a733b3321f83..000000000000
--- a/fs/ocfs2/dlmfs/dlmfsver.c
+++ /dev/null
@@ -1,42 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * dlmfsver.c
- *
- * version string
- *
- * Copyright (C) 2002, 2005 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include "dlmfsver.h"
-#define DLM_BUILD_VERSION "1.5.0"
-#define VERSION_STR "OCFS2 DLMFS " DLM_BUILD_VERSION
-void dlmfs_print_version(void)
-{
-        printk(KERN_INFO "%s\n", VERSION_STR);
-}
-MODULE_DESCRIPTION(VERSION_STR);
-MODULE_VERSION(DLM_BUILD_VERSION);
diff --git a/fs/ocfs2/dlmfs/dlmfsver.h b/fs/ocfs2/dlmfs/dlmfsver.h
deleted file mode 100644
index f35eadbed25c..000000000000
--- a/fs/ocfs2/dlmfs/dlmfsver.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * dlmver.h
- *
- * Function prototypes
- *
- * Copyright (C) 2005 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-#ifndef DLMFS_VER_H
-#define DLMFS_VER_H
-void dlmfs_print_version(void);
-#endif /* DLMFS_VER_H */
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 3407b2c62b21..19986959d149 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2996,6 +2996,8 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
        /* for now, uuid == domain */
        status = ocfs2_cluster_connect(osb->osb_cluster_stack,
+                                       osb->osb_cluster_name,
+                                       strlen(osb->osb_cluster_name),
                                       osb->uuid_str,
                                       strlen(osb->uuid_str),
                                       &lproto, ocfs2_do_node_down, osb,
@@ -3005,7 +3007,7 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
                goto bail;
        }
-        status = ocfs2_cluster_this_node(&osb->node_num);
+        status = ocfs2_cluster_this_node(conn, &osb->node_num);
        if (status < 0) {
                mlog_errno(status);
                mlog(ML_ERROR,
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 6fff128cad16..51632c40e896 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -185,6 +185,9 @@ static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end,
                              file->f_path.dentry->d_name.name,
                              (unsigned long long)datasync);
+        if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
+                return -EROFS;
        err = filemap_write_and_wait_range(inode->i_mapping, start, end);
        if (err)
                return err;
@@ -474,11 +477,6 @@ static int ocfs2_truncate_file(struct inode *inode,
                goto bail;
        }
-        /* lets handle the simple truncate cases before doing any more
-         * cluster locking. */
-        if (new_i_size == le64_to_cpu(fe->i_size))
-                goto bail;
        down_write(&OCFS2_I(inode)->ip_alloc_sem);
        ocfs2_resv_discard(&osb->osb_la_resmap,
@@ -718,7 +716,8 @@ leave:
 * While a write will already be ordering the data, a truncate will not.
 * Thus, we need to explicitly order the zeroed pages.
 */
-static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode)
+static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode,
+                                                struct buffer_head *di_bh)
 {
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        handle_t *handle = NULL;
@@ -735,7 +734,14 @@ static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode)
        }
        ret = ocfs2_jbd2_file_inode(handle, inode);
-        if (ret < 0)
+        if (ret < 0) {
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret)
                mlog_errno(ret);
 out:
@@ -751,7 +757,7 @@ out:
 * to be too fragile to do exactly what we need without us having to
 * worry about recursive locking in ->write_begin() and ->write_end(). */
 static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
-                                 u64 abs_to)
+                                 u64 abs_to, struct buffer_head *di_bh)
 {
        struct address_space *mapping = inode->i_mapping;
        struct page *page;
@@ -759,6 +765,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
        handle_t *handle = NULL;
        int ret = 0;
        unsigned zero_from, zero_to, block_start, block_end;
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
        BUG_ON(abs_from >= abs_to);
        BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT));
@@ -801,7 +808,8 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
                }
                if (!handle) {
-                        handle = ocfs2_zero_start_ordered_transaction(inode);
+                        handle = ocfs2_zero_start_ordered_transaction(inode,
+                                                                      di_bh);
                        if (IS_ERR(handle)) {
                                ret = PTR_ERR(handle);
                                handle = NULL;
@@ -818,8 +826,22 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
                        ret = 0;
        }
-        if (handle)
+        if (handle) {
+                /*
+                 * fs-writeback will release the dirty pages without page lock
+                 * whose offset are over inode size, the release happens at
+                 * block_write_full_page_endio().
+                 */
+                i_size_write(inode, abs_to);
+                inode->i_blocks = ocfs2_inode_sector_count(inode);
+                di->i_size = cpu_to_le64((u64)i_size_read(inode));
+                inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+                di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
+                di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+                di->i_mtime_nsec = di->i_ctime_nsec;
+                ocfs2_journal_dirty(handle, di_bh);
                ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+        }
 out_unlock:
        unlock_page(page);
@@ -915,7 +937,7 @@ out:
 * has made sure that the entire range needs zeroing.
 */
 static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start,
-                                   u64 range_end)
+                                   u64 range_end, struct buffer_head *di_bh)
 {
        int rc = 0;
        u64 next_pos;
@@ -931,7 +953,7 @@ static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start,
                next_pos = (zero_pos & PAGE_CACHE_MASK) + PAGE_CACHE_SIZE;
                if (next_pos > range_end)
                        next_pos = range_end;
-                rc = ocfs2_write_zero_page(inode, zero_pos, next_pos);
+                rc = ocfs2_write_zero_page(inode, zero_pos, next_pos, di_bh);
                if (rc < 0) {
                        mlog_errno(rc);
                        break;
@@ -977,7 +999,7 @@ int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
                        range_end = zero_to_size;
                ret = ocfs2_zero_extend_range(inode, range_start,
-                                              range_end);
+                                              range_end, di_bh);
                if (ret) {
                        mlog_errno(ret);
                        break;
@@ -1145,14 +1167,14 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
                goto bail_unlock_rw;
        }
-        if (size_change && attr->ia_size != i_size_read(inode)) {
+        if (size_change) {
                status = inode_newsize_ok(inode, attr->ia_size);
                if (status)
                        goto bail_unlock;
                inode_dio_wait(inode);
-                if (i_size_read(inode) > attr->ia_size) {
+                if (i_size_read(inode) >= attr->ia_size) {
                        if (ocfs2_should_order_data(inode)) {
                                status = ocfs2_begin_ordered_truncate(inode,
                                                                      attr->ia_size);
@@ -1236,7 +1258,7 @@ bail:
                dqput(transfer_to[qtype]);
        if (!status && attr->ia_valid & ATTR_MODE) {
-                status = ocfs2_acl_chmod(inode);
+                status = posix_acl_chmod(inode, inode->i_mode);
                if (status < 0)
                        mlog_errno(status);
        }
@@ -1869,7 +1891,8 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
        }
        size = sr->l_start + sr->l_len;
-        if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) {
+        if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64 ||
+            cmd == OCFS2_IOC_UNRESVSP || cmd == OCFS2_IOC_UNRESVSP64) {
                if (sr->l_len <= 0) {
                        ret = -EINVAL;
                        goto out_inode_unlock;
@@ -2370,8 +2393,8 @@ out_dio:
        if (((file->f_flags & O_DSYNC) && !direct_io) || IS_SYNC(inode) ||
            ((file->f_flags & O_DIRECT) && !direct_io)) {
-                ret = filemap_fdatawrite_range(file->f_mapping, pos,
+                ret = filemap_fdatawrite_range(file->f_mapping, *ppos,
-                                               pos + count - 1);
+                                               *ppos + count - 1);
                if (ret < 0)
                        written = ret;
@@ -2384,8 +2407,8 @@ out_dio:
                }
                if (!ret)
-                        ret = filemap_fdatawait_range(file->f_mapping, pos,
+                        ret = filemap_fdatawait_range(file->f_mapping, *ppos,
-                                                      pos + count - 1);
+                                                      *ppos + count - 1);
        }
        /*
@@ -2661,6 +2684,7 @@ const struct inode_operations ocfs2_file_iops = {
        .removexattr    = generic_removexattr,
        .fiemap         = ocfs2_fiemap,
        .get_acl        = ocfs2_iop_get_acl,
+        .set_acl        = ocfs2_iop_set_acl,
 };
 const struct inode_operations ocfs2_special_file_iops = {
@@ -2668,6 +2692,7 @@ const struct inode_operations ocfs2_special_file_iops = {
        .getattr        = ocfs2_getattr,
        .permission     = ocfs2_permission,
        .get_acl        = ocfs2_iop_get_acl,
+        .set_acl        = ocfs2_iop_set_acl,
 };
 /*
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index fa32ce9b455d..8ca3c29accbf 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -7,6 +7,7 @@
 #include <linux/fs.h>
 #include <linux/mount.h>
+#include <linux/blkdev.h>
 #include <linux/compat.h>
 #include <cluster/masklog.h>
@@ -966,15 +967,21 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
        case FITRIM:
        {
                struct super_block *sb = inode->i_sb;
+                struct request_queue *q = bdev_get_queue(sb->s_bdev);
                struct fstrim_range range;
                int ret = 0;
                if (!capable(CAP_SYS_ADMIN))
                        return -EPERM;
+                if (!blk_queue_discard(q))
+                        return -EOPNOTSUPP;
                if (copy_from_user(&range, argp, sizeof(range)))
                        return -EFAULT;
+                range.minlen = max_t(u64, q->limits.discard_granularity,
+                                     range.minlen);
                ret = ocfs2_trim_fs(sb, &range);
                if (ret < 0)
                        return ret;
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index cd5496b7a0a3..044013455621 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -781,6 +781,48 @@ bail:
        return status;
 }
+int ocfs2_free_local_alloc_bits(struct ocfs2_super *osb,
+                                handle_t *handle,
+                                struct ocfs2_alloc_context *ac,
+                                u32 bit_off,
+                                u32 num_bits)
+{
+        int status, start;
+        u32 clear_bits;
+        struct inode *local_alloc_inode;
+        void *bitmap;
+        struct ocfs2_dinode *alloc;
+        struct ocfs2_local_alloc *la;
+        BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL);
+        local_alloc_inode = ac->ac_inode;
+        alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
+        la = OCFS2_LOCAL_ALLOC(alloc);
+        bitmap = la->la_bitmap;
+        start = bit_off - le32_to_cpu(la->la_bm_off);
+        clear_bits = num_bits;
+        status = ocfs2_journal_access_di(handle,
+                        INODE_CACHE(local_alloc_inode),
+                        osb->local_alloc_bh,
+                        OCFS2_JOURNAL_ACCESS_WRITE);
+        if (status < 0) {
+                mlog_errno(status);
+                goto bail;
+        }
+        while (clear_bits--)
+                ocfs2_clear_bit(start++, bitmap);
+        le32_add_cpu(&alloc->id1.bitmap1.i_used, -num_bits);
+        ocfs2_journal_dirty(handle, osb->local_alloc_bh);
+bail:
+        return status;
+}
 static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc)
 {
        u32 count;
diff --git a/fs/ocfs2/localalloc.h b/fs/ocfs2/localalloc.h
index 1be9b5864460..44a7d1fb2dec 100644
--- a/fs/ocfs2/localalloc.h
+++ b/fs/ocfs2/localalloc.h
@@ -55,6 +55,12 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
                                 u32 *bit_off,
                                 u32 *num_bits);
+int ocfs2_free_local_alloc_bits(struct ocfs2_super *osb,
+                                handle_t *handle,
+                                struct ocfs2_alloc_context *ac,
+                                u32 bit_off,
+                                u32 num_bits);
 void ocfs2_local_alloc_seen_free_bits(struct ocfs2_super *osb,
                                      unsigned int num_clusters);
 void ocfs2_la_enable_worker(struct work_struct *work);
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index 631a98213474..64c304d668f0 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -561,83 +561,6 @@ static void ocfs2_probe_alloc_group(struct inode *inode, struct buffer_head *bh,
        mlog(0, "found phys_cpos: %u to fit the wanted moving.\n", *phys_cpos);
 }
-static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
-                                       handle_t *handle,
-                                       struct buffer_head *di_bh,
-                                       u32 num_bits,
-                                       u16 chain)
-{
-        int ret;
-        u32 tmp_used;
-        struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
-        struct ocfs2_chain_list *cl =
-                                (struct ocfs2_chain_list *) &di->id2.i_chain;
-        ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
-        if (ret < 0) {
-                mlog_errno(ret);
-                goto out;
-        }
-        tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
-        di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
-        le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
-        ocfs2_journal_dirty(handle, di_bh);
-out:
-        return ret;
-}
-static inline int ocfs2_block_group_set_bits(handle_t *handle,
-                                             struct inode *alloc_inode,
-                                             struct ocfs2_group_desc *bg,
-                                             struct buffer_head *group_bh,
-                                             unsigned int bit_off,
-                                             unsigned int num_bits)
-{
-        int status;
-        void *bitmap = bg->bg_bitmap;
-        int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
-        /* All callers get the descriptor via
-         * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
-        BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
-        BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
-        mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
-             num_bits);
-        if (ocfs2_is_cluster_bitmap(alloc_inode))
-                journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
-        status = ocfs2_journal_access_gd(handle,
-                                         INODE_CACHE(alloc_inode),
-                                         group_bh,
-                                         journal_type);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
-        le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
-        if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
-                ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
-                            " count %u but claims %u are freed. num_bits %d",
-                            (unsigned long long)le64_to_cpu(bg->bg_blkno),
-                            le16_to_cpu(bg->bg_bits),
-                            le16_to_cpu(bg->bg_free_bits_count), num_bits);
-                return -EROFS;
-        }
-        while (num_bits--)
-                ocfs2_set_bit(bit_off++, bitmap);
-        ocfs2_journal_dirty(handle, group_bh);
-bail:
-        return status;
-}
 static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
                             u32 cpos, u32 phys_cpos, u32 *new_phys_cpos,
                             u32 len, int ext_flags)
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 4f791f6d27d0..3683643f3f0e 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -230,6 +230,7 @@ static int ocfs2_mknod(struct inode *dir,
        struct ocfs2_dir_lookup_result lookup = { NULL, };
        sigset_t oldset;
        int did_block_signals = 0;
+        struct posix_acl *default_acl = NULL, *acl = NULL;
        trace_ocfs2_mknod(dir, dentry, dentry->d_name.len, dentry->d_name.name,
                          (unsigned long long)OCFS2_I(dir)->ip_blkno,
@@ -331,6 +332,12 @@ static int ocfs2_mknod(struct inode *dir,
                goto leave;
        }
+        status = posix_acl_create(dir, &mode, &default_acl, &acl);
+        if (status) {
+                mlog_errno(status);
+                goto leave;
+        }
        handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb,
                                                            S_ISDIR(mode),
                                                            xattr_credits));
@@ -379,8 +386,17 @@ static int ocfs2_mknod(struct inode *dir,
                inc_nlink(dir);
        }
-        status = ocfs2_init_acl(handle, inode, dir, new_fe_bh, parent_fe_bh,
+        if (default_acl) {
-                                meta_ac, data_ac);
+                status = ocfs2_set_acl(handle, inode, new_fe_bh,
+                                       ACL_TYPE_DEFAULT, default_acl,
+                                       meta_ac, data_ac);
+        }
+        if (!status && acl) {
+                status = ocfs2_set_acl(handle, inode, new_fe_bh,
+                                       ACL_TYPE_ACCESS, acl,
+                                       meta_ac, data_ac);
+        }
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -419,6 +435,10 @@ static int ocfs2_mknod(struct inode *dir,
        d_instantiate(dentry, inode);
        status = 0;
 leave:
+        if (default_acl)
+                posix_acl_release(default_acl);
+        if (acl)
+                posix_acl_release(acl);
        if (status < 0 && did_quota_inode)
                dquot_free_inode(inode);
        if (handle)
@@ -644,6 +664,7 @@ static int ocfs2_link(struct dentry *old_dentry,
        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
        struct ocfs2_dir_lookup_result lookup = { NULL, };
        sigset_t oldset;
+        u64 old_de_ino;
        trace_ocfs2_link((unsigned long long)OCFS2_I(inode)->ip_blkno,
                         old_dentry->d_name.len, old_dentry->d_name.name,
@@ -666,6 +687,22 @@ static int ocfs2_link(struct dentry *old_dentry,
                goto out;
        }
+        err = ocfs2_lookup_ino_from_name(dir, old_dentry->d_name.name,
+                        old_dentry->d_name.len, &old_de_ino);
+        if (err) {
+                err = -ENOENT;
+                goto out;
+        }
+        /*
+         * Check whether another node removed the source inode while we
+         * were in the vfs.
+         */
+        if (old_de_ino != OCFS2_I(inode)->ip_blkno) {
+                err = -ENOENT;
+                goto out;
+        }
        err = ocfs2_check_dir_for_entry(dir, dentry->d_name.name,
                                        dentry->d_name.len);
        if (err)
@@ -948,7 +985,7 @@ leave:
        ocfs2_free_dir_lookup_result(&orphan_insert);
        ocfs2_free_dir_lookup_result(&lookup);
-        if (status && (status != -ENOTEMPTY))
+        if (status && (status != -ENOTEMPTY) && (status != -ENOENT))
                mlog_errno(status);
        return status;
@@ -2504,4 +2541,5 @@ const struct inode_operations ocfs2_dir_iops = {
        .removexattr    = generic_removexattr,
        .fiemap         = ocfs2_fiemap,
        .get_acl        = ocfs2_iop_get_acl,
+        .set_acl        = ocfs2_iop_set_acl,
 };
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 3a903470c794..553f53cc73ae 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -387,6 +387,7 @@ struct ocfs2_super
        u8 osb_stackflags;
        char osb_cluster_stack[OCFS2_STACK_LABEL_LEN + 1];
+        char osb_cluster_name[OCFS2_CLUSTER_NAME_LEN + 1];
        struct ocfs2_cluster_connection *cconn;
        struct ocfs2_lock_res osb_super_lockres;
        struct ocfs2_lock_res osb_rename_lockres;
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index aaa50611ec66..d7b5108789e2 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -717,6 +717,12 @@ static int ocfs2_release_dquot(struct dquot *dquot)
         */
        if (status < 0)
                mlog_errno(status);
+        /*
+         * Clear dq_off so that we search for the structure in quota file next
+         * time we acquire it. The structure might be deleted and reallocated
+         * elsewhere by another node while our dquot structure is on freelist.
+         */
+        dquot->dq_off = 0;
        clear_bit(DQ_ACTIVE_B, &dquot->dq_flags);
 out_trans:
        ocfs2_commit_trans(osb, handle);
@@ -756,16 +762,17 @@ static int ocfs2_acquire_dquot(struct dquot *dquot)
        status = ocfs2_lock_global_qf(info, 1);
        if (status < 0)
                goto out;
-        if (!test_bit(DQ_READ_B, &dquot->dq_flags)) {
+        status = ocfs2_qinfo_lock(info, 0);
-                status = ocfs2_qinfo_lock(info, 0);
+        if (status < 0)
-                if (status < 0)
+                goto out_dq;
-                        goto out_dq;
+        /*
-                status = qtree_read_dquot(&info->dqi_gi, dquot);
+         * We always want to read dquot structure from disk because we don't
-                ocfs2_qinfo_unlock(info, 0);
+         * know what happened with it while it was on freelist.
-                if (status < 0)
+         */
-                        goto out_dq;
+        status = qtree_read_dquot(&info->dqi_gi, dquot);
-        }
+        ocfs2_qinfo_unlock(info, 0);
-        set_bit(DQ_READ_B, &dquot->dq_flags);
+        if (status < 0)
+                goto out_dq;
        OCFS2_DQUOT(dquot)->dq_use_count++;
        OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace;
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 2e4344be3b96..2001862bf2b1 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -1303,10 +1303,6 @@ int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot)
        ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh);
 out:
-        /* Clear the read bit so that next time someone uses this
-         * dquot he reads fresh info from disk and allocates local
-         * dquot structure */
-        clear_bit(DQ_READ_B, &dquot->dq_flags);
        return status;
 }
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 55767e1ba724..6ba4bcbc4796 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -46,6 +46,7 @@
 #include <linux/quotaops.h>
 #include <linux/namei.h>
 #include <linux/mount.h>
+#include <linux/posix_acl.h>
 struct ocfs2_cow_context {
        struct inode *inode;
@@ -4268,11 +4269,20 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
        struct inode *inode = old_dentry->d_inode;
        struct buffer_head *old_bh = NULL;
        struct inode *new_orphan_inode = NULL;
+        struct posix_acl *default_acl, *acl;
+        umode_t mode;
        if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)))
                return -EOPNOTSUPP;
-        error = ocfs2_create_inode_in_orphan(dir, inode->i_mode,
+        mode = inode->i_mode;
+        error = posix_acl_create(dir, &mode, &default_acl, &acl);
+        if (error) {
+                mlog_errno(error);
+                goto out;
+        }
+        error = ocfs2_create_inode_in_orphan(dir, mode,
                                             &new_orphan_inode);
        if (error) {
                mlog_errno(error);
@@ -4303,11 +4313,16 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
        /* If the security isn't preserved, we need to re-initialize them. */
        if (!preserve) {
                error = ocfs2_init_security_and_acl(dir, new_orphan_inode,
-                                                    &new_dentry->d_name);
+                                                    &new_dentry->d_name,
+                                                    default_acl, acl);
                if (error)
                        mlog_errno(error);
        }
 out:
+        if (default_acl)
+                posix_acl_release(default_acl);
+        if (acl)
+                posix_acl_release(acl);
        if (!error) {
                error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode,
                                                       new_dentry);
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index bf1f8930456f..1724d43d3da1 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -398,7 +398,8 @@ static int o2cb_cluster_disconnect(struct ocfs2_cluster_connection *conn)
        return 0;
 }
-static int o2cb_cluster_this_node(unsigned int *node)
+static int o2cb_cluster_this_node(struct ocfs2_cluster_connection *conn,
+                                  unsigned int *node)
 {
        int node_num;
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 286edf1e231f..13a8537d8e8b 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -23,6 +23,7 @@
 #include <linux/mutex.h>
 #include <linux/slab.h>
 #include <linux/reboot.h>
+#include <linux/sched.h>
 #include <asm/uaccess.h>
 #include "stackglue.h"
@@ -102,6 +103,12 @@
 #define OCFS2_TEXT_UUID_LEN                     32
 #define OCFS2_CONTROL_MESSAGE_VERNUM_LEN        2
 #define OCFS2_CONTROL_MESSAGE_NODENUM_LEN       8
+#define VERSION_LOCK                            "version_lock"
+enum ocfs2_connection_type {
+        WITH_CONTROLD,
+        NO_CONTROLD
+};
 /*
 * ocfs2_live_connection is refcounted because the filesystem and
@@ -110,6 +117,13 @@
 struct ocfs2_live_connection {
        struct list_head                oc_list;
        struct ocfs2_cluster_connection *oc_conn;
+        enum ocfs2_connection_type      oc_type;
+        atomic_t                        oc_this_node;
+        int                             oc_our_slot;
+        struct dlm_lksb                 oc_version_lksb;
+        char                            oc_lvb[DLM_LVB_LEN];
+        struct completion               oc_sync_wait;
+        wait_queue_head_t               oc_wait;
 };
 struct ocfs2_control_private {
@@ -198,20 +212,15 @@ static struct ocfs2_live_connection *ocfs2_connection_find(const char *name)
 * mount path.  Since the VFS prevents multiple calls to
 * fill_super(), we can't get dupes here.
 */
-static int ocfs2_live_connection_new(struct ocfs2_cluster_connection *conn,
+static int ocfs2_live_connection_attach(struct ocfs2_cluster_connection *conn,
-                                     struct ocfs2_live_connection **c_ret)
+                                     struct ocfs2_live_connection *c)
 {
        int rc = 0;
-        struct ocfs2_live_connection *c;
-        c = kzalloc(sizeof(struct ocfs2_live_connection), GFP_KERNEL);
-        if (!c)
-                return -ENOMEM;
        mutex_lock(&ocfs2_control_lock);
        c->oc_conn = conn;
-        if (atomic_read(&ocfs2_control_opened))
+        if ((c->oc_type == NO_CONTROLD) || atomic_read(&ocfs2_control_opened))
                list_add(&c->oc_list, &ocfs2_live_connection_list);
        else {
                printk(KERN_ERR
@@ -220,12 +229,6 @@ static int ocfs2_live_connection_new(struct ocfs2_cluster_connection *conn,
        }
        mutex_unlock(&ocfs2_control_lock);
-        if (!rc)
-                *c_ret = c;
-        else
-                kfree(c);
        return rc;
 }
@@ -799,18 +802,251 @@ static int fs_protocol_compare(struct ocfs2_protocol_version *existing,
        return 0;
 }
+static void lvb_to_version(char *lvb, struct ocfs2_protocol_version *ver)
+{
+        struct ocfs2_protocol_version *pv =
+                (struct ocfs2_protocol_version *)lvb;
+        /*
+         * ocfs2_protocol_version has two u8 variables, so we don't
+         * need any endian conversion.
+         */
+        ver->pv_major = pv->pv_major;
+        ver->pv_minor = pv->pv_minor;
+}
+static void version_to_lvb(struct ocfs2_protocol_version *ver, char *lvb)
+{
+        struct ocfs2_protocol_version *pv =
+                (struct ocfs2_protocol_version *)lvb;
+        /*
+         * ocfs2_protocol_version has two u8 variables, so we don't
+         * need any endian conversion.
+         */
+        pv->pv_major = ver->pv_major;
+        pv->pv_minor = ver->pv_minor;
+}
+static void sync_wait_cb(void *arg)
+{
+        struct ocfs2_cluster_connection *conn = arg;
+        struct ocfs2_live_connection *lc = conn->cc_private;
+        complete(&lc->oc_sync_wait);
+}
+static int sync_unlock(struct ocfs2_cluster_connection *conn,
+                struct dlm_lksb *lksb, char *name)
+{
+        int error;
+        struct ocfs2_live_connection *lc = conn->cc_private;
+        error = dlm_unlock(conn->cc_lockspace, lksb->sb_lkid, 0, lksb, conn);
+        if (error) {
+                printk(KERN_ERR "%s lkid %x error %d\n",
+                                name, lksb->sb_lkid, error);
+                return error;
+        }
+        wait_for_completion(&lc->oc_sync_wait);
+        if (lksb->sb_status != -DLM_EUNLOCK) {
+                printk(KERN_ERR "%s lkid %x status %d\n",
+                                name, lksb->sb_lkid, lksb->sb_status);
+                return -1;
+        }
+        return 0;
+}
+static int sync_lock(struct ocfs2_cluster_connection *conn,
+                int mode, uint32_t flags,
+                struct dlm_lksb *lksb, char *name)
+{
+        int error, status;
+        struct ocfs2_live_connection *lc = conn->cc_private;
+        error = dlm_lock(conn->cc_lockspace, mode, lksb, flags,
+                        name, strlen(name),
+                        0, sync_wait_cb, conn, NULL);
+        if (error) {
+                printk(KERN_ERR "%s lkid %x flags %x mode %d error %d\n",
+                                name, lksb->sb_lkid, flags, mode, error);
+                return error;
+        }
+        wait_for_completion(&lc->oc_sync_wait);
+        status = lksb->sb_status;
+        if (status && status != -EAGAIN) {
+                printk(KERN_ERR "%s lkid %x flags %x mode %d status %d\n",
+                                name, lksb->sb_lkid, flags, mode, status);
+        }
+        return status;
+}
+static int version_lock(struct ocfs2_cluster_connection *conn, int mode,
+                int flags)
+{
+        struct ocfs2_live_connection *lc = conn->cc_private;
+        return sync_lock(conn, mode, flags,
+                        &lc->oc_version_lksb, VERSION_LOCK);
+}
+static int version_unlock(struct ocfs2_cluster_connection *conn)
+{
+        struct ocfs2_live_connection *lc = conn->cc_private;
+        return sync_unlock(conn, &lc->oc_version_lksb, VERSION_LOCK);
+}
+/* get_protocol_version()
+ *
+ * To exchange ocfs2 versioning, we use the LVB of the version dlm lock.
+ * The algorithm is:
+ * 1. Attempt to take the lock in EX mode (non-blocking).
+ * 2. If successful (which means it is the first mount), write the
+ *    version number and downconvert to PR lock.
+ * 3. If unsuccessful (returns -EAGAIN), read the version from the LVB after
+ *    taking the PR lock.
+ */
+static int get_protocol_version(struct ocfs2_cluster_connection *conn)
+{
+        int ret;
+        struct ocfs2_live_connection *lc = conn->cc_private;
+        struct ocfs2_protocol_version pv;
+        running_proto.pv_major =
+                ocfs2_user_plugin.sp_max_proto.pv_major;
+        running_proto.pv_minor =
+                ocfs2_user_plugin.sp_max_proto.pv_minor;
+        lc->oc_version_lksb.sb_lvbptr = lc->oc_lvb;
+        ret = version_lock(conn, DLM_LOCK_EX,
+                        DLM_LKF_VALBLK|DLM_LKF_NOQUEUE);
+        if (!ret) {
+                conn->cc_version.pv_major = running_proto.pv_major;
+                conn->cc_version.pv_minor = running_proto.pv_minor;
+                version_to_lvb(&running_proto, lc->oc_lvb);
+                version_lock(conn, DLM_LOCK_PR, DLM_LKF_CONVERT|DLM_LKF_VALBLK);
+        } else if (ret == -EAGAIN) {
+                ret = version_lock(conn, DLM_LOCK_PR, DLM_LKF_VALBLK);
+                if (ret)
+                        goto out;
+                lvb_to_version(lc->oc_lvb, &pv);
+                if ((pv.pv_major != running_proto.pv_major) ||
+                                (pv.pv_minor > running_proto.pv_minor)) {
+                        ret = -EINVAL;
+                        goto out;
+                }
+                conn->cc_version.pv_major = pv.pv_major;
+                conn->cc_version.pv_minor = pv.pv_minor;
+        }
+out:
+        return ret;
+}
+static void user_recover_prep(void *arg)
+{
+}
+static void user_recover_slot(void *arg, struct dlm_slot *slot)
+{
+        struct ocfs2_cluster_connection *conn = arg;
+        printk(KERN_INFO "ocfs2: Node %d/%d down. Initiating recovery.\n",
+                        slot->nodeid, slot->slot);
+        conn->cc_recovery_handler(slot->nodeid, conn->cc_recovery_data);
+}
+static void user_recover_done(void *arg, struct dlm_slot *slots,
+                int num_slots, int our_slot,
+                uint32_t generation)
+{
+        struct ocfs2_cluster_connection *conn = arg;
+        struct ocfs2_live_connection *lc = conn->cc_private;
+        int i;
+        for (i = 0; i < num_slots; i++)
+                if (slots[i].slot == our_slot) {
+                        atomic_set(&lc->oc_this_node, slots[i].nodeid);
+                        break;
+                }
+        lc->oc_our_slot = our_slot;
+        wake_up(&lc->oc_wait);
+}
+static const struct dlm_lockspace_ops ocfs2_ls_ops = {
+        .recover_prep = user_recover_prep,
+        .recover_slot = user_recover_slot,
+        .recover_done = user_recover_done,
+};
+static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn)
+{
+        version_unlock(conn);
+        dlm_release_lockspace(conn->cc_lockspace, 2);
+        conn->cc_lockspace = NULL;
+        ocfs2_live_connection_drop(conn->cc_private);
+        conn->cc_private = NULL;
+        return 0;
+}
 static int user_cluster_connect(struct ocfs2_cluster_connection *conn)
 {
        dlm_lockspace_t *fsdlm;
-        struct ocfs2_live_connection *uninitialized_var(control);
+        struct ocfs2_live_connection *lc;
-        int rc = 0;
+        int rc, ops_rv;
        BUG_ON(conn == NULL);
-        rc = ocfs2_live_connection_new(conn, &control);
+        lc = kzalloc(sizeof(struct ocfs2_live_connection), GFP_KERNEL);
+        if (!lc) {
+                rc = -ENOMEM;
+                goto out;
+        }
+        init_waitqueue_head(&lc->oc_wait);
+        init_completion(&lc->oc_sync_wait);
+        atomic_set(&lc->oc_this_node, 0);
+        conn->cc_private = lc;
+        lc->oc_type = NO_CONTROLD;
+        rc = dlm_new_lockspace(conn->cc_name, conn->cc_cluster_name,
+                               DLM_LSFL_FS, DLM_LVB_LEN,
+                               &ocfs2_ls_ops, conn, &ops_rv, &fsdlm);
+        if (rc)
+                goto out;
+        if (ops_rv == -EOPNOTSUPP) {
+                lc->oc_type = WITH_CONTROLD;
+                printk(KERN_NOTICE "ocfs2: You seem to be using an older "
+                                "version of dlm_controld and/or ocfs2-tools."
+                                " Please consider upgrading.\n");
+        } else if (ops_rv) {
+                rc = ops_rv;
+                goto out;
+        }
+        conn->cc_lockspace = fsdlm;
+        rc = ocfs2_live_connection_attach(conn, lc);
        if (rc)
                goto out;
+        if (lc->oc_type == NO_CONTROLD) {
+                rc = get_protocol_version(conn);
+                if (rc) {
+                        printk(KERN_ERR "ocfs2: Could not determine"
+                                        " locking version\n");
+                        user_cluster_disconnect(conn);
+                        goto out;
+                }
+                wait_event(lc->oc_wait, (atomic_read(&lc->oc_this_node) > 0));
+        }
        /*
         * running_proto must have been set before we allowed any mounts
         * to proceed.
@@ -818,42 +1054,34 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn)
        if (fs_protocol_compare(&running_proto, &conn->cc_version)) {
                printk(KERN_ERR
                       "Unable to mount with fs locking protocol version "
-                       "%u.%u because the userspace control daemon has "
+                       "%u.%u because negotiated protocol is %u.%u\n",
-                       "negotiated %u.%u\n",
                       conn->cc_version.pv_major, conn->cc_version.pv_minor,
                       running_proto.pv_major, running_proto.pv_minor);
                rc = -EPROTO;
-                ocfs2_live_connection_drop(control);
+                ocfs2_live_connection_drop(lc);
-                goto out;
+                lc = NULL;
-        }
-        rc = dlm_new_lockspace(conn->cc_name, NULL, DLM_LSFL_FS, DLM_LVB_LEN,
-                               NULL, NULL, NULL, &fsdlm);
-        if (rc) {
-                ocfs2_live_connection_drop(control);
-                goto out;
        }
-        conn->cc_private = control;
-        conn->cc_lockspace = fsdlm;
 out:
+        if (rc && lc)
+                kfree(lc);
        return rc;
 }
-static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn)
-{
-        dlm_release_lockspace(conn->cc_lockspace, 2);
-        conn->cc_lockspace = NULL;
-        ocfs2_live_connection_drop(conn->cc_private);
-        conn->cc_private = NULL;
-        return 0;
-}
-static int user_cluster_this_node(unsigned int *this_node)
+static int user_cluster_this_node(struct ocfs2_cluster_connection *conn,
+                                  unsigned int *this_node)
 {
        int rc;
+        struct ocfs2_live_connection *lc = conn->cc_private;
+        if (lc->oc_type == WITH_CONTROLD)
+                rc = ocfs2_control_get_this_node();
+        else if (lc->oc_type == NO_CONTROLD)
+                rc = atomic_read(&lc->oc_this_node);
+        else
+                rc = -EINVAL;
-        rc = ocfs2_control_get_this_node();
        if (rc < 0)
                return rc;
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index cb7ec0b63ddc..ca5ce14cbddc 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -309,6 +309,8 @@ int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino,
 EXPORT_SYMBOL_GPL(ocfs2_plock);
 int ocfs2_cluster_connect(const char *stack_name,
+                          const char *cluster_name,
+                          int cluster_name_len,
                          const char *group,
                          int grouplen,
                          struct ocfs2_locking_protocol *lproto,
@@ -342,8 +344,12 @@ int ocfs2_cluster_connect(const char *stack_name,
                goto out;
        }
-        memcpy(new_conn->cc_name, group, grouplen);
+        strlcpy(new_conn->cc_name, group, GROUP_NAME_MAX + 1);
        new_conn->cc_namelen = grouplen;
+        if (cluster_name_len)
+                strlcpy(new_conn->cc_cluster_name, cluster_name,
+                        CLUSTER_NAME_MAX + 1);
+        new_conn->cc_cluster_name_len = cluster_name_len;
        new_conn->cc_recovery_handler = recovery_handler;
        new_conn->cc_recovery_data = recovery_data;
@@ -386,8 +392,9 @@ int ocfs2_cluster_connect_agnostic(const char *group,
        if (cluster_stack_name[0])
                stack_name = cluster_stack_name;
-        return ocfs2_cluster_connect(stack_name, group, grouplen, lproto,
+        return ocfs2_cluster_connect(stack_name, NULL, 0, group, grouplen,
-                                     recovery_handler, recovery_data, conn);
+                                     lproto, recovery_handler, recovery_data,
+                                     conn);
 }
 EXPORT_SYMBOL_GPL(ocfs2_cluster_connect_agnostic);
@@ -460,9 +467,10 @@ void ocfs2_cluster_hangup(const char *group, int grouplen)
 }
 EXPORT_SYMBOL_GPL(ocfs2_cluster_hangup);
-int ocfs2_cluster_this_node(unsigned int *node)
+int ocfs2_cluster_this_node(struct ocfs2_cluster_connection *conn,
+                            unsigned int *node)
 {
-        return active_stack->sp_ops->this_node(node);
+        return active_stack->sp_ops->this_node(conn, node);
 }
 EXPORT_SYMBOL_GPL(ocfs2_cluster_this_node);
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index 1ec56fdb8d0d..66334a30cea8 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -45,6 +45,9 @@ struct file_lock;
 */
 #define GROUP_NAME_MAX          64
+/* This shadows  OCFS2_CLUSTER_NAME_LEN */
+#define CLUSTER_NAME_MAX        16
 /*
 * ocfs2_protocol_version changes when ocfs2 does something different in
@@ -97,8 +100,10 @@ struct ocfs2_locking_protocol {
 * locking compatibility.
 */
 struct ocfs2_cluster_connection {
-        char cc_name[GROUP_NAME_MAX];
+        char cc_name[GROUP_NAME_MAX + 1];
        int cc_namelen;
+        char cc_cluster_name[CLUSTER_NAME_MAX + 1];
+        int cc_cluster_name_len;
        struct ocfs2_protocol_version cc_version;
        struct ocfs2_locking_protocol *cc_proto;
        void (*cc_recovery_handler)(int node_num, void *recovery_data);
@@ -152,7 +157,8 @@ struct ocfs2_stack_operations {
         * ->this_node() returns the cluster's unique identifier for the
         * local node.
         */
-        int (*this_node)(unsigned int *node);
+        int (*this_node)(struct ocfs2_cluster_connection *conn,
+                         unsigned int *node);
        /*
         * Call the underlying dlm lock function.  The ->dlm_lock()
@@ -239,6 +245,8 @@ struct ocfs2_stack_plugin {
 /* Used by the filesystem */
 int ocfs2_cluster_connect(const char *stack_name,
+                          const char *cluster_name,
+                          int cluster_name_len,
                          const char *group,
                          int grouplen,
                          struct ocfs2_locking_protocol *lproto,
@@ -260,7 +268,8 @@ int ocfs2_cluster_connect_agnostic(const char *group,
 int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn,
                             int hangup_pending);
 void ocfs2_cluster_hangup(const char *group, int grouplen);
-int ocfs2_cluster_this_node(unsigned int *node);
+int ocfs2_cluster_this_node(struct ocfs2_cluster_connection *conn,
+                            unsigned int *node);
 struct ocfs2_lock_res;
 int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn,
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 2c91452c4047..47ae2663a6f5 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -113,12 +113,6 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
                                     struct ocfs2_suballoc_result *res);
 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
                                         int nr);
-static inline int ocfs2_block_group_set_bits(handle_t *handle,
-                                             struct inode *alloc_inode,
-                                             struct ocfs2_group_desc *bg,
-                                             struct buffer_head *group_bh,
-                                             unsigned int bit_off,
-                                             unsigned int num_bits);
 static int ocfs2_relink_block_group(handle_t *handle,
                                    struct inode *alloc_inode,
                                    struct buffer_head *fe_bh,
@@ -1343,7 +1337,7 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
        return status;
 }
-static inline int ocfs2_block_group_set_bits(handle_t *handle,
+int ocfs2_block_group_set_bits(handle_t *handle,
                                             struct inode *alloc_inode,
                                             struct ocfs2_group_desc *bg,
                                             struct buffer_head *group_bh,
@@ -1388,8 +1382,6 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
        ocfs2_journal_dirty(handle, group_bh);
 bail:
-        if (status)
-                mlog_errno(status);
        return status;
 }
@@ -1588,7 +1580,7 @@ static int ocfs2_block_group_search(struct inode *inode,
        return ret;
 }
-static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
+int ocfs2_alloc_dinode_update_counts(struct inode *inode,
                                       handle_t *handle,
                                       struct buffer_head *di_bh,
                                       u32 num_bits,
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index a36d0aa50911..218d8036b3e7 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -86,6 +86,18 @@ int ocfs2_reserve_clusters(struct ocfs2_super *osb,
                           u32 bits_wanted,
                           struct ocfs2_alloc_context **ac);
+int ocfs2_alloc_dinode_update_counts(struct inode *inode,
+                         handle_t *handle,
+                         struct buffer_head *di_bh,
+                         u32 num_bits,
+                         u16 chain);
+int ocfs2_block_group_set_bits(handle_t *handle,
+                         struct inode *alloc_inode,
+                         struct ocfs2_group_desc *bg,
+                         struct buffer_head *group_bh,
+                         unsigned int bit_off,
+                         unsigned int num_bits);
 int ocfs2_claim_metadata(handle_t *handle,
                         struct ocfs2_alloc_context *ac,
                         u32 bits_wanted,
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index c41492957aa5..49d84f80f36c 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -68,7 +68,6 @@
 #include "super.h"
 #include "sysfile.h"
 #include "uptodate.h"
-#include "ver.h"
 #include "xattr.h"
 #include "quota.h"
 #include "refcounttree.h"
@@ -90,6 +89,7 @@ static struct dentry *ocfs2_debugfs_root = NULL;
 MODULE_AUTHOR("Oracle");
 MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("OCFS2 cluster file system");
 struct mount_options
 {
@@ -1618,8 +1618,6 @@ static int __init ocfs2_init(void)
 {
        int status, i;
-        ocfs2_print_version();
        for (i = 0; i < OCFS2_IOEND_WQ_HASH_SZ; i++)
                init_waitqueue_head(&ocfs2__ioend_wq[i]);
@@ -1947,11 +1945,15 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
        ocfs2_shutdown_local_alloc(osb);
-        ocfs2_truncate_log_shutdown(osb);
        /* This will disable recovery and flush any recovery work. */
        ocfs2_recovery_exit(osb);
+        /*
+         * During dismount, when it recovers another node it will call
+         * ocfs2_recover_orphans and queue delayed work osb_truncate_log_wq.
+         */
+        ocfs2_truncate_log_shutdown(osb);
        ocfs2_journal_shutdown(osb);
        ocfs2_sync_blockdev(sb);
@@ -2225,10 +2227,9 @@ static int ocfs2_initialize_super(struct super_block *sb,
        if (ocfs2_clusterinfo_valid(osb)) {
                osb->osb_stackflags =
                        OCFS2_RAW_SB(di)->s_cluster_info.ci_stackflags;
-                memcpy(osb->osb_cluster_stack,
+                strlcpy(osb->osb_cluster_stack,
                       OCFS2_RAW_SB(di)->s_cluster_info.ci_stack,
-                       OCFS2_STACK_LABEL_LEN);
+                       OCFS2_STACK_LABEL_LEN + 1);
-                osb->osb_cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0';
                if (strlen(osb->osb_cluster_stack) != OCFS2_STACK_LABEL_LEN) {
                        mlog(ML_ERROR,
                             "couldn't mount because of an invalid "
@@ -2237,6 +2238,9 @@ static int ocfs2_initialize_super(struct super_block *sb,
                        status = -EINVAL;
                        goto bail;
                }
+                strlcpy(osb->osb_cluster_name,
+                        OCFS2_RAW_SB(di)->s_cluster_info.ci_cluster,
+                        OCFS2_CLUSTER_NAME_LEN + 1);
        } else {
                /* The empty string is identical with classic tools that
                 * don't know about s_cluster_info. */
diff --git a/fs/ocfs2/ver.c b/fs/ocfs2/ver.c
deleted file mode 100644
index e2488f4128a2..000000000000
--- a/fs/ocfs2/ver.c
+++ /dev/null
@@ -1,43 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * ver.c
- *
- * version string
- *
- * Copyright (C) 2002, 2005 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
-#include "ver.h"
-#define OCFS2_BUILD_VERSION "1.5.0"
-#define VERSION_STR "OCFS2 " OCFS2_BUILD_VERSION
-void ocfs2_print_version(void)
-{
-        printk(KERN_INFO "%s\n", VERSION_STR);
-}
-MODULE_DESCRIPTION(VERSION_STR);
-MODULE_VERSION(OCFS2_BUILD_VERSION);
diff --git a/fs/ocfs2/ver.h b/fs/ocfs2/ver.h
deleted file mode 100644
index d7395cb91d2f..000000000000
--- a/fs/ocfs2/ver.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * ver.h
- *
- * Function prototypes
- *
- * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-#ifndef OCFS2_VER_H
-#define OCFS2_VER_H
-void ocfs2_print_version(void);
-#endif /* OCFS2_VER_H */
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index f0a1326d9bba..185fa3b7f962 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -99,8 +99,8 @@ static struct ocfs2_xattr_def_value_root def_xv = {
 const struct xattr_handler *ocfs2_xattr_handlers[] = {
        &ocfs2_xattr_user_handler,
-        &ocfs2_xattr_acl_access_handler,
+        &posix_acl_access_xattr_handler,
-        &ocfs2_xattr_acl_default_handler,
+        &posix_acl_default_xattr_handler,
        &ocfs2_xattr_trusted_handler,
        &ocfs2_xattr_security_handler,
        NULL
@@ -109,9 +109,9 @@ const struct xattr_handler *ocfs2_xattr_handlers[] = {
 static const struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = {
        [OCFS2_XATTR_INDEX_USER]        = &ocfs2_xattr_user_handler,
        [OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS]
-                                        = &ocfs2_xattr_acl_access_handler,
+                                        = &posix_acl_access_xattr_handler,
        [OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT]
-                                        = &ocfs2_xattr_acl_default_handler,
+                                        = &posix_acl_default_xattr_handler,
        [OCFS2_XATTR_INDEX_TRUSTED]     = &ocfs2_xattr_trusted_handler,
        [OCFS2_XATTR_INDEX_SECURITY]    = &ocfs2_xattr_security_handler,
 };
@@ -7190,10 +7190,12 @@ out:
 */
 int ocfs2_init_security_and_acl(struct inode *dir,
                                struct inode *inode,
-                                const struct qstr *qstr)
+                                const struct qstr *qstr,
+                                struct posix_acl *default_acl,
+                                struct posix_acl *acl)
 {
-        int ret = 0;
        struct buffer_head *dir_bh = NULL;
+        int ret = 0;
        ret = ocfs2_init_security_get(inode, dir, qstr, NULL);
        if (ret) {
@@ -7207,9 +7209,10 @@ int ocfs2_init_security_and_acl(struct inode *dir,
                goto leave;
        }
-        ret = ocfs2_init_acl(NULL, inode, dir, NULL, dir_bh, NULL, NULL);
+        if (!ret && default_acl)
-        if (ret)
+                ret = ocfs2_iop_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
-                mlog_errno(ret);
+        if (!ret && acl)
+                ret = ocfs2_iop_set_acl(inode, acl, ACL_TYPE_ACCESS);
        ocfs2_inode_unlock(dir, 0);
        brelse(dir_bh);
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index 19f134e896a9..f10d5b93c366 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -40,8 +40,6 @@ struct ocfs2_security_xattr_info {
 extern const struct xattr_handler ocfs2_xattr_user_handler;
 extern const struct xattr_handler ocfs2_xattr_trusted_handler;
 extern const struct xattr_handler ocfs2_xattr_security_handler;
-extern const struct xattr_handler ocfs2_xattr_acl_access_handler;
-extern const struct xattr_handler ocfs2_xattr_acl_default_handler;
 extern const struct xattr_handler *ocfs2_xattr_handlers[];
 ssize_t ocfs2_listxattr(struct dentry *, char *, size_t);
@@ -96,5 +94,7 @@ int ocfs2_reflink_xattrs(struct inode *old_inode,
                         bool preserve_security);
 int ocfs2_init_security_and_acl(struct inode *dir,
                                struct inode *inode,
-                                const struct qstr *qstr);
+                                const struct qstr *qstr,
+                                struct posix_acl *default_acl,
+                                struct posix_acl *acl);
 #endif /* OCFS2_XATTR_H */
diff --git a/fs/open.c b/fs/open.c
index 4b3e1edf2fe4..b9ed8b25c108 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -705,6 +705,10 @@ static int do_dentry_open(struct file *f,
                return 0;
        }
+        /* POSIX.1-2008/SUSv4 Section XSI 2.9.7 */
+        if (S_ISREG(inode->i_mode))
+                f->f_mode |= FMODE_ATOMIC_POS;
        f->f_op = fops_get(inode->i_fop);
        if (unlikely(WARN_ON(!f->f_op))) {
                error = -ENODEV;
diff --git a/fs/pipe.c b/fs/pipe.c
index 0e0752ef2715..78fd0d0788db 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -663,10 +663,11 @@ out:
                wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM);
                kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
        }
-        if (ret > 0) {
+        if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) {
                int err = file_update_time(filp);
                if (err)
                        ret = err;
+                sb_end_write(file_inode(filp)->i_sb);
        }
        return ret;
 }
diff --git a/fs/pnode.c b/fs/pnode.c
index c7221bb19801..88396df725b4 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -220,14 +220,14 @@ static struct mount *get_source(struct mount *dest,
 * @tree_list : list of heads of trees to be attached.
 */
 int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp,
-                    struct mount *source_mnt, struct list_head *tree_list)
+                    struct mount *source_mnt, struct hlist_head *tree_list)
 {
        struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
        struct mount *m, *child;
        int ret = 0;
        struct mount *prev_dest_mnt = dest_mnt;
        struct mount *prev_src_mnt  = source_mnt;
-        LIST_HEAD(tmp_list);
+        HLIST_HEAD(tmp_list);
        for (m = propagation_next(dest_mnt, dest_mnt); m;
                        m = propagation_next(m, dest_mnt)) {
@@ -246,27 +246,29 @@ int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp,
                child = copy_tree(source, source->mnt.mnt_root, type);
                if (IS_ERR(child)) {
                        ret = PTR_ERR(child);
-                        list_splice(tree_list, tmp_list.prev);
+                        tmp_list = *tree_list;
+                        tmp_list.first->pprev = &tmp_list.first;
+                        INIT_HLIST_HEAD(tree_list);
                        goto out;
                }
                if (is_subdir(dest_mp->m_dentry, m->mnt.mnt_root)) {
                        mnt_set_mountpoint(m, dest_mp, child);
-                        list_add_tail(&child->mnt_hash, tree_list);
+                        hlist_add_head(&child->mnt_hash, tree_list);
                } else {
                        /*
                         * This can happen if the parent mount was bind mounted
                         * on some subdirectory of a shared/slave mount.
                         */
-                        list_add_tail(&child->mnt_hash, &tmp_list);
+                        hlist_add_head(&child->mnt_hash, &tmp_list);
                }
                prev_dest_mnt = m;
                prev_src_mnt  = child;
        }
 out:
        lock_mount_hash();
-        while (!list_empty(&tmp_list)) {
+        while (!hlist_empty(&tmp_list)) {
-                child = list_first_entry(&tmp_list, struct mount, mnt_hash);
+                child = hlist_entry(tmp_list.first, struct mount, mnt_hash);
                umount_tree(child, 0);
        }
        unlock_mount_hash();
@@ -338,8 +340,10 @@ static void __propagate_umount(struct mount *mnt)
                 * umount the child only if the child has no
                 * other children
                 */
-                if (child && list_empty(&child->mnt_mounts))
+                if (child && list_empty(&child->mnt_mounts)) {
-                        list_move_tail(&child->mnt_hash, &mnt->mnt_hash);
+                        hlist_del_init_rcu(&child->mnt_hash);
+                        hlist_add_before_rcu(&child->mnt_hash, &mnt->mnt_hash);
+                }
        }
 }
@@ -350,11 +354,11 @@ static void __propagate_umount(struct mount *mnt)
 *
 * vfsmount lock must be held for write
 */
-int propagate_umount(struct list_head *list)
+int propagate_umount(struct hlist_head *list)
 {
        struct mount *mnt;
-        list_for_each_entry(mnt, list, mnt_hash)
+        hlist_for_each_entry(mnt, list, mnt_hash)
                __propagate_umount(mnt);
        return 0;
 }
diff --git a/fs/pnode.h b/fs/pnode.h
index 59e7eda1851e..fc28a27fa892 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -36,8 +36,8 @@ static inline void set_mnt_shared(struct mount *mnt)
 void change_mnt_propagation(struct mount *, int);
 int propagate_mnt(struct mount *, struct mountpoint *, struct mount *,
-                struct list_head *);
+                struct hlist_head *);
-int propagate_umount(struct list_head *);
+int propagate_umount(struct hlist_head *);
 int propagate_mount_busy(struct mount *, int);
 void mnt_release_group_id(struct mount *);
 int get_dominating_id(struct mount *mnt, const struct path *root);
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 8bd2135b7f82..11c54fd51e16 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -1,10 +1,8 @@
 /*
- * linux/fs/posix_acl.c
+ * Copyright (C) 2002,2003 by Andreas Gruenbacher <a.gruenbacher@computer.org>
 *
- *  Copyright (C) 2002 by Andreas Gruenbacher <a.gruenbacher@computer.org>
+ * Fixes from William Schumacher incorporated on 15 March 2001.
- *
+ *    (Reported by Charles Bertsch, <CBertsch@microtest.com>).
- *  Fixes from William Schumacher incorporated on 15 March 2001.
- *     (Reported by Charles Bertsch, <CBertsch@microtest.com>).
 */
 /*
@@ -18,15 +16,112 @@
 #include <linux/fs.h>
 #include <linux/sched.h>
 #include <linux/posix_acl.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/xattr.h>
 #include <linux/export.h>
+#include <linux/user_namespace.h>
-#include <linux/errno.h>
+struct posix_acl **acl_by_type(struct inode *inode, int type)
+{
+        switch (type) {
+        case ACL_TYPE_ACCESS:
+                return &inode->i_acl;
+        case ACL_TYPE_DEFAULT:
+                return &inode->i_default_acl;
+        default:
+                BUG();
+        }
+}
+EXPORT_SYMBOL(acl_by_type);
-EXPORT_SYMBOL(posix_acl_init);
+struct posix_acl *get_cached_acl(struct inode *inode, int type)
-EXPORT_SYMBOL(posix_acl_alloc);
+{
-EXPORT_SYMBOL(posix_acl_valid);
+        struct posix_acl **p = acl_by_type(inode, type);
-EXPORT_SYMBOL(posix_acl_equiv_mode);
+        struct posix_acl *acl = ACCESS_ONCE(*p);
-EXPORT_SYMBOL(posix_acl_from_mode);
+        if (acl) {
+                spin_lock(&inode->i_lock);
+                acl = *p;
+                if (acl != ACL_NOT_CACHED)
+                        acl = posix_acl_dup(acl);
+                spin_unlock(&inode->i_lock);
+        }
+        return acl;
+}
+EXPORT_SYMBOL(get_cached_acl);
+struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type)
+{
+        return rcu_dereference(*acl_by_type(inode, type));
+}
+EXPORT_SYMBOL(get_cached_acl_rcu);
+void set_cached_acl(struct inode *inode, int type, struct posix_acl *acl)
+{
+        struct posix_acl **p = acl_by_type(inode, type);
+        struct posix_acl *old;
+        spin_lock(&inode->i_lock);
+        old = *p;
+        rcu_assign_pointer(*p, posix_acl_dup(acl));
+        spin_unlock(&inode->i_lock);
+        if (old != ACL_NOT_CACHED)
+                posix_acl_release(old);
+}
+EXPORT_SYMBOL(set_cached_acl);
+void forget_cached_acl(struct inode *inode, int type)
+{
+        struct posix_acl **p = acl_by_type(inode, type);
+        struct posix_acl *old;
+        spin_lock(&inode->i_lock);
+        old = *p;
+        *p = ACL_NOT_CACHED;
+        spin_unlock(&inode->i_lock);
+        if (old != ACL_NOT_CACHED)
+                posix_acl_release(old);
+}
+EXPORT_SYMBOL(forget_cached_acl);
+void forget_all_cached_acls(struct inode *inode)
+{
+        struct posix_acl *old_access, *old_default;
+        spin_lock(&inode->i_lock);
+        old_access = inode->i_acl;
+        old_default = inode->i_default_acl;
+        inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED;
+        spin_unlock(&inode->i_lock);
+        if (old_access != ACL_NOT_CACHED)
+                posix_acl_release(old_access);
+        if (old_default != ACL_NOT_CACHED)
+                posix_acl_release(old_default);
+}
+EXPORT_SYMBOL(forget_all_cached_acls);
+struct posix_acl *get_acl(struct inode *inode, int type)
+{
+        struct posix_acl *acl;
+        acl = get_cached_acl(inode, type);
+        if (acl != ACL_NOT_CACHED)
+                return acl;
+        if (!IS_POSIXACL(inode))
+                return NULL;
+        /*
+         * A filesystem can force a ACL callback by just never filling the
+         * ACL cache. But normally you'd fill the cache either at inode
+         * instantiation time, or on the first ->get_acl call.
+         *
+         * If the filesystem doesn't have a get_acl() function at all, we'll
+         * just create the negative cache entry.
+         */
+        if (!inode->i_op->get_acl) {
+                set_cached_acl(inode, type, NULL);
+                return NULL;
+        }
+        return inode->i_op->get_acl(inode, type);
+}
+EXPORT_SYMBOL(get_acl);
 /*
 * Init a fresh posix_acl
@@ -37,6 +132,7 @@ posix_acl_init(struct posix_acl *acl, int count)
        atomic_set(&acl->a_refcount, 1);
        acl->a_count = count;
 }
+EXPORT_SYMBOL(posix_acl_init);
 /*
 * Allocate a new ACL with the specified number of entries.
@@ -51,6 +147,7 @@ posix_acl_alloc(int count, gfp_t flags)
                posix_acl_init(acl, count);
        return acl;
 }
+EXPORT_SYMBOL(posix_acl_alloc);
 /*
 * Clone an ACL.
@@ -78,8 +175,6 @@ posix_acl_valid(const struct posix_acl *acl)
 {
        const struct posix_acl_entry *pa, *pe;
        int state = ACL_USER_OBJ;
-        kuid_t prev_uid = INVALID_UID;
-        kgid_t prev_gid = INVALID_GID;
        int needs_mask = 0;
        FOREACH_ACL_ENTRY(pa, acl, pe) {
@@ -98,10 +193,6 @@ posix_acl_valid(const struct posix_acl *acl)
                                        return -EINVAL;
                                if (!uid_valid(pa->e_uid))
                                        return -EINVAL;
-                                if (uid_valid(prev_uid) &&
-                                    uid_lte(pa->e_uid, prev_uid))
-                                        return -EINVAL;
-                                prev_uid = pa->e_uid;
                                needs_mask = 1;
                                break;
@@ -117,10 +208,6 @@ posix_acl_valid(const struct posix_acl *acl)
                                        return -EINVAL;
                                if (!gid_valid(pa->e_gid))
                                        return -EINVAL;
-                                if (gid_valid(prev_gid) &&
-                                    gid_lte(pa->e_gid, prev_gid))
-                                        return -EINVAL;
-                                prev_gid = pa->e_gid;
                                needs_mask = 1;
                                break;
@@ -146,6 +233,7 @@ posix_acl_valid(const struct posix_acl *acl)
                return 0;
        return -EINVAL;
 }
+EXPORT_SYMBOL(posix_acl_valid);
 /*
 * Returns 0 if the acl can be exactly represented in the traditional
@@ -186,6 +274,7 @@ posix_acl_equiv_mode(const struct posix_acl *acl, umode_t *mode_p)
                *mode_p = (*mode_p & ~S_IRWXUGO) | mode;
        return not_equiv;
 }
+EXPORT_SYMBOL(posix_acl_equiv_mode);
 /*
 * Create an ACL representing the file mode permission bits of an inode.
@@ -207,6 +296,7 @@ posix_acl_from_mode(umode_t mode, gfp_t flags)
        acl->a_entries[2].e_perm = (mode & S_IRWXO);
        return acl;
 }
+EXPORT_SYMBOL(posix_acl_from_mode);
 /*
 * Return 0 if current is granted want access to the inode
@@ -338,7 +428,7 @@ static int posix_acl_create_masq(struct posix_acl *acl, umode_t *mode_p)
 /*
 * Modify the ACL for the chmod syscall.
 */
-static int posix_acl_chmod_masq(struct posix_acl *acl, umode_t mode)
+static int __posix_acl_chmod_masq(struct posix_acl *acl, umode_t mode)
 {
        struct posix_acl_entry *group_obj = NULL, *mask_obj = NULL;
        struct posix_acl_entry *pa, *pe;
@@ -384,7 +474,7 @@ static int posix_acl_chmod_masq(struct posix_acl *acl, umode_t mode)
 }
 int
-posix_acl_create(struct posix_acl **acl, gfp_t gfp, umode_t *mode_p)
+__posix_acl_create(struct posix_acl **acl, gfp_t gfp, umode_t *mode_p)
 {
        struct posix_acl *clone = posix_acl_clone(*acl, gfp);
        int err = -ENOMEM;
@@ -399,15 +489,15 @@ posix_acl_create(struct posix_acl **acl, gfp_t gfp, umode_t *mode_p)
        *acl = clone;
        return err;
 }
-EXPORT_SYMBOL(posix_acl_create);
+EXPORT_SYMBOL(__posix_acl_create);
 int
-posix_acl_chmod(struct posix_acl **acl, gfp_t gfp, umode_t mode)
+__posix_acl_chmod(struct posix_acl **acl, gfp_t gfp, umode_t mode)
 {
        struct posix_acl *clone = posix_acl_clone(*acl, gfp);
        int err = -ENOMEM;
        if (clone) {
-                err = posix_acl_chmod_masq(clone, mode);
+                err = __posix_acl_chmod_masq(clone, mode);
                if (err) {
                        posix_acl_release(clone);
                        clone = NULL;
@@ -417,4 +507,388 @@ posix_acl_chmod(struct posix_acl **acl, gfp_t gfp, umode_t mode)
        *acl = clone;
        return err;
 }
+EXPORT_SYMBOL(__posix_acl_chmod);
+int
+posix_acl_chmod(struct inode *inode, umode_t mode)
+{
+        struct posix_acl *acl;
+        int ret = 0;
+        if (!IS_POSIXACL(inode))
+                return 0;
+        if (!inode->i_op->set_acl)
+                return -EOPNOTSUPP;
+        acl = get_acl(inode, ACL_TYPE_ACCESS);
+        if (IS_ERR_OR_NULL(acl)) {
+                if (acl == ERR_PTR(-EOPNOTSUPP))
+                        return 0;
+                return PTR_ERR(acl);
+        }
+        ret = __posix_acl_chmod(&acl, GFP_KERNEL, mode);
+        if (ret)
+                return ret;
+        ret = inode->i_op->set_acl(inode, acl, ACL_TYPE_ACCESS);
+        posix_acl_release(acl);
+        return ret;
+}
 EXPORT_SYMBOL(posix_acl_chmod);
+int
+posix_acl_create(struct inode *dir, umode_t *mode,
+                struct posix_acl **default_acl, struct posix_acl **acl)
+{
+        struct posix_acl *p;
+        int ret;
+        if (S_ISLNK(*mode) || !IS_POSIXACL(dir))
+                goto no_acl;
+        p = get_acl(dir, ACL_TYPE_DEFAULT);
+        if (IS_ERR(p)) {
+                if (p == ERR_PTR(-EOPNOTSUPP))
+                        goto apply_umask;
+                return PTR_ERR(p);
+        }
+        if (!p)
+                goto apply_umask;
+        *acl = posix_acl_clone(p, GFP_NOFS);
+        if (!*acl)
+                return -ENOMEM;
+        ret = posix_acl_create_masq(*acl, mode);
+        if (ret < 0) {
+                posix_acl_release(*acl);
+                return -ENOMEM;
+        }
+        if (ret == 0) {
+                posix_acl_release(*acl);
+                *acl = NULL;
+        }
+        if (!S_ISDIR(*mode)) {
+                posix_acl_release(p);
+                *default_acl = NULL;
+        } else {
+                *default_acl = p;
+        }
+        return 0;
+apply_umask:
+        *mode &= ~current_umask();
+no_acl:
+        *default_acl = NULL;
+        *acl = NULL;
+        return 0;
+}
+EXPORT_SYMBOL_GPL(posix_acl_create);
+/*
+ * Fix up the uids and gids in posix acl extended attributes in place.
+ */
+static void posix_acl_fix_xattr_userns(
+        struct user_namespace *to, struct user_namespace *from,
+        void *value, size_t size)
+{
+        posix_acl_xattr_header *header = (posix_acl_xattr_header *)value;
+        posix_acl_xattr_entry *entry = (posix_acl_xattr_entry *)(header+1), *end;
+        int count;
+        kuid_t uid;
+        kgid_t gid;
+        if (!value)
+                return;
+        if (size < sizeof(posix_acl_xattr_header))
+                return;
+        if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION))
+                return;
+        count = posix_acl_xattr_count(size);
+        if (count < 0)
+                return;
+        if (count == 0)
+                return;
+        for (end = entry + count; entry != end; entry++) {
+                switch(le16_to_cpu(entry->e_tag)) {
+                case ACL_USER:
+                        uid = make_kuid(from, le32_to_cpu(entry->e_id));
+                        entry->e_id = cpu_to_le32(from_kuid(to, uid));
+                        break;
+                case ACL_GROUP:
+                        gid = make_kgid(from, le32_to_cpu(entry->e_id));
+                        entry->e_id = cpu_to_le32(from_kgid(to, gid));
+                        break;
+                default:
+                        break;
+                }
+        }
+}
+void posix_acl_fix_xattr_from_user(void *value, size_t size)
+{
+        struct user_namespace *user_ns = current_user_ns();
+        if (user_ns == &init_user_ns)
+                return;
+        posix_acl_fix_xattr_userns(&init_user_ns, user_ns, value, size);
+}
+void posix_acl_fix_xattr_to_user(void *value, size_t size)
+{
+        struct user_namespace *user_ns = current_user_ns();
+        if (user_ns == &init_user_ns)
+                return;
+        posix_acl_fix_xattr_userns(user_ns, &init_user_ns, value, size);
+}
+/*
+ * Convert from extended attribute to in-memory representation.
+ */
+struct posix_acl *
+posix_acl_from_xattr(struct user_namespace *user_ns,
+                     const void *value, size_t size)
+{
+        posix_acl_xattr_header *header = (posix_acl_xattr_header *)value;
+        posix_acl_xattr_entry *entry = (posix_acl_xattr_entry *)(header+1), *end;
+        int count;
+        struct posix_acl *acl;
+        struct posix_acl_entry *acl_e;
+        if (!value)
+                return NULL;
+        if (size < sizeof(posix_acl_xattr_header))
+                 return ERR_PTR(-EINVAL);
+        if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION))
+                return ERR_PTR(-EOPNOTSUPP);
+        count = posix_acl_xattr_count(size);
+        if (count < 0)
+                return ERR_PTR(-EINVAL);
+        if (count == 0)
+                return NULL;
+        
+        acl = posix_acl_alloc(count, GFP_NOFS);
+        if (!acl)
+                return ERR_PTR(-ENOMEM);
+        acl_e = acl->a_entries;
+        
+        for (end = entry + count; entry != end; acl_e++, entry++) {
+                acl_e->e_tag  = le16_to_cpu(entry->e_tag);
+                acl_e->e_perm = le16_to_cpu(entry->e_perm);
+                switch(acl_e->e_tag) {
+                        case ACL_USER_OBJ:
+                        case ACL_GROUP_OBJ:
+                        case ACL_MASK:
+                        case ACL_OTHER:
+                                break;
+                        case ACL_USER:
+                                acl_e->e_uid =
+                                        make_kuid(user_ns,
+                                                  le32_to_cpu(entry->e_id));
+                                if (!uid_valid(acl_e->e_uid))
+                                        goto fail;
+                                break;
+                        case ACL_GROUP:
+                                acl_e->e_gid =
+                                        make_kgid(user_ns,
+                                                  le32_to_cpu(entry->e_id));
+                                if (!gid_valid(acl_e->e_gid))
+                                        goto fail;
+                                break;
+                        default:
+                                goto fail;
+                }
+        }
+        return acl;
+fail:
+        posix_acl_release(acl);
+        return ERR_PTR(-EINVAL);
+}
+EXPORT_SYMBOL (posix_acl_from_xattr);
+/*
+ * Convert from in-memory to extended attribute representation.
+ */
+int
+posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl,
+                   void *buffer, size_t size)
+{
+        posix_acl_xattr_header *ext_acl = (posix_acl_xattr_header *)buffer;
+        posix_acl_xattr_entry *ext_entry = ext_acl->a_entries;
+        int real_size, n;
+        real_size = posix_acl_xattr_size(acl->a_count);
+        if (!buffer)
+                return real_size;
+        if (real_size > size)
+                return -ERANGE;
+        
+        ext_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION);
+        for (n=0; n < acl->a_count; n++, ext_entry++) {
+                const struct posix_acl_entry *acl_e = &acl->a_entries[n];
+                ext_entry->e_tag  = cpu_to_le16(acl_e->e_tag);
+                ext_entry->e_perm = cpu_to_le16(acl_e->e_perm);
+                switch(acl_e->e_tag) {
+                case ACL_USER:
+                        ext_entry->e_id =
+                                cpu_to_le32(from_kuid(user_ns, acl_e->e_uid));
+                        break;
+                case ACL_GROUP:
+                        ext_entry->e_id =
+                                cpu_to_le32(from_kgid(user_ns, acl_e->e_gid));
+                        break;
+                default:
+                        ext_entry->e_id = cpu_to_le32(ACL_UNDEFINED_ID);
+                        break;
+                }
+        }
+        return real_size;
+}
+EXPORT_SYMBOL (posix_acl_to_xattr);
+static int
+posix_acl_xattr_get(struct dentry *dentry, const char *name,
+                void *value, size_t size, int type)
+{
+        struct posix_acl *acl;
+        int error;
+        if (!IS_POSIXACL(dentry->d_inode))
+                return -EOPNOTSUPP;
+        if (S_ISLNK(dentry->d_inode->i_mode))
+                return -EOPNOTSUPP;
+        acl = get_acl(dentry->d_inode, type);
+        if (IS_ERR(acl))
+                return PTR_ERR(acl);
+        if (acl == NULL)
+                return -ENODATA;
+        error = posix_acl_to_xattr(&init_user_ns, acl, value, size);
+        posix_acl_release(acl);
+        return error;
+}
+static int
+posix_acl_xattr_set(struct dentry *dentry, const char *name,
+                const void *value, size_t size, int flags, int type)
+{
+        struct inode *inode = dentry->d_inode;
+        struct posix_acl *acl = NULL;
+        int ret;
+        if (!IS_POSIXACL(inode))
+                return -EOPNOTSUPP;
+        if (!inode->i_op->set_acl)
+                return -EOPNOTSUPP;
+        if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
+                return value ? -EACCES : 0;
+        if (!inode_owner_or_capable(inode))
+                return -EPERM;
+        if (value) {
+                acl = posix_acl_from_xattr(&init_user_ns, value, size);
+                if (IS_ERR(acl))
+                        return PTR_ERR(acl);
+                if (acl) {
+                        ret = posix_acl_valid(acl);
+                        if (ret)
+                                goto out;
+                }
+        }
+        ret = inode->i_op->set_acl(inode, acl, type);
+out:
+        posix_acl_release(acl);
+        return ret;
+}
+static size_t
+posix_acl_xattr_list(struct dentry *dentry, char *list, size_t list_size,
+                const char *name, size_t name_len, int type)
+{
+        const char *xname;
+        size_t size;
+        if (!IS_POSIXACL(dentry->d_inode))
+                return -EOPNOTSUPP;
+        if (S_ISLNK(dentry->d_inode->i_mode))
+                return -EOPNOTSUPP;
+        if (type == ACL_TYPE_ACCESS)
+                xname = POSIX_ACL_XATTR_ACCESS;
+        else
+                xname = POSIX_ACL_XATTR_DEFAULT;
+        size = strlen(xname) + 1;
+        if (list && size <= list_size)
+                memcpy(list, xname, size);
+        return size;
+}
+const struct xattr_handler posix_acl_access_xattr_handler = {
+        .prefix = POSIX_ACL_XATTR_ACCESS,
+        .flags = ACL_TYPE_ACCESS,
+        .list = posix_acl_xattr_list,
+        .get = posix_acl_xattr_get,
+        .set = posix_acl_xattr_set,
+};
+EXPORT_SYMBOL_GPL(posix_acl_access_xattr_handler);
+const struct xattr_handler posix_acl_default_xattr_handler = {
+        .prefix = POSIX_ACL_XATTR_DEFAULT,
+        .flags = ACL_TYPE_DEFAULT,
+        .list = posix_acl_xattr_list,
+        .get = posix_acl_xattr_get,
+        .set = posix_acl_xattr_set,
+};
+EXPORT_SYMBOL_GPL(posix_acl_default_xattr_handler);
+int simple_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+{
+        int error;
+        if (type == ACL_TYPE_ACCESS) {
+                error = posix_acl_equiv_mode(acl, &inode->i_mode);
+                if (error < 0)
+                        return 0;
+                if (error == 0)
+                        acl = NULL;
+        }
+        inode->i_ctime = CURRENT_TIME;
+        set_cached_acl(inode, type, acl);
+        return 0;
+}
+int simple_acl_create(struct inode *dir, struct inode *inode)
+{
+        struct posix_acl *default_acl, *acl;
+        int error;
+        error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
+        if (error)
+                return error;
+        set_cached_acl(inode, ACL_TYPE_DEFAULT, default_acl);
+        set_cached_acl(inode, ACL_TYPE_ACCESS, acl);
+        if (default_acl)
+                posix_acl_release(default_acl);
+        if (acl)
+                posix_acl_release(acl);
+        return 0;
+}
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 1bd2077187fd..656e401794de 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -140,24 +140,15 @@ static const char * const task_state_array[] = {
        "t (tracing stop)",     /*   8 */
        "Z (zombie)",           /*  16 */
        "X (dead)",             /*  32 */
-        "x (dead)",             /*  64 */
-        "K (wakekill)",         /* 128 */
-        "W (waking)",           /* 256 */
-        "P (parked)",           /* 512 */
 };
 static inline const char *get_task_state(struct task_struct *tsk)
 {
-        unsigned int state = (tsk->state & TASK_REPORT) | tsk->exit_state;
+        unsigned int state = (tsk->state | tsk->exit_state) & TASK_REPORT;
-        const char * const *p = &task_state_array[0];
-        BUILD_BUG_ON(1 + ilog2(TASK_STATE_MAX) != ARRAY_SIZE(task_state_array));
+        BUILD_BUG_ON(1 + ilog2(TASK_REPORT) != ARRAY_SIZE(task_state_array)-1);
-        while (state) {
+        return task_state_array[fls(state)];
-                p++;
-                state >>= 1;
-        }
-        return *p;
 }
 static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
@@ -453,8 +444,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
                                min_flt += t->min_flt;
                                maj_flt += t->maj_flt;
                                gtime += task_gtime(t);
-                                t = next_thread(t);
+                        } while_each_thread(task, t);
-                        } while (t != task);
                        min_flt += sig->min_flt;
                        maj_flt += sig->maj_flt;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 03c8d747be48..b9760628e1fd 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1658,13 +1658,18 @@ int pid_revalidate(struct dentry *dentry, unsigned int flags)
        return 0;
 }
+static inline bool proc_inode_is_dead(struct inode *inode)
+{
+        return !proc_pid(inode)->tasks[PIDTYPE_PID].first;
+}
 int pid_delete_dentry(const struct dentry *dentry)
 {
        /* Is the task we represent dead?
         * If so, then don't put the dentry on the lru list,
         * kill it immediately.
         */
-        return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first;
+        return proc_inode_is_dead(dentry->d_inode);
 }
 const struct dentry_operations pid_dentry_operations =
@@ -1819,6 +1824,7 @@ static int proc_map_files_get_link(struct dentry *dentry, struct path *path)
        if (rc)
                goto out_mmput;
+        rc = -ENOENT;
        down_read(&mm->mmap_sem);
        vma = find_exact_vma(mm, vm_start, vm_end);
        if (vma && vma->vm_file) {
@@ -3092,34 +3098,42 @@ out_no_task:
 * In the case of a seek we start with the leader and walk nr
 * threads past it.
 */
-static struct task_struct *first_tid(struct task_struct *leader,
+static struct task_struct *first_tid(struct pid *pid, int tid, loff_t f_pos,
-                int tid, int nr, struct pid_namespace *ns)
+                                        struct pid_namespace *ns)
 {
-        struct task_struct *pos;
+        struct task_struct *pos, *task;
+        unsigned long nr = f_pos;
+        if (nr != f_pos)        /* 32bit overflow? */
+                return NULL;
        rcu_read_lock();
-        /* Attempt to start with the pid of a thread */
+        task = pid_task(pid, PIDTYPE_PID);
-        if (tid && (nr > 0)) {
+        if (!task)
+                goto fail;
+        /* Attempt to start with the tid of a thread */
+        if (tid && nr) {
                pos = find_task_by_pid_ns(tid, ns);
-                if (pos && (pos->group_leader == leader))
+                if (pos && same_thread_group(pos, task))
                        goto found;
        }
        /* If nr exceeds the number of threads there is nothing todo */
-        pos = NULL;
+        if (nr >= get_nr_threads(task))
-        if (nr && nr >= get_nr_threads(leader))
+                goto fail;
-                goto out;
        /* If we haven't found our starting place yet start
         * with the leader and walk nr threads forward.
         */
-        for (pos = leader; nr > 0; --nr) {
+        pos = task = task->group_leader;
-                pos = next_thread(pos);
+        do {
-                if (pos == leader) {
+                if (!nr--)
-                        pos = NULL;
+                        goto found;
-                        goto out;
+        } while_each_thread(task, pos);
-                }
+fail:
-        }
+        pos = NULL;
+        goto out;
 found:
        get_task_struct(pos);
 out:
@@ -3152,25 +3166,16 @@ static struct task_struct *next_tid(struct task_struct *start)
 /* for the /proc/TGID/task/ directories */
 static int proc_task_readdir(struct file *file, struct dir_context *ctx)
 {
-        struct task_struct *leader = NULL;
+        struct inode *inode = file_inode(file);
-        struct task_struct *task = get_proc_task(file_inode(file));
+        struct task_struct *task;
        struct pid_namespace *ns;
        int tid;
-        if (!task)
+        if (proc_inode_is_dead(inode))
-                return -ENOENT;
-        rcu_read_lock();
-        if (pid_alive(task)) {
-                leader = task->group_leader;
-                get_task_struct(leader);
-        }
-        rcu_read_unlock();
-        put_task_struct(task);
-        if (!leader)
                return -ENOENT;
        if (!dir_emit_dots(file, ctx))
-                goto out;
+                return 0;
        /* f_version caches the tgid value that the last readdir call couldn't
         * return. lseek aka telldir automagically resets f_version to 0.
@@ -3178,7 +3183,7 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx)
        ns = file->f_dentry->d_sb->s_fs_info;
        tid = (int)file->f_version;
        file->f_version = 0;
-        for (task = first_tid(leader, tid, ctx->pos - 2, ns);
+        for (task = first_tid(proc_pid(inode), tid, ctx->pos - 2, ns);
             task;
             task = next_tid(task), ctx->pos++) {
                char name[PROC_NUMBUF];
@@ -3194,8 +3199,7 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx)
                        break;
                }
        }
-out:
-        put_task_struct(leader);
        return 0;
 }
diff --git a/fs/proc/cmdline.c b/fs/proc/cmdline.c
index 82676e3fcd1d..cbd82dff7e81 100644
--- a/fs/proc/cmdline.c
+++ b/fs/proc/cmdline.c
@@ -26,4 +26,4 @@ static int __init proc_cmdline_init(void)
        proc_create("cmdline", 0, NULL, &cmdline_proc_fops);
        return 0;
 }
-module_init(proc_cmdline_init);
+fs_initcall(proc_cmdline_init);
diff --git a/fs/proc/consoles.c b/fs/proc/consoles.c
index 51942d5abcec..290ba85cb900 100644
--- a/fs/proc/consoles.c
+++ b/fs/proc/consoles.c
@@ -109,4 +109,4 @@ static int __init proc_consoles_init(void)
        proc_create("consoles", 0, NULL, &proc_consoles_operations);
        return 0;
 }
-module_init(proc_consoles_init);
+fs_initcall(proc_consoles_init);
diff --git a/fs/proc/cpuinfo.c b/fs/proc/cpuinfo.c
index 5a1e539a234b..06f4d31e0396 100644
--- a/fs/proc/cpuinfo.c
+++ b/fs/proc/cpuinfo.c
@@ -21,4 +21,4 @@ static int __init proc_cpuinfo_init(void)
        proc_create("cpuinfo", 0, NULL, &proc_cpuinfo_operations);
        return 0;
 }
-module_init(proc_cpuinfo_init);
+fs_initcall(proc_cpuinfo_init);
diff --git a/fs/proc/devices.c b/fs/proc/devices.c
index b14347167c35..50493edc30e5 100644
--- a/fs/proc/devices.c
+++ b/fs/proc/devices.c
@@ -67,4 +67,4 @@ static int __init proc_devices_init(void)
        proc_create("devices", 0, NULL, &proc_devinfo_operations);
        return 0;
 }
-module_init(proc_devices_init);
+fs_initcall(proc_devices_init);
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index cca93b6fb9a9..b7f268eb5f45 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -49,8 +49,7 @@ static int proc_notify_change(struct dentry *dentry, struct iattr *iattr)
        setattr_copy(inode, iattr);
        mark_inode_dirty(inode);
-        de->uid = inode->i_uid;
+        proc_set_user(de, inode->i_uid, inode->i_gid);
-        de->gid = inode->i_gid;
        de->mode = inode->i_mode;
        return 0;
 }
diff --git a/fs/proc/interrupts.c b/fs/proc/interrupts.c
index 05029c0e2f24..a352d5703b41 100644
--- a/fs/proc/interrupts.c
+++ b/fs/proc/interrupts.c
@@ -50,4 +50,4 @@ static int __init proc_interrupts_init(void)
        proc_create("interrupts", 0, NULL, &proc_interrupts_operations);
        return 0;
 }
-module_init(proc_interrupts_init);
+fs_initcall(proc_interrupts_init);
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 5ed0e52d6aa0..39e6ef32f0bd 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -639,4 +639,4 @@ static int __init proc_kcore_init(void)
        return 0;
 }
-module_init(proc_kcore_init);
+fs_initcall(proc_kcore_init);
diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c
index bdfabdaefdce..05f8dcdb086e 100644
--- a/fs/proc/kmsg.c
+++ b/fs/proc/kmsg.c
@@ -61,4 +61,4 @@ static int __init proc_kmsg_init(void)
        proc_create("kmsg", S_IRUSR, NULL, &proc_kmsg_operations);
        return 0;
 }
-module_init(proc_kmsg_init);
+fs_initcall(proc_kmsg_init);
diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c
index 1afa4dd4cae2..aec66e6c2060 100644
--- a/fs/proc/loadavg.c
+++ b/fs/proc/loadavg.c
@@ -42,4 +42,4 @@ static int __init proc_loadavg_init(void)
        proc_create("loadavg", 0, NULL, &loadavg_proc_fops);
        return 0;
 }
-module_init(proc_loadavg_init);
+fs_initcall(proc_loadavg_init);
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index a77d2b299199..136e548d9567 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -26,7 +26,11 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
        unsigned long committed;
        struct vmalloc_info vmi;
        long cached;
+        long available;
+        unsigned long pagecache;
+        unsigned long wmark_low = 0;
        unsigned long pages[NR_LRU_LISTS];
+        struct zone *zone;
        int lru;
 /*
@@ -47,12 +51,44 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
        for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
                pages[lru] = global_page_state(NR_LRU_BASE + lru);
+        for_each_zone(zone)
+                wmark_low += zone->watermark[WMARK_LOW];
+        /*
+         * Estimate the amount of memory available for userspace allocations,
+         * without causing swapping.
+         *
+         * Free memory cannot be taken below the low watermark, before the
+         * system starts swapping.
+         */
+        available = i.freeram - wmark_low;
+        /*
+         * Not all the page cache can be freed, otherwise the system will
+         * start swapping. Assume at least half of the page cache, or the
+         * low watermark worth of cache, needs to stay.
+         */
+        pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE];
+        pagecache -= min(pagecache / 2, wmark_low);
+        available += pagecache;
+        /*
+         * Part of the reclaimable swap consists of items that are in use,
+         * and cannot be freed. Cap this estimate at the low watermark.
+         */
+        available += global_page_state(NR_SLAB_RECLAIMABLE) -
+                     min(global_page_state(NR_SLAB_RECLAIMABLE) / 2, wmark_low);
+        if (available < 0)
+                available = 0;
        /*
         * Tagged format, for easy grepping and expansion.
         */
        seq_printf(m,
                "MemTotal:       %8lu kB\n"
                "MemFree:        %8lu kB\n"
+                "MemAvailable:   %8lu kB\n"
                "Buffers:        %8lu kB\n"
                "Cached:         %8lu kB\n"
                "SwapCached:     %8lu kB\n"
@@ -105,6 +141,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
                ,
                K(i.totalram),
                K(i.freeram),
+                K(available),
                K(i.bufferram),
                K(cached),
                K(total_swapcache_pages()),
@@ -183,4 +220,4 @@ static int __init proc_meminfo_init(void)
        proc_create("meminfo", 0, NULL, &meminfo_proc_fops);
        return 0;
 }
-module_init(proc_meminfo_init);
+fs_initcall(proc_meminfo_init);
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index 5f9bc8a746c9..d4a35746cab9 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -131,4 +131,4 @@ static int __init proc_nommu_init(void)
        return 0;
 }
-module_init(proc_nommu_init);
+fs_initcall(proc_nommu_init);
diff --git a/fs/proc/page.c b/fs/proc/page.c
index b8730d9ebaee..e647c55275d9 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -118,10 +118,11 @@ u64 stable_page_flags(struct page *page)
        /*
         * PageTransCompound can be true for non-huge compound pages (slab
         * pages or pages allocated by drivers with __GFP_COMP) because it
-         * just checks PG_head/PG_tail, so we need to check PageLRU to make
+         * just checks PG_head/PG_tail, so we need to check PageLRU/PageAnon
-         * sure a given page is a thp, not a non-huge compound page.
+         * to make sure a given page is a thp, not a non-huge compound page.
         */
-        else if (PageTransCompound(page) && PageLRU(compound_trans_head(page)))
+        else if (PageTransCompound(page) && (PageLRU(compound_head(page)) ||
+                                             PageAnon(compound_head(page))))
                u |= 1 << KPF_THP;
        /*
@@ -217,4 +218,4 @@ static int __init proc_page_init(void)
        proc_create("kpageflags", S_IRUSR, NULL, &proc_kpageflags_operations);
        return 0;
 }
-module_init(proc_page_init);
+fs_initcall(proc_page_init);
diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
index 70779b2fc209..c82dd5147845 100644
--- a/fs/proc/proc_devtree.c
+++ b/fs/proc/proc_devtree.c
@@ -74,9 +74,9 @@ __proc_device_tree_add_prop(struct proc_dir_entry *de, struct property *pp,
                return NULL;
        if (!strncmp(name, "security-", 9))
-                ent->size = 0; /* don't leak number of password chars */
+                proc_set_size(ent, 0); /* don't leak number of password chars */
        else
-                ent->size = pp->length;
+                proc_set_size(ent, pp->length);
        return ent;
 }
@@ -232,6 +232,7 @@ void __init proc_device_tree_init(void)
                return;
        root = of_find_node_by_path("/");
        if (root == NULL) {
+                remove_proc_entry("device-tree", NULL);
                pr_debug("/proc/device-tree: can't find root\n");
                return;
        }
diff --git a/fs/proc/softirqs.c b/fs/proc/softirqs.c
index 62604be9f58d..ad8a77f94beb 100644
--- a/fs/proc/softirqs.c
+++ b/fs/proc/softirqs.c
@@ -41,4 +41,4 @@ static int __init proc_softirqs_init(void)
        proc_create("softirqs", 0, NULL, &proc_softirqs_operations);
        return 0;
 }
-module_init(proc_softirqs_init);
+fs_initcall(proc_softirqs_init);
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 1cf86c0e8689..6f599c62f0cc 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -221,4 +221,4 @@ static int __init proc_stat_init(void)
        proc_create("stat", 0, NULL, &proc_stat_operations);
        return 0;
 }
-module_init(proc_stat_init);
+fs_initcall(proc_stat_init);
diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c
index 061894625903..7141b8d0ca9e 100644
--- a/fs/proc/uptime.c
+++ b/fs/proc/uptime.c
@@ -49,4 +49,4 @@ static int __init proc_uptime_init(void)
        proc_create("uptime", 0, NULL, &uptime_proc_fops);
        return 0;
 }
-module_init(proc_uptime_init);
+fs_initcall(proc_uptime_init);
diff --git a/fs/proc/version.c b/fs/proc/version.c
index 76817a60678c..d2154eb6d78f 100644
--- a/fs/proc/version.c
+++ b/fs/proc/version.c
@@ -31,4 +31,4 @@ static int __init proc_version_init(void)
        proc_create("version", 0, NULL, &version_proc_fops);
        return 0;
 }
-module_init(proc_version_init);
+fs_initcall(proc_version_init);
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 9100d6959886..88d4585b30f1 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -468,17 +468,24 @@ static int __init update_note_header_size_elf64(const Elf64_Ehdr *ehdr_ptr)
                        return rc;
                }
                nhdr_ptr = notes_section;
-                while (real_sz < max_sz) {
+                while (nhdr_ptr->n_namesz != 0) {
-                        if (nhdr_ptr->n_namesz == 0)
-                                break;
                        sz = sizeof(Elf64_Nhdr) +
                                ((nhdr_ptr->n_namesz + 3) & ~3) +
                                ((nhdr_ptr->n_descsz + 3) & ~3);
+                        if ((real_sz + sz) > max_sz) {
+                                pr_warn("Warning: Exceeded p_memsz, dropping PT_NOTE entry n_namesz=0x%x, n_descsz=0x%x\n",
+                                        nhdr_ptr->n_namesz, nhdr_ptr->n_descsz);
+                                break;
+                        }
                        real_sz += sz;
                        nhdr_ptr = (Elf64_Nhdr*)((char*)nhdr_ptr + sz);
                }
                kfree(notes_section);
                phdr_ptr->p_memsz = real_sz;
+                if (real_sz == 0) {
+                        pr_warn("Warning: Zero PT_NOTE entries found\n");
+                        return -EINVAL;
+                }
        }
        return 0;
@@ -648,17 +655,24 @@ static int __init update_note_header_size_elf32(const Elf32_Ehdr *ehdr_ptr)
                        return rc;
                }
                nhdr_ptr = notes_section;
-                while (real_sz < max_sz) {
+                while (nhdr_ptr->n_namesz != 0) {
-                        if (nhdr_ptr->n_namesz == 0)
-                                break;
                        sz = sizeof(Elf32_Nhdr) +
                                ((nhdr_ptr->n_namesz + 3) & ~3) +
                                ((nhdr_ptr->n_descsz + 3) & ~3);
+                        if ((real_sz + sz) > max_sz) {
+                                pr_warn("Warning: Exceeded p_memsz, dropping PT_NOTE entry n_namesz=0x%x, n_descsz=0x%x\n",
+                                        nhdr_ptr->n_namesz, nhdr_ptr->n_descsz);
+                                break;
+                        }
                        real_sz += sz;
                        nhdr_ptr = (Elf32_Nhdr*)((char*)nhdr_ptr + sz);
                }
                kfree(notes_section);
                phdr_ptr->p_memsz = real_sz;
+                if (real_sz == 0) {
+                        pr_warn("Warning: Zero PT_NOTE entries found\n");
+                        return -EINVAL;
+                }
        }
        return 0;
@@ -1082,7 +1096,7 @@ static int __init vmcore_init(void)
                proc_vmcore->size = vmcore_size;
        return 0;
 }
-module_init(vmcore_init)
+fs_initcall(vmcore_init);
 /* Cleanup function for vmcore module. */
 void vmcore_cleanup(void)
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 439406e081af..7be26f03a3f5 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -234,17 +234,12 @@ static int mounts_open_common(struct inode *inode, struct file *file,
        rcu_read_lock();
        nsp = task_nsproxy(task);
-        if (!nsp) {
+        if (!nsp || !nsp->mnt_ns) {
                rcu_read_unlock();
                put_task_struct(task);
                goto err;
        }
        ns = nsp->mnt_ns;
-        if (!ns) {
-                rcu_read_unlock();
-                put_task_struct(task);
-                goto err;
-        }
        get_mnt_ns(ns);
        rcu_read_unlock();
        task_lock(task);
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 2e8caa62da78..89558810381c 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -27,7 +27,6 @@
 static const struct super_operations qnx4_sops;
-static void qnx4_put_super(struct super_block *sb);
 static struct inode *qnx4_alloc_inode(struct super_block *sb);
 static void qnx4_destroy_inode(struct inode *inode);
 static int qnx4_remount(struct super_block *sb, int *flags, char *data);
@@ -37,7 +36,6 @@ static const struct super_operations qnx4_sops =
 {
        .alloc_inode    = qnx4_alloc_inode,
        .destroy_inode  = qnx4_destroy_inode,
-        .put_super      = qnx4_put_super,
        .statfs         = qnx4_statfs,
        .remount_fs     = qnx4_remount,
 };
@@ -148,18 +146,19 @@ static int qnx4_statfs(struct dentry *dentry, struct kstatfs *buf)
 * it really _is_ a qnx4 filesystem, and to check the size
 * of the directory entry.
 */
-static const char *qnx4_checkroot(struct super_block *sb)
+static const char *qnx4_checkroot(struct super_block *sb,
+                                  struct qnx4_super_block *s)
 {
        struct buffer_head *bh;
        struct qnx4_inode_entry *rootdir;
        int rd, rl;
        int i, j;
-        if (*(qnx4_sb(sb)->sb->RootDir.di_fname) != '/')
+        if (s->RootDir.di_fname[0] != '/' || s->RootDir.di_fname[1] != '\0')
                return "no qnx4 filesystem (no root dir).";
        QNX4DEBUG((KERN_NOTICE "QNX4 filesystem found on dev %s.\n", sb->s_id));
-        rd = le32_to_cpu(qnx4_sb(sb)->sb->RootDir.di_first_xtnt.xtnt_blk) - 1;
+        rd = le32_to_cpu(s->RootDir.di_first_xtnt.xtnt_blk) - 1;
-        rl = le32_to_cpu(qnx4_sb(sb)->sb->RootDir.di_first_xtnt.xtnt_size);
+        rl = le32_to_cpu(s->RootDir.di_first_xtnt.xtnt_size);
        for (j = 0; j < rl; j++) {
                bh = sb_bread(sb, rd + j);      /* root dir, first block */
                if (bh == NULL)
@@ -189,7 +188,6 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent)
        struct inode *root;
        const char *errmsg;
        struct qnx4_sb_info *qs;
-        int ret = -EINVAL;
        qs = kzalloc(sizeof(struct qnx4_sb_info), GFP_KERNEL);
        if (!qs)
@@ -198,67 +196,50 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent)
        sb_set_blocksize(s, QNX4_BLOCK_SIZE);
+        s->s_op = &qnx4_sops;
+        s->s_magic = QNX4_SUPER_MAGIC;
+        s->s_flags |= MS_RDONLY;        /* Yup, read-only yet */
        /* Check the superblock signature. Since the qnx4 code is
           dangerous, we should leave as quickly as possible
           if we don't belong here... */
        bh = sb_bread(s, 1);
        if (!bh) {
                printk(KERN_ERR "qnx4: unable to read the superblock\n");
-                goto outnobh;
+                return -EINVAL;
        }
-        if ( le32_to_cpup((__le32*) bh->b_data) != QNX4_SUPER_MAGIC ) {
-                if (!silent)
-                        printk(KERN_ERR "qnx4: wrong fsid in superblock.\n");
-                goto out;
-        }
-        s->s_op = &qnx4_sops;
-        s->s_magic = QNX4_SUPER_MAGIC;
-        s->s_flags |= MS_RDONLY;        /* Yup, read-only yet */
-        qnx4_sb(s)->sb_buf = bh;
-        qnx4_sb(s)->sb = (struct qnx4_super_block *) bh->b_data;
        /* check before allocating dentries, inodes, .. */
-        errmsg = qnx4_checkroot(s);
+        errmsg = qnx4_checkroot(s, (struct qnx4_super_block *) bh->b_data);
+        brelse(bh);
        if (errmsg != NULL) {
                if (!silent)
                        printk(KERN_ERR "qnx4: %s\n", errmsg);
-                goto out;
+                return -EINVAL;
        }
        /* does root not have inode number QNX4_ROOT_INO ?? */
        root = qnx4_iget(s, QNX4_ROOT_INO * QNX4_INODES_PER_BLOCK);
        if (IS_ERR(root)) {
                printk(KERN_ERR "qnx4: get inode failed\n");
-                ret = PTR_ERR(root);
+                return PTR_ERR(root);
-                goto outb;
        }
-        ret = -ENOMEM;
        s->s_root = d_make_root(root);
        if (s->s_root == NULL)
-                goto outb;
+                return -ENOMEM;
-        brelse(bh);
        return 0;
-      outb:
-        kfree(qs->BitMap);
-      out:
-        brelse(bh);
-      outnobh:
-        kfree(qs);
-        s->s_fs_info = NULL;
-        return ret;
 }
-static void qnx4_put_super(struct super_block *sb)
+static void qnx4_kill_sb(struct super_block *sb)
 {
        struct qnx4_sb_info *qs = qnx4_sb(sb);
-        kfree( qs->BitMap );
+        kill_block_super(sb);
-        kfree( qs );
+        if (qs) {
-        sb->s_fs_info = NULL;
+                kfree(qs->BitMap);
-        return;
+                kfree(qs);
+        }
 }
 static int qnx4_readpage(struct file *file, struct page *page)
@@ -409,7 +390,7 @@ static struct file_system_type qnx4_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "qnx4",
        .mount          = qnx4_mount,
-        .kill_sb        = kill_block_super,
+        .kill_sb        = qnx4_kill_sb,
        .fs_flags       = FS_REQUIRES_DEV,
 };
 MODULE_ALIAS_FS("qnx4");
diff --git a/fs/qnx4/qnx4.h b/fs/qnx4/qnx4.h
index 34e2d329c97e..c9b1be2c164d 100644
--- a/fs/qnx4/qnx4.h
+++ b/fs/qnx4/qnx4.h
@@ -10,8 +10,6 @@
 #endif
 struct qnx4_sb_info {
-        struct buffer_head      *sb_buf;        /* superblock buffer */
-        struct qnx4_super_block *sb;            /* our superblock */
        unsigned int            Version;        /* may be useful */
        struct qnx4_inode_entry *BitMap;        /* useful */
 };
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 831d49a4111f..cfc8dcc16043 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -581,9 +581,17 @@ int dquot_scan_active(struct super_block *sb,
                dqstats_inc(DQST_LOOKUPS);
                dqput(old_dquot);
                old_dquot = dquot;
-                ret = fn(dquot, priv);
+                /*
-                if (ret < 0)
+                 * ->release_dquot() can be racing with us. Our reference
-                        goto out;
+                 * protects us from new calls to it so just wait for any
+                 * outstanding call and recheck the DQ_ACTIVE_B after that.
+                 */
+                wait_on_dquot(dquot);
+                if (test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) {
+                        ret = fn(dquot, priv);
+                        if (ret < 0)
+                                goto out;
+                }
                spin_lock(&dq_list_lock);
                /* We are safe to continue now because our dquot could not
                 * be moved out of the inuse list while we hold the reference */
diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c
index 4884ac5ae9be..1e56a4e8cf7c 100644
--- a/fs/ramfs/file-mmu.c
+++ b/fs/ramfs/file-mmu.c
@@ -30,13 +30,6 @@
 #include "internal.h"
-const struct address_space_operations ramfs_aops = {
-        .readpage       = simple_readpage,
-        .write_begin    = simple_write_begin,
-        .write_end      = simple_write_end,
-        .set_page_dirty = __set_page_dirty_no_writeback,
-};
 const struct file_operations ramfs_file_operations = {
        .read           = do_sync_read,
        .aio_read       = generic_file_aio_read,
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 8d5b438cc188..0b3d8e4cb2fa 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -27,13 +27,12 @@
 #include "internal.h"
 static int ramfs_nommu_setattr(struct dentry *, struct iattr *);
+static unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
-const struct address_space_operations ramfs_aops = {
+                                                   unsigned long addr,
-        .readpage               = simple_readpage,
+                                                   unsigned long len,
-        .write_begin            = simple_write_begin,
+                                                   unsigned long pgoff,
-        .write_end              = simple_write_end,
+                                                   unsigned long flags);
-        .set_page_dirty         = __set_page_dirty_no_writeback,
+static int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma);
-};
 const struct file_operations ramfs_file_operations = {
        .mmap                   = ramfs_nommu_mmap,
@@ -197,7 +196,7 @@ static int ramfs_nommu_setattr(struct dentry *dentry, struct iattr *ia)
 *   - the pages to be mapped must exist
 *   - the pages be physically contiguous in sequence
 */
-unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
+static unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
                                            unsigned long addr, unsigned long len,
                                            unsigned long pgoff, unsigned long flags)
 {
@@ -256,7 +255,7 @@ out:
 /*
 * set up a mapping for shared memory segments
 */
-int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma)
+static int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma)
 {
        if (!(vma->vm_flags & VM_SHARED))
                return -ENOSYS;
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 39d14659a8d3..d365b1c4eb3c 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -43,6 +43,13 @@
 static const struct super_operations ramfs_ops;
 static const struct inode_operations ramfs_dir_inode_operations;
+static const struct address_space_operations ramfs_aops = {
+        .readpage       = simple_readpage,
+        .write_begin    = simple_write_begin,
+        .write_end      = simple_write_end,
+        .set_page_dirty = __set_page_dirty_no_writeback,
+};
 static struct backing_dev_info ramfs_backing_dev_info = {
        .name           = "ramfs",
        .ra_pages       = 0,    /* No readahead */
@@ -275,4 +282,4 @@ int __init init_ramfs_fs(void)
        return err;
 }
-module_init(init_ramfs_fs)
+fs_initcall(init_ramfs_fs);
diff --git a/fs/ramfs/internal.h b/fs/ramfs/internal.h
index 6b330639b51d..a9d8ae88fa15 100644
--- a/fs/ramfs/internal.h
+++ b/fs/ramfs/internal.h
@@ -10,5 +10,4 @@
 */
-extern const struct address_space_operations ramfs_aops;
 extern const struct inode_operations ramfs_file_inode_operations;
diff --git a/fs/read_write.c b/fs/read_write.c
index 58e440df1bc6..28cc9c810744 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -264,10 +264,22 @@ loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
 }
 EXPORT_SYMBOL(vfs_llseek);
+static inline struct fd fdget_pos(int fd)
+{
+        return __to_fd(__fdget_pos(fd));
+}
+static inline void fdput_pos(struct fd f)
+{
+        if (f.flags & FDPUT_POS_UNLOCK)
+                mutex_unlock(&f.file->f_pos_lock);
+        fdput(f);
+}
 SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
 {
        off_t retval;
-        struct fd f = fdget(fd);
+        struct fd f = fdget_pos(fd);
        if (!f.file)
                return -EBADF;
@@ -278,7 +290,7 @@ SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
                if (res != (loff_t)retval)
                        retval = -EOVERFLOW;    /* LFS: should only happen on 32 bit platforms */
        }
-        fdput(f);
+        fdput_pos(f);
        return retval;
 }
@@ -295,7 +307,7 @@ SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
                unsigned int, whence)
 {
        int retval;
-        struct fd f = fdget(fd);
+        struct fd f = fdget_pos(fd);
        loff_t offset;
        if (!f.file)
@@ -315,7 +327,7 @@ SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
                        retval = 0;
        }
 out_putf:
-        fdput(f);
+        fdput_pos(f);
        return retval;
 }
 #endif
@@ -498,7 +510,7 @@ static inline void file_pos_write(struct file *file, loff_t pos)
 SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
 {
-        struct fd f = fdget(fd);
+        struct fd f = fdget_pos(fd);
        ssize_t ret = -EBADF;
        if (f.file) {
@@ -506,7 +518,7 @@ SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
                ret = vfs_read(f.file, buf, count, &pos);
                if (ret >= 0)
                        file_pos_write(f.file, pos);
-                fdput(f);
+                fdput_pos(f);
        }
        return ret;
 }
@@ -514,7 +526,7 @@ SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
 SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
                size_t, count)
 {
-        struct fd f = fdget(fd);
+        struct fd f = fdget_pos(fd);
        ssize_t ret = -EBADF;
        if (f.file) {
@@ -522,7 +534,7 @@ SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
                ret = vfs_write(f.file, buf, count, &pos);
                if (ret >= 0)
                        file_pos_write(f.file, pos);
-                fdput(f);
+                fdput_pos(f);
        }
        return ret;
@@ -797,7 +809,7 @@ EXPORT_SYMBOL(vfs_writev);
 SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
                unsigned long, vlen)
 {
-        struct fd f = fdget(fd);
+        struct fd f = fdget_pos(fd);
        ssize_t ret = -EBADF;
        if (f.file) {
@@ -805,7 +817,7 @@ SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
                ret = vfs_readv(f.file, vec, vlen, &pos);
                if (ret >= 0)
                        file_pos_write(f.file, pos);
-                fdput(f);
+                fdput_pos(f);
        }
        if (ret > 0)
@@ -817,7 +829,7 @@ SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
 SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
                unsigned long, vlen)
 {
-        struct fd f = fdget(fd);
+        struct fd f = fdget_pos(fd);
        ssize_t ret = -EBADF;
        if (f.file) {
@@ -825,7 +837,7 @@ SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
                ret = vfs_writev(f.file, vec, vlen, &pos);
                if (ret >= 0)
                        file_pos_write(f.file, pos);
-                fdput(f);
+                fdput_pos(f);
        }
        if (ret > 0)
@@ -901,10 +913,6 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
        io_fn_t fn;
        iov_fn_t fnv;
-        ret = -EFAULT;
-        if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector)))
-                goto out;
        ret = compat_rw_copy_check_uvector(type, uvector, nr_segs,
                                               UIO_FASTIOV, iovstack, &iov);
        if (ret <= 0)
@@ -968,11 +976,11 @@ out:
        return ret;
 }
-COMPAT_SYSCALL_DEFINE3(readv, unsigned long, fd,
+COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
                const struct compat_iovec __user *,vec,
-                unsigned long, vlen)
+                compat_ulong_t, vlen)
 {
-        struct fd f = fdget(fd);
+        struct fd f = fdget_pos(fd);
        ssize_t ret;
        loff_t pos;
@@ -982,7 +990,7 @@ COMPAT_SYSCALL_DEFINE3(readv, unsigned long, fd,
        ret = compat_readv(f.file, vec, vlen, &pos);
        if (ret >= 0)
                f.file->f_pos = pos;
-        fdput(f);
+        fdput_pos(f);
        return ret;
 }
@@ -1005,9 +1013,9 @@ COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
        return ret;
 }
-COMPAT_SYSCALL_DEFINE5(preadv, unsigned long, fd,
+COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd,
                const struct compat_iovec __user *,vec,
-                unsigned long, vlen, u32, pos_low, u32, pos_high)
+                compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
 {
        loff_t pos = ((loff_t)pos_high << 32) | pos_low;
        return compat_sys_preadv64(fd, vec, vlen, pos);
@@ -1035,11 +1043,11 @@ out:
        return ret;
 }
-COMPAT_SYSCALL_DEFINE3(writev, unsigned long, fd,
+COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
                const struct compat_iovec __user *, vec,
-                unsigned long, vlen)
+                compat_ulong_t, vlen)
 {
-        struct fd f = fdget(fd);
+        struct fd f = fdget_pos(fd);
        ssize_t ret;
        loff_t pos;
@@ -1049,7 +1057,7 @@ COMPAT_SYSCALL_DEFINE3(writev, unsigned long, fd,
        ret = compat_writev(f.file, vec, vlen, &pos);
        if (ret >= 0)
                f.file->f_pos = pos;
-        fdput(f);
+        fdput_pos(f);
        return ret;
 }
@@ -1072,9 +1080,9 @@ COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
        return ret;
 }
-COMPAT_SYSCALL_DEFINE5(pwritev, unsigned long, fd,
+COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd,
                const struct compat_iovec __user *,vec,
-                unsigned long, vlen, u32, pos_low, u32, pos_high)
+                compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
 {
        loff_t pos = ((loff_t)pos_high << 32) | pos_low;
        return compat_sys_pwritev64(fd, vec, vlen, pos);
diff --git a/fs/reiserfs/acl.h b/fs/reiserfs/acl.h
index f096b80e73d8..4a211f5b34b8 100644
--- a/fs/reiserfs/acl.h
+++ b/fs/reiserfs/acl.h
@@ -48,18 +48,18 @@ static inline int reiserfs_acl_count(size_t size)
 #ifdef CONFIG_REISERFS_FS_POSIX_ACL
 struct posix_acl *reiserfs_get_acl(struct inode *inode, int type);
+int reiserfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
 int reiserfs_acl_chmod(struct inode *inode);
 int reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
                                 struct inode *dir, struct dentry *dentry,
                                 struct inode *inode);
 int reiserfs_cache_default_acl(struct inode *dir);
-extern const struct xattr_handler reiserfs_posix_acl_default_handler;
-extern const struct xattr_handler reiserfs_posix_acl_access_handler;
 #else
 #define reiserfs_cache_default_acl(inode) 0
 #define reiserfs_get_acl NULL
+#define reiserfs_set_acl NULL
 static inline int reiserfs_acl_chmod(struct inode *inode)
 {
diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c
index 2b7882b508db..9a3c68cf6026 100644
--- a/fs/reiserfs/do_balan.c
+++ b/fs/reiserfs/do_balan.c
@@ -324,23 +324,17 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
                        switch (flag) {
                        case M_INSERT:  /* insert item into L[0] */
-                                if (item_pos == tb->lnum[0] - 1
+                                if (item_pos == tb->lnum[0] - 1 && tb->lbytes != -1) {
-                                    && tb->lbytes != -1) {
                                        /* part of new item falls into L[0] */
                                        int new_item_len;
                                        int version;
-                                        ret_val =
+                                        ret_val = leaf_shift_left(tb, tb->lnum[0] - 1, -1);
-                                            leaf_shift_left(tb, tb->lnum[0] - 1,
-                                                            -1);
                                        /* Calculate item length to insert to S[0] */
-                                        new_item_len =
+                                        new_item_len = ih_item_len(ih) - tb->lbytes;
-                                            ih_item_len(ih) - tb->lbytes;
                                        /* Calculate and check item length to insert to L[0] */
-                                        put_ih_item_len(ih,
+                                        put_ih_item_len(ih, ih_item_len(ih) - new_item_len);
-                                                        ih_item_len(ih) -
-                                                        new_item_len);
                                        RFALSE(ih_item_len(ih) <= 0,
                                               "PAP-12080: there is nothing to insert into L[0]: ih_item_len=%d",
@@ -349,30 +343,18 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
                                        /* Insert new item into L[0] */
                                        buffer_info_init_left(tb, &bi);
                                        leaf_insert_into_buf(&bi,
-                                                             n + item_pos -
+                                                        n + item_pos - ret_val, ih, body,
-                                                             ret_val, ih, body,
+                                                        zeros_num > ih_item_len(ih) ? ih_item_len(ih) : zeros_num);
-                                                             zeros_num >
-                                                             ih_item_len(ih) ?
-                                                             ih_item_len(ih) :
-                                                             zeros_num);
                                        version = ih_version(ih);
                                        /* Calculate key component, item length and body to insert into S[0] */
-                                        set_le_ih_k_offset(ih,
+                                        set_le_ih_k_offset(ih, le_ih_k_offset(ih) +
-                                                           le_ih_k_offset(ih) +
+                                                        (tb-> lbytes << (is_indirect_le_ih(ih) ? tb->tb_sb-> s_blocksize_bits - UNFM_P_SHIFT : 0)));
-                                                           (tb->
-                                                            lbytes <<
-                                                            (is_indirect_le_ih
-                                                             (ih) ? tb->tb_sb->
-                                                             s_blocksize_bits -
-                                                             UNFM_P_SHIFT :
-                                                             0)));
                                        put_ih_item_len(ih, new_item_len);
                                        if (tb->lbytes > zeros_num) {
-                                                body +=
+                                                body += (tb->lbytes - zeros_num);
-                                                    (tb->lbytes - zeros_num);
                                                zeros_num = 0;
                                        } else
                                                zeros_num -= tb->lbytes;
@@ -383,15 +365,10 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
                                } else {
                                        /* new item in whole falls into L[0] */
                                        /* Shift lnum[0]-1 items to L[0] */
-                                        ret_val =
+                                        ret_val = leaf_shift_left(tb, tb->lnum[0] - 1, tb->lbytes);
-                                            leaf_shift_left(tb, tb->lnum[0] - 1,
-                                                            tb->lbytes);
                                        /* Insert new item into L[0] */
                                        buffer_info_init_left(tb, &bi);
-                                        leaf_insert_into_buf(&bi,
+                                        leaf_insert_into_buf(&bi, n + item_pos - ret_val, ih, body, zeros_num);
-                                                             n + item_pos -
-                                                             ret_val, ih, body,
-                                                             zeros_num);
                                        tb->insert_size[0] = 0;
                                        zeros_num = 0;
                                }
@@ -399,264 +376,117 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
                        case M_PASTE:   /* append item in L[0] */
-                                if (item_pos == tb->lnum[0] - 1
+                                if (item_pos == tb->lnum[0] - 1 && tb->lbytes != -1) {
-                                    && tb->lbytes != -1) {
                                        /* we must shift the part of the appended item */
-                                        if (is_direntry_le_ih
+                                        if (is_direntry_le_ih(B_N_PITEM_HEAD(tbS0, item_pos))) {
-                                            (B_N_PITEM_HEAD(tbS0, item_pos))) {
                                                RFALSE(zeros_num,
                                                       "PAP-12090: invalid parameter in case of a directory");
                                                /* directory item */
                                                if (tb->lbytes > pos_in_item) {
                                                        /* new directory entry falls into L[0] */
-                                                        struct item_head
+                                                        struct item_head *pasted;
-                                                            *pasted;
+                                                        int l_pos_in_item = pos_in_item;
-                                                        int l_pos_in_item =
-                                                            pos_in_item;
                                                        /* Shift lnum[0] - 1 items in whole. Shift lbytes - 1 entries from given directory item */
-                                                        ret_val =
+                                                        ret_val = leaf_shift_left(tb, tb->lnum[0], tb->lbytes-1);
-                                                            leaf_shift_left(tb,
+                                                        if (ret_val && !item_pos) {
-                                                                            tb->
+                                                                pasted = B_N_PITEM_HEAD(tb->L[0], B_NR_ITEMS(tb->L[0]) - 1);
-                                                                            lnum
+                                                                l_pos_in_item += I_ENTRY_COUNT(pasted) - (tb->lbytes -1);
-                                                                            [0],
-                                                                            tb->
-                                                                            lbytes
-                                                                            -
-                                                                            1);
-                                                        if (ret_val
-                                                            && !item_pos) {
-                                                                pasted =
-                                                                    B_N_PITEM_HEAD
-                                                                    (tb->L[0],
-                                                                     B_NR_ITEMS
-                                                                     (tb->
-                                                                      L[0]) -
-                                                                     1);
-                                                                l_pos_in_item +=
-                                                                    I_ENTRY_COUNT
-                                                                    (pasted) -
-                                                                    (tb->
-                                                                     lbytes -
-                                                                     1);
                                                        }
                                                        /* Append given directory entry to directory item */
                                                        buffer_info_init_left(tb, &bi);
-                                                        leaf_paste_in_buffer
+                                                        leaf_paste_in_buffer(&bi, n + item_pos - ret_val, l_pos_in_item, tb->insert_size[0], body, zeros_num);
-                                                            (&bi,
-                                                             n + item_pos -
-                                                             ret_val,
-                                                             l_pos_in_item,
-                                                             tb->insert_size[0],
-                                                             body, zeros_num);
                                                        /* previous string prepared space for pasting new entry, following string pastes this entry */
                                                        /* when we have merge directory item, pos_in_item has been changed too */
                                                        /* paste new directory entry. 1 is entry number */
-                                                        leaf_paste_entries(&bi,
+                                                        leaf_paste_entries(&bi, n + item_pos - ret_val, l_pos_in_item,
-                                                                           n +
+                                                                           1, (struct reiserfs_de_head *) body,
-                                                                           item_pos
+                                                                           body + DEH_SIZE, tb->insert_size[0]);
-                                                                           -
-                                                                           ret_val,
-                                                                           l_pos_in_item,
-                                                                           1,
-                                                                           (struct
-                                                                            reiserfs_de_head
-                                                                            *)
-                                                                           body,
-                                                                           body
-                                                                           +
-                                                                           DEH_SIZE,
-                                                                           tb->
-                                                                           insert_size
-                                                                           [0]
-                                                            );
                                                        tb->insert_size[0] = 0;
                                                } else {
                                                        /* new directory item doesn't fall into L[0] */
                                                        /* Shift lnum[0]-1 items in whole. Shift lbytes directory entries from directory item number lnum[0] */
-                                                        leaf_shift_left(tb,
+                                                        leaf_shift_left(tb, tb->lnum[0], tb->lbytes);
-                                                                        tb->
-                                                                        lnum[0],
-                                                                        tb->
-                                                                        lbytes);
                                                }
                                                /* Calculate new position to append in item body */
                                                pos_in_item -= tb->lbytes;
                                        } else {
                                                /* regular object */
-                                                RFALSE(tb->lbytes <= 0,
+                                                RFALSE(tb->lbytes <= 0, "PAP-12095: there is nothing to shift to L[0]. lbytes=%d", tb->lbytes);
-                                                       "PAP-12095: there is nothing to shift to L[0]. lbytes=%d",
+                                                RFALSE(pos_in_item != ih_item_len(B_N_PITEM_HEAD(tbS0, item_pos)),
-                                                       tb->lbytes);
-                                                RFALSE(pos_in_item !=
-                                                       ih_item_len
-                                                       (B_N_PITEM_HEAD
-                                                        (tbS0, item_pos)),
                                                       "PAP-12100: incorrect position to paste: item_len=%d, pos_in_item=%d",
-                                                       ih_item_len
+                                                       ih_item_len(B_N_PITEM_HEAD(tbS0, item_pos)),pos_in_item);
-                                                       (B_N_PITEM_HEAD
-                                                        (tbS0, item_pos)),
-                                                       pos_in_item);
                                                if (tb->lbytes >= pos_in_item) {
                                                        /* appended item will be in L[0] in whole */
                                                        int l_n;
                                                        /* this bytes number must be appended to the last item of L[h] */
-                                                        l_n =
+                                                        l_n = tb->lbytes - pos_in_item;
-                                                            tb->lbytes -
-                                                            pos_in_item;
                                                        /* Calculate new insert_size[0] */
-                                                        tb->insert_size[0] -=
+                                                        tb->insert_size[0] -= l_n;
-                                                            l_n;
-                                                        RFALSE(tb->
+                                                        RFALSE(tb->insert_size[0] <= 0,
-                                                               insert_size[0] <=
-                                                               0,
                                                               "PAP-12105: there is nothing to paste into L[0]. insert_size=%d",
-                                                               tb->
+                                                               tb->insert_size[0]);
-                                                               insert_size[0]);
+                                                        ret_val = leaf_shift_left(tb, tb->lnum[0], ih_item_len
-                                                        ret_val =
+                                                                            (B_N_PITEM_HEAD(tbS0, item_pos)));
-                                                            leaf_shift_left(tb,
-                                                                            tb->
-                                                                            lnum
-                                                                            [0],
-                                                                            ih_item_len
-                                                                            (B_N_PITEM_HEAD
-                                                                             (tbS0,
-                                                                              item_pos)));
                                                        /* Append to body of item in L[0] */
                                                        buffer_info_init_left(tb, &bi);
                                                        leaf_paste_in_buffer
-                                                            (&bi,
+                                                            (&bi, n + item_pos - ret_val, ih_item_len
-                                                             n + item_pos -
+                                                             (B_N_PITEM_HEAD(tb->L[0], n + item_pos - ret_val)),
-                                                             ret_val,
+                                                             l_n, body,
-                                                             ih_item_len
+                                                             zeros_num > l_n ? l_n : zeros_num);
-                                                             (B_N_PITEM_HEAD
-                                                              (tb->L[0],
-                                                               n + item_pos -
-                                                               ret_val)), l_n,
-                                                             body,
-                                                             zeros_num >
-                                                             l_n ? l_n :
-                                                             zeros_num);
                                                        /* 0-th item in S0 can be only of DIRECT type when l_n != 0 */
                                                        {
                                                                int version;
-                                                                int temp_l =
+                                                                int temp_l = l_n;
-                                                                    l_n;
+                                                                RFALSE(ih_item_len(B_N_PITEM_HEAD(tbS0, 0)),
-                                                                RFALSE
-                                                                    (ih_item_len
-                                                                     (B_N_PITEM_HEAD
-                                                                      (tbS0,
-                                                                       0)),
                                                                     "PAP-12106: item length must be 0");
-                                                                RFALSE
+                                                                RFALSE(comp_short_le_keys(B_N_PKEY(tbS0, 0), B_N_PKEY
-                                                                    (comp_short_le_keys
+                                                                      (tb->L[0], n + item_pos - ret_val)),
-                                                                     (B_N_PKEY
-                                                                      (tbS0, 0),
-                                                                      B_N_PKEY
-                                                                      (tb->L[0],
-                                                                       n +
-                                                                       item_pos
-                                                                       -
-                                                                       ret_val)),
                                                                     "PAP-12107: items must be of the same file");
                                                                if (is_indirect_le_ih(B_N_PITEM_HEAD(tb->L[0], n + item_pos - ret_val))) {
-                                                                        temp_l =
+                                                                        temp_l = l_n << (tb->tb_sb-> s_blocksize_bits - UNFM_P_SHIFT);
-                                                                            l_n
-                                                                            <<
-                                                                            (tb->
-                                                                             tb_sb->
-                                                                             s_blocksize_bits
-                                                                             -
-                                                                             UNFM_P_SHIFT);
                                                                }
                                                                /* update key of first item in S0 */
-                                                                version =
+                                                                version = ih_version(B_N_PITEM_HEAD(tbS0, 0));
-                                                                    ih_version
+                                                                set_le_key_k_offset(version, B_N_PKEY(tbS0, 0),
-                                                                    (B_N_PITEM_HEAD
+                                                                     le_key_k_offset(version,B_N_PKEY(tbS0, 0)) + temp_l);
-                                                                     (tbS0, 0));
-                                                                set_le_key_k_offset
-                                                                    (version,
-                                                                     B_N_PKEY
-                                                                     (tbS0, 0),
-                                                                     le_key_k_offset
-                                                                     (version,
-                                                                      B_N_PKEY
-                                                                      (tbS0,
-                                                                       0)) +
-                                                                     temp_l);
                                                                /* update left delimiting key */
-                                                                set_le_key_k_offset
+                                                                set_le_key_k_offset(version, B_N_PDELIM_KEY(tb->CFL[0], tb->lkey[0]),
-                                                                    (version,
+                                                                     le_key_k_offset(version, B_N_PDELIM_KEY(tb->CFL[0], tb->lkey[0])) + temp_l);
-                                                                     B_N_PDELIM_KEY
-                                                                     (tb->
-                                                                      CFL[0],
-                                                                      tb->
-                                                                      lkey[0]),
-                                                                     le_key_k_offset
-                                                                     (version,
-                                                                      B_N_PDELIM_KEY
-                                                                      (tb->
-                                                                       CFL[0],
-                                                                       tb->
-                                                                       lkey[0]))
-                                                                     + temp_l);
                                                        }
                                                        /* Calculate new body, position in item and insert_size[0] */
                                                        if (l_n > zeros_num) {
-                                                                body +=
+                                                                body += (l_n - zeros_num);
-                                                                    (l_n -
-                                                                     zeros_num);
                                                                zeros_num = 0;
                                                        } else
-                                                                zeros_num -=
+                                                                zeros_num -= l_n;
-                                                                    l_n;
                                                        pos_in_item = 0;
-                                                        RFALSE
+                                                        RFALSE(comp_short_le_keys(B_N_PKEY(tbS0, 0), B_N_PKEY(tb->L[0], B_NR_ITEMS(tb->L[0]) - 1))
-                                                            (comp_short_le_keys
+                                                             || !op_is_left_mergeable(B_N_PKEY(tbS0, 0), tbS0->b_size)
-                                                             (B_N_PKEY(tbS0, 0),
+                                                             || !op_is_left_mergeable(B_N_PDELIM_KEY(tb->CFL[0], tb->lkey[0]), tbS0->b_size),
-                                                              B_N_PKEY(tb->L[0],
-                                                                       B_NR_ITEMS
-                                                                       (tb->
-                                                                        L[0]) -
-                                                                       1))
-                                                             ||
-                                                             !op_is_left_mergeable
-                                                             (B_N_PKEY(tbS0, 0),
-                                                              tbS0->b_size)
-                                                             ||
-                                                             !op_is_left_mergeable
-                                                             (B_N_PDELIM_KEY
-                                                              (tb->CFL[0],
-                                                               tb->lkey[0]),
-                                                              tbS0->b_size),
                                                             "PAP-12120: item must be merge-able with left neighboring item");
                                                } else {        /* only part of the appended item will be in L[0] */
                                                        /* Calculate position in item for append in S[0] */
-                                                        pos_in_item -=
+                                                        pos_in_item -= tb->lbytes;
-                                                            tb->lbytes;
-                                                        RFALSE(pos_in_item <= 0,
+                                                        RFALSE(pos_in_item <= 0, "PAP-12125: no place for paste. pos_in_item=%d", pos_in_item);
-                                                               "PAP-12125: no place for paste. pos_in_item=%d",
-                                                               pos_in_item);
                                                        /* Shift lnum[0] - 1 items in whole. Shift lbytes - 1 byte from item number lnum[0] */
-                                                        leaf_shift_left(tb,
+                                                        leaf_shift_left(tb, tb->lnum[0], tb->lbytes);
-                                                                        tb->
-                                                                        lnum[0],
-                                                                        tb->
-                                                                        lbytes);
                                                }
                                        }
                                } else {        /* appended item will be in L[0] in whole */
@@ -665,52 +495,30 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
                                        if (!item_pos && op_is_left_mergeable(B_N_PKEY(tbS0, 0), tbS0->b_size)) {       /* if we paste into first item of S[0] and it is left mergable */
                                                /* then increment pos_in_item by the size of the last item in L[0] */
-                                                pasted =
+                                                pasted = B_N_PITEM_HEAD(tb->L[0], n - 1);
-                                                    B_N_PITEM_HEAD(tb->L[0],
-                                                                   n - 1);
                                                if (is_direntry_le_ih(pasted))
-                                                        pos_in_item +=
+                                                        pos_in_item += ih_entry_count(pasted);
-                                                            ih_entry_count
-                                                            (pasted);
                                                else
-                                                        pos_in_item +=
+                                                        pos_in_item += ih_item_len(pasted);
-                                                            ih_item_len(pasted);
                                        }
                                        /* Shift lnum[0] - 1 items in whole. Shift lbytes - 1 byte from item number lnum[0] */
-                                        ret_val =
+                                        ret_val = leaf_shift_left(tb, tb->lnum[0], tb->lbytes);
-                                            leaf_shift_left(tb, tb->lnum[0],
-                                                            tb->lbytes);
                                        /* Append to body of item in L[0] */
                                        buffer_info_init_left(tb, &bi);
-                                        leaf_paste_in_buffer(&bi,
+                                        leaf_paste_in_buffer(&bi, n + item_pos - ret_val,
-                                                             n + item_pos -
-                                                             ret_val,
                                                             pos_in_item,
                                                             tb->insert_size[0],
                                                             body, zeros_num);
                                        /* if appended item is directory, paste entry */
-                                        pasted =
+                                        pasted = B_N_PITEM_HEAD(tb->L[0], n + item_pos - ret_val);
-                                            B_N_PITEM_HEAD(tb->L[0],
-                                                           n + item_pos -
-                                                           ret_val);
                                        if (is_direntry_le_ih(pasted))
-                                                leaf_paste_entries(&bi,
+                                                leaf_paste_entries(&bi, n + item_pos - ret_val,
-                                                                   n +
+                                                                   pos_in_item, 1,
-                                                                   item_pos -
+                                                                   (struct reiserfs_de_head *) body,
-                                                                   ret_val,
+                                                                   body + DEH_SIZE,
-                                                                   pos_in_item,
+                                                                   tb->insert_size[0]);
-                                                                   1,
-                                                                   (struct
-                                                                    reiserfs_de_head
-                                                                    *)body,
-                                                                   body +
-                                                                   DEH_SIZE,
-                                                                   tb->
-                                                                   insert_size
-                                                                   [0]
-                                                    );
                                        /* if appended item is indirect item, put unformatted node into un list */
                                        if (is_indirect_le_ih(pasted))
                                                set_ih_free_space(pasted, 0);
@@ -722,13 +530,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
                                reiserfs_panic(tb->tb_sb, "PAP-12130",
                                               "lnum > 0: unexpected mode: "
                                               " %s(%d)",
-                                               (flag ==
+                                               (flag == M_DELETE) ? "DELETE" : ((flag == M_CUT) ? "CUT" : "UNKNOWN"), flag);
-                                                M_DELETE) ? "DELETE" : ((flag ==
-                                                                         M_CUT)
-                                                                        ? "CUT"
-                                                                        :
-                                                                        "UNKNOWN"),
-                                               flag);
                        }
                } else {
                        /* new item doesn't fall into L[0] */
@@ -748,14 +550,12 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
                case M_INSERT:  /* insert item */
                        if (n - tb->rnum[0] < item_pos) {       /* new item or its part falls to R[0] */
                                if (item_pos == n - tb->rnum[0] + 1 && tb->rbytes != -1) {      /* part of new item falls into R[0] */
-                                        loff_t old_key_comp, old_len,
+                                        loff_t old_key_comp, old_len, r_zeros_number;
-                                            r_zeros_number;
                                        const char *r_body;
                                        int version;
                                        loff_t offset;
-                                        leaf_shift_right(tb, tb->rnum[0] - 1,
+                                        leaf_shift_right(tb, tb->rnum[0] - 1, -1);
-                                                         -1);
                                        version = ih_version(ih);
                                        /* Remember key component and item length */
@@ -763,29 +563,17 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
                                        old_len = ih_item_len(ih);
                                        /* Calculate key component and item length to insert into R[0] */
-                                        offset =
+                                        offset = le_ih_k_offset(ih) + ((old_len - tb->rbytes) << (is_indirect_le_ih(ih) ? tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT : 0));
-                                            le_ih_k_offset(ih) +
-                                            ((old_len -
-                                              tb->
-                                              rbytes) << (is_indirect_le_ih(ih)
-                                                          ? tb->tb_sb->
-                                                          s_blocksize_bits -
-                                                          UNFM_P_SHIFT : 0));
                                        set_le_ih_k_offset(ih, offset);
                                        put_ih_item_len(ih, tb->rbytes);
                                        /* Insert part of the item into R[0] */
                                        buffer_info_init_right(tb, &bi);
                                        if ((old_len - tb->rbytes) > zeros_num) {
                                                r_zeros_number = 0;
-                                                r_body =
+                                                r_body = body + (old_len - tb->rbytes) - zeros_num;
-                                                    body + (old_len -
-                                                            tb->rbytes) -
-                                                    zeros_num;
                                        } else {
                                                r_body = body;
-                                                r_zeros_number =
+                                                r_zeros_number = zeros_num - (old_len - tb->rbytes);
-                                                    zeros_num - (old_len -
-                                                                 tb->rbytes);
                                                zeros_num -= r_zeros_number;
                                        }
@@ -798,25 +586,18 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
                                        /* Calculate key component and item length to insert into S[0] */
                                        set_le_ih_k_offset(ih, old_key_comp);
-                                        put_ih_item_len(ih,
+                                        put_ih_item_len(ih, old_len - tb->rbytes);
-                                                        old_len - tb->rbytes);
                                        tb->insert_size[0] -= tb->rbytes;
                                } else {        /* whole new item falls into R[0] */
                                        /* Shift rnum[0]-1 items to R[0] */
-                                        ret_val =
+                                        ret_val = leaf_shift_right(tb, tb->rnum[0] - 1, tb->rbytes);
-                                            leaf_shift_right(tb,
-                                                             tb->rnum[0] - 1,
-                                                             tb->rbytes);
                                        /* Insert new item into R[0] */
                                        buffer_info_init_right(tb, &bi);
-                                        leaf_insert_into_buf(&bi,
+                                        leaf_insert_into_buf(&bi, item_pos - n + tb->rnum[0] - 1,
-                                                             item_pos - n +
+                                                             ih, body, zeros_num);
-                                                             tb->rnum[0] - 1,
-                                                             ih, body,
-                                                             zeros_num);
                                        if (item_pos - n + tb->rnum[0] - 1 == 0) {
                                                replace_key(tb, tb->CFR[0],
@@ -841,200 +622,97 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
                                                RFALSE(zeros_num,
                                                       "PAP-12145: invalid parameter in case of a directory");
-                                                entry_count =
+                                                entry_count = I_ENTRY_COUNT(B_N_PITEM_HEAD
-                                                    I_ENTRY_COUNT(B_N_PITEM_HEAD
+                                                                  (tbS0, item_pos));
-                                                                  (tbS0,
-                                                                   item_pos));
                                                if (entry_count - tb->rbytes <
                                                    pos_in_item)
                                                        /* new directory entry falls into R[0] */
                                                {
                                                        int paste_entry_position;
-                                                        RFALSE(tb->rbytes - 1 >=
+                                                        RFALSE(tb->rbytes - 1 >= entry_count || !tb-> insert_size[0],
-                                                               entry_count
-                                                               || !tb->
-                                                               insert_size[0],
                                                               "PAP-12150: no enough of entries to shift to R[0]: rbytes=%d, entry_count=%d",
-                                                               tb->rbytes,
+                                                               tb->rbytes, entry_count);
-                                                               entry_count);
                                                        /* Shift rnum[0]-1 items in whole. Shift rbytes-1 directory entries from directory item number rnum[0] */
-                                                        leaf_shift_right(tb,
+                                                        leaf_shift_right(tb, tb->rnum[0], tb->rbytes - 1);
-                                                                         tb->
-                                                                         rnum
-                                                                         [0],
-                                                                         tb->
-                                                                         rbytes
-                                                                         - 1);
                                                        /* Paste given directory entry to directory item */
-                                                        paste_entry_position =
+                                                        paste_entry_position = pos_in_item - entry_count + tb->rbytes - 1;
-                                                            pos_in_item -
-                                                            entry_count +
-                                                            tb->rbytes - 1;
                                                        buffer_info_init_right(tb, &bi);
-                                                        leaf_paste_in_buffer
+                                                        leaf_paste_in_buffer(&bi, 0, paste_entry_position, tb->insert_size[0], body, zeros_num);
-                                                            (&bi, 0,
-                                                             paste_entry_position,
-                                                             tb->insert_size[0],
-                                                             body, zeros_num);
                                                        /* paste entry */
-                                                        leaf_paste_entries(&bi,
+                                                        leaf_paste_entries(&bi, 0, paste_entry_position, 1,
-                                                                           0,
+                                                                           (struct reiserfs_de_head *) body,
-                                                                           paste_entry_position,
+                                                                           body + DEH_SIZE, tb->insert_size[0]);
-                                                                           1,
-                                                                           (struct
+                                                        if (paste_entry_position == 0) {
-                                                                            reiserfs_de_head
-                                                                            *)
-                                                                           body,
-                                                                           body
-                                                                           +
-                                                                           DEH_SIZE,
-                                                                           tb->
-                                                                           insert_size
-                                                                           [0]
-                                                            );
-                                                        if (paste_entry_position
-                                                            == 0) {
                                                                /* change delimiting keys */
-                                                                replace_key(tb,
+                                                                replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0],0);
-                                                                            tb->
-                                                                            CFR
-                                                                            [0],
-                                                                            tb->
-                                                                            rkey
-                                                                            [0],
-                                                                            tb->
-                                                                            R
-                                                                            [0],
-                                                                            0);
                                                        }
                                                        tb->insert_size[0] = 0;
                                                        pos_in_item++;
                                                } else {        /* new directory entry doesn't fall into R[0] */
-                                                        leaf_shift_right(tb,
+                                                        leaf_shift_right(tb, tb->rnum[0], tb->rbytes);
-                                                                         tb->
-                                                                         rnum
-                                                                         [0],
-                                                                         tb->
-                                                                         rbytes);
                                                }
                                        } else {        /* regular object */
-                                                int n_shift, n_rem,
+                                                int n_shift, n_rem, r_zeros_number;
-                                                    r_zeros_number;
                                                const char *r_body;
                                                /* Calculate number of bytes which must be shifted from appended item */
-                                                if ((n_shift =
+                                                if ((n_shift = tb->rbytes - tb->insert_size[0]) < 0)
-                                                     tb->rbytes -
-                                                     tb->insert_size[0]) < 0)
                                                        n_shift = 0;
-                                                RFALSE(pos_in_item !=
+                                                RFALSE(pos_in_item != ih_item_len
-                                                       ih_item_len
+                                                       (B_N_PITEM_HEAD(tbS0, item_pos)),
-                                                       (B_N_PITEM_HEAD
-                                                        (tbS0, item_pos)),
                                                       "PAP-12155: invalid position to paste. ih_item_len=%d, pos_in_item=%d",
-                                                       pos_in_item,
+                                                       pos_in_item, ih_item_len
-                                                       ih_item_len
+                                                       (B_N_PITEM_HEAD(tbS0, item_pos)));
-                                                       (B_N_PITEM_HEAD
-                                                        (tbS0, item_pos)));
+                                                leaf_shift_right(tb, tb->rnum[0], n_shift);
-                                                leaf_shift_right(tb,
-                                                                 tb->rnum[0],
-                                                                 n_shift);
                                                /* Calculate number of bytes which must remain in body after appending to R[0] */
-                                                if ((n_rem =
+                                                if ((n_rem = tb->insert_size[0] - tb->rbytes) < 0)
-                                                     tb->insert_size[0] -
-                                                     tb->rbytes) < 0)
                                                        n_rem = 0;
                                                {
                                                        int version;
-                                                        unsigned long temp_rem =
+                                                        unsigned long temp_rem = n_rem;
-                                                            n_rem;
+                                                        version = ih_version(B_N_PITEM_HEAD(tb->R[0], 0));
-                                                        version =
+                                                        if (is_indirect_le_key(version, B_N_PKEY(tb->R[0], 0))) {
-                                                            ih_version
+                                                                temp_rem = n_rem << (tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT);
-                                                            (B_N_PITEM_HEAD
-                                                             (tb->R[0], 0));
-                                                        if (is_indirect_le_key
-                                                            (version,
-                                                             B_N_PKEY(tb->R[0],
-                                                                      0))) {
-                                                                temp_rem =
-                                                                    n_rem <<
-                                                                    (tb->tb_sb->
-                                                                     s_blocksize_bits
-                                                                     -
-                                                                     UNFM_P_SHIFT);
                                                        }
-                                                        set_le_key_k_offset
+                                                        set_le_key_k_offset(version, B_N_PKEY(tb->R[0], 0),
-                                                            (version,
+                                                             le_key_k_offset(version, B_N_PKEY(tb->R[0], 0)) + temp_rem);
-                                                             B_N_PKEY(tb->R[0],
+                                                        set_le_key_k_offset(version, B_N_PDELIM_KEY(tb->CFR[0], tb->rkey[0]),
-                                                                      0),
+                                                             le_key_k_offset(version, B_N_PDELIM_KEY(tb->CFR[0], tb->rkey[0])) + temp_rem);
-                                                             le_key_k_offset
-                                                             (version,
-                                                              B_N_PKEY(tb->R[0],
-                                                                       0)) +
-                                                             temp_rem);
-                                                        set_le_key_k_offset
-                                                            (version,
-                                                             B_N_PDELIM_KEY(tb->
-                                                                            CFR
-                                                                            [0],
-                                                                            tb->
-                                                                            rkey
-                                                                            [0]),
-                                                             le_key_k_offset
-                                                             (version,
-                                                              B_N_PDELIM_KEY
-                                                              (tb->CFR[0],
-                                                               tb->rkey[0])) +
-                                                             temp_rem);
                                                }
 /*                k_offset (B_N_PKEY(tb->R[0],0)) += n_rem;
                  k_offset (B_N_PDELIM_KEY(tb->CFR[0],tb->rkey[0])) += n_rem;*/
-                                                do_balance_mark_internal_dirty
+                                                do_balance_mark_internal_dirty(tb, tb->CFR[0], 0);
-                                                    (tb, tb->CFR[0], 0);
                                                /* Append part of body into R[0] */
                                                buffer_info_init_right(tb, &bi);
                                                if (n_rem > zeros_num) {
                                                        r_zeros_number = 0;
-                                                        r_body =
+                                                        r_body = body + n_rem - zeros_num;
-                                                            body + n_rem -
-                                                            zeros_num;
                                                } else {
                                                        r_body = body;
-                                                        r_zeros_number =
+                                                        r_zeros_number = zeros_num - n_rem;
-                                                            zeros_num - n_rem;
+                                                        zeros_num -= r_zeros_number;
-                                                        zeros_num -=
-                                                            r_zeros_number;
                                                }
-                                                leaf_paste_in_buffer(&bi, 0,
+                                                leaf_paste_in_buffer(&bi, 0, n_shift,
-                                                                     n_shift,
+                                                                     tb->insert_size[0] - n_rem,
-                                                                     tb->
+                                                                     r_body, r_zeros_number);
-                                                                     insert_size
-                                                                     [0] -
+                                                if (is_indirect_le_ih(B_N_PITEM_HEAD(tb->R[0], 0))) {
-                                                                     n_rem,
-                                                                     r_body,
-                                                                     r_zeros_number);
-                                                if (is_indirect_le_ih
-                                                    (B_N_PITEM_HEAD
-                                                     (tb->R[0], 0))) {
 #if 0
                                                        RFALSE(n_rem,
                                                               "PAP-12160: paste more than one unformatted node pointer");
 #endif
-                                                        set_ih_free_space
+                                                        set_ih_free_space(B_N_PITEM_HEAD(tb->R[0], 0), 0);
-                                                            (B_N_PITEM_HEAD
-                                                             (tb->R[0], 0), 0);
                                                }
                                                tb->insert_size[0] = n_rem;
                                                if (!n_rem)
@@ -1044,58 +722,28 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
                                        struct item_head *pasted;
-                                        ret_val =
+                                        ret_val = leaf_shift_right(tb, tb->rnum[0], tb->rbytes);
-                                            leaf_shift_right(tb, tb->rnum[0],
-                                                             tb->rbytes);
                                        /* append item in R[0] */
                                        if (pos_in_item >= 0) {
                                                buffer_info_init_right(tb, &bi);
-                                                leaf_paste_in_buffer(&bi,
+                                                leaf_paste_in_buffer(&bi, item_pos - n + tb->rnum[0], pos_in_item,
-                                                                     item_pos -
+                                                                     tb->insert_size[0], body, zeros_num);
-                                                                     n +
-                                                                     tb->
-                                                                     rnum[0],
-                                                                     pos_in_item,
-                                                                     tb->
-                                                                     insert_size
-                                                                     [0], body,
-                                                                     zeros_num);
                                        }
                                        /* paste new entry, if item is directory item */
-                                        pasted =
+                                        pasted = B_N_PITEM_HEAD(tb->R[0], item_pos - n + tb->rnum[0]);
-                                            B_N_PITEM_HEAD(tb->R[0],
+                                        if (is_direntry_le_ih(pasted) && pos_in_item >= 0) {
-                                                           item_pos - n +
+                                                leaf_paste_entries(&bi, item_pos - n + tb->rnum[0],
-                                                           tb->rnum[0]);
+                                                                   pos_in_item, 1,
-                                        if (is_direntry_le_ih(pasted)
+                                                                   (struct reiserfs_de_head *) body,
-                                            && pos_in_item >= 0) {
+                                                                   body + DEH_SIZE, tb->insert_size[0]);
-                                                leaf_paste_entries(&bi,
-                                                                   item_pos -
-                                                                   n +
-                                                                   tb->rnum[0],
-                                                                   pos_in_item,
-                                                                   1,
-                                                                   (struct
-                                                                    reiserfs_de_head
-                                                                    *)body,
-                                                                   body +
-                                                                   DEH_SIZE,
-                                                                   tb->
-                                                                   insert_size
-                                                                   [0]
-                                                    );
                                                if (!pos_in_item) {
-                                                        RFALSE(item_pos - n +
+                                                        RFALSE(item_pos - n + tb->rnum[0],
-                                                               tb->rnum[0],
                                                               "PAP-12165: directory item must be first item of node when pasting is in 0th position");
                                                        /* update delimiting keys */
-                                                        replace_key(tb,
+                                                        replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0);
-                                                                    tb->CFR[0],
-                                                                    tb->rkey[0],
-                                                                    tb->R[0],
-                                                                    0);
                                                }
                                        }
@@ -1111,22 +759,16 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
                default:        /* cases d and t */
                        reiserfs_panic(tb->tb_sb, "PAP-12175",
                                       "rnum > 0: unexpected mode: %s(%d)",
-                                       (flag ==
+                                       (flag == M_DELETE) ? "DELETE" : ((flag == M_CUT) ? "CUT" : "UNKNOWN"), flag);
-                                        M_DELETE) ? "DELETE" : ((flag ==
-                                                                 M_CUT) ? "CUT"
-                                                                : "UNKNOWN"),
-                                       flag);
                }
        }
        /* tb->rnum[0] > 0 */
        RFALSE(tb->blknum[0] > 3,
-               "PAP-12180: blknum can not be %d. It must be <= 3",
+               "PAP-12180: blknum can not be %d. It must be <= 3", tb->blknum[0]);
-               tb->blknum[0]);
        RFALSE(tb->blknum[0] < 0,
-               "PAP-12185: blknum can not be %d. It must be >= 0",
+               "PAP-12185: blknum can not be %d. It must be >= 0", tb->blknum[0]);
-               tb->blknum[0]);
        /* if while adding to a node we discover that it is possible to split
           it in two, and merge the left part into the left neighbor and the
@@ -1177,8 +819,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
                        if (n - snum[i] < item_pos) {   /* new item or it's part falls to first new node S_new[i] */
                                if (item_pos == n - snum[i] + 1 && sbytes[i] != -1) {   /* part of new item falls into S_new[i] */
-                                        int old_key_comp, old_len,
+                                        int old_key_comp, old_len, r_zeros_number;
-                                            r_zeros_number;
                                        const char *r_body;
                                        int version;
@@ -1192,15 +833,8 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
                                        old_len = ih_item_len(ih);
                                        /* Calculate key component and item length to insert into S_new[i] */
-                                        set_le_ih_k_offset(ih,
+                                        set_le_ih_k_offset(ih, le_ih_k_offset(ih) +
-                                                           le_ih_k_offset(ih) +
+                                                           ((old_len - sbytes[i]) << (is_indirect_le_ih(ih) ? tb->tb_sb-> s_blocksize_bits - UNFM_P_SHIFT : 0)));
-                                                           ((old_len -
-                                                             sbytes[i]) <<
-                                                            (is_indirect_le_ih
-                                                             (ih) ? tb->tb_sb->
-                                                             s_blocksize_bits -
-                                                             UNFM_P_SHIFT :
-                                                             0)));
                                        put_ih_item_len(ih, sbytes[i]);
@@ -1209,39 +843,29 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
                                        if ((old_len - sbytes[i]) > zeros_num) {
                                                r_zeros_number = 0;
-                                                r_body =
+                                                r_body = body + (old_len - sbytes[i]) - zeros_num;
-                                                    body + (old_len -
-                                                            sbytes[i]) -
-                                                    zeros_num;
                                        } else {
                                                r_body = body;
-                                                r_zeros_number =
+                                                r_zeros_number = zeros_num - (old_len - sbytes[i]);
-                                                    zeros_num - (old_len -
-                                                                 sbytes[i]);
                                                zeros_num -= r_zeros_number;
                                        }
-                                        leaf_insert_into_buf(&bi, 0, ih, r_body,
+                                        leaf_insert_into_buf(&bi, 0, ih, r_body, r_zeros_number);
-                                                             r_zeros_number);
                                        /* Calculate key component and item length to insert into S[i] */
                                        set_le_ih_k_offset(ih, old_key_comp);
-                                        put_ih_item_len(ih,
+                                        put_ih_item_len(ih, old_len - sbytes[i]);
-                                                        old_len - sbytes[i]);
                                        tb->insert_size[0] -= sbytes[i];
                                } else {        /* whole new item falls into S_new[i] */
                                        /* Shift snum[0] - 1 items to S_new[i] (sbytes[i] of split item) */
                                        leaf_move_items(LEAF_FROM_S_TO_SNEW, tb,
-                                                        snum[i] - 1, sbytes[i],
+                                                        snum[i] - 1, sbytes[i], S_new[i]);
-                                                        S_new[i]);
                                        /* Insert new item into S_new[i] */
                                        buffer_info_init_bh(tb, &bi, S_new[i]);
-                                        leaf_insert_into_buf(&bi,
+                                        leaf_insert_into_buf(&bi, item_pos - n + snum[i] - 1,
-                                                             item_pos - n +
+                                                             ih, body, zeros_num);
-                                                             snum[i] - 1, ih,
-                                                             body, zeros_num);
                                        zeros_num = tb->insert_size[0] = 0;
                                }
@@ -1268,150 +892,73 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
                                                int entry_count;
-                                                entry_count =
+                                                entry_count = ih_entry_count(aux_ih);
-                                                    ih_entry_count(aux_ih);
-                                                if (entry_count - sbytes[i] <
+                                                if (entry_count - sbytes[i] < pos_in_item && pos_in_item <= entry_count) {
-                                                    pos_in_item
-                                                    && pos_in_item <=
-                                                    entry_count) {
                                                        /* new directory entry falls into S_new[i] */
-                                                        RFALSE(!tb->
+                                                        RFALSE(!tb->insert_size[0], "PAP-12215: insert_size is already 0");
-                                                               insert_size[0],
+                                                        RFALSE(sbytes[i] - 1 >= entry_count,
-                                                               "PAP-12215: insert_size is already 0");
-                                                        RFALSE(sbytes[i] - 1 >=
-                                                               entry_count,
                                                               "PAP-12220: there are no so much entries (%d), only %d",
-                                                               sbytes[i] - 1,
+                                                               sbytes[i] - 1, entry_count);
-                                                               entry_count);
                                                        /* Shift snum[i]-1 items in whole. Shift sbytes[i] directory entries from directory item number snum[i] */
-                                                        leaf_move_items
+                                                        leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, snum[i], sbytes[i] - 1, S_new[i]);
-                                                            (LEAF_FROM_S_TO_SNEW,
-                                                             tb, snum[i],
-                                                             sbytes[i] - 1,
-                                                             S_new[i]);
                                                        /* Paste given directory entry to directory item */
                                                        buffer_info_init_bh(tb, &bi, S_new[i]);
-                                                        leaf_paste_in_buffer
+                                                        leaf_paste_in_buffer(&bi, 0, pos_in_item - entry_count + sbytes[i] - 1,
-                                                            (&bi, 0,
+                                                             tb->insert_size[0], body, zeros_num);
-                                                             pos_in_item -
-                                                             entry_count +
-                                                             sbytes[i] - 1,
-                                                             tb->insert_size[0],
-                                                             body, zeros_num);
                                                        /* paste new directory entry */
-                                                        leaf_paste_entries(&bi,
+                                                        leaf_paste_entries(&bi, 0, pos_in_item - entry_count + sbytes[i] - 1, 1,
-                                                                           0,
+                                                                           (struct reiserfs_de_head *) body,
-                                                                           pos_in_item
+                                                                           body + DEH_SIZE, tb->insert_size[0]);
-                                                                           -
-                                                                           entry_count
-                                                                           +
-                                                                           sbytes
-                                                                           [i] -
-                                                                           1, 1,
-                                                                           (struct
-                                                                            reiserfs_de_head
-                                                                            *)
-                                                                           body,
-                                                                           body
-                                                                           +
-                                                                           DEH_SIZE,
-                                                                           tb->
-                                                                           insert_size
-                                                                           [0]
-                                                            );
                                                        tb->insert_size[0] = 0;
                                                        pos_in_item++;
                                                } else {        /* new directory entry doesn't fall into S_new[i] */
-                                                        leaf_move_items
+                                                        leaf_move_items(LEAF_FROM_S_TO_SNEW,tb, snum[i], sbytes[i], S_new[i]);
-                                                            (LEAF_FROM_S_TO_SNEW,
-                                                             tb, snum[i],
-                                                             sbytes[i],
-                                                             S_new[i]);
                                                }
                                        } else {        /* regular object */
-                                                int n_shift, n_rem,
+                                                int n_shift, n_rem, r_zeros_number;
-                                                    r_zeros_number;
                                                const char *r_body;
-                                                RFALSE(pos_in_item !=
+                                                RFALSE(pos_in_item != ih_item_len(B_N_PITEM_HEAD(tbS0, item_pos)) || tb->insert_size[0] <= 0,
-                                                       ih_item_len
-                                                       (B_N_PITEM_HEAD
-                                                        (tbS0, item_pos))
-                                                       || tb->insert_size[0] <=
-                                                       0,
                                                       "PAP-12225: item too short or insert_size <= 0");
                                                /* Calculate number of bytes which must be shifted from appended item */
-                                                n_shift =
+                                                n_shift = sbytes[i] - tb->insert_size[0];
-                                                    sbytes[i] -
-                                                    tb->insert_size[0];
                                                if (n_shift < 0)
                                                        n_shift = 0;
-                                                leaf_move_items
+                                                leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, snum[i], n_shift, S_new[i]);
-                                                    (LEAF_FROM_S_TO_SNEW, tb,
-                                                     snum[i], n_shift,
-                                                     S_new[i]);
                                                /* Calculate number of bytes which must remain in body after append to S_new[i] */
-                                                n_rem =
+                                                n_rem = tb->insert_size[0] - sbytes[i];
-                                                    tb->insert_size[0] -
-                                                    sbytes[i];
                                                if (n_rem < 0)
                                                        n_rem = 0;
                                                /* Append part of body into S_new[0] */
                                                buffer_info_init_bh(tb, &bi, S_new[i]);
                                                if (n_rem > zeros_num) {
                                                        r_zeros_number = 0;
-                                                        r_body =
+                                                        r_body = body + n_rem - zeros_num;
-                                                            body + n_rem -
-                                                            zeros_num;
                                                } else {
                                                        r_body = body;
-                                                        r_zeros_number =
+                                                        r_zeros_number = zeros_num - n_rem;
-                                                            zeros_num - n_rem;
+                                                        zeros_num -= r_zeros_number;
-                                                        zeros_num -=
-                                                            r_zeros_number;
                                                }
-                                                leaf_paste_in_buffer(&bi, 0,
+                                                leaf_paste_in_buffer(&bi, 0, n_shift,
-                                                                     n_shift,
+                                                                     tb->insert_size[0] - n_rem,
-                                                                     tb->
+                                                                     r_body, r_zeros_number);
-                                                                     insert_size
-                                                                     [0] -
-                                                                     n_rem,
-                                                                     r_body,
-                                                                     r_zeros_number);
                                                {
                                                        struct item_head *tmp;
-                                                        tmp =
+                                                        tmp = B_N_PITEM_HEAD(S_new[i], 0);
-                                                            B_N_PITEM_HEAD(S_new
-                                                                           [i],
-                                                                           0);
                                                        if (is_indirect_le_ih
                                                            (tmp)) {
-                                                                set_ih_free_space
+                                                                set_ih_free_space(tmp, 0);
-                                                                    (tmp, 0);
+                                                                set_le_ih_k_offset(tmp, le_ih_k_offset(tmp) + (n_rem << (tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT)));
-                                                                set_le_ih_k_offset
-                                                                    (tmp,
-                                                                     le_ih_k_offset
-                                                                     (tmp) +
-                                                                     (n_rem <<
-                                                                      (tb->
-                                                                       tb_sb->
-                                                                       s_blocksize_bits
-                                                                       -
-                                                                       UNFM_P_SHIFT)));
                                                        } else {
-                                                                set_le_ih_k_offset
+                                                                set_le_ih_k_offset(tmp, le_ih_k_offset(tmp) + n_rem);
-                                                                    (tmp,
-                                                                     le_ih_k_offset
-                                                                     (tmp) +
-                                                                     n_rem);
                                                        }
                                                }
@@ -1426,8 +973,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
                                        struct item_head *pasted;
 #ifdef CONFIG_REISERFS_CHECK
-                                        struct item_head *ih_check =
+                                        struct item_head *ih_check = B_N_PITEM_HEAD(tbS0, item_pos);
-                                            B_N_PITEM_HEAD(tbS0, item_pos);
                                        if (!is_direntry_le_ih(ih_check)
                                            && (pos_in_item != ih_item_len(ih_check)
@@ -1439,8 +985,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
                                                             "to ih_item_len");
 #endif                          /* CONFIG_REISERFS_CHECK */
-                                        leaf_mi =
+                                        leaf_mi = leaf_move_items(LEAF_FROM_S_TO_SNEW,
-                                            leaf_move_items(LEAF_FROM_S_TO_SNEW,
                                                            tb, snum[i],
                                                            sbytes[i],
                                                            S_new[i]);
@@ -1452,30 +997,19 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
                                        /* paste into item */
                                        buffer_info_init_bh(tb, &bi, S_new[i]);
                                        leaf_paste_in_buffer(&bi,
-                                                             item_pos - n +
+                                                             item_pos - n + snum[i],
-                                                             snum[i],
                                                             pos_in_item,
                                                             tb->insert_size[0],
                                                             body, zeros_num);
-                                        pasted =
+                                        pasted = B_N_PITEM_HEAD(S_new[i], item_pos - n + snum[i]);
-                                            B_N_PITEM_HEAD(S_new[i],
-                                                           item_pos - n +
-                                                           snum[i]);
                                        if (is_direntry_le_ih(pasted)) {
                                                leaf_paste_entries(&bi,
-                                                                   item_pos -
+                                                                   item_pos - n + snum[i],
-                                                                   n + snum[i],
+                                                                   pos_in_item, 1,
-                                                                   pos_in_item,
+                                                                   (struct reiserfs_de_head *)body,
-                                                                   1,
+                                                                   body + DEH_SIZE,
-                                                                   (struct
+                                                                   tb->insert_size[0]
-                                                                    reiserfs_de_head
-                                                                    *)body,
-                                                                   body +
-                                                                   DEH_SIZE,
-                                                                   tb->
-                                                                   insert_size
-                                                                   [0]
                                                    );
                                        }
@@ -1495,11 +1029,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
                default:        /* cases d and t */
                        reiserfs_panic(tb->tb_sb, "PAP-12245",
                                       "blknum > 2: unexpected mode: %s(%d)",
-                                       (flag ==
+                                       (flag == M_DELETE) ? "DELETE" : ((flag == M_CUT) ? "CUT" : "UNKNOWN"), flag);
-                                        M_DELETE) ? "DELETE" : ((flag ==
-                                                                 M_CUT) ? "CUT"
-                                                                : "UNKNOWN"),
-                                       flag);
                }
                memcpy(insert_key + i, B_N_PKEY(S_new[i], 0), KEY_SIZE);
@@ -1524,9 +1054,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
                        /* If we insert the first key change the delimiting key */
                        if (item_pos == 0) {
                                if (tb->CFL[0]) /* can be 0 in reiserfsck */
-                                        replace_key(tb, tb->CFL[0], tb->lkey[0],
+                                        replace_key(tb, tb->CFL[0], tb->lkey[0], tbS0, 0);
-                                                    tbS0, 0);
                        }
                        break;
@@ -1536,53 +1064,27 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
                                pasted = B_N_PITEM_HEAD(tbS0, item_pos);
                                /* when directory, may be new entry already pasted */
                                if (is_direntry_le_ih(pasted)) {
-                                        if (pos_in_item >= 0 &&
+                                        if (pos_in_item >= 0 && pos_in_item <= ih_entry_count(pasted)) {
-                                            pos_in_item <=
-                                            ih_entry_count(pasted)) {
                                                RFALSE(!tb->insert_size[0],
                                                       "PAP-12260: insert_size is 0 already");
                                                /* prepare space */
                                                buffer_info_init_tbS0(tb, &bi);
-                                                leaf_paste_in_buffer(&bi,
+                                                leaf_paste_in_buffer(&bi, item_pos, pos_in_item,
-                                                                     item_pos,
+                                                                     tb->insert_size[0], body,
-                                                                     pos_in_item,
-                                                                     tb->
-                                                                     insert_size
-                                                                     [0], body,
                                                                     zeros_num);
                                                /* paste entry */
-                                                leaf_paste_entries(&bi,
+                                                leaf_paste_entries(&bi, item_pos, pos_in_item, 1,
-                                                                   item_pos,
+                                                                   (struct reiserfs_de_head *)body,
-                                                                   pos_in_item,
+                                                                   body + DEH_SIZE,
-                                                                   1,
+                                                                   tb->insert_size[0]);
-                                                                   (struct
-                                                                    reiserfs_de_head
-                                                                    *)body,
-                                                                   body +
-                                                                   DEH_SIZE,
-                                                                   tb->
-                                                                   insert_size
-                                                                   [0]
-                                                    );
                                                if (!item_pos && !pos_in_item) {
-                                                        RFALSE(!tb->CFL[0]
+                                                        RFALSE(!tb->CFL[0] || !tb->L[0],
-                                                               || !tb->L[0],
                                                               "PAP-12270: CFL[0]/L[0] must be specified");
-                                                        if (tb->CFL[0]) {
+                                                        if (tb->CFL[0])
-                                                                replace_key(tb,
+                                                                replace_key(tb, tb->CFL[0], tb->lkey[0], tbS0, 0);
-                                                                            tb->
-                                                                            CFL
-                                                                            [0],
-                                                                            tb->
-                                                                            lkey
-                                                                            [0],
-                                                                            tbS0,
-                                                                            0);
-                                                        }
                                                }
                                                tb->insert_size[0] = 0;
                                        }
@@ -1593,13 +1095,8 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
                                                       "PAP-12275: insert size must not be %d",
                                                       tb->insert_size[0]);
                                                buffer_info_init_tbS0(tb, &bi);
-                                                leaf_paste_in_buffer(&bi,
+                                                leaf_paste_in_buffer(&bi, item_pos, pos_in_item,
-                                                                     item_pos,
+                                                                     tb->insert_size[0], body, zeros_num);
-                                                                     pos_in_item,
-                                                                     tb->
-                                                                     insert_size
-                                                                     [0], body,
-                                                                     zeros_num);
                                                if (is_indirect_le_ih(pasted)) {
 #if 0
@@ -1611,8 +1108,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
                                                               tb->
                                                               insert_size[0]);
 #endif
-                                                        set_ih_free_space
+                                                        set_ih_free_space(pasted, 0);
-                                                            (pasted, 0);
                                                }
                                                tb->insert_size[0] = 0;
                                        }
@@ -1620,8 +1116,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
                                        else {
                                                if (tb->insert_size[0]) {
                                                        print_cur_tb("12285");
-                                                        reiserfs_panic(tb->
+                                                        reiserfs_panic(tb->tb_sb,
-                                                                       tb_sb,
                                                            "PAP-12285",
                                                            "insert_size "
                                                            "must be 0 "
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index dcaafcfc23b0..ed58d843d578 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -260,4 +260,5 @@ const struct inode_operations reiserfs_file_inode_operations = {
        .removexattr = reiserfs_removexattr,
        .permission = reiserfs_permission,
        .get_acl = reiserfs_get_acl,
+        .set_acl = reiserfs_set_acl,
 };
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index dc5236f6de1b..e825f8b63e6b 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -1522,6 +1522,7 @@ const struct inode_operations reiserfs_dir_inode_operations = {
        .removexattr = reiserfs_removexattr,
        .permission = reiserfs_permission,
        .get_acl = reiserfs_get_acl,
+        .set_acl = reiserfs_set_acl,
 };
 /*
@@ -1538,8 +1539,6 @@ const struct inode_operations reiserfs_symlink_inode_operations = {
        .listxattr = reiserfs_listxattr,
        .removexattr = reiserfs_removexattr,
        .permission = reiserfs_permission,
-        .get_acl = reiserfs_get_acl,
 };
 /*
@@ -1553,4 +1552,5 @@ const struct inode_operations reiserfs_special_inode_operations = {
        .removexattr = reiserfs_removexattr,
        .permission = reiserfs_permission,
        .get_acl = reiserfs_get_acl,
+        .set_acl = reiserfs_set_acl,
 };
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
index a958444a75fc..02b0b7d0f7d5 100644
--- a/fs/reiserfs/procfs.c
+++ b/fs/reiserfs/procfs.c
@@ -419,7 +419,7 @@ int reiserfs_proc_info_init(struct super_block *sb)
        char *s;
        /* Some block devices use /'s */
-        strlcpy(b, reiserfs_bdevname(sb), BDEVNAME_SIZE);
+        strlcpy(b, sb->s_id, BDEVNAME_SIZE);
        s = strchr(b, '/');
        if (s)
                *s = '!';
@@ -449,7 +449,7 @@ int reiserfs_proc_info_done(struct super_block *sb)
                char *s;
                /* Some block devices use /'s */
-                strlcpy(b, reiserfs_bdevname(sb), BDEVNAME_SIZE);
+                strlcpy(b, sb->s_id, BDEVNAME_SIZE);
                s = strchr(b, '/');
                if (s)
                        *s = '!';
diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h
index f8adaee537c2..8d06adf89948 100644
--- a/fs/reiserfs/reiserfs.h
+++ b/fs/reiserfs/reiserfs.h
@@ -608,14 +608,6 @@ int reiserfs_resize(struct super_block *, unsigned long);
 #define SB_DISK_JOURNAL_HEAD(s) (SB_JOURNAL(s)->j_header_bh->)
-/* A safe version of the "bdevname", which returns the "s_id" field of
- * a superblock or else "Null superblock" if the super block is NULL.
- */
-static inline char *reiserfs_bdevname(struct super_block *s)
-{
-        return (s == NULL) ? "Null superblock" : s->s_id;
-}
 #define reiserfs_is_journal_aborted(journal) (unlikely (__reiserfs_is_journal_aborted (journal)))
 static inline int __reiserfs_is_journal_aborted(struct reiserfs_journal
                                                *journal)
@@ -1958,8 +1950,6 @@ struct treepath var = {.path_length = ILLEGAL_PATH_ELEMENT_OFFSET, .reada = 0,}
 #define MAX_US_INT 0xffff
 // reiserfs version 2 has max offset 60 bits. Version 1 - 32 bit offset
-#define U32_MAX (~(__u32)0)
 static inline loff_t max_reiserfs_offset(struct inode *inode)
 {
        if (get_inode_item_key_version(inode) == KEY_FORMAT_3_5)
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 3ead145dadc4..2c803353f8ac 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -1479,7 +1479,7 @@ static int read_super_block(struct super_block *s, int offset)
        if (!bh) {
                reiserfs_warning(s, "sh-2006",
                                 "bread failed (dev %s, block %lu, size %lu)",
-                                 reiserfs_bdevname(s), offset / s->s_blocksize,
+                                 s->s_id, offset / s->s_blocksize,
                                 s->s_blocksize);
                return 1;
        }
@@ -1500,7 +1500,7 @@ static int read_super_block(struct super_block *s, int offset)
        if (!bh) {
                reiserfs_warning(s, "sh-2007",
                                 "bread failed (dev %s, block %lu, size %lu)",
-                                 reiserfs_bdevname(s), offset / s->s_blocksize,
+                                 s->s_id, offset / s->s_blocksize,
                                 s->s_blocksize);
                return 1;
        }
@@ -1509,7 +1509,7 @@ static int read_super_block(struct super_block *s, int offset)
        if (sb_blocksize(rs) != s->s_blocksize) {
                reiserfs_warning(s, "sh-2011", "can't find a reiserfs "
                                 "filesystem on (dev %s, block %Lu, size %lu)",
-                                 reiserfs_bdevname(s),
+                                 s->s_id,
                                 (unsigned long long)bh->b_blocknr,
                                 s->s_blocksize);
                brelse(bh);
@@ -1825,7 +1825,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
        /* try new format (64-th 1k block), which can contain reiserfs super block */
        else if (read_super_block(s, REISERFS_DISK_OFFSET_IN_BYTES)) {
                SWARN(silent, s, "sh-2021", "can not find reiserfs on %s",
-                      reiserfs_bdevname(s));
+                      s->s_id);
                goto error_unlocked;
        }
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 8a9e2dcfe004..5cdfbd638b5c 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -50,6 +50,7 @@
 #include <linux/stat.h>
 #include <linux/quotaops.h>
 #include <linux/security.h>
+#include <linux/posix_acl_xattr.h>
 #define PRIVROOT_NAME ".reiserfs_priv"
 #define XAROOT_NAME   "xattrs"
@@ -904,8 +905,8 @@ static const struct xattr_handler *reiserfs_xattr_handlers[] = {
        &reiserfs_xattr_security_handler,
 #endif
 #ifdef CONFIG_REISERFS_FS_POSIX_ACL
-        &reiserfs_posix_acl_access_handler,
+        &posix_acl_access_xattr_handler,
-        &reiserfs_posix_acl_default_handler,
+        &posix_acl_default_xattr_handler,
 #endif
        NULL
 };
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index 06c04f73da65..a6ce532402dc 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -11,35 +11,19 @@
 #include "acl.h"
 #include <asm/uaccess.h>
-static int reiserfs_set_acl(struct reiserfs_transaction_handle *th,
+static int __reiserfs_set_acl(struct reiserfs_transaction_handle *th,
                            struct inode *inode, int type,
                            struct posix_acl *acl);
-static int
-posix_acl_set(struct dentry *dentry, const char *name, const void *value,
+int
-                size_t size, int flags, int type)
+reiserfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 {
-        struct inode *inode = dentry->d_inode;
-        struct posix_acl *acl;
        int error, error2;
        struct reiserfs_transaction_handle th;
        size_t jcreate_blocks;
-        if (!reiserfs_posixacl(inode->i_sb))
+        int size = acl ? posix_acl_xattr_size(acl->a_count) : 0;
-                return -EOPNOTSUPP;
-        if (!inode_owner_or_capable(inode))
-                return -EPERM;
-        if (value) {
-                acl = posix_acl_from_xattr(&init_user_ns, value, size);
-                if (IS_ERR(acl)) {
-                        return PTR_ERR(acl);
-                } else if (acl) {
-                        error = posix_acl_valid(acl);
-                        if (error)
-                                goto release_and_out;
-                }
-        } else
-                acl = NULL;
        /* Pessimism: We can't assume that anything from the xattr root up
         * has been created. */
@@ -51,7 +35,7 @@ posix_acl_set(struct dentry *dentry, const char *name, const void *value,
        error = journal_begin(&th, inode->i_sb, jcreate_blocks);
        reiserfs_write_unlock(inode->i_sb);
        if (error == 0) {
-                error = reiserfs_set_acl(&th, inode, type, acl);
+                error = __reiserfs_set_acl(&th, inode, type, acl);
                reiserfs_write_lock(inode->i_sb);
                error2 = journal_end(&th, inode->i_sb, jcreate_blocks);
                reiserfs_write_unlock(inode->i_sb);
@@ -59,36 +43,13 @@ posix_acl_set(struct dentry *dentry, const char *name, const void *value,
                        error = error2;
        }
-      release_and_out:
-        posix_acl_release(acl);
-        return error;
-}
-static int
-posix_acl_get(struct dentry *dentry, const char *name, void *buffer,
-                size_t size, int type)
-{
-        struct posix_acl *acl;
-        int error;
-        if (!reiserfs_posixacl(dentry->d_sb))
-                return -EOPNOTSUPP;
-        acl = reiserfs_get_acl(dentry->d_inode, type);
-        if (IS_ERR(acl))
-                return PTR_ERR(acl);
-        if (acl == NULL)
-                return -ENODATA;
-        error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
-        posix_acl_release(acl);
        return error;
 }
 /*
 * Convert from filesystem to in-memory representation.
 */
-static struct posix_acl *posix_acl_from_disk(const void *value, size_t size)
+static struct posix_acl *reiserfs_posix_acl_from_disk(const void *value, size_t size)
 {
        const char *end = (char *)value + size;
        int n, count;
@@ -158,7 +119,7 @@ static struct posix_acl *posix_acl_from_disk(const void *value, size_t size)
 /*
 * Convert from in-memory to filesystem representation.
 */
-static void *posix_acl_to_disk(const struct posix_acl *acl, size_t * size)
+static void *reiserfs_posix_acl_to_disk(const struct posix_acl *acl, size_t * size)
 {
        reiserfs_acl_header *ext_acl;
        char *e;
@@ -221,10 +182,6 @@ struct posix_acl *reiserfs_get_acl(struct inode *inode, int type)
        int size;
        int retval;
-        acl = get_cached_acl(inode, type);
-        if (acl != ACL_NOT_CACHED)
-                return acl;
        switch (type) {
        case ACL_TYPE_ACCESS:
                name = POSIX_ACL_XATTR_ACCESS;
@@ -257,7 +214,7 @@ struct posix_acl *reiserfs_get_acl(struct inode *inode, int type)
        } else if (retval < 0) {
                acl = ERR_PTR(retval);
        } else {
-                acl = posix_acl_from_disk(value, retval);
+                acl = reiserfs_posix_acl_from_disk(value, retval);
        }
        if (!IS_ERR(acl))
                set_cached_acl(inode, type, acl);
@@ -273,7 +230,7 @@ struct posix_acl *reiserfs_get_acl(struct inode *inode, int type)
 * BKL held [before 2.5.x]
 */
 static int
-reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
+__reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
                 int type, struct posix_acl *acl)
 {
        char *name;
@@ -281,9 +238,6 @@ reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
        size_t size = 0;
        int error;
-        if (S_ISLNK(inode->i_mode))
-                return -EOPNOTSUPP;
        switch (type) {
        case ACL_TYPE_ACCESS:
                name = POSIX_ACL_XATTR_ACCESS;
@@ -307,7 +261,7 @@ reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
        }
        if (acl) {
-                value = posix_acl_to_disk(acl, &size);
+                value = reiserfs_posix_acl_to_disk(acl, &size);
                if (IS_ERR(value))
                        return (int)PTR_ERR(value);
        }
@@ -343,7 +297,7 @@ reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
                             struct inode *dir, struct dentry *dentry,
                             struct inode *inode)
 {
-        struct posix_acl *acl;
+        struct posix_acl *default_acl, *acl;
        int err = 0;
        /* ACLs only get applied to files and directories */
@@ -363,37 +317,28 @@ reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
                goto apply_umask;
        }
-        acl = reiserfs_get_acl(dir, ACL_TYPE_DEFAULT);
+        err = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
-        if (IS_ERR(acl))
+        if (err)
-                return PTR_ERR(acl);
+                return err;
+        if (default_acl) {
+                err = __reiserfs_set_acl(th, inode, ACL_TYPE_DEFAULT,
+                                         default_acl);
+                posix_acl_release(default_acl);
+        }
        if (acl) {
-                /* Copy the default ACL to the default ACL of a new directory */
+                if (!err)
-                if (S_ISDIR(inode->i_mode)) {
+                        err = __reiserfs_set_acl(th, inode, ACL_TYPE_ACCESS,
-                        err = reiserfs_set_acl(th, inode, ACL_TYPE_DEFAULT,
+                                                 acl);
-                                               acl);
-                        if (err)
-                                goto cleanup;
-                }
-                /* Now we reconcile the new ACL and the mode,
-                   potentially modifying both */
-                err = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode);
-                if (err < 0)
-                        return err;
-                /* If we need an ACL.. */
-                if (err > 0)
-                        err = reiserfs_set_acl(th, inode, ACL_TYPE_ACCESS, acl);
-              cleanup:
                posix_acl_release(acl);
-        } else {
-              apply_umask:
-                /* no ACL, apply umask */
-                inode->i_mode &= ~current_umask();
        }
        return err;
+      apply_umask:
+        /* no ACL, apply umask */
+        inode->i_mode &= ~current_umask();
+        return err;
 }
 /* This is used to cache the default acl before a new object is created.
@@ -442,84 +387,11 @@ int reiserfs_cache_default_acl(struct inode *inode)
 */
 int reiserfs_acl_chmod(struct inode *inode)
 {
-        struct reiserfs_transaction_handle th;
-        struct posix_acl *acl;
-        size_t size;
-        int error;
        if (IS_PRIVATE(inode))
                return 0;
-        if (S_ISLNK(inode->i_mode))
-                return -EOPNOTSUPP;
        if (get_inode_sd_version(inode) == STAT_DATA_V1 ||
-            !reiserfs_posixacl(inode->i_sb)) {
+            !reiserfs_posixacl(inode->i_sb))
                return 0;
-        }
-        acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS);
+        return posix_acl_chmod(inode, inode->i_mode);
-        if (!acl)
-                return 0;
-        if (IS_ERR(acl))
-                return PTR_ERR(acl);
-        error = posix_acl_chmod(&acl, GFP_NOFS, inode->i_mode);
-        if (error)
-                return error;
-        size = reiserfs_xattr_nblocks(inode, reiserfs_acl_size(acl->a_count));
-        reiserfs_write_lock(inode->i_sb);
-        error = journal_begin(&th, inode->i_sb, size * 2);
-        reiserfs_write_unlock(inode->i_sb);
-        if (!error) {
-                int error2;
-                error = reiserfs_set_acl(&th, inode, ACL_TYPE_ACCESS, acl);
-                reiserfs_write_lock(inode->i_sb);
-                error2 = journal_end(&th, inode->i_sb, size * 2);
-                reiserfs_write_unlock(inode->i_sb);
-                if (error2)
-                        error = error2;
-        }
-        posix_acl_release(acl);
-        return error;
-}
-static size_t posix_acl_access_list(struct dentry *dentry, char *list,
-                                    size_t list_size, const char *name,
-                                    size_t name_len, int type)
-{
-        const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
-        if (!reiserfs_posixacl(dentry->d_sb))
-                return 0;
-        if (list && size <= list_size)
-                memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
-        return size;
 }
-const struct xattr_handler reiserfs_posix_acl_access_handler = {
-        .prefix = POSIX_ACL_XATTR_ACCESS,
-        .flags = ACL_TYPE_ACCESS,
-        .get = posix_acl_get,
-        .set = posix_acl_set,
-        .list = posix_acl_access_list,
-};
-static size_t posix_acl_default_list(struct dentry *dentry, char *list,
-                                     size_t list_size, const char *name,
-                                     size_t name_len, int type)
-{
-        const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
-        if (!reiserfs_posixacl(dentry->d_sb))
-                return 0;
-        if (list && size <= list_size)
-                memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
-        return size;
-}
-const struct xattr_handler reiserfs_posix_acl_default_handler = {
-        .prefix = POSIX_ACL_XATTR_DEFAULT,
-        .flags = ACL_TYPE_DEFAULT,
-        .get = posix_acl_get,
-        .set = posix_acl_set,
-        .list = posix_acl_default_list,
-};
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index ff1d3d42e72a..d8418782862b 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -533,16 +533,14 @@ static int romfs_fill_super(struct super_block *sb, void *data, int silent)
        root = romfs_iget(sb, pos);
        if (IS_ERR(root))
-                goto error;
+                return PTR_ERR(root);
        sb->s_root = d_make_root(root);
        if (!sb->s_root)
-                goto error;
+                return -ENOMEM;
        return 0;
-error:
-        return -EINVAL;
 error_rsb_inval:
        ret = -EINVAL;
 error_rsb:
diff --git a/fs/splice.c b/fs/splice.c
index 46a08f772d7d..12028fa41def 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -555,6 +555,24 @@ static const struct pipe_buf_operations default_pipe_buf_ops = {
        .get = generic_pipe_buf_get,
 };
+static int generic_pipe_buf_nosteal(struct pipe_inode_info *pipe,
+                                    struct pipe_buffer *buf)
+{
+        return 1;
+}
+/* Pipe buffer operations for a socket and similar. */
+const struct pipe_buf_operations nosteal_pipe_buf_ops = {
+        .can_merge = 0,
+        .map = generic_pipe_buf_map,
+        .unmap = generic_pipe_buf_unmap,
+        .confirm = generic_pipe_buf_confirm,
+        .release = generic_pipe_buf_release,
+        .steal = generic_pipe_buf_nosteal,
+        .get = generic_pipe_buf_get,
+};
+EXPORT_SYMBOL(nosteal_pipe_buf_ops);
 static ssize_t kernel_readv(struct file *file, const struct iovec *vec,
                            unsigned long vlen, loff_t offset)
 {
diff --git a/fs/super.c b/fs/super.c
index e5f6c2cfac38..80d5cf2ca765 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -166,6 +166,8 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
        if (!s)
                return NULL;
+        INIT_LIST_HEAD(&s->s_mounts);
        if (security_sb_alloc(s))
                goto fail;
@@ -188,7 +190,6 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
        if (list_lru_init(&s->s_inode_lru))
                goto fail;
-        INIT_LIST_HEAD(&s->s_mounts);
        init_rwsem(&s->s_umount);
        lockdep_set_class(&s->s_umount, &type->s_umount_key);
        /*
@@ -702,7 +703,6 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
        if (flags & MS_RDONLY)
                acct_auto_close(sb);
        shrink_dcache_sb(sb);
-        sync_filesystem(sb);
        remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY);
@@ -719,6 +719,8 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
                }
        }
+        sync_filesystem(sb);
        if (sb->s_op->remount_fs) {
                retval = sb->s_op->remount_fs(sb, &flags, data);
                if (retval) {
diff --git a/fs/sync.c b/fs/sync.c
index f15537452231..b28d1dd10e8b 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -27,11 +27,10 @@
 * wait == 1 case since in that case write_inode() functions do
 * sync_dirty_buffer() and thus effectively write one block at a time.
 */
-static int __sync_filesystem(struct super_block *sb, int wait,
+static int __sync_filesystem(struct super_block *sb, int wait)
-                             unsigned long start)
 {
        if (wait)
-                sync_inodes_sb(sb, start);
+                sync_inodes_sb(sb);
        else
                writeback_inodes_sb(sb, WB_REASON_SYNC);
@@ -48,7 +47,6 @@ static int __sync_filesystem(struct super_block *sb, int wait,
 int sync_filesystem(struct super_block *sb)
 {
        int ret;
-        unsigned long start = jiffies;
        /*
         * We need to be protected against the filesystem going from
@@ -62,17 +60,17 @@ int sync_filesystem(struct super_block *sb)
        if (sb->s_flags & MS_RDONLY)
                return 0;
-        ret = __sync_filesystem(sb, 0, start);
+        ret = __sync_filesystem(sb, 0);
        if (ret < 0)
                return ret;
-        return __sync_filesystem(sb, 1, start);
+        return __sync_filesystem(sb, 1);
 }
 EXPORT_SYMBOL_GPL(sync_filesystem);
 static void sync_inodes_one_sb(struct super_block *sb, void *arg)
 {
        if (!(sb->s_flags & MS_RDONLY))
-                sync_inodes_sb(sb, *((unsigned long *)arg));
+                sync_inodes_sb(sb);
 }
 static void sync_fs_one_sb(struct super_block *sb, void *arg)
@@ -104,10 +102,9 @@ static void fdatawait_one_bdev(struct block_device *bdev, void *arg)
 SYSCALL_DEFINE0(sync)
 {
        int nowait = 0, wait = 1;
-        unsigned long start = jiffies;
        wakeup_flusher_threads(0, WB_REASON_SYNC);
-        iterate_supers(sync_inodes_one_sb, &start);
+        iterate_supers(sync_inodes_one_sb, NULL);
        iterate_supers(sync_fs_one_sb, &nowait);
        iterate_supers(sync_fs_one_sb, &wait);
        iterate_bdevs(fdatawrite_one_bdev, NULL);
@@ -222,23 +219,6 @@ SYSCALL_DEFINE1(fdatasync, unsigned int, fd)
        return do_fsync(fd, 1);
 }
-/**
- * generic_write_sync - perform syncing after a write if file / inode is sync
- * @file:       file to which the write happened
- * @pos:        offset where the write started
- * @count:      length of the write
- *
- * This is just a simple wrapper about our general syncing function.
- */
-int generic_write_sync(struct file *file, loff_t pos, loff_t count)
-{
-        if (!(file->f_flags & O_DSYNC) && !IS_SYNC(file->f_mapping->host))
-                return 0;
-        return vfs_fsync_range(file, pos, pos + count - 1,
-                               (file->f_flags & __O_SYNC) ? 0 : 1);
-}
-EXPORT_SYMBOL(generic_write_sync);
 /*
 * sys_sync_file_range() permits finely controlled syncing over a segment of
 * a file in the range offset .. (offset+nbytes-1) inclusive.  If nbytes is
diff --git a/fs/sysfs/Makefile b/fs/sysfs/Makefile
index 8876ac183373..6eff6e1205a5 100644
--- a/fs/sysfs/Makefile
+++ b/fs/sysfs/Makefile
@@ -2,4 +2,4 @@
 # Makefile for the sysfs virtual filesystem
 #
-obj-y           := inode.o file.o dir.o symlink.o mount.o group.o
+obj-y           := file.o dir.o symlink.o mount.o group.o
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 5e73d6626e50..ee0d761c3179 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -13,465 +13,31 @@
 #undef DEBUG
 #include <linux/fs.h>
-#include <linux/mount.h>
-#include <linux/module.h>
 #include <linux/kobject.h>
-#include <linux/namei.h>
-#include <linux/idr.h>
-#include <linux/completion.h>
-#include <linux/mutex.h>
 #include <linux/slab.h>
-#include <linux/security.h>
-#include <linux/hash.h>
 #include "sysfs.h"
-DEFINE_MUTEX(sysfs_mutex);
 DEFINE_SPINLOCK(sysfs_symlink_target_lock);
-#define to_sysfs_dirent(X) rb_entry((X), struct sysfs_dirent, s_rb)
-static DEFINE_SPINLOCK(sysfs_ino_lock);
-static DEFINE_IDA(sysfs_ino_ida);
-/**
- *      sysfs_name_hash
- *      @name: Null terminated string to hash
- *      @ns:   Namespace tag to hash
- *
- *      Returns 31 bit hash of ns + name (so it fits in an off_t )
- */
-static unsigned int sysfs_name_hash(const char *name, const void *ns)
-{
-        unsigned long hash = init_name_hash();
-        unsigned int len = strlen(name);
-        while (len--)
-                hash = partial_name_hash(*name++, hash);
-        hash = (end_name_hash(hash) ^ hash_ptr((void *)ns, 31));
-        hash &= 0x7fffffffU;
-        /* Reserve hash numbers 0, 1 and INT_MAX for magic directory entries */
-        if (hash < 1)
-                hash += 2;
-        if (hash >= INT_MAX)
-                hash = INT_MAX - 1;
-        return hash;
-}
-static int sysfs_name_compare(unsigned int hash, const char *name,
-                              const void *ns, const struct sysfs_dirent *sd)
-{
-        if (hash != sd->s_hash)
-                return hash - sd->s_hash;
-        if (ns != sd->s_ns)
-                return ns - sd->s_ns;
-        return strcmp(name, sd->s_name);
-}
-static int sysfs_sd_compare(const struct sysfs_dirent *left,
-                            const struct sysfs_dirent *right)
-{
-        return sysfs_name_compare(left->s_hash, left->s_name, left->s_ns,
-                                  right);
-}
-/**
- *      sysfs_link_sibling - link sysfs_dirent into sibling rbtree
- *      @sd: sysfs_dirent of interest
- *
- *      Link @sd into its sibling rbtree which starts from
- *      sd->s_parent->s_dir.children.
- *
- *      Locking:
- *      mutex_lock(sysfs_mutex)
- *
- *      RETURNS:
- *      0 on susccess -EEXIST on failure.
- */
-static int sysfs_link_sibling(struct sysfs_dirent *sd)
-{
-        struct rb_node **node = &sd->s_parent->s_dir.children.rb_node;
-        struct rb_node *parent = NULL;
-        if (sysfs_type(sd) == SYSFS_DIR)
-                sd->s_parent->s_dir.subdirs++;
-        while (*node) {
-                struct sysfs_dirent *pos;
-                int result;
-                pos = to_sysfs_dirent(*node);
-                parent = *node;
-                result = sysfs_sd_compare(sd, pos);
-                if (result < 0)
-                        node = &pos->s_rb.rb_left;
-                else if (result > 0)
-                        node = &pos->s_rb.rb_right;
-                else
-                        return -EEXIST;
-        }
-        /* add new node and rebalance the tree */
-        rb_link_node(&sd->s_rb, parent, node);
-        rb_insert_color(&sd->s_rb, &sd->s_parent->s_dir.children);
-        return 0;
-}
-/**
- *      sysfs_unlink_sibling - unlink sysfs_dirent from sibling rbtree
- *      @sd: sysfs_dirent of interest
- *
- *      Unlink @sd from its sibling rbtree which starts from
- *      sd->s_parent->s_dir.children.
- *
- *      Locking:
- *      mutex_lock(sysfs_mutex)
- */
-static void sysfs_unlink_sibling(struct sysfs_dirent *sd)
-{
-        if (sysfs_type(sd) == SYSFS_DIR)
-                sd->s_parent->s_dir.subdirs--;
-        rb_erase(&sd->s_rb, &sd->s_parent->s_dir.children);
-}
-/**
- *      sysfs_get_active - get an active reference to sysfs_dirent
- *      @sd: sysfs_dirent to get an active reference to
- *
- *      Get an active reference of @sd.  This function is noop if @sd
- *      is NULL.
- *
- *      RETURNS:
- *      Pointer to @sd on success, NULL on failure.
- */
-struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd)
-{
-        if (unlikely(!sd))
-                return NULL;
-        if (!atomic_inc_unless_negative(&sd->s_active))
-                return NULL;
-        if (likely(!sysfs_ignore_lockdep(sd)))
-                rwsem_acquire_read(&sd->dep_map, 0, 1, _RET_IP_);
-        return sd;
-}
-/**
- *      sysfs_put_active - put an active reference to sysfs_dirent
- *      @sd: sysfs_dirent to put an active reference to
- *
- *      Put an active reference to @sd.  This function is noop if @sd
- *      is NULL.
- */
-void sysfs_put_active(struct sysfs_dirent *sd)
-{
-        int v;
-        if (unlikely(!sd))
-                return;
-        if (likely(!sysfs_ignore_lockdep(sd)))
-                rwsem_release(&sd->dep_map, 1, _RET_IP_);
-        v = atomic_dec_return(&sd->s_active);
-        if (likely(v != SD_DEACTIVATED_BIAS))
-                return;
-        /* atomic_dec_return() is a mb(), we'll always see the updated
-         * sd->u.completion.
-         */
-        complete(sd->u.completion);
-}
-/**
- *      sysfs_deactivate - deactivate sysfs_dirent
- *      @sd: sysfs_dirent to deactivate
- *
- *      Deny new active references and drain existing ones.
- */
-static void sysfs_deactivate(struct sysfs_dirent *sd)
-{
-        DECLARE_COMPLETION_ONSTACK(wait);
-        int v;
-        BUG_ON(!(sd->s_flags & SYSFS_FLAG_REMOVED));
-        if (!(sysfs_type(sd) & SYSFS_ACTIVE_REF))
-                return;
-        sd->u.completion = (void *)&wait;
-        rwsem_acquire(&sd->dep_map, 0, 0, _RET_IP_);
-        /* atomic_add_return() is a mb(), put_active() will always see
-         * the updated sd->u.completion.
-         */
-        v = atomic_add_return(SD_DEACTIVATED_BIAS, &sd->s_active);
-        if (v != SD_DEACTIVATED_BIAS) {
-                lock_contended(&sd->dep_map, _RET_IP_);
-                wait_for_completion(&wait);
-        }
-        lock_acquired(&sd->dep_map, _RET_IP_);
-        rwsem_release(&sd->dep_map, 1, _RET_IP_);
-}
-static int sysfs_alloc_ino(unsigned int *pino)
-{
-        int ino, rc;
- retry:
-        spin_lock(&sysfs_ino_lock);
-        rc = ida_get_new_above(&sysfs_ino_ida, 2, &ino);
-        spin_unlock(&sysfs_ino_lock);
-        if (rc == -EAGAIN) {
-                if (ida_pre_get(&sysfs_ino_ida, GFP_KERNEL))
-                        goto retry;
-                rc = -ENOMEM;
-        }
-        *pino = ino;
-        return rc;
-}
-static void sysfs_free_ino(unsigned int ino)
-{
-        spin_lock(&sysfs_ino_lock);
-        ida_remove(&sysfs_ino_ida, ino);
-        spin_unlock(&sysfs_ino_lock);
-}
-void release_sysfs_dirent(struct sysfs_dirent *sd)
-{
-        struct sysfs_dirent *parent_sd;
- repeat:
-        /* Moving/renaming is always done while holding reference.
-         * sd->s_parent won't change beneath us.
-         */
-        parent_sd = sd->s_parent;
-        WARN(!(sd->s_flags & SYSFS_FLAG_REMOVED),
-                "sysfs: free using entry: %s/%s\n",
-                parent_sd ? parent_sd->s_name : "", sd->s_name);
-        if (sysfs_type(sd) == SYSFS_KOBJ_LINK)
-                sysfs_put(sd->s_symlink.target_sd);
-        if (sysfs_type(sd) & SYSFS_COPY_NAME)
-                kfree(sd->s_name);
-        if (sd->s_iattr && sd->s_iattr->ia_secdata)
-                security_release_secctx(sd->s_iattr->ia_secdata,
-                                        sd->s_iattr->ia_secdata_len);
-        kfree(sd->s_iattr);
-        sysfs_free_ino(sd->s_ino);
-        kmem_cache_free(sysfs_dir_cachep, sd);
-        sd = parent_sd;
-        if (sd && atomic_dec_and_test(&sd->s_count))
-                goto repeat;
-}
-static int sysfs_dentry_delete(const struct dentry *dentry)
-{
-        struct sysfs_dirent *sd = dentry->d_fsdata;
-        return !(sd && !(sd->s_flags & SYSFS_FLAG_REMOVED));
-}
-static int sysfs_dentry_revalidate(struct dentry *dentry, unsigned int flags)
-{
-        struct sysfs_dirent *sd;
-        int type;
-        if (flags & LOOKUP_RCU)
-                return -ECHILD;
-        sd = dentry->d_fsdata;
-        mutex_lock(&sysfs_mutex);
-        /* The sysfs dirent has been deleted */
-        if (sd->s_flags & SYSFS_FLAG_REMOVED)
-                goto out_bad;
-        /* The sysfs dirent has been moved? */
-        if (dentry->d_parent->d_fsdata != sd->s_parent)
-                goto out_bad;
-        /* The sysfs dirent has been renamed */
-        if (strcmp(dentry->d_name.name, sd->s_name) != 0)
-                goto out_bad;
-        /* The sysfs dirent has been moved to a different namespace */
-        type = KOBJ_NS_TYPE_NONE;
-        if (sd->s_parent) {
-                type = sysfs_ns_type(sd->s_parent);
-                if (type != KOBJ_NS_TYPE_NONE &&
-                                sysfs_info(dentry->d_sb)->ns[type] != sd->s_ns)
-                        goto out_bad;
-        }
-        mutex_unlock(&sysfs_mutex);
-out_valid:
-        return 1;
-out_bad:
-        /* Remove the dentry from the dcache hashes.
-         * If this is a deleted dentry we use d_drop instead of d_delete
-         * so sysfs doesn't need to cope with negative dentries.
-         *
-         * If this is a dentry that has simply been renamed we
-         * use d_drop to remove it from the dcache lookup on its
-         * old parent.  If this dentry persists later when a lookup
-         * is performed at its new name the dentry will be readded
-         * to the dcache hashes.
-         */
-        mutex_unlock(&sysfs_mutex);
-        /* If we have submounts we must allow the vfs caches
-         * to lie about the state of the filesystem to prevent
-         * leaks and other nasty things.
-         */
-        if (check_submounts_and_drop(dentry) != 0)
-                goto out_valid;
-        return 0;
-}
-static void sysfs_dentry_release(struct dentry *dentry)
-{
-        sysfs_put(dentry->d_fsdata);
-}
-const struct dentry_operations sysfs_dentry_ops = {
-        .d_revalidate   = sysfs_dentry_revalidate,
-        .d_delete       = sysfs_dentry_delete,
-        .d_release      = sysfs_dentry_release,
-};
-struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type)
-{
-        char *dup_name = NULL;
-        struct sysfs_dirent *sd;
-        if (type & SYSFS_COPY_NAME) {
-                name = dup_name = kstrdup(name, GFP_KERNEL);
-                if (!name)
-                        return NULL;
-        }
-        sd = kmem_cache_zalloc(sysfs_dir_cachep, GFP_KERNEL);
-        if (!sd)
-                goto err_out1;
-        if (sysfs_alloc_ino(&sd->s_ino))
-                goto err_out2;
-        atomic_set(&sd->s_count, 1);
-        atomic_set(&sd->s_active, 0);
-        sd->s_name = name;
-        sd->s_mode = mode;
-        sd->s_flags = type | SYSFS_FLAG_REMOVED;
-        return sd;
- err_out2:
-        kmem_cache_free(sysfs_dir_cachep, sd);
- err_out1:
-        kfree(dup_name);
-        return NULL;
-}
-/**
- *      sysfs_addrm_start - prepare for sysfs_dirent add/remove
- *      @acxt: pointer to sysfs_addrm_cxt to be used
- *
- *      This function is called when the caller is about to add or remove
- *      sysfs_dirent.  This function acquires sysfs_mutex.  @acxt is used
- *      to keep and pass context to other addrm functions.
- *
- *      LOCKING:
- *      Kernel thread context (may sleep).  sysfs_mutex is locked on
- *      return.
- */
-void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt)
-        __acquires(sysfs_mutex)
-{
-        memset(acxt, 0, sizeof(*acxt));
-        mutex_lock(&sysfs_mutex);
-}
-/**
- *      __sysfs_add_one - add sysfs_dirent to parent without warning
- *      @acxt: addrm context to use
- *      @sd: sysfs_dirent to be added
- *      @parent_sd: the parent sysfs_dirent to add @sd to
- *
- *      Get @parent_sd and set @sd->s_parent to it and increment nlink of
- *      the parent inode if @sd is a directory and link into the children
- *      list of the parent.
- *
- *      This function should be called between calls to
- *      sysfs_addrm_start() and sysfs_addrm_finish() and should be
- *      passed the same @acxt as passed to sysfs_addrm_start().
- *
- *      LOCKING:
- *      Determined by sysfs_addrm_start().
- *
- *      RETURNS:
- *      0 on success, -EEXIST if entry with the given name already
- *      exists.
- */
-int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd,
-                    struct sysfs_dirent *parent_sd)
-{
-        struct sysfs_inode_attrs *ps_iattr;
-        int ret;
-        if (!!sysfs_ns_type(parent_sd) != !!sd->s_ns) {
-                WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n",
-                        sysfs_ns_type(parent_sd) ? "required" : "invalid",
-                        parent_sd->s_name, sd->s_name);
-                return -EINVAL;
-        }
-        sd->s_hash = sysfs_name_hash(sd->s_name, sd->s_ns);
-        sd->s_parent = sysfs_get(parent_sd);
-        ret = sysfs_link_sibling(sd);
-        if (ret)
-                return ret;
-        /* Update timestamps on the parent */
-        ps_iattr = parent_sd->s_iattr;
-        if (ps_iattr) {
-                struct iattr *ps_iattrs = &ps_iattr->ia_iattr;
-                ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME;
-        }
-        /* Mark the entry added into directory tree */
-        sd->s_flags &= ~SYSFS_FLAG_REMOVED;
-        return 0;
-}
 /**
 *      sysfs_pathname - return full path to sysfs dirent
- *      @sd: sysfs_dirent whose path we want
+ *      @kn: kernfs_node whose path we want
 *      @path: caller allocated buffer of size PATH_MAX
 *
 *      Gives the name "/" to the sysfs_root entry; any path returned
 *      is relative to wherever sysfs is mounted.
 */
-static char *sysfs_pathname(struct sysfs_dirent *sd, char *path)
+static char *sysfs_pathname(struct kernfs_node *kn, char *path)
 {
-        if (sd->s_parent) {
+        if (kn->parent) {
-                sysfs_pathname(sd->s_parent, path);
+                sysfs_pathname(kn->parent, path);
                strlcat(path, "/", PATH_MAX);
        }
-        strlcat(path, sd->s_name, PATH_MAX);
+        strlcat(path, kn->name, PATH_MAX);
        return path;
 }
-void sysfs_warn_dup(struct sysfs_dirent *parent, const char *name)
+void sysfs_warn_dup(struct kernfs_node *parent, const char *name)
 {
        char *path;
@@ -489,445 +55,34 @@ void sysfs_warn_dup(struct sysfs_dirent *parent, const char *name)
 }
 /**
- *      sysfs_add_one - add sysfs_dirent to parent
- *      @acxt: addrm context to use
- *      @sd: sysfs_dirent to be added
- *      @parent_sd: the parent sysfs_dirent to add @sd to
- *
- *      Get @parent_sd and set @sd->s_parent to it and increment nlink of
- *      the parent inode if @sd is a directory and link into the children
- *      list of the parent.
- *
- *      This function should be called between calls to
- *      sysfs_addrm_start() and sysfs_addrm_finish() and should be
- *      passed the same @acxt as passed to sysfs_addrm_start().
- *
- *      LOCKING:
- *      Determined by sysfs_addrm_start().
- *
- *      RETURNS:
- *      0 on success, -EEXIST if entry with the given name already
- *      exists.
- */
-int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd,
-                  struct sysfs_dirent *parent_sd)
-{
-        int ret;
-        ret = __sysfs_add_one(acxt, sd, parent_sd);
-        if (ret == -EEXIST)
-                sysfs_warn_dup(parent_sd, sd->s_name);
-        return ret;
-}
-/**
- *      sysfs_remove_one - remove sysfs_dirent from parent
- *      @acxt: addrm context to use
- *      @sd: sysfs_dirent to be removed
- *
- *      Mark @sd removed and drop nlink of parent inode if @sd is a
- *      directory.  @sd is unlinked from the children list.
- *
- *      This function should be called between calls to
- *      sysfs_addrm_start() and sysfs_addrm_finish() and should be
- *      passed the same @acxt as passed to sysfs_addrm_start().
- *
- *      LOCKING:
- *      Determined by sysfs_addrm_start().
- */
-static void sysfs_remove_one(struct sysfs_addrm_cxt *acxt,
-                             struct sysfs_dirent *sd)
-{
-        struct sysfs_inode_attrs *ps_iattr;
-        /*
-         * Removal can be called multiple times on the same node.  Only the
-         * first invocation is effective and puts the base ref.
-         */
-        if (sd->s_flags & SYSFS_FLAG_REMOVED)
-                return;
-        sysfs_unlink_sibling(sd);
-        /* Update timestamps on the parent */
-        ps_iattr = sd->s_parent->s_iattr;
-        if (ps_iattr) {
-                struct iattr *ps_iattrs = &ps_iattr->ia_iattr;
-                ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME;
-        }
-        sd->s_flags |= SYSFS_FLAG_REMOVED;
-        sd->u.removed_list = acxt->removed;
-        acxt->removed = sd;
-}
-/**
- *      sysfs_addrm_finish - finish up sysfs_dirent add/remove
- *      @acxt: addrm context to finish up
- *
- *      Finish up sysfs_dirent add/remove.  Resources acquired by
- *      sysfs_addrm_start() are released and removed sysfs_dirents are
- *      cleaned up.
- *
- *      LOCKING:
- *      sysfs_mutex is released.
- */
-void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt)
-        __releases(sysfs_mutex)
-{
-        /* release resources acquired by sysfs_addrm_start() */
-        mutex_unlock(&sysfs_mutex);
-        /* kill removed sysfs_dirents */
-        while (acxt->removed) {
-                struct sysfs_dirent *sd = acxt->removed;
-                acxt->removed = sd->u.removed_list;
-                sysfs_deactivate(sd);
-                sysfs_unmap_bin_file(sd);
-                sysfs_put(sd);
-        }
-}
-/**
- *      sysfs_find_dirent - find sysfs_dirent with the given name
- *      @parent_sd: sysfs_dirent to search under
- *      @name: name to look for
- *      @ns: the namespace tag to use
- *
- *      Look for sysfs_dirent with name @name under @parent_sd.
- *
- *      LOCKING:
- *      mutex_lock(sysfs_mutex)
- *
- *      RETURNS:
- *      Pointer to sysfs_dirent if found, NULL if not.
- */
-struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
-                                       const unsigned char *name,
-                                       const void *ns)
-{
-        struct rb_node *node = parent_sd->s_dir.children.rb_node;
-        unsigned int hash;
-        if (!!sysfs_ns_type(parent_sd) != !!ns) {
-                WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n",
-                        sysfs_ns_type(parent_sd) ? "required" : "invalid",
-                        parent_sd->s_name, name);
-                return NULL;
-        }
-        hash = sysfs_name_hash(name, ns);
-        while (node) {
-                struct sysfs_dirent *sd;
-                int result;
-                sd = to_sysfs_dirent(node);
-                result = sysfs_name_compare(hash, name, ns, sd);
-                if (result < 0)
-                        node = node->rb_left;
-                else if (result > 0)
-                        node = node->rb_right;
-                else
-                        return sd;
-        }
-        return NULL;
-}
-/**
- *      sysfs_get_dirent_ns - find and get sysfs_dirent with the given name
- *      @parent_sd: sysfs_dirent to search under
- *      @name: name to look for
- *      @ns: the namespace tag to use
- *
- *      Look for sysfs_dirent with name @name under @parent_sd and get
- *      it if found.
- *
- *      LOCKING:
- *      Kernel thread context (may sleep).  Grabs sysfs_mutex.
- *
- *      RETURNS:
- *      Pointer to sysfs_dirent if found, NULL if not.
- */
-struct sysfs_dirent *sysfs_get_dirent_ns(struct sysfs_dirent *parent_sd,
-                                         const unsigned char *name,
-                                         const void *ns)
-{
-        struct sysfs_dirent *sd;
-        mutex_lock(&sysfs_mutex);
-        sd = sysfs_find_dirent(parent_sd, name, ns);
-        sysfs_get(sd);
-        mutex_unlock(&sysfs_mutex);
-        return sd;
-}
-EXPORT_SYMBOL_GPL(sysfs_get_dirent_ns);
-static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
-                      enum kobj_ns_type type,
-                      const char *name, const void *ns,
-                      struct sysfs_dirent **p_sd)
-{
-        umode_t mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
-        struct sysfs_addrm_cxt acxt;
-        struct sysfs_dirent *sd;
-        int rc;
-        /* allocate */
-        sd = sysfs_new_dirent(name, mode, SYSFS_DIR);
-        if (!sd)
-                return -ENOMEM;
-        sd->s_flags |= (type << SYSFS_NS_TYPE_SHIFT);
-        sd->s_ns = ns;
-        sd->s_dir.kobj = kobj;
-        /* link in */
-        sysfs_addrm_start(&acxt);
-        rc = sysfs_add_one(&acxt, sd, parent_sd);
-        sysfs_addrm_finish(&acxt);
-        if (rc == 0)
-                *p_sd = sd;
-        else
-                sysfs_put(sd);
-        return rc;
-}
-int sysfs_create_subdir(struct kobject *kobj, const char *name,
-                        struct sysfs_dirent **p_sd)
-{
-        return create_dir(kobj, kobj->sd,
-                          KOBJ_NS_TYPE_NONE, name, NULL, p_sd);
-}
-/**
- *      sysfs_read_ns_type: return associated ns_type
- *      @kobj: the kobject being queried
- *
- *      Each kobject can be tagged with exactly one namespace type
- *      (i.e. network or user).  Return the ns_type associated with
- *      this object if any
- */
-static enum kobj_ns_type sysfs_read_ns_type(struct kobject *kobj)
-{
-        const struct kobj_ns_type_operations *ops;
-        enum kobj_ns_type type;
-        ops = kobj_child_ns_ops(kobj);
-        if (!ops)
-                return KOBJ_NS_TYPE_NONE;
-        type = ops->type;
-        BUG_ON(type <= KOBJ_NS_TYPE_NONE);
-        BUG_ON(type >= KOBJ_NS_TYPES);
-        BUG_ON(!kobj_ns_type_registered(type));
-        return type;
-}
-/**
 * sysfs_create_dir_ns - create a directory for an object with a namespace tag
 * @kobj: object we're creating directory for
 * @ns: the namespace tag to use
 */
 int sysfs_create_dir_ns(struct kobject *kobj, const void *ns)
 {
-        enum kobj_ns_type type;
+        struct kernfs_node *parent, *kn;
-        struct sysfs_dirent *parent_sd, *sd;
-        int error = 0;
        BUG_ON(!kobj);
        if (kobj->parent)
-                parent_sd = kobj->parent->sd;
+                parent = kobj->parent->sd;
        else
-                parent_sd = &sysfs_root;
+                parent = sysfs_root_kn;
-        if (!parent_sd)
+        if (!parent)
                return -ENOENT;
-        type = sysfs_read_ns_type(kobj);
+        kn = kernfs_create_dir_ns(parent, kobject_name(kobj),
+                                  S_IRWXU | S_IRUGO | S_IXUGO, kobj, ns);
-        error = create_dir(kobj, parent_sd, type, kobject_name(kobj), ns, &sd);
+        if (IS_ERR(kn)) {
-        if (!error)
+                if (PTR_ERR(kn) == -EEXIST)
-                kobj->sd = sd;
+                        sysfs_warn_dup(parent, kobject_name(kobj));
-        return error;
+                return PTR_ERR(kn);
-}
-static struct dentry *sysfs_lookup(struct inode *dir, struct dentry *dentry,
-                                   unsigned int flags)
-{
-        struct dentry *ret = NULL;
-        struct dentry *parent = dentry->d_parent;
-        struct sysfs_dirent *parent_sd = parent->d_fsdata;
-        struct sysfs_dirent *sd;
-        struct inode *inode;
-        enum kobj_ns_type type;
-        const void *ns;
-        mutex_lock(&sysfs_mutex);
-        type = sysfs_ns_type(parent_sd);
-        ns = sysfs_info(dir->i_sb)->ns[type];
-        sd = sysfs_find_dirent(parent_sd, dentry->d_name.name, ns);
-        /* no such entry */
-        if (!sd) {
-                ret = ERR_PTR(-ENOENT);
-                goto out_unlock;
-        }
-        dentry->d_fsdata = sysfs_get(sd);
-        /* attach dentry and inode */
-        inode = sysfs_get_inode(dir->i_sb, sd);
-        if (!inode) {
-                ret = ERR_PTR(-ENOMEM);
-                goto out_unlock;
-        }
-        /* instantiate and hash dentry */
-        ret = d_materialise_unique(dentry, inode);
- out_unlock:
-        mutex_unlock(&sysfs_mutex);
-        return ret;
-}
-const struct inode_operations sysfs_dir_inode_operations = {
-        .lookup         = sysfs_lookup,
-        .permission     = sysfs_permission,
-        .setattr        = sysfs_setattr,
-        .getattr        = sysfs_getattr,
-        .setxattr       = sysfs_setxattr,
-};
-static struct sysfs_dirent *sysfs_leftmost_descendant(struct sysfs_dirent *pos)
-{
-        struct sysfs_dirent *last;
-        while (true) {
-                struct rb_node *rbn;
-                last = pos;
-                if (sysfs_type(pos) != SYSFS_DIR)
-                        break;
-                rbn = rb_first(&pos->s_dir.children);
-                if (!rbn)
-                        break;
-                pos = to_sysfs_dirent(rbn);
-        }
-        return last;
-}
-/**
- * sysfs_next_descendant_post - find the next descendant for post-order walk
- * @pos: the current position (%NULL to initiate traversal)
- * @root: sysfs_dirent whose descendants to walk
- *
- * Find the next descendant to visit for post-order traversal of @root's
- * descendants.  @root is included in the iteration and the last node to be
- * visited.
- */
-static struct sysfs_dirent *sysfs_next_descendant_post(struct sysfs_dirent *pos,
-                                                       struct sysfs_dirent *root)
-{
-        struct rb_node *rbn;
-        lockdep_assert_held(&sysfs_mutex);
-        /* if first iteration, visit leftmost descendant which may be root */
-        if (!pos)
-                return sysfs_leftmost_descendant(root);
-        /* if we visited @root, we're done */
-        if (pos == root)
-                return NULL;
-        /* if there's an unvisited sibling, visit its leftmost descendant */
-        rbn = rb_next(&pos->s_rb);
-        if (rbn)
-                return sysfs_leftmost_descendant(to_sysfs_dirent(rbn));
-        /* no sibling left, visit parent */
-        return pos->s_parent;
-}
-static void __sysfs_remove(struct sysfs_addrm_cxt *acxt,
-                           struct sysfs_dirent *sd)
-{
-        struct sysfs_dirent *pos, *next;
-        if (!sd)
-                return;
-        pr_debug("sysfs %s: removing\n", sd->s_name);
-        next = NULL;
-        do {
-                pos = next;
-                next = sysfs_next_descendant_post(pos, sd);
-                if (pos)
-                        sysfs_remove_one(acxt, pos);
-        } while (next);
-}
-/**
- * sysfs_remove - remove a sysfs_dirent recursively
- * @sd: the sysfs_dirent to remove
- *
- * Remove @sd along with all its subdirectories and files.
- */
-void sysfs_remove(struct sysfs_dirent *sd)
-{
-        struct sysfs_addrm_cxt acxt;
-        sysfs_addrm_start(&acxt);
-        __sysfs_remove(&acxt, sd);
-        sysfs_addrm_finish(&acxt);
-}
-/**
- * sysfs_hash_and_remove - find a sysfs_dirent by name and remove it
- * @dir_sd: parent of the target
- * @name: name of the sysfs_dirent to remove
- * @ns: namespace tag of the sysfs_dirent to remove
- *
- * Look for the sysfs_dirent with @name and @ns under @dir_sd and remove
- * it.  Returns 0 on success, -ENOENT if such entry doesn't exist.
- */
-int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name,
-                          const void *ns)
-{
-        struct sysfs_addrm_cxt acxt;
-        struct sysfs_dirent *sd;
-        if (!dir_sd) {
-                WARN(1, KERN_WARNING "sysfs: can not remove '%s', no directory\n",
-                        name);
-                return -ENOENT;
        }
-        sysfs_addrm_start(&acxt);
+        kobj->sd = kn;
+        return 0;
-        sd = sysfs_find_dirent(dir_sd, name, ns);
-        if (sd)
-                __sysfs_remove(&acxt, sd);
-        sysfs_addrm_finish(&acxt);
-        if (sd)
-                return 0;
-        else
-                return -ENOENT;
 }
 /**
@@ -940,207 +95,47 @@ int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name,
 */
 void sysfs_remove_dir(struct kobject *kobj)
 {
-        struct sysfs_dirent *sd = kobj->sd;
+        struct kernfs_node *kn = kobj->sd;
        /*
         * In general, kboject owner is responsible for ensuring removal
         * doesn't race with other operations and sysfs doesn't provide any
         * protection; however, when @kobj is used as a symlink target, the
         * symlinking entity usually doesn't own @kobj and thus has no
-         * control over removal.  @kobj->sd may be removed anytime and
+         * control over removal.  @kobj->sd may be removed anytime
-         * symlink code may end up dereferencing an already freed sd.
+         * and symlink code may end up dereferencing an already freed node.
         *
-         * sysfs_symlink_target_lock synchronizes @kobj->sd disassociation
+         * sysfs_symlink_target_lock synchronizes @kobj->sd
-         * against symlink operations so that symlink code can safely
+         * disassociation against symlink operations so that symlink code
-         * dereference @kobj->sd.
+         * can safely dereference @kobj->sd.
         */
        spin_lock(&sysfs_symlink_target_lock);
        kobj->sd = NULL;
        spin_unlock(&sysfs_symlink_target_lock);
-        if (sd) {
+        if (kn) {
-                WARN_ON_ONCE(sysfs_type(sd) != SYSFS_DIR);
+                WARN_ON_ONCE(kernfs_type(kn) != KERNFS_DIR);
-                sysfs_remove(sd);
+                kernfs_remove(kn);
        }
 }
-int sysfs_rename(struct sysfs_dirent *sd, struct sysfs_dirent *new_parent_sd,
-                 const char *new_name, const void *new_ns)
-{
-        int error;
-        mutex_lock(&sysfs_mutex);
-        error = 0;
-        if ((sd->s_parent == new_parent_sd) && (sd->s_ns == new_ns) &&
-            (strcmp(sd->s_name, new_name) == 0))
-                goto out;       /* nothing to rename */
-        error = -EEXIST;
-        if (sysfs_find_dirent(new_parent_sd, new_name, new_ns))
-                goto out;
-        /* rename sysfs_dirent */
-        if (strcmp(sd->s_name, new_name) != 0) {
-                error = -ENOMEM;
-                new_name = kstrdup(new_name, GFP_KERNEL);
-                if (!new_name)
-                        goto out;
-                kfree(sd->s_name);
-                sd->s_name = new_name;
-        }
-        /*
-         * Move to the appropriate place in the appropriate directories rbtree.
-         */
-        sysfs_unlink_sibling(sd);
-        sysfs_get(new_parent_sd);
-        sysfs_put(sd->s_parent);
-        sd->s_ns = new_ns;
-        sd->s_hash = sysfs_name_hash(sd->s_name, sd->s_ns);
-        sd->s_parent = new_parent_sd;
-        sysfs_link_sibling(sd);
-        error = 0;
- out:
-        mutex_unlock(&sysfs_mutex);
-        return error;
-}
 int sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name,
                        const void *new_ns)
 {
-        struct sysfs_dirent *parent_sd = kobj->sd->s_parent;
+        struct kernfs_node *parent = kobj->sd->parent;
-        return sysfs_rename(kobj->sd, parent_sd, new_name, new_ns);
+        return kernfs_rename_ns(kobj->sd, parent, new_name, new_ns);
 }
 int sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj,
                      const void *new_ns)
 {
-        struct sysfs_dirent *sd = kobj->sd;
+        struct kernfs_node *kn = kobj->sd;
-        struct sysfs_dirent *new_parent_sd;
+        struct kernfs_node *new_parent;
-        BUG_ON(!sd->s_parent);
+        BUG_ON(!kn->parent);
-        new_parent_sd = new_parent_kobj && new_parent_kobj->sd ?
+        new_parent = new_parent_kobj && new_parent_kobj->sd ?
-                new_parent_kobj->sd : &sysfs_root;
+                new_parent_kobj->sd : sysfs_root_kn;
-        return sysfs_rename(sd, new_parent_sd, sd->s_name, new_ns);
+        return kernfs_rename_ns(kn, new_parent, kn->name, new_ns);
 }
-/* Relationship between s_mode and the DT_xxx types */
-static inline unsigned char dt_type(struct sysfs_dirent *sd)
-{
-        return (sd->s_mode >> 12) & 15;
-}
-static int sysfs_dir_release(struct inode *inode, struct file *filp)
-{
-        sysfs_put(filp->private_data);
-        return 0;
-}
-static struct sysfs_dirent *sysfs_dir_pos(const void *ns,
-        struct sysfs_dirent *parent_sd, loff_t hash, struct sysfs_dirent *pos)
-{
-        if (pos) {
-                int valid = !(pos->s_flags & SYSFS_FLAG_REMOVED) &&
-                        pos->s_parent == parent_sd &&
-                        hash == pos->s_hash;
-                sysfs_put(pos);
-                if (!valid)
-                        pos = NULL;
-        }
-        if (!pos && (hash > 1) && (hash < INT_MAX)) {
-                struct rb_node *node = parent_sd->s_dir.children.rb_node;
-                while (node) {
-                        pos = to_sysfs_dirent(node);
-                        if (hash < pos->s_hash)
-                                node = node->rb_left;
-                        else if (hash > pos->s_hash)
-                                node = node->rb_right;
-                        else
-                                break;
-                }
-        }
-        /* Skip over entries in the wrong namespace */
-        while (pos && pos->s_ns != ns) {
-                struct rb_node *node = rb_next(&pos->s_rb);
-                if (!node)
-                        pos = NULL;
-                else
-                        pos = to_sysfs_dirent(node);
-        }
-        return pos;
-}
-static struct sysfs_dirent *sysfs_dir_next_pos(const void *ns,
-        struct sysfs_dirent *parent_sd, ino_t ino, struct sysfs_dirent *pos)
-{
-        pos = sysfs_dir_pos(ns, parent_sd, ino, pos);
-        if (pos)
-                do {
-                        struct rb_node *node = rb_next(&pos->s_rb);
-                        if (!node)
-                                pos = NULL;
-                        else
-                                pos = to_sysfs_dirent(node);
-                } while (pos && pos->s_ns != ns);
-        return pos;
-}
-static int sysfs_readdir(struct file *file, struct dir_context *ctx)
-{
-        struct dentry *dentry = file->f_path.dentry;
-        struct sysfs_dirent *parent_sd = dentry->d_fsdata;
-        struct sysfs_dirent *pos = file->private_data;
-        enum kobj_ns_type type;
-        const void *ns;
-        type = sysfs_ns_type(parent_sd);
-        ns = sysfs_info(dentry->d_sb)->ns[type];
-        if (!dir_emit_dots(file, ctx))
-                return 0;
-        mutex_lock(&sysfs_mutex);
-        for (pos = sysfs_dir_pos(ns, parent_sd, ctx->pos, pos);
-             pos;
-             pos = sysfs_dir_next_pos(ns, parent_sd, ctx->pos, pos)) {
-                const char *name = pos->s_name;
-                unsigned int type = dt_type(pos);
-                int len = strlen(name);
-                ino_t ino = pos->s_ino;
-                ctx->pos = pos->s_hash;
-                file->private_data = sysfs_get(pos);
-                mutex_unlock(&sysfs_mutex);
-                if (!dir_emit(ctx, name, len, ino, type))
-                        return 0;
-                mutex_lock(&sysfs_mutex);
-        }
-        mutex_unlock(&sysfs_mutex);
-        file->private_data = NULL;
-        ctx->pos = INT_MAX;
-        return 0;
-}
-static loff_t sysfs_dir_llseek(struct file *file, loff_t offset, int whence)
-{
-        struct inode *inode = file_inode(file);
-        loff_t ret;
-        mutex_lock(&inode->i_mutex);
-        ret = generic_file_llseek(file, offset, whence);
-        mutex_unlock(&inode->i_mutex);
-        return ret;
-}
-const struct file_operations sysfs_dir_operations = {
-        .read           = generic_read_dir,
-        .iterate        = sysfs_readdir,
-        .release        = sysfs_dir_release,
-        .llseek         = sysfs_dir_llseek,
-};
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 35e7d08fe629..810cf6e613e5 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -14,70 +14,23 @@
 #include <linux/kobject.h>
 #include <linux/kallsyms.h>
 #include <linux/slab.h>
-#include <linux/fsnotify.h>
-#include <linux/namei.h>
-#include <linux/poll.h>
 #include <linux/list.h>
 #include <linux/mutex.h>
-#include <linux/limits.h>
-#include <linux/uaccess.h>
 #include <linux/seq_file.h>
-#include <linux/mm.h>
 #include "sysfs.h"
+#include "../kernfs/kernfs-internal.h"
 /*
- * There's one sysfs_open_file for each open file and one sysfs_open_dirent
+ * Determine ktype->sysfs_ops for the given kernfs_node.  This function
- * for each sysfs_dirent with one or more open files.
- *
- * sysfs_dirent->s_attr.open points to sysfs_open_dirent.  s_attr.open is
- * protected by sysfs_open_dirent_lock.
- *
- * filp->private_data points to seq_file whose ->private points to
- * sysfs_open_file.  sysfs_open_files are chained at
- * sysfs_open_dirent->files, which is protected by sysfs_open_file_mutex.
- */
-static DEFINE_SPINLOCK(sysfs_open_dirent_lock);
-static DEFINE_MUTEX(sysfs_open_file_mutex);
-struct sysfs_open_dirent {
-        atomic_t                refcnt;
-        atomic_t                event;
-        wait_queue_head_t       poll;
-        struct list_head        files; /* goes through sysfs_open_file.list */
-};
-struct sysfs_open_file {
-        struct sysfs_dirent     *sd;
-        struct file             *file;
-        struct mutex            mutex;
-        int                     event;
-        struct list_head        list;
-        bool                    mmapped;
-        const struct vm_operations_struct *vm_ops;
-};
-static bool sysfs_is_bin(struct sysfs_dirent *sd)
-{
-        return sysfs_type(sd) == SYSFS_KOBJ_BIN_ATTR;
-}
-static struct sysfs_open_file *sysfs_of(struct file *file)
-{
-        return ((struct seq_file *)file->private_data)->private;
-}
-/*
- * Determine ktype->sysfs_ops for the given sysfs_dirent.  This function
 * must be called while holding an active reference.
 */
-static const struct sysfs_ops *sysfs_file_ops(struct sysfs_dirent *sd)
+static const struct sysfs_ops *sysfs_file_ops(struct kernfs_node *kn)
 {
-        struct kobject *kobj = sd->s_parent->s_dir.kobj;
+        struct kobject *kobj = kn->parent->priv;
-        if (!sysfs_ignore_lockdep(sd))
+        if (kn->flags & KERNFS_LOCKDEP)
-                lockdep_assert_held(sd);
+                lockdep_assert_held(kn);
        return kobj->ktype ? kobj->ktype->sysfs_ops : NULL;
 }
@@ -86,13 +39,13 @@ static const struct sysfs_ops *sysfs_file_ops(struct sysfs_dirent *sd)
 * details like buffering and seeking.  The following function pipes
 * sysfs_ops->show() result through seq_file.
 */
-static int sysfs_seq_show(struct seq_file *sf, void *v)
+static int sysfs_kf_seq_show(struct seq_file *sf, void *v)
 {
-        struct sysfs_open_file *of = sf->private;
+        struct kernfs_open_file *of = sf->private;
-        struct kobject *kobj = of->sd->s_parent->s_dir.kobj;
+        struct kobject *kobj = of->kn->parent->priv;
-        const struct sysfs_ops *ops;
+        const struct sysfs_ops *ops = sysfs_file_ops(of->kn);
-        char *buf;
        ssize_t count;
+        char *buf;
        /* acquire buffer and ensure that it's >= PAGE_SIZE */
        count = seq_get_buf(sf, &buf);
@@ -102,34 +55,15 @@ static int sysfs_seq_show(struct seq_file *sf, void *v)
        }
        /*
-         * Need @of->sd for attr and ops, its parent for kobj.  @of->mutex
+         * Invoke show().  Control may reach here via seq file lseek even
-         * nests outside active ref and is just to ensure that the ops
+         * if @ops->show() isn't implemented.
-         * aren't called concurrently for the same open file.
         */
-        mutex_lock(&of->mutex);
+        if (ops->show) {
-        if (!sysfs_get_active(of->sd)) {
+                count = ops->show(kobj, of->kn->priv, buf);
-                mutex_unlock(&of->mutex);
+                if (count < 0)
-                return -ENODEV;
+                        return count;
        }
-        of->event = atomic_read(&of->sd->s_attr.open->event);
-        /*
-         * Lookup @ops and invoke show().  Control may reach here via seq
-         * file lseek even if @ops->show() isn't implemented.
-         */
-        ops = sysfs_file_ops(of->sd);
-        if (ops->show)
-                count = ops->show(kobj, of->sd->s_attr.attr, buf);
-        else
-                count = 0;
-        sysfs_put_active(of->sd);
-        mutex_unlock(&of->mutex);
-        if (count < 0)
-                return count;
        /*
         * The code works fine with PAGE_SIZE return but it's likely to
         * indicate truncated result or overflow in normal use cases.
@@ -144,726 +78,194 @@ static int sysfs_seq_show(struct seq_file *sf, void *v)
        return 0;
 }
-/*
+static ssize_t sysfs_kf_bin_read(struct kernfs_open_file *of, char *buf,
- * Read method for bin files.  As reading a bin file can have side-effects,
+                                 size_t count, loff_t pos)
- * the exact offset and bytes specified in read(2) call should be passed to
- * the read callback making it difficult to use seq_file.  Implement
- * simplistic custom buffering for bin files.
- */
-static ssize_t sysfs_bin_read(struct file *file, char __user *userbuf,
-                              size_t bytes, loff_t *off)
 {
-        struct sysfs_open_file *of = sysfs_of(file);
+        struct bin_attribute *battr = of->kn->priv;
-        struct bin_attribute *battr = of->sd->s_attr.bin_attr;
+        struct kobject *kobj = of->kn->parent->priv;
-        struct kobject *kobj = of->sd->s_parent->s_dir.kobj;
+        loff_t size = file_inode(of->file)->i_size;
-        loff_t size = file_inode(file)->i_size;
-        int count = min_t(size_t, bytes, PAGE_SIZE);
-        loff_t offs = *off;
-        char *buf;
-        if (!bytes)
+        if (!count)
                return 0;
        if (size) {
-                if (offs > size)
+                if (pos > size)
                        return 0;
-                if (offs + count > size)
+                if (pos + count > size)
-                        count = size - offs;
+                        count = size - pos;
-        }
-        buf = kmalloc(count, GFP_KERNEL);
-        if (!buf)
-                return -ENOMEM;
-        /* need of->sd for battr, its parent for kobj */
-        mutex_lock(&of->mutex);
-        if (!sysfs_get_active(of->sd)) {
-                count = -ENODEV;
-                mutex_unlock(&of->mutex);
-                goto out_free;
-        }
-        if (battr->read)
-                count = battr->read(file, kobj, battr, buf, offs, count);
-        else
-                count = -EIO;
-        sysfs_put_active(of->sd);
-        mutex_unlock(&of->mutex);
-        if (count < 0)
-                goto out_free;
-        if (copy_to_user(userbuf, buf, count)) {
-                count = -EFAULT;
-                goto out_free;
        }
-        pr_debug("offs = %lld, *off = %lld, count = %d\n", offs, *off, count);
+        if (!battr->read)
+                return -EIO;
-        *off = offs + count;
- out_free:
+        return battr->read(of->file, kobj, battr, buf, pos, count);
-        kfree(buf);
-        return count;
 }
-/**
+/* kernfs write callback for regular sysfs files */
- * flush_write_buffer - push buffer to kobject
+static ssize_t sysfs_kf_write(struct kernfs_open_file *of, char *buf,
- * @of: open file
+                              size_t count, loff_t pos)
- * @buf: data buffer for file
- * @off: file offset to write to
- * @count: number of bytes
- *
- * Get the correct pointers for the kobject and the attribute we're dealing
- * with, then call the store() method for it with @buf.
- */
-static int flush_write_buffer(struct sysfs_open_file *of, char *buf, loff_t off,
-                              size_t count)
 {
-        struct kobject *kobj = of->sd->s_parent->s_dir.kobj;
+        const struct sysfs_ops *ops = sysfs_file_ops(of->kn);
-        int rc = 0;
+        struct kobject *kobj = of->kn->parent->priv;
-        /*
-         * Need @of->sd for attr and ops, its parent for kobj.  @of->mutex
-         * nests outside active ref and is just to ensure that the ops
-         * aren't called concurrently for the same open file.
-         */
-        mutex_lock(&of->mutex);
-        if (!sysfs_get_active(of->sd)) {
-                mutex_unlock(&of->mutex);
-                return -ENODEV;
-        }
-        if (sysfs_is_bin(of->sd)) {
+        if (!count)
-                struct bin_attribute *battr = of->sd->s_attr.bin_attr;
+                return 0;
-                rc = -EIO;
-                if (battr->write)
-                        rc = battr->write(of->file, kobj, battr, buf, off,
-                                          count);
-        } else {
-                const struct sysfs_ops *ops = sysfs_file_ops(of->sd);
-                rc = ops->store(kobj, of->sd->s_attr.attr, buf, count);
-        }
-        sysfs_put_active(of->sd);
-        mutex_unlock(&of->mutex);
-        return rc;
+        return ops->store(kobj, of->kn->priv, buf, count);
 }
-/**
+/* kernfs write callback for bin sysfs files */
- * sysfs_write_file - write an attribute
+static ssize_t sysfs_kf_bin_write(struct kernfs_open_file *of, char *buf,
- * @file: file pointer
+                                  size_t count, loff_t pos)
- * @user_buf: data to write
- * @count: number of bytes
- * @ppos: starting offset
- *
- * Copy data in from userland and pass it to the matching
- * sysfs_ops->store() by invoking flush_write_buffer().
- *
- * There is no easy way for us to know if userspace is only doing a partial
- * write, so we don't support them. We expect the entire buffer to come on
- * the first write.  Hint: if you're writing a value, first read the file,
- * modify only the the value you're changing, then write entire buffer
- * back.
- */
-static ssize_t sysfs_write_file(struct file *file, const char __user *user_buf,
-                                size_t count, loff_t *ppos)
 {
-        struct sysfs_open_file *of = sysfs_of(file);
+        struct bin_attribute *battr = of->kn->priv;
-        ssize_t len = min_t(size_t, count, PAGE_SIZE);
+        struct kobject *kobj = of->kn->parent->priv;
-        loff_t size = file_inode(file)->i_size;
+        loff_t size = file_inode(of->file)->i_size;
-        char *buf;
-        if (sysfs_is_bin(of->sd) && size) {
+        if (size) {
-                if (size <= *ppos)
+                if (size <= pos)
                        return 0;
-                len = min_t(ssize_t, len, size - *ppos);
+                count = min_t(ssize_t, count, size - pos);
        }
+        if (!count)
-        if (!len)
                return 0;
-        buf = kmalloc(len + 1, GFP_KERNEL);
+        if (!battr->write)
-        if (!buf)
+                return -EIO;
-                return -ENOMEM;
-        if (copy_from_user(buf, user_buf, len)) {
+        return battr->write(of->file, kobj, battr, buf, pos, count);
-                len = -EFAULT;
-                goto out_free;
-        }
-        buf[len] = '\0';        /* guarantee string termination */
-        len = flush_write_buffer(of, buf, *ppos, len);
-        if (len > 0)
-                *ppos += len;
-out_free:
-        kfree(buf);
-        return len;
-}
-static void sysfs_bin_vma_open(struct vm_area_struct *vma)
-{
-        struct file *file = vma->vm_file;
-        struct sysfs_open_file *of = sysfs_of(file);
-        if (!of->vm_ops)
-                return;
-        if (!sysfs_get_active(of->sd))
-                return;
-        if (of->vm_ops->open)
-                of->vm_ops->open(vma);
-        sysfs_put_active(of->sd);
 }
-static int sysfs_bin_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int sysfs_kf_bin_mmap(struct kernfs_open_file *of,
+                             struct vm_area_struct *vma)
 {
-        struct file *file = vma->vm_file;
+        struct bin_attribute *battr = of->kn->priv;
-        struct sysfs_open_file *of = sysfs_of(file);
+        struct kobject *kobj = of->kn->parent->priv;
-        int ret;
-        if (!of->vm_ops)
+        return battr->mmap(of->file, kobj, battr, vma);
-                return VM_FAULT_SIGBUS;
-        if (!sysfs_get_active(of->sd))
-                return VM_FAULT_SIGBUS;
-        ret = VM_FAULT_SIGBUS;
-        if (of->vm_ops->fault)
-                ret = of->vm_ops->fault(vma, vmf);
-        sysfs_put_active(of->sd);
-        return ret;
 }
-static int sysfs_bin_page_mkwrite(struct vm_area_struct *vma,
+void sysfs_notify(struct kobject *kobj, const char *dir, const char *attr)
-                                  struct vm_fault *vmf)
 {
-        struct file *file = vma->vm_file;
+        struct kernfs_node *kn = kobj->sd, *tmp;
-        struct sysfs_open_file *of = sysfs_of(file);
-        int ret;
-        if (!of->vm_ops)
-                return VM_FAULT_SIGBUS;
-        if (!sysfs_get_active(of->sd))
+        if (kn && dir)
-                return VM_FAULT_SIGBUS;
+                kn = kernfs_find_and_get(kn, dir);
-        ret = 0;
-        if (of->vm_ops->page_mkwrite)
-                ret = of->vm_ops->page_mkwrite(vma, vmf);
        else
-                file_update_time(file);
+                kernfs_get(kn);
-        sysfs_put_active(of->sd);
-        return ret;
-}
-static int sysfs_bin_access(struct vm_area_struct *vma, unsigned long addr,
-                            void *buf, int len, int write)
-{
-        struct file *file = vma->vm_file;
-        struct sysfs_open_file *of = sysfs_of(file);
-        int ret;
-        if (!of->vm_ops)
-                return -EINVAL;
-        if (!sysfs_get_active(of->sd))
-                return -EINVAL;
-        ret = -EINVAL;
-        if (of->vm_ops->access)
-                ret = of->vm_ops->access(vma, addr, buf, len, write);
-        sysfs_put_active(of->sd);
-        return ret;
-}
-#ifdef CONFIG_NUMA
-static int sysfs_bin_set_policy(struct vm_area_struct *vma,
-                                struct mempolicy *new)
-{
-        struct file *file = vma->vm_file;
-        struct sysfs_open_file *of = sysfs_of(file);
-        int ret;
-        if (!of->vm_ops)
-                return 0;
-        if (!sysfs_get_active(of->sd))
-                return -EINVAL;
-        ret = 0;
-        if (of->vm_ops->set_policy)
-                ret = of->vm_ops->set_policy(vma, new);
-        sysfs_put_active(of->sd);
-        return ret;
-}
-static struct mempolicy *sysfs_bin_get_policy(struct vm_area_struct *vma,
-                                              unsigned long addr)
-{
-        struct file *file = vma->vm_file;
-        struct sysfs_open_file *of = sysfs_of(file);
-        struct mempolicy *pol;
-        if (!of->vm_ops)
-                return vma->vm_policy;
-        if (!sysfs_get_active(of->sd))
-                return vma->vm_policy;
-        pol = vma->vm_policy;
-        if (of->vm_ops->get_policy)
-                pol = of->vm_ops->get_policy(vma, addr);
-        sysfs_put_active(of->sd);
-        return pol;
-}
-static int sysfs_bin_migrate(struct vm_area_struct *vma, const nodemask_t *from,
-                             const nodemask_t *to, unsigned long flags)
-{
-        struct file *file = vma->vm_file;
-        struct sysfs_open_file *of = sysfs_of(file);
-        int ret;
-        if (!of->vm_ops)
-                return 0;
-        if (!sysfs_get_active(of->sd))
-                return 0;
-        ret = 0;
-        if (of->vm_ops->migrate)
-                ret = of->vm_ops->migrate(vma, from, to, flags);
-        sysfs_put_active(of->sd);
-        return ret;
-}
-#endif
-static const struct vm_operations_struct sysfs_bin_vm_ops = {
-        .open           = sysfs_bin_vma_open,
-        .fault          = sysfs_bin_fault,
-        .page_mkwrite   = sysfs_bin_page_mkwrite,
-        .access         = sysfs_bin_access,
-#ifdef CONFIG_NUMA
-        .set_policy     = sysfs_bin_set_policy,
-        .get_policy     = sysfs_bin_get_policy,
-        .migrate        = sysfs_bin_migrate,
-#endif
-};
-static int sysfs_bin_mmap(struct file *file, struct vm_area_struct *vma)
-{
-        struct sysfs_open_file *of = sysfs_of(file);
-        struct bin_attribute *battr = of->sd->s_attr.bin_attr;
-        struct kobject *kobj = of->sd->s_parent->s_dir.kobj;
-        int rc;
-        mutex_lock(&of->mutex);
-        /* need of->sd for battr, its parent for kobj */
-        rc = -ENODEV;
-        if (!sysfs_get_active(of->sd))
-                goto out_unlock;
-        if (!battr->mmap)
-                goto out_put;
-        rc = battr->mmap(file, kobj, battr, vma);
-        if (rc)
-                goto out_put;
-        /*
-         * PowerPC's pci_mmap of legacy_mem uses shmem_zero_setup()
-         * to satisfy versions of X which crash if the mmap fails: that
-         * substitutes a new vm_file, and we don't then want bin_vm_ops.
-         */
-        if (vma->vm_file != file)
-                goto out_put;
-        rc = -EINVAL;
-        if (of->mmapped && of->vm_ops != vma->vm_ops)
-                goto out_put;
-        /*
+        if (kn && attr) {
-         * It is not possible to successfully wrap close.
+                tmp = kernfs_find_and_get(kn, attr);
-         * So error if someone is trying to use close.
+                kernfs_put(kn);
-         */
+                kn = tmp;
-        rc = -EINVAL;
-        if (vma->vm_ops && vma->vm_ops->close)
-                goto out_put;
-        rc = 0;
-        of->mmapped = 1;
-        of->vm_ops = vma->vm_ops;
-        vma->vm_ops = &sysfs_bin_vm_ops;
-out_put:
-        sysfs_put_active(of->sd);
-out_unlock:
-        mutex_unlock(&of->mutex);
-        return rc;
-}
-/**
- *      sysfs_get_open_dirent - get or create sysfs_open_dirent
- *      @sd: target sysfs_dirent
- *      @of: sysfs_open_file for this instance of open
- *
- *      If @sd->s_attr.open exists, increment its reference count;
- *      otherwise, create one.  @of is chained to the files list.
- *
- *      LOCKING:
- *      Kernel thread context (may sleep).
- *
- *      RETURNS:
- *      0 on success, -errno on failure.
- */
-static int sysfs_get_open_dirent(struct sysfs_dirent *sd,
-                                 struct sysfs_open_file *of)
-{
-        struct sysfs_open_dirent *od, *new_od = NULL;
- retry:
-        mutex_lock(&sysfs_open_file_mutex);
-        spin_lock_irq(&sysfs_open_dirent_lock);
-        if (!sd->s_attr.open && new_od) {
-                sd->s_attr.open = new_od;
-                new_od = NULL;
        }
-        od = sd->s_attr.open;
+        if (kn) {
-        if (od) {
+                kernfs_notify(kn);
-                atomic_inc(&od->refcnt);
+                kernfs_put(kn);
-                list_add_tail(&of->list, &od->files);
-        }
-        spin_unlock_irq(&sysfs_open_dirent_lock);
-        mutex_unlock(&sysfs_open_file_mutex);
-        if (od) {
-                kfree(new_od);
-                return 0;
        }
+}
+EXPORT_SYMBOL_GPL(sysfs_notify);
-        /* not there, initialize a new one and retry */
+static const struct kernfs_ops sysfs_file_kfops_empty = {
-        new_od = kmalloc(sizeof(*new_od), GFP_KERNEL);
+};
-        if (!new_od)
-                return -ENOMEM;
-        atomic_set(&new_od->refcnt, 0);
+static const struct kernfs_ops sysfs_file_kfops_ro = {
-        atomic_set(&new_od->event, 1);
+        .seq_show       = sysfs_kf_seq_show,
-        init_waitqueue_head(&new_od->poll);
+};
-        INIT_LIST_HEAD(&new_od->files);
-        goto retry;
-}
-/**
+static const struct kernfs_ops sysfs_file_kfops_wo = {
- *      sysfs_put_open_dirent - put sysfs_open_dirent
+        .write          = sysfs_kf_write,
- *      @sd: target sysfs_dirent
+};
- *      @of: associated sysfs_open_file
- *
- *      Put @sd->s_attr.open and unlink @of from the files list.  If
- *      reference count reaches zero, disassociate and free it.
- *
- *      LOCKING:
- *      None.
- */
-static void sysfs_put_open_dirent(struct sysfs_dirent *sd,
-                                  struct sysfs_open_file *of)
-{
-        struct sysfs_open_dirent *od = sd->s_attr.open;
-        unsigned long flags;
-        mutex_lock(&sysfs_open_file_mutex);
+static const struct kernfs_ops sysfs_file_kfops_rw = {
-        spin_lock_irqsave(&sysfs_open_dirent_lock, flags);
+        .seq_show       = sysfs_kf_seq_show,
+        .write          = sysfs_kf_write,
+};
-        if (of)
+static const struct kernfs_ops sysfs_bin_kfops_ro = {
-                list_del(&of->list);
+        .read           = sysfs_kf_bin_read,
+};
-        if (atomic_dec_and_test(&od->refcnt))
+static const struct kernfs_ops sysfs_bin_kfops_wo = {
-                sd->s_attr.open = NULL;
+        .write          = sysfs_kf_bin_write,
-        else
+};
-                od = NULL;
-        spin_unlock_irqrestore(&sysfs_open_dirent_lock, flags);
+static const struct kernfs_ops sysfs_bin_kfops_rw = {
-        mutex_unlock(&sysfs_open_file_mutex);
+        .read           = sysfs_kf_bin_read,
+        .write          = sysfs_kf_bin_write,
+};
-        kfree(od);
+static const struct kernfs_ops sysfs_bin_kfops_mmap = {
-}
+        .read           = sysfs_kf_bin_read,
+        .write          = sysfs_kf_bin_write,
+        .mmap           = sysfs_kf_bin_mmap,
+};
-static int sysfs_open_file(struct inode *inode, struct file *file)
+int sysfs_add_file_mode_ns(struct kernfs_node *parent,
+                           const struct attribute *attr, bool is_bin,
+                           umode_t mode, const void *ns)
 {
-        struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
+        struct lock_class_key *key = NULL;
-        struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
+        const struct kernfs_ops *ops;
-        struct sysfs_open_file *of;
+        struct kernfs_node *kn;
-        bool has_read, has_write;
+        loff_t size;
-        int error = -EACCES;
-        /* need attr_sd for attr and ops, its parent for kobj */
-        if (!sysfs_get_active(attr_sd))
-                return -ENODEV;
-        if (sysfs_is_bin(attr_sd)) {
+        if (!is_bin) {
-                struct bin_attribute *battr = attr_sd->s_attr.bin_attr;
+                struct kobject *kobj = parent->priv;
+                const struct sysfs_ops *sysfs_ops = kobj->ktype->sysfs_ops;
-                has_read = battr->read || battr->mmap;
-                has_write = battr->write || battr->mmap;
-        } else {
-                const struct sysfs_ops *ops = sysfs_file_ops(attr_sd);
                /* every kobject with an attribute needs a ktype assigned */
-                if (WARN(!ops, KERN_ERR
+                if (WARN(!sysfs_ops, KERN_ERR
                         "missing sysfs attribute operations for kobject: %s\n",
                         kobject_name(kobj)))
-                        goto err_out;
+                        return -EINVAL;
-                has_read = ops->show;
+                if (sysfs_ops->show && sysfs_ops->store)
-                has_write = ops->store;
+                        ops = &sysfs_file_kfops_rw;
-        }
+                else if (sysfs_ops->show)
+                        ops = &sysfs_file_kfops_ro;
-        /* check perms and supported operations */
+                else if (sysfs_ops->store)
-        if ((file->f_mode & FMODE_WRITE) &&
+                        ops = &sysfs_file_kfops_wo;
-            (!(inode->i_mode & S_IWUGO) || !has_write))
+                else
-                goto err_out;
+                        ops = &sysfs_file_kfops_empty;
-        if ((file->f_mode & FMODE_READ) &&
+                size = PAGE_SIZE;
-            (!(inode->i_mode & S_IRUGO) || !has_read))
+        } else {
-                goto err_out;
+                struct bin_attribute *battr = (void *)attr;
-        /* allocate a sysfs_open_file for the file */
+                if (battr->mmap)
-        error = -ENOMEM;
+                        ops = &sysfs_bin_kfops_mmap;
-        of = kzalloc(sizeof(struct sysfs_open_file), GFP_KERNEL);
+                else if (battr->read && battr->write)
-        if (!of)
+                        ops = &sysfs_bin_kfops_rw;
-                goto err_out;
+                else if (battr->read)
+                        ops = &sysfs_bin_kfops_ro;
-        /*
+                else if (battr->write)
-         * The following is done to give a different lockdep key to
+                        ops = &sysfs_bin_kfops_wo;
-         * @of->mutex for files which implement mmap.  This is a rather
+                else
-         * crude way to avoid false positive lockdep warning around
+                        ops = &sysfs_file_kfops_empty;
-         * mm->mmap_sem - mmap nests @of->mutex under mm->mmap_sem and
-         * reading /sys/block/sda/trace/act_mask grabs sr_mutex, under
+                size = battr->size;
-         * which mm->mmap_sem nests, while holding @of->mutex.  As each
-         * open file has a separate mutex, it's okay as long as those don't
-         * happen on the same file.  At this point, we can't easily give
-         * each file a separate locking class.  Let's differentiate on
-         * whether the file is bin or not for now.
-         */
-        if (sysfs_is_bin(attr_sd))
-                mutex_init(&of->mutex);
-        else
-                mutex_init(&of->mutex);
-        of->sd = attr_sd;
-        of->file = file;
-        /*
-         * Always instantiate seq_file even if read access doesn't use
-         * seq_file or is not requested.  This unifies private data access
-         * and readable regular files are the vast majority anyway.
-         */
-        if (sysfs_is_bin(attr_sd))
-                error = single_open(file, NULL, of);
-        else
-                error = single_open(file, sysfs_seq_show, of);
-        if (error)
-                goto err_free;
-        /* seq_file clears PWRITE unconditionally, restore it if WRITE */
-        if (file->f_mode & FMODE_WRITE)
-                file->f_mode |= FMODE_PWRITE;
-        /* make sure we have open dirent struct */
-        error = sysfs_get_open_dirent(attr_sd, of);
-        if (error)
-                goto err_close;
-        /* open succeeded, put active references */
-        sysfs_put_active(attr_sd);
-        return 0;
-err_close:
-        single_release(inode, file);
-err_free:
-        kfree(of);
-err_out:
-        sysfs_put_active(attr_sd);
-        return error;
-}
-static int sysfs_release(struct inode *inode, struct file *filp)
-{
-        struct sysfs_dirent *sd = filp->f_path.dentry->d_fsdata;
-        struct sysfs_open_file *of = sysfs_of(filp);
-        sysfs_put_open_dirent(sd, of);
-        single_release(inode, filp);
-        kfree(of);
-        return 0;
-}
-void sysfs_unmap_bin_file(struct sysfs_dirent *sd)
-{
-        struct sysfs_open_dirent *od;
-        struct sysfs_open_file *of;
-        if (!sysfs_is_bin(sd))
-                return;
-        spin_lock_irq(&sysfs_open_dirent_lock);
-        od = sd->s_attr.open;
-        if (od)
-                atomic_inc(&od->refcnt);
-        spin_unlock_irq(&sysfs_open_dirent_lock);
-        if (!od)
-                return;
-        mutex_lock(&sysfs_open_file_mutex);
-        list_for_each_entry(of, &od->files, list) {
-                struct inode *inode = file_inode(of->file);
-                unmap_mapping_range(inode->i_mapping, 0, 0, 1);
        }
-        mutex_unlock(&sysfs_open_file_mutex);
-        sysfs_put_open_dirent(sd, NULL);
-}
-/* Sysfs attribute files are pollable.  The idea is that you read
- * the content and then you use 'poll' or 'select' to wait for
- * the content to change.  When the content changes (assuming the
- * manager for the kobject supports notification), poll will
- * return POLLERR|POLLPRI, and select will return the fd whether
- * it is waiting for read, write, or exceptions.
- * Once poll/select indicates that the value has changed, you
- * need to close and re-open the file, or seek to 0 and read again.
- * Reminder: this only works for attributes which actively support
- * it, and it is not possible to test an attribute from userspace
- * to see if it supports poll (Neither 'poll' nor 'select' return
- * an appropriate error code).  When in doubt, set a suitable timeout value.
- */
-static unsigned int sysfs_poll(struct file *filp, poll_table *wait)
-{
-        struct sysfs_open_file *of = sysfs_of(filp);
-        struct sysfs_dirent *attr_sd = filp->f_path.dentry->d_fsdata;
-        struct sysfs_open_dirent *od = attr_sd->s_attr.open;
-        /* need parent for the kobj, grab both */
-        if (!sysfs_get_active(attr_sd))
-                goto trigger;
-        poll_wait(filp, &od->poll, wait);
-        sysfs_put_active(attr_sd);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+        if (!attr->ignore_lockdep)
-        if (of->event != atomic_read(&od->event))
+                key = attr->key ?: (struct lock_class_key *)&attr->skey;
-                goto trigger;
+#endif
+        kn = __kernfs_create_file(parent, attr->name, mode, size, ops,
-        return DEFAULT_POLLMASK;
+                                  (void *)attr, ns, true, key);
+        if (IS_ERR(kn)) {
- trigger:
+                if (PTR_ERR(kn) == -EEXIST)
-        return DEFAULT_POLLMASK|POLLERR|POLLPRI;
+                        sysfs_warn_dup(parent, attr->name);
-}
+                return PTR_ERR(kn);
-void sysfs_notify_dirent(struct sysfs_dirent *sd)
-{
-        struct sysfs_open_dirent *od;
-        unsigned long flags;
-        spin_lock_irqsave(&sysfs_open_dirent_lock, flags);
-        if (!WARN_ON(sysfs_type(sd) != SYSFS_KOBJ_ATTR)) {
-                od = sd->s_attr.open;
-                if (od) {
-                        atomic_inc(&od->event);
-                        wake_up_interruptible(&od->poll);
-                }
        }
+        return 0;
-        spin_unlock_irqrestore(&sysfs_open_dirent_lock, flags);
-}
-EXPORT_SYMBOL_GPL(sysfs_notify_dirent);
-void sysfs_notify(struct kobject *k, const char *dir, const char *attr)
-{
-        struct sysfs_dirent *sd = k->sd;
-        mutex_lock(&sysfs_mutex);
-        if (sd && dir)
-                sd = sysfs_find_dirent(sd, dir, NULL);
-        if (sd && attr)
-                sd = sysfs_find_dirent(sd, attr, NULL);
-        if (sd)
-                sysfs_notify_dirent(sd);
-        mutex_unlock(&sysfs_mutex);
-}
-EXPORT_SYMBOL_GPL(sysfs_notify);
-const struct file_operations sysfs_file_operations = {
-        .read           = seq_read,
-        .write          = sysfs_write_file,
-        .llseek         = generic_file_llseek,
-        .open           = sysfs_open_file,
-        .release        = sysfs_release,
-        .poll           = sysfs_poll,
-};
-const struct file_operations sysfs_bin_operations = {
-        .read           = sysfs_bin_read,
-        .write          = sysfs_write_file,
-        .llseek         = generic_file_llseek,
-        .mmap           = sysfs_bin_mmap,
-        .open           = sysfs_open_file,
-        .release        = sysfs_release,
-        .poll           = sysfs_poll,
-};
-int sysfs_add_file_mode_ns(struct sysfs_dirent *dir_sd,
-                           const struct attribute *attr, int type,
-                           umode_t amode, const void *ns)
-{
-        umode_t mode = (amode & S_IALLUGO) | S_IFREG;
-        struct sysfs_addrm_cxt acxt;
-        struct sysfs_dirent *sd;
-        int rc;
-        sd = sysfs_new_dirent(attr->name, mode, type);
-        if (!sd)
-                return -ENOMEM;
-        sd->s_ns = ns;
-        sd->s_attr.attr = (void *)attr;
-        sysfs_dirent_init_lockdep(sd);
-        sysfs_addrm_start(&acxt);
-        rc = sysfs_add_one(&acxt, sd, dir_sd);
-        sysfs_addrm_finish(&acxt);
-        if (rc)
-                sysfs_put(sd);
-        return rc;
 }
+int sysfs_add_file(struct kernfs_node *parent, const struct attribute *attr,
-int sysfs_add_file(struct sysfs_dirent *dir_sd, const struct attribute *attr,
+                   bool is_bin)
-                   int type)
 {
-        return sysfs_add_file_mode_ns(dir_sd, attr, type, attr->mode, NULL);
+        return sysfs_add_file_mode_ns(parent, attr, is_bin, attr->mode, NULL);
 }
 /**
@@ -877,8 +279,7 @@ int sysfs_create_file_ns(struct kobject *kobj, const struct attribute *attr,
 {
        BUG_ON(!kobj || !kobj->sd || !attr);
-        return sysfs_add_file_mode_ns(kobj->sd, attr, SYSFS_KOBJ_ATTR,
+        return sysfs_add_file_mode_ns(kobj->sd, attr, false, attr->mode, ns);
-                                      attr->mode, ns);
 }
 EXPORT_SYMBOL_GPL(sysfs_create_file_ns);
@@ -906,19 +307,21 @@ EXPORT_SYMBOL_GPL(sysfs_create_files);
 int sysfs_add_file_to_group(struct kobject *kobj,
                const struct attribute *attr, const char *group)
 {
-        struct sysfs_dirent *dir_sd;
+        struct kernfs_node *parent;
        int error;
-        if (group)
+        if (group) {
-                dir_sd = sysfs_get_dirent(kobj->sd, group);
+                parent = kernfs_find_and_get(kobj->sd, group);
-        else
+        } else {
-                dir_sd = sysfs_get(kobj->sd);
+                parent = kobj->sd;
+                kernfs_get(parent);
+        }
-        if (!dir_sd)
+        if (!parent)
                return -ENOENT;
-        error = sysfs_add_file(dir_sd, attr, SYSFS_KOBJ_ATTR);
+        error = sysfs_add_file(parent, attr, false);
-        sysfs_put(dir_sd);
+        kernfs_put(parent);
        return error;
 }
@@ -934,23 +337,20 @@ EXPORT_SYMBOL_GPL(sysfs_add_file_to_group);
 int sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr,
                     umode_t mode)
 {
-        struct sysfs_dirent *sd;
+        struct kernfs_node *kn;
        struct iattr newattrs;
        int rc;
-        mutex_lock(&sysfs_mutex);
+        kn = kernfs_find_and_get(kobj->sd, attr->name);
+        if (!kn)
-        rc = -ENOENT;
+                return -ENOENT;
-        sd = sysfs_find_dirent(kobj->sd, attr->name, NULL);
-        if (!sd)
-                goto out;
-        newattrs.ia_mode = (mode & S_IALLUGO) | (sd->s_mode & ~S_IALLUGO);
+        newattrs.ia_mode = (mode & S_IALLUGO) | (kn->mode & ~S_IALLUGO);
        newattrs.ia_valid = ATTR_MODE;
-        rc = sysfs_sd_setattr(sd, &newattrs);
- out:
+        rc = kernfs_setattr(kn, &newattrs);
-        mutex_unlock(&sysfs_mutex);
+        kernfs_put(kn);
        return rc;
 }
 EXPORT_SYMBOL_GPL(sysfs_chmod_file);
@@ -966,9 +366,9 @@ EXPORT_SYMBOL_GPL(sysfs_chmod_file);
 void sysfs_remove_file_ns(struct kobject *kobj, const struct attribute *attr,
                          const void *ns)
 {
-        struct sysfs_dirent *dir_sd = kobj->sd;
+        struct kernfs_node *parent = kobj->sd;
-        sysfs_hash_and_remove(dir_sd, attr->name, ns);
+        kernfs_remove_by_name_ns(parent, attr->name, ns);
 }
 EXPORT_SYMBOL_GPL(sysfs_remove_file_ns);
@@ -989,15 +389,18 @@ EXPORT_SYMBOL_GPL(sysfs_remove_files);
 void sysfs_remove_file_from_group(struct kobject *kobj,
                const struct attribute *attr, const char *group)
 {
-        struct sysfs_dirent *dir_sd;
+        struct kernfs_node *parent;
-        if (group)
+        if (group) {
-                dir_sd = sysfs_get_dirent(kobj->sd, group);
+                parent = kernfs_find_and_get(kobj->sd, group);
-        else
+        } else {
-                dir_sd = sysfs_get(kobj->sd);
+                parent = kobj->sd;
-        if (dir_sd) {
+                kernfs_get(parent);
-                sysfs_hash_and_remove(dir_sd, attr->name, NULL);
+        }
-                sysfs_put(dir_sd);
+        if (parent) {
+                kernfs_remove_by_name(parent, attr->name);
+                kernfs_put(parent);
        }
 }
 EXPORT_SYMBOL_GPL(sysfs_remove_file_from_group);
@@ -1012,7 +415,7 @@ int sysfs_create_bin_file(struct kobject *kobj,
 {
        BUG_ON(!kobj || !kobj->sd || !attr);
-        return sysfs_add_file(kobj->sd, &attr->attr, SYSFS_KOBJ_BIN_ATTR);
+        return sysfs_add_file(kobj->sd, &attr->attr, true);
 }
 EXPORT_SYMBOL_GPL(sysfs_create_bin_file);
@@ -1024,7 +427,7 @@ EXPORT_SYMBOL_GPL(sysfs_create_bin_file);
 void sysfs_remove_bin_file(struct kobject *kobj,
                           const struct bin_attribute *attr)
 {
-        sysfs_hash_and_remove(kobj->sd, attr->attr.name, NULL);
+        kernfs_remove_by_name(kobj->sd, attr->attr.name);
 }
 EXPORT_SYMBOL_GPL(sysfs_remove_bin_file);
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 1898a10e38ce..6b579387c67a 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -18,7 +18,7 @@
 #include "sysfs.h"
-static void remove_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
+static void remove_files(struct kernfs_node *parent, struct kobject *kobj,
                         const struct attribute_group *grp)
 {
        struct attribute *const *attr;
@@ -26,13 +26,13 @@ static void remove_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
        if (grp->attrs)
                for (attr = grp->attrs; *attr; attr++)
-                        sysfs_hash_and_remove(dir_sd, (*attr)->name, NULL);
+                        kernfs_remove_by_name(parent, (*attr)->name);
        if (grp->bin_attrs)
                for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++)
                        sysfs_remove_bin_file(kobj, *bin_attr);
 }
-static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
+static int create_files(struct kernfs_node *parent, struct kobject *kobj,
                        const struct attribute_group *grp, int update)
 {
        struct attribute *const *attr;
@@ -49,22 +49,20 @@ static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
                         * re-adding (if required) the file.
                         */
                        if (update)
-                                sysfs_hash_and_remove(dir_sd, (*attr)->name,
+                                kernfs_remove_by_name(parent, (*attr)->name);
-                                                      NULL);
                        if (grp->is_visible) {
                                mode = grp->is_visible(kobj, *attr, i);
                                if (!mode)
                                        continue;
                        }
-                        error = sysfs_add_file_mode_ns(dir_sd, *attr,
+                        error = sysfs_add_file_mode_ns(parent, *attr, false,
-                                                       SYSFS_KOBJ_ATTR,
                                                       (*attr)->mode | mode,
                                                       NULL);
                        if (unlikely(error))
                                break;
                }
                if (error) {
-                        remove_files(dir_sd, kobj, grp);
+                        remove_files(parent, kobj, grp);
                        goto exit;
                }
        }
@@ -78,7 +76,7 @@ static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
                                break;
                }
                if (error)
-                        remove_files(dir_sd, kobj, grp);
+                        remove_files(parent, kobj, grp);
        }
 exit:
        return error;
@@ -88,7 +86,7 @@ exit:
 static int internal_create_group(struct kobject *kobj, int update,
                                 const struct attribute_group *grp)
 {
-        struct sysfs_dirent *sd;
+        struct kernfs_node *kn;
        int error;
        BUG_ON(!kobj || (!update && !kobj->sd));
@@ -102,18 +100,22 @@ static int internal_create_group(struct kobject *kobj, int update,
                return -EINVAL;
        }
        if (grp->name) {
-                error = sysfs_create_subdir(kobj, grp->name, &sd);
+                kn = kernfs_create_dir(kobj->sd, grp->name,
-                if (error)
+                                       S_IRWXU | S_IRUGO | S_IXUGO, kobj);
-                        return error;
+                if (IS_ERR(kn)) {
+                        if (PTR_ERR(kn) == -EEXIST)
+                                sysfs_warn_dup(kobj->sd, grp->name);
+                        return PTR_ERR(kn);
+                }
        } else
-                sd = kobj->sd;
+                kn = kobj->sd;
-        sysfs_get(sd);
+        kernfs_get(kn);
-        error = create_files(sd, kobj, grp, update);
+        error = create_files(kn, kobj, grp, update);
        if (error) {
                if (grp->name)
-                        sysfs_remove(sd);
+                        kernfs_remove(kn);
        }
-        sysfs_put(sd);
+        kernfs_put(kn);
        return error;
 }
@@ -203,25 +205,27 @@ EXPORT_SYMBOL_GPL(sysfs_update_group);
 void sysfs_remove_group(struct kobject *kobj,
                        const struct attribute_group *grp)
 {
-        struct sysfs_dirent *dir_sd = kobj->sd;
+        struct kernfs_node *parent = kobj->sd;
-        struct sysfs_dirent *sd;
+        struct kernfs_node *kn;
        if (grp->name) {
-                sd = sysfs_get_dirent(dir_sd, grp->name);
+                kn = kernfs_find_and_get(parent, grp->name);
-                if (!sd) {
+                if (!kn) {
-                        WARN(!sd, KERN_WARNING
+                        WARN(!kn, KERN_WARNING
                             "sysfs group %p not found for kobject '%s'\n",
                             grp, kobject_name(kobj));
                        return;
                }
-        } else
+        } else {
-                sd = sysfs_get(dir_sd);
+                kn = parent;
+                kernfs_get(kn);
+        }
-        remove_files(sd, kobj, grp);
+        remove_files(kn, kobj, grp);
        if (grp->name)
-                sysfs_remove(sd);
+                kernfs_remove(kn);
-        sysfs_put(sd);
+        kernfs_put(kn);
 }
 EXPORT_SYMBOL_GPL(sysfs_remove_group);
@@ -257,22 +261,22 @@ EXPORT_SYMBOL_GPL(sysfs_remove_groups);
 int sysfs_merge_group(struct kobject *kobj,
                       const struct attribute_group *grp)
 {
-        struct sysfs_dirent *dir_sd;
+        struct kernfs_node *parent;
        int error = 0;
        struct attribute *const *attr;
        int i;
-        dir_sd = sysfs_get_dirent(kobj->sd, grp->name);
+        parent = kernfs_find_and_get(kobj->sd, grp->name);
-        if (!dir_sd)
+        if (!parent)
                return -ENOENT;
        for ((i = 0, attr = grp->attrs); *attr && !error; (++i, ++attr))
-                error = sysfs_add_file(dir_sd, *attr, SYSFS_KOBJ_ATTR);
+                error = sysfs_add_file(parent, *attr, false);
        if (error) {
                while (--i >= 0)
-                        sysfs_hash_and_remove(dir_sd, (*--attr)->name, NULL);
+                        kernfs_remove_by_name(parent, (*--attr)->name);
        }
-        sysfs_put(dir_sd);
+        kernfs_put(parent);
        return error;
 }
@@ -286,14 +290,14 @@ EXPORT_SYMBOL_GPL(sysfs_merge_group);
 void sysfs_unmerge_group(struct kobject *kobj,
                       const struct attribute_group *grp)
 {
-        struct sysfs_dirent *dir_sd;
+        struct kernfs_node *parent;
        struct attribute *const *attr;
-        dir_sd = sysfs_get_dirent(kobj->sd, grp->name);
+        parent = kernfs_find_and_get(kobj->sd, grp->name);
-        if (dir_sd) {
+        if (parent) {
                for (attr = grp->attrs; *attr; ++attr)
-                        sysfs_hash_and_remove(dir_sd, (*attr)->name, NULL);
+                        kernfs_remove_by_name(parent, (*attr)->name);
-                sysfs_put(dir_sd);
+                kernfs_put(parent);
        }
 }
 EXPORT_SYMBOL_GPL(sysfs_unmerge_group);
@@ -308,15 +312,15 @@ EXPORT_SYMBOL_GPL(sysfs_unmerge_group);
 int sysfs_add_link_to_group(struct kobject *kobj, const char *group_name,
                            struct kobject *target, const char *link_name)
 {
-        struct sysfs_dirent *dir_sd;
+        struct kernfs_node *parent;
        int error = 0;
-        dir_sd = sysfs_get_dirent(kobj->sd, group_name);
+        parent = kernfs_find_and_get(kobj->sd, group_name);
-        if (!dir_sd)
+        if (!parent)
                return -ENOENT;
-        error = sysfs_create_link_sd(dir_sd, target, link_name);
+        error = sysfs_create_link_sd(parent, target, link_name);
-        sysfs_put(dir_sd);
+        kernfs_put(parent);
        return error;
 }
@@ -331,12 +335,12 @@ EXPORT_SYMBOL_GPL(sysfs_add_link_to_group);
 void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name,
                                  const char *link_name)
 {
-        struct sysfs_dirent *dir_sd;
+        struct kernfs_node *parent;
-        dir_sd = sysfs_get_dirent(kobj->sd, group_name);
+        parent = kernfs_find_and_get(kobj->sd, group_name);
-        if (dir_sd) {
+        if (parent) {
-                sysfs_hash_and_remove(dir_sd, link_name, NULL);
+                kernfs_remove_by_name(parent, link_name);
-                sysfs_put(dir_sd);
+                kernfs_put(parent);
        }
 }
 EXPORT_SYMBOL_GPL(sysfs_remove_link_from_group);
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
deleted file mode 100644
index 1750f790af3b..000000000000
--- a/fs/sysfs/inode.c
+++ /dev/null
@@ -1,331 +0,0 @@
-/*
- * fs/sysfs/inode.c - basic sysfs inode and dentry operations
- *
- * Copyright (c) 2001-3 Patrick Mochel
- * Copyright (c) 2007 SUSE Linux Products GmbH
- * Copyright (c) 2007 Tejun Heo <teheo@suse.de>
- *
- * This file is released under the GPLv2.
- *
- * Please see Documentation/filesystems/sysfs.txt for more information.
- */
-#undef DEBUG
-#include <linux/pagemap.h>
-#include <linux/namei.h>
-#include <linux/backing-dev.h>
-#include <linux/capability.h>
-#include <linux/errno.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/sysfs.h>
-#include <linux/xattr.h>
-#include <linux/security.h>
-#include "sysfs.h"
-static const struct address_space_operations sysfs_aops = {
-        .readpage       = simple_readpage,
-        .write_begin    = simple_write_begin,
-        .write_end      = simple_write_end,
-};
-static struct backing_dev_info sysfs_backing_dev_info = {
-        .name           = "sysfs",
-        .ra_pages       = 0,    /* No readahead */
-        .capabilities   = BDI_CAP_NO_ACCT_AND_WRITEBACK,
-};
-static const struct inode_operations sysfs_inode_operations = {
-        .permission     = sysfs_permission,
-        .setattr        = sysfs_setattr,
-        .getattr        = sysfs_getattr,
-        .setxattr       = sysfs_setxattr,
-};
-int __init sysfs_inode_init(void)
-{
-        return bdi_init(&sysfs_backing_dev_info);
-}
-static struct sysfs_inode_attrs *sysfs_init_inode_attrs(struct sysfs_dirent *sd)
-{
-        struct sysfs_inode_attrs *attrs;
-        struct iattr *iattrs;
-        attrs = kzalloc(sizeof(struct sysfs_inode_attrs), GFP_KERNEL);
-        if (!attrs)
-                return NULL;
-        iattrs = &attrs->ia_iattr;
-        /* assign default attributes */
-        iattrs->ia_mode = sd->s_mode;
-        iattrs->ia_uid = GLOBAL_ROOT_UID;
-        iattrs->ia_gid = GLOBAL_ROOT_GID;
-        iattrs->ia_atime = iattrs->ia_mtime = iattrs->ia_ctime = CURRENT_TIME;
-        return attrs;
-}
-int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr *iattr)
-{
-        struct sysfs_inode_attrs *sd_attrs;
-        struct iattr *iattrs;
-        unsigned int ia_valid = iattr->ia_valid;
-        sd_attrs = sd->s_iattr;
-        if (!sd_attrs) {
-                /* setting attributes for the first time, allocate now */
-                sd_attrs = sysfs_init_inode_attrs(sd);
-                if (!sd_attrs)
-                        return -ENOMEM;
-                sd->s_iattr = sd_attrs;
-        }
-        /* attributes were changed at least once in past */
-        iattrs = &sd_attrs->ia_iattr;
-        if (ia_valid & ATTR_UID)
-                iattrs->ia_uid = iattr->ia_uid;
-        if (ia_valid & ATTR_GID)
-                iattrs->ia_gid = iattr->ia_gid;
-        if (ia_valid & ATTR_ATIME)
-                iattrs->ia_atime = iattr->ia_atime;
-        if (ia_valid & ATTR_MTIME)
-                iattrs->ia_mtime = iattr->ia_mtime;
-        if (ia_valid & ATTR_CTIME)
-                iattrs->ia_ctime = iattr->ia_ctime;
-        if (ia_valid & ATTR_MODE) {
-                umode_t mode = iattr->ia_mode;
-                iattrs->ia_mode = sd->s_mode = mode;
-        }
-        return 0;
-}
-int sysfs_setattr(struct dentry *dentry, struct iattr *iattr)
-{
-        struct inode *inode = dentry->d_inode;
-        struct sysfs_dirent *sd = dentry->d_fsdata;
-        int error;
-        if (!sd)
-                return -EINVAL;
-        mutex_lock(&sysfs_mutex);
-        error = inode_change_ok(inode, iattr);
-        if (error)
-                goto out;
-        error = sysfs_sd_setattr(sd, iattr);
-        if (error)
-                goto out;
-        /* this ignores size changes */
-        setattr_copy(inode, iattr);
-out:
-        mutex_unlock(&sysfs_mutex);
-        return error;
-}
-static int sysfs_sd_setsecdata(struct sysfs_dirent *sd, void **secdata,
-                               u32 *secdata_len)
-{
-        struct sysfs_inode_attrs *iattrs;
-        void *old_secdata;
-        size_t old_secdata_len;
-        if (!sd->s_iattr) {
-                sd->s_iattr = sysfs_init_inode_attrs(sd);
-                if (!sd->s_iattr)
-                        return -ENOMEM;
-        }
-        iattrs = sd->s_iattr;
-        old_secdata = iattrs->ia_secdata;
-        old_secdata_len = iattrs->ia_secdata_len;
-        iattrs->ia_secdata = *secdata;
-        iattrs->ia_secdata_len = *secdata_len;
-        *secdata = old_secdata;
-        *secdata_len = old_secdata_len;
-        return 0;
-}
-int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
-                size_t size, int flags)
-{
-        struct sysfs_dirent *sd = dentry->d_fsdata;
-        void *secdata;
-        int error;
-        u32 secdata_len = 0;
-        if (!sd)
-                return -EINVAL;
-        if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) {
-                const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;
-                error = security_inode_setsecurity(dentry->d_inode, suffix,
-                                                value, size, flags);
-                if (error)
-                        goto out;
-                error = security_inode_getsecctx(dentry->d_inode,
-                                                &secdata, &secdata_len);
-                if (error)
-                        goto out;
-                mutex_lock(&sysfs_mutex);
-                error = sysfs_sd_setsecdata(sd, &secdata, &secdata_len);
-                mutex_unlock(&sysfs_mutex);
-                if (secdata)
-                        security_release_secctx(secdata, secdata_len);
-        } else
-                return -EINVAL;
-out:
-        return error;
-}
-static inline void set_default_inode_attr(struct inode *inode, umode_t mode)
-{
-        inode->i_mode = mode;
-        inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-}
-static inline void set_inode_attr(struct inode *inode, struct iattr *iattr)
-{
-        inode->i_uid = iattr->ia_uid;
-        inode->i_gid = iattr->ia_gid;
-        inode->i_atime = iattr->ia_atime;
-        inode->i_mtime = iattr->ia_mtime;
-        inode->i_ctime = iattr->ia_ctime;
-}
-static void sysfs_refresh_inode(struct sysfs_dirent *sd, struct inode *inode)
-{
-        struct sysfs_inode_attrs *iattrs = sd->s_iattr;
-        inode->i_mode = sd->s_mode;
-        if (iattrs) {
-                /* sysfs_dirent has non-default attributes
-                 * get them from persistent copy in sysfs_dirent
-                 */
-                set_inode_attr(inode, &iattrs->ia_iattr);
-                security_inode_notifysecctx(inode,
-                                            iattrs->ia_secdata,
-                                            iattrs->ia_secdata_len);
-        }
-        if (sysfs_type(sd) == SYSFS_DIR)
-                set_nlink(inode, sd->s_dir.subdirs + 2);
-}
-int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
-                  struct kstat *stat)
-{
-        struct sysfs_dirent *sd = dentry->d_fsdata;
-        struct inode *inode = dentry->d_inode;
-        mutex_lock(&sysfs_mutex);
-        sysfs_refresh_inode(sd, inode);
-        mutex_unlock(&sysfs_mutex);
-        generic_fillattr(inode, stat);
-        return 0;
-}
-static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
-{
-        struct bin_attribute *bin_attr;
-        inode->i_private = sysfs_get(sd);
-        inode->i_mapping->a_ops = &sysfs_aops;
-        inode->i_mapping->backing_dev_info = &sysfs_backing_dev_info;
-        inode->i_op = &sysfs_inode_operations;
-        set_default_inode_attr(inode, sd->s_mode);
-        sysfs_refresh_inode(sd, inode);
-        /* initialize inode according to type */
-        switch (sysfs_type(sd)) {
-        case SYSFS_DIR:
-                inode->i_op = &sysfs_dir_inode_operations;
-                inode->i_fop = &sysfs_dir_operations;
-                break;
-        case SYSFS_KOBJ_ATTR:
-                inode->i_size = PAGE_SIZE;
-                inode->i_fop = &sysfs_file_operations;
-                break;
-        case SYSFS_KOBJ_BIN_ATTR:
-                bin_attr = sd->s_attr.bin_attr;
-                inode->i_size = bin_attr->size;
-                inode->i_fop = &sysfs_bin_operations;
-                break;
-        case SYSFS_KOBJ_LINK:
-                inode->i_op = &sysfs_symlink_inode_operations;
-                break;
-        default:
-                BUG();
-        }
-        unlock_new_inode(inode);
-}
-/**
- *      sysfs_get_inode - get inode for sysfs_dirent
- *      @sb: super block
- *      @sd: sysfs_dirent to allocate inode for
- *
- *      Get inode for @sd.  If such inode doesn't exist, a new inode
- *      is allocated and basics are initialized.  New inode is
- *      returned locked.
- *
- *      LOCKING:
- *      Kernel thread context (may sleep).
- *
- *      RETURNS:
- *      Pointer to allocated inode on success, NULL on failure.
- */
-struct inode *sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd)
-{
-        struct inode *inode;
-        inode = iget_locked(sb, sd->s_ino);
-        if (inode && (inode->i_state & I_NEW))
-                sysfs_init_inode(sd, inode);
-        return inode;
-}
-/*
- * The sysfs_dirent serves as both an inode and a directory entry for sysfs.
- * To prevent the sysfs inode numbers from being freed prematurely we take a
- * reference to sysfs_dirent from the sysfs inode.  A
- * super_operations.evict_inode() implementation is needed to drop that
- * reference upon inode destruction.
- */
-void sysfs_evict_inode(struct inode *inode)
-{
-        struct sysfs_dirent *sd  = inode->i_private;
-        truncate_inode_pages(&inode->i_data, 0);
-        clear_inode(inode);
-        sysfs_put(sd);
-}
-int sysfs_permission(struct inode *inode, int mask)
-{
-        struct sysfs_dirent *sd;
-        if (mask & MAY_NOT_BLOCK)
-                return -ECHILD;
-        sd = inode->i_private;
-        mutex_lock(&sysfs_mutex);
-        sysfs_refresh_inode(sd, inode);
-        mutex_unlock(&sysfs_mutex);
-        return generic_permission(inode, mask);
-}
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 834ec2cdb7a3..3eaf5c6622eb 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -14,146 +14,42 @@
 #include <linux/fs.h>
 #include <linux/mount.h>
-#include <linux/pagemap.h>
 #include <linux/init.h>
-#include <linux/module.h>
-#include <linux/magic.h>
-#include <linux/slab.h>
 #include <linux/user_namespace.h>
 #include "sysfs.h"
+static struct kernfs_root *sysfs_root;
-static struct vfsmount *sysfs_mnt;
+struct kernfs_node *sysfs_root_kn;
-struct kmem_cache *sysfs_dir_cachep;
-static const struct super_operations sysfs_ops = {
-        .statfs         = simple_statfs,
-        .drop_inode     = generic_delete_inode,
-        .evict_inode    = sysfs_evict_inode,
-};
-struct sysfs_dirent sysfs_root = {
-        .s_name         = "",
-        .s_count        = ATOMIC_INIT(1),
-        .s_flags        = SYSFS_DIR | (KOBJ_NS_TYPE_NONE << SYSFS_NS_TYPE_SHIFT),
-        .s_mode         = S_IFDIR | S_IRUGO | S_IXUGO,
-        .s_ino          = 1,
-};
-static int sysfs_fill_super(struct super_block *sb, void *data, int silent)
-{
-        struct inode *inode;
-        struct dentry *root;
-        sb->s_blocksize = PAGE_CACHE_SIZE;
-        sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
-        sb->s_magic = SYSFS_MAGIC;
-        sb->s_op = &sysfs_ops;
-        sb->s_time_gran = 1;
-        /* get root inode, initialize and unlock it */
-        mutex_lock(&sysfs_mutex);
-        inode = sysfs_get_inode(sb, &sysfs_root);
-        mutex_unlock(&sysfs_mutex);
-        if (!inode) {
-                pr_debug("sysfs: could not get root inode\n");
-                return -ENOMEM;
-        }
-        /* instantiate and link root dentry */
-        root = d_make_root(inode);
-        if (!root) {
-                pr_debug("%s: could not get root dentry!\n", __func__);
-                return -ENOMEM;
-        }
-        root->d_fsdata = &sysfs_root;
-        sb->s_root = root;
-        sb->s_d_op = &sysfs_dentry_ops;
-        return 0;
-}
-static int sysfs_test_super(struct super_block *sb, void *data)
-{
-        struct sysfs_super_info *sb_info = sysfs_info(sb);
-        struct sysfs_super_info *info = data;
-        enum kobj_ns_type type;
-        int found = 1;
-        for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) {
-                if (sb_info->ns[type] != info->ns[type])
-                        found = 0;
-        }
-        return found;
-}
-static int sysfs_set_super(struct super_block *sb, void *data)
-{
-        int error;
-        error = set_anon_super(sb, data);
-        if (!error)
-                sb->s_fs_info = data;
-        return error;
-}
-static void free_sysfs_super_info(struct sysfs_super_info *info)
-{
-        int type;
-        for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++)
-                kobj_ns_drop(type, info->ns[type]);
-        kfree(info);
-}
 static struct dentry *sysfs_mount(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *data)
 {
-        struct sysfs_super_info *info;
+        struct dentry *root;
-        enum kobj_ns_type type;
+        void *ns;
-        struct super_block *sb;
+        bool new_sb;
-        int error;
        if (!(flags & MS_KERNMOUNT)) {
                if (!capable(CAP_SYS_ADMIN) && !fs_fully_visible(fs_type))
                        return ERR_PTR(-EPERM);
-                for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) {
+                if (!kobj_ns_current_may_mount(KOBJ_NS_TYPE_NET))
-                        if (!kobj_ns_current_may_mount(type))
+                        return ERR_PTR(-EPERM);
-                                return ERR_PTR(-EPERM);
-                }
-        }
-        info = kzalloc(sizeof(*info), GFP_KERNEL);
-        if (!info)
-                return ERR_PTR(-ENOMEM);
-        for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++)
-                info->ns[type] = kobj_ns_grab_current(type);
-        sb = sget(fs_type, sysfs_test_super, sysfs_set_super, flags, info);
-        if (IS_ERR(sb) || sb->s_fs_info != info)
-                free_sysfs_super_info(info);
-        if (IS_ERR(sb))
-                return ERR_CAST(sb);
-        if (!sb->s_root) {
-                error = sysfs_fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
-                if (error) {
-                        deactivate_locked_super(sb);
-                        return ERR_PTR(error);
-                }
-                sb->s_flags |= MS_ACTIVE;
        }
-        return dget(sb->s_root);
+        ns = kobj_ns_grab_current(KOBJ_NS_TYPE_NET);
+        root = kernfs_mount_ns(fs_type, flags, sysfs_root, &new_sb, ns);
+        if (IS_ERR(root) || !new_sb)
+                kobj_ns_drop(KOBJ_NS_TYPE_NET, ns);
+        return root;
 }
 static void sysfs_kill_sb(struct super_block *sb)
 {
-        struct sysfs_super_info *info = sysfs_info(sb);
+        void *ns = (void *)kernfs_super_ns(sb);
-        /* Remove the superblock from fs_supers/s_instances
-         * so we can't find it, before freeing sysfs_super_info.
+        kernfs_kill_sb(sb);
-         */
+        kobj_ns_drop(KOBJ_NS_TYPE_NET, ns);
-        kill_anon_super(sb);
-        free_sysfs_super_info(info);
 }
 static struct file_system_type sysfs_fs_type = {
@@ -165,48 +61,19 @@ static struct file_system_type sysfs_fs_type = {
 int __init sysfs_init(void)
 {
-        int err = -ENOMEM;
+        int err;
-        sysfs_dir_cachep = kmem_cache_create("sysfs_dir_cache",
+        sysfs_root = kernfs_create_root(NULL, NULL);
-                                              sizeof(struct sysfs_dirent),
+        if (IS_ERR(sysfs_root))
-                                              0, 0, NULL);
+                return PTR_ERR(sysfs_root);
-        if (!sysfs_dir_cachep)
-                goto out;
-        err = sysfs_inode_init();
+        sysfs_root_kn = sysfs_root->kn;
-        if (err)
-                goto out_err;
        err = register_filesystem(&sysfs_fs_type);
-        if (!err) {
+        if (err) {
-                sysfs_mnt = kern_mount(&sysfs_fs_type);
+                kernfs_destroy_root(sysfs_root);
-                if (IS_ERR(sysfs_mnt)) {
+                return err;
-                        printk(KERN_ERR "sysfs: could not mount!\n");
+        }
-                        err = PTR_ERR(sysfs_mnt);
-                        sysfs_mnt = NULL;
-                        unregister_filesystem(&sysfs_fs_type);
-                        goto out_err;
-                }
-        } else
-                goto out_err;
-out:
-        return err;
-out_err:
-        kmem_cache_destroy(sysfs_dir_cachep);
-        sysfs_dir_cachep = NULL;
-        goto out;
-}
-#undef sysfs_get
-struct sysfs_dirent *sysfs_get(struct sysfs_dirent *sd)
-{
-        return __sysfs_get(sd);
-}
-EXPORT_SYMBOL_GPL(sysfs_get);
-#undef sysfs_put
+        return 0;
-void sysfs_put(struct sysfs_dirent *sd)
-{
-        __sysfs_put(sd);
 }
-EXPORT_SYMBOL_GPL(sysfs_put);
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 3ae3f1bf1a09..aecb15f84557 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -11,109 +11,73 @@
 */
 #include <linux/fs.h>
-#include <linux/gfp.h>
-#include <linux/mount.h>
 #include <linux/module.h>
 #include <linux/kobject.h>
-#include <linux/namei.h>
 #include <linux/mutex.h>
 #include <linux/security.h>
 #include "sysfs.h"
-static int sysfs_do_create_link_sd(struct sysfs_dirent *parent_sd,
+static int sysfs_do_create_link_sd(struct kernfs_node *parent,
-                                   struct kobject *target,
+                                   struct kobject *target_kobj,
                                   const char *name, int warn)
 {
-        struct sysfs_dirent *target_sd = NULL;
+        struct kernfs_node *kn, *target = NULL;
-        struct sysfs_dirent *sd = NULL;
-        struct sysfs_addrm_cxt acxt;
-        enum kobj_ns_type ns_type;
-        int error;
-        BUG_ON(!name || !parent_sd);
+        BUG_ON(!name || !parent);
        /*
-         * We don't own @target and it may be removed at any time.
+         * We don't own @target_kobj and it may be removed at any time.
         * Synchronize using sysfs_symlink_target_lock.  See
         * sysfs_remove_dir() for details.
         */
        spin_lock(&sysfs_symlink_target_lock);
-        if (target->sd)
+        if (target_kobj->sd) {
-                target_sd = sysfs_get(target->sd);
+                target = target_kobj->sd;
+                kernfs_get(target);
+        }
        spin_unlock(&sysfs_symlink_target_lock);
-        error = -ENOENT;
+        if (!target)
-        if (!target_sd)
+                return -ENOENT;
-                goto out_put;
-        error = -ENOMEM;
-        sd = sysfs_new_dirent(name, S_IFLNK|S_IRWXUGO, SYSFS_KOBJ_LINK);
-        if (!sd)
-                goto out_put;
-        ns_type = sysfs_ns_type(parent_sd);
+        kn = kernfs_create_link(parent, name, target);
-        if (ns_type)
+        kernfs_put(target);
-                sd->s_ns = target_sd->s_ns;
-        sd->s_symlink.target_sd = target_sd;
-        target_sd = NULL;       /* reference is now owned by the symlink */
-        sysfs_addrm_start(&acxt);
-        /* Symlinks must be between directories with the same ns_type */
-        if (!ns_type ||
-            (ns_type == sysfs_ns_type(sd->s_symlink.target_sd->s_parent))) {
-                if (warn)
-                        error = sysfs_add_one(&acxt, sd, parent_sd);
-                else
-                        error = __sysfs_add_one(&acxt, sd, parent_sd);
-        } else {
-                error = -EINVAL;
-                WARN(1, KERN_WARNING
-                        "sysfs: symlink across ns_types %s/%s -> %s/%s\n",
-                        parent_sd->s_name,
-                        sd->s_name,
-                        sd->s_symlink.target_sd->s_parent->s_name,
-                        sd->s_symlink.target_sd->s_name);
-        }
-        sysfs_addrm_finish(&acxt);
-        if (error)
+        if (!IS_ERR(kn))
-                goto out_put;
+                return 0;
-        return 0;
+        if (warn && PTR_ERR(kn) == -EEXIST)
+                sysfs_warn_dup(parent, name);
- out_put:
+        return PTR_ERR(kn);
-        sysfs_put(target_sd);
-        sysfs_put(sd);
-        return error;
 }
 /**
 *      sysfs_create_link_sd - create symlink to a given object.
- *      @sd:            directory we're creating the link in.
+ *      @kn:            directory we're creating the link in.
 *      @target:        object we're pointing to.
 *      @name:          name of the symlink.
 */
-int sysfs_create_link_sd(struct sysfs_dirent *sd, struct kobject *target,
+int sysfs_create_link_sd(struct kernfs_node *kn, struct kobject *target,
                         const char *name)
 {
-        return sysfs_do_create_link_sd(sd, target, name, 1);
+        return sysfs_do_create_link_sd(kn, target, name, 1);
 }
 static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target,
                                const char *name, int warn)
 {
-        struct sysfs_dirent *parent_sd = NULL;
+        struct kernfs_node *parent = NULL;
        if (!kobj)
-                parent_sd = &sysfs_root;
+                parent = sysfs_root_kn;
        else
-                parent_sd = kobj->sd;
+                parent = kobj->sd;
-        if (!parent_sd)
+        if (!parent)
                return -EFAULT;
-        return sysfs_do_create_link_sd(parent_sd, target, name, warn);
+        return sysfs_do_create_link_sd(parent, target, name, warn);
 }
 /**
@@ -164,10 +128,10 @@ void sysfs_delete_link(struct kobject *kobj, struct kobject *targ,
         * sysfs_remove_dir() for details.
         */
        spin_lock(&sysfs_symlink_target_lock);
-        if (targ->sd && sysfs_ns_type(kobj->sd))
+        if (targ->sd && kernfs_ns_enabled(kobj->sd))
-                ns = targ->sd->s_ns;
+                ns = targ->sd->ns;
        spin_unlock(&sysfs_symlink_target_lock);
-        sysfs_hash_and_remove(kobj->sd, name, ns);
+        kernfs_remove_by_name_ns(kobj->sd, name, ns);
 }
 /**
@@ -177,14 +141,14 @@ void sysfs_delete_link(struct kobject *kobj, struct kobject *targ,
 */
 void sysfs_remove_link(struct kobject *kobj, const char *name)
 {
-        struct sysfs_dirent *parent_sd = NULL;
+        struct kernfs_node *parent = NULL;
        if (!kobj)
-                parent_sd = &sysfs_root;
+                parent = sysfs_root_kn;
        else
-                parent_sd = kobj->sd;
+                parent = kobj->sd;
-        sysfs_hash_and_remove(parent_sd, name, NULL);
+        kernfs_remove_by_name(parent, name);
 }
 EXPORT_SYMBOL_GPL(sysfs_remove_link);
@@ -201,130 +165,33 @@ EXPORT_SYMBOL_GPL(sysfs_remove_link);
 int sysfs_rename_link_ns(struct kobject *kobj, struct kobject *targ,
                         const char *old, const char *new, const void *new_ns)
 {
-        struct sysfs_dirent *parent_sd, *sd = NULL;
+        struct kernfs_node *parent, *kn = NULL;
        const void *old_ns = NULL;
        int result;
        if (!kobj)
-                parent_sd = &sysfs_root;
+                parent = sysfs_root_kn;
        else
-                parent_sd = kobj->sd;
+                parent = kobj->sd;
        if (targ->sd)
-                old_ns = targ->sd->s_ns;
+                old_ns = targ->sd->ns;
        result = -ENOENT;
-        sd = sysfs_get_dirent_ns(parent_sd, old, old_ns);
+        kn = kernfs_find_and_get_ns(parent, old, old_ns);
-        if (!sd)
+        if (!kn)
                goto out;
        result = -EINVAL;
-        if (sysfs_type(sd) != SYSFS_KOBJ_LINK)
+        if (kernfs_type(kn) != KERNFS_LINK)
                goto out;
-        if (sd->s_symlink.target_sd->s_dir.kobj != targ)
+        if (kn->symlink.target_kn->priv != targ)
                goto out;
-        result = sysfs_rename(sd, parent_sd, new, new_ns);
+        result = kernfs_rename_ns(kn, parent, new, new_ns);
 out:
-        sysfs_put(sd);
+        kernfs_put(kn);
        return result;
 }
 EXPORT_SYMBOL_GPL(sysfs_rename_link_ns);
-static int sysfs_get_target_path(struct sysfs_dirent *parent_sd,
-                                 struct sysfs_dirent *target_sd, char *path)
-{
-        struct sysfs_dirent *base, *sd;
-        char *s = path;
-        int len = 0;
-        /* go up to the root, stop at the base */
-        base = parent_sd;
-        while (base->s_parent) {
-                sd = target_sd->s_parent;
-                while (sd->s_parent && base != sd)
-                        sd = sd->s_parent;
-                if (base == sd)
-                        break;
-                strcpy(s, "../");
-                s += 3;
-                base = base->s_parent;
-        }
-        /* determine end of target string for reverse fillup */
-        sd = target_sd;
-        while (sd->s_parent && sd != base) {
-                len += strlen(sd->s_name) + 1;
-                sd = sd->s_parent;
-        }
-        /* check limits */
-        if (len < 2)
-                return -EINVAL;
-        len--;
-        if ((s - path) + len > PATH_MAX)
-                return -ENAMETOOLONG;
-        /* reverse fillup of target string from target to base */
-        sd = target_sd;
-        while (sd->s_parent && sd != base) {
-                int slen = strlen(sd->s_name);
-                len -= slen;
-                strncpy(s + len, sd->s_name, slen);
-                if (len)
-                        s[--len] = '/';
-                sd = sd->s_parent;
-        }
-        return 0;
-}
-static int sysfs_getlink(struct dentry *dentry, char *path)
-{
-        struct sysfs_dirent *sd = dentry->d_fsdata;
-        struct sysfs_dirent *parent_sd = sd->s_parent;
-        struct sysfs_dirent *target_sd = sd->s_symlink.target_sd;
-        int error;
-        mutex_lock(&sysfs_mutex);
-        error = sysfs_get_target_path(parent_sd, target_sd, path);
-        mutex_unlock(&sysfs_mutex);
-        return error;
-}
-static void *sysfs_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
-        int error = -ENOMEM;
-        unsigned long page = get_zeroed_page(GFP_KERNEL);
-        if (page) {
-                error = sysfs_getlink(dentry, (char *) page);
-                if (error < 0)
-                        free_page((unsigned long)page);
-        }
-        nd_set_link(nd, error ? ERR_PTR(error) : (char *)page);
-        return NULL;
-}
-static void sysfs_put_link(struct dentry *dentry, struct nameidata *nd,
-                           void *cookie)
-{
-        char *page = nd_get_link(nd);
-        if (!IS_ERR(page))
-                free_page((unsigned long)page);
-}
-const struct inode_operations sysfs_symlink_inode_operations = {
-        .setxattr       = sysfs_setxattr,
-        .readlink       = generic_readlink,
-        .follow_link    = sysfs_follow_link,
-        .put_link       = sysfs_put_link,
-        .setattr        = sysfs_setattr,
-        .getattr        = sysfs_getattr,
-        .permission     = sysfs_permission,
-};
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 0af09fbfb3f6..0e2f1cccb812 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -8,248 +8,36 @@
 * This file is released under the GPLv2.
 */
-#include <linux/lockdep.h>
+#ifndef __SYSFS_INTERNAL_H
-#include <linux/kobject_ns.h>
+#define __SYSFS_INTERNAL_H
-#include <linux/fs.h>
-#include <linux/rbtree.h>
-struct sysfs_open_dirent;
+#include <linux/sysfs.h>
-/* type-specific structures for sysfs_dirent->s_* union members */
-struct sysfs_elem_dir {
-        struct kobject          *kobj;
-        unsigned long           subdirs;
-        /* children rbtree starts here and goes through sd->s_rb */
-        struct rb_root          children;
-};
-struct sysfs_elem_symlink {
-        struct sysfs_dirent     *target_sd;
-};
-struct sysfs_elem_attr {
-        union {
-                struct attribute        *attr;
-                struct bin_attribute    *bin_attr;
-        };
-        struct sysfs_open_dirent *open;
-};
-struct sysfs_inode_attrs {
-        struct iattr    ia_iattr;
-        void            *ia_secdata;
-        u32             ia_secdata_len;
-};
-/*
- * sysfs_dirent - the building block of sysfs hierarchy.  Each and
- * every sysfs node is represented by single sysfs_dirent.
- *
- * As long as s_count reference is held, the sysfs_dirent itself is
- * accessible.  Dereferencing s_elem or any other outer entity
- * requires s_active reference.
- */
-struct sysfs_dirent {
-        atomic_t                s_count;
-        atomic_t                s_active;
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-        struct lockdep_map      dep_map;
-#endif
-        struct sysfs_dirent     *s_parent;
-        const char              *s_name;
-        struct rb_node          s_rb;
-        union {
-                struct completion       *completion;
-                struct sysfs_dirent     *removed_list;
-        } u;
-        const void              *s_ns; /* namespace tag */
-        unsigned int            s_hash; /* ns + name hash */
-        union {
-                struct sysfs_elem_dir           s_dir;
-                struct sysfs_elem_symlink       s_symlink;
-                struct sysfs_elem_attr          s_attr;
-        };
-        unsigned short          s_flags;
-        umode_t                 s_mode;
-        unsigned int            s_ino;
-        struct sysfs_inode_attrs *s_iattr;
-};
-#define SD_DEACTIVATED_BIAS             INT_MIN
-#define SYSFS_TYPE_MASK                 0x00ff
-#define SYSFS_DIR                       0x0001
-#define SYSFS_KOBJ_ATTR                 0x0002
-#define SYSFS_KOBJ_BIN_ATTR             0x0004
-#define SYSFS_KOBJ_LINK                 0x0008
-#define SYSFS_COPY_NAME                 (SYSFS_DIR | SYSFS_KOBJ_LINK)
-#define SYSFS_ACTIVE_REF                (SYSFS_KOBJ_ATTR | SYSFS_KOBJ_BIN_ATTR)
-/* identify any namespace tag on sysfs_dirents */
-#define SYSFS_NS_TYPE_MASK              0xf00
-#define SYSFS_NS_TYPE_SHIFT             8
-#define SYSFS_FLAG_MASK                 ~(SYSFS_NS_TYPE_MASK|SYSFS_TYPE_MASK)
-#define SYSFS_FLAG_REMOVED              0x02000
-static inline unsigned int sysfs_type(struct sysfs_dirent *sd)
-{
-        return sd->s_flags & SYSFS_TYPE_MASK;
-}
-/*
- * Return any namespace tags on this dirent.
- * enum kobj_ns_type is defined in linux/kobject.h
- */
-static inline enum kobj_ns_type sysfs_ns_type(struct sysfs_dirent *sd)
-{
-        return (sd->s_flags & SYSFS_NS_TYPE_MASK) >> SYSFS_NS_TYPE_SHIFT;
-}
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-#define sysfs_dirent_init_lockdep(sd)                           \
-do {                                                            \
-        struct attribute *attr = sd->s_attr.attr;               \
-        struct lock_class_key *key = attr->key;                 \
-        if (!key)                                               \
-                key = &attr->skey;                              \
-                                                                \
-        lockdep_init_map(&sd->dep_map, "s_active", key, 0);     \
-} while (0)
-/* Test for attributes that want to ignore lockdep for read-locking */
-static inline bool sysfs_ignore_lockdep(struct sysfs_dirent *sd)
-{
-        int type = sysfs_type(sd);
-        return (type == SYSFS_KOBJ_ATTR || type == SYSFS_KOBJ_BIN_ATTR) &&
-                sd->s_attr.attr->ignore_lockdep;
-}
-#else
-#define sysfs_dirent_init_lockdep(sd) do {} while (0)
-static inline bool sysfs_ignore_lockdep(struct sysfs_dirent *sd)
-{
-        return true;
-}
-#endif
-/*
- * Context structure to be used while adding/removing nodes.
- */
-struct sysfs_addrm_cxt {
-        struct sysfs_dirent     *removed;
-};
 /*
 * mount.c
 */
+extern struct kernfs_node *sysfs_root_kn;
-/*
- * Each sb is associated with a set of namespace tags (i.e.
- * the network namespace of the task which mounted this sysfs
- * instance).
- */
-struct sysfs_super_info {
-        void *ns[KOBJ_NS_TYPES];
-};
-#define sysfs_info(SB) ((struct sysfs_super_info *)(SB->s_fs_info))
-extern struct sysfs_dirent sysfs_root;
-extern struct kmem_cache *sysfs_dir_cachep;
 /*
 * dir.c
 */
-extern struct mutex sysfs_mutex;
 extern spinlock_t sysfs_symlink_target_lock;
-extern const struct dentry_operations sysfs_dentry_ops;
-extern const struct file_operations sysfs_dir_operations;
-extern const struct inode_operations sysfs_dir_inode_operations;
-struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd);
+void sysfs_warn_dup(struct kernfs_node *parent, const char *name);
-void sysfs_put_active(struct sysfs_dirent *sd);
-void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt);
-void sysfs_warn_dup(struct sysfs_dirent *parent, const char *name);
-int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd,
-                    struct sysfs_dirent *parent_sd);
-int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd,
-                  struct sysfs_dirent *parent_sd);
-void sysfs_remove(struct sysfs_dirent *sd);
-int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name,
-                          const void *ns);
-void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt);
-struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
-                                       const unsigned char *name,
-                                       const void *ns);
-struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type);
-void release_sysfs_dirent(struct sysfs_dirent *sd);
-int sysfs_create_subdir(struct kobject *kobj, const char *name,
-                        struct sysfs_dirent **p_sd);
-int sysfs_rename(struct sysfs_dirent *sd, struct sysfs_dirent *new_parent_sd,
-                 const char *new_name, const void *new_ns);
-static inline struct sysfs_dirent *__sysfs_get(struct sysfs_dirent *sd)
-{
-        if (sd) {
-                WARN_ON(!atomic_read(&sd->s_count));
-                atomic_inc(&sd->s_count);
-        }
-        return sd;
-}
-#define sysfs_get(sd) __sysfs_get(sd)
-static inline void __sysfs_put(struct sysfs_dirent *sd)
-{
-        if (sd && atomic_dec_and_test(&sd->s_count))
-                release_sysfs_dirent(sd);
-}
-#define sysfs_put(sd) __sysfs_put(sd)
-/*
- * inode.c
- */
-struct inode *sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd);
-void sysfs_evict_inode(struct inode *inode);
-int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr *iattr);
-int sysfs_permission(struct inode *inode, int mask);
-int sysfs_setattr(struct dentry *dentry, struct iattr *iattr);
-int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
-                  struct kstat *stat);
-int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
-                   size_t size, int flags);
-int sysfs_inode_init(void);
 /*
 * file.c
 */
-extern const struct file_operations sysfs_file_operations;
+int sysfs_add_file(struct kernfs_node *parent,
-extern const struct file_operations sysfs_bin_operations;
+                   const struct attribute *attr, bool is_bin);
+int sysfs_add_file_mode_ns(struct kernfs_node *parent,
-int sysfs_add_file(struct sysfs_dirent *dir_sd,
+                           const struct attribute *attr, bool is_bin,
-                   const struct attribute *attr, int type);
-int sysfs_add_file_mode_ns(struct sysfs_dirent *dir_sd,
-                           const struct attribute *attr, int type,
                           umode_t amode, const void *ns);
-void sysfs_unmap_bin_file(struct sysfs_dirent *sd);
 /*
 * symlink.c
 */
-extern const struct inode_operations sysfs_symlink_inode_operations;
+int sysfs_create_link_sd(struct kernfs_node *kn, struct kobject *target,
-int sysfs_create_link_sd(struct sysfs_dirent *sd, struct kobject *target,
                         const char *name);
+#endif  /* __SYSFS_INTERNAL_H */
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index cc1febd8fadf..5157b866a853 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -2118,26 +2118,10 @@ out_free:
 */
 static void free_inodes(struct fsck_data *fsckd)
 {
-        struct rb_node *this = fsckd->inodes.rb_node;
+        struct fsck_inode *fscki, *n;
-        struct fsck_inode *fscki;
-        while (this) {
+        rbtree_postorder_for_each_entry_safe(fscki, n, &fsckd->inodes, rb)
-                if (this->rb_left)
+                kfree(fscki);
-                        this = this->rb_left;
-                else if (this->rb_right)
-                        this = this->rb_right;
-                else {
-                        fscki = rb_entry(this, struct fsck_inode, rb);
-                        this = rb_parent(this);
-                        if (this) {
-                                if (this->rb_left == &fscki->rb)
-                                        this->rb_left = NULL;
-                                else
-                                        this->rb_right = NULL;
-                        }
-                        kfree(fscki);
-                }
-        }
 }
 /**
diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c
index 36bd4efd0819..a902c5919e42 100644
--- a/fs/ubifs/log.c
+++ b/fs/ubifs/log.c
@@ -574,27 +574,10 @@ static int done_already(struct rb_root *done_tree, int lnum)
 */
 static void destroy_done_tree(struct rb_root *done_tree)
 {
-        struct rb_node *this = done_tree->rb_node;
+        struct done_ref *dr, *n;
-        struct done_ref *dr;
-        while (this) {
+        rbtree_postorder_for_each_entry_safe(dr, n, done_tree, rb)
-                if (this->rb_left) {
-                        this = this->rb_left;
-                        continue;
-                } else if (this->rb_right) {
-                        this = this->rb_right;
-                        continue;
-                }
-                dr = rb_entry(this, struct done_ref, rb);
-                this = rb_parent(this);
-                if (this) {
-                        if (this->rb_left == &dr->rb)
-                                this->rb_left = NULL;
-                        else
-                                this->rb_right = NULL;
-                }
                kfree(dr);
-        }
 }
 /**
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index ba32da3fe08a..f1c3e5a1b315 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -815,27 +815,10 @@ static int dbg_find_check_orphan(struct rb_root *root, ino_t inum)
 static void dbg_free_check_tree(struct rb_root *root)
 {
-        struct rb_node *this = root->rb_node;
+        struct check_orphan *o, *n;
-        struct check_orphan *o;
-        while (this) {
+        rbtree_postorder_for_each_entry_safe(o, n, root, rb)
-                if (this->rb_left) {
-                        this = this->rb_left;
-                        continue;
-                } else if (this->rb_right) {
-                        this = this->rb_right;
-                        continue;
-                }
-                o = rb_entry(this, struct check_orphan, rb);
-                this = rb_parent(this);
-                if (this) {
-                        if (this->rb_left == &o->rb)
-                                this->rb_left = NULL;
-                        else
-                                this->rb_right = NULL;
-                }
                kfree(o);
-        }
 }
 static int dbg_orphan_check(struct ubifs_info *c, struct ubifs_zbranch *zbr,
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 065096e36ed9..c14adb2f420c 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -1335,29 +1335,14 @@ static void remove_ino(struct ubifs_info *c, ino_t inum)
 */
 void ubifs_destroy_size_tree(struct ubifs_info *c)
 {
-        struct rb_node *this = c->size_tree.rb_node;
+        struct size_entry *e, *n;
-        struct size_entry *e;
-        while (this) {
+        rbtree_postorder_for_each_entry_safe(e, n, &c->size_tree, rb) {
-                if (this->rb_left) {
-                        this = this->rb_left;
-                        continue;
-                } else if (this->rb_right) {
-                        this = this->rb_right;
-                        continue;
-                }
-                e = rb_entry(this, struct size_entry, rb);
                if (e->inode)
                        iput(e->inode);
-                this = rb_parent(this);
-                if (this) {
-                        if (this->rb_left == &e->rb)
-                                this->rb_left = NULL;
-                        else
-                                this->rb_right = NULL;
-                }
                kfree(e);
        }
        c->size_tree = RB_ROOT;
 }
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index f69daa514a57..5ded8490c0c6 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -873,26 +873,10 @@ static void free_orphans(struct ubifs_info *c)
 */
 static void free_buds(struct ubifs_info *c)
 {
-        struct rb_node *this = c->buds.rb_node;
+        struct ubifs_bud *bud, *n;
-        struct ubifs_bud *bud;
+        rbtree_postorder_for_each_entry_safe(bud, n, &c->buds, rb)
-        while (this) {
+                kfree(bud);
-                if (this->rb_left)
-                        this = this->rb_left;
-                else if (this->rb_right)
-                        this = this->rb_right;
-                else {
-                        bud = rb_entry(this, struct ubifs_bud, rb);
-                        this = rb_parent(this);
-                        if (this) {
-                                if (this->rb_left == &bud->rb)
-                                        this->rb_left = NULL;
-                                else
-                                        this->rb_right = NULL;
-                        }
-                        kfree(bud);
-                }
-        }
 }
 /**
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index 349f31a30f40..9083bc7ed4ae 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -178,27 +178,11 @@ static int ins_clr_old_idx_znode(struct ubifs_info *c,
 */
 void destroy_old_idx(struct ubifs_info *c)
 {
-        struct rb_node *this = c->old_idx.rb_node;
+        struct ubifs_old_idx *old_idx, *n;
-        struct ubifs_old_idx *old_idx;
-        while (this) {
+        rbtree_postorder_for_each_entry_safe(old_idx, n, &c->old_idx, rb)
-                if (this->rb_left) {
-                        this = this->rb_left;
-                        continue;
-                } else if (this->rb_right) {
-                        this = this->rb_right;
-                        continue;
-                }
-                old_idx = rb_entry(this, struct ubifs_old_idx, rb);
-                this = rb_parent(this);
-                if (this) {
-                        if (this->rb_left == &old_idx->rb)
-                                this->rb_left = NULL;
-                        else
-                                this->rb_right = NULL;
-                }
                kfree(old_idx);
-        }
        c->old_idx = RB_ROOT;
 }
diff --git a/fs/udf/file.c b/fs/udf/file.c
index c02a27a19c6d..1037637957c7 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -144,6 +144,7 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
        size_t count = iocb->ki_nbytes;
        struct udf_inode_info *iinfo = UDF_I(inode);
+        mutex_lock(&inode->i_mutex);
        down_write(&iinfo->i_data_sem);
        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
                if (file->f_flags & O_APPEND)
@@ -156,6 +157,7 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
                                                pos + count)) {
                        err = udf_expand_file_adinicb(inode);
                        if (err) {
+                                mutex_unlock(&inode->i_mutex);
                                udf_debug("udf_expand_adinicb: err=%d\n", err);
                                return err;
                        }
@@ -169,9 +171,17 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
        } else
                up_write(&iinfo->i_data_sem);
-        retval = generic_file_aio_write(iocb, iov, nr_segs, ppos);
+        retval = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
-        if (retval > 0)
+        mutex_unlock(&inode->i_mutex);
+        if (retval > 0) {
+                ssize_t err;
                mark_inode_dirty(inode);
+                err = generic_write_sync(file, iocb->ki_pos - retval, retval);
+                if (err < 0)
+                        retval = err;
+        }
        return retval;
 }
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 062b7925bca0..982ce05c87ed 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -265,6 +265,7 @@ int udf_expand_file_adinicb(struct inode *inode)
                .nr_to_write = 1,
        };
+        WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex));
        if (!iinfo->i_lenAlloc) {
                if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD))
                        iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT;
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 5f6fc17d6bc5..9737cba1357d 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -1010,6 +1010,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
        else
                udf_truncate_tail_extent(inode);
        mark_inode_dirty(inode);
+        up_write(&iinfo->i_data_sem);
        fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
        if (!fi)
@@ -1023,7 +1024,6 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
        udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL);
        if (UDF_I(dir)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
                mark_inode_dirty(dir);
-        up_write(&iinfo->i_data_sem);
        if (fibh.sbh != fibh.ebh)
                brelse(fibh.ebh);
        brelse(fibh.sbh);
diff --git a/fs/xattr_acl.c b/fs/xattr_acl.c
deleted file mode 100644
index 9fbea87fdb6e..000000000000
--- a/fs/xattr_acl.c
+++ /dev/null
@@ -1,180 +0,0 @@
-/*
- * linux/fs/xattr_acl.c
- *
- * Almost all from linux/fs/ext2/acl.c:
- * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
- */
-#include <linux/export.h>
-#include <linux/fs.h>
-#include <linux/posix_acl_xattr.h>
-#include <linux/gfp.h>
-#include <linux/user_namespace.h>
-/*
- * Fix up the uids and gids in posix acl extended attributes in place.
- */
-static void posix_acl_fix_xattr_userns(
-        struct user_namespace *to, struct user_namespace *from,
-        void *value, size_t size)
-{
-        posix_acl_xattr_header *header = (posix_acl_xattr_header *)value;
-        posix_acl_xattr_entry *entry = (posix_acl_xattr_entry *)(header+1), *end;
-        int count;
-        kuid_t uid;
-        kgid_t gid;
-        if (!value)
-                return;
-        if (size < sizeof(posix_acl_xattr_header))
-                return;
-        if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION))
-                return;
-        count = posix_acl_xattr_count(size);
-        if (count < 0)
-                return;
-        if (count == 0)
-                return;
-        for (end = entry + count; entry != end; entry++) {
-                switch(le16_to_cpu(entry->e_tag)) {
-                case ACL_USER:
-                        uid = make_kuid(from, le32_to_cpu(entry->e_id));
-                        entry->e_id = cpu_to_le32(from_kuid(to, uid));
-                        break;
-                case ACL_GROUP:
-                        gid = make_kgid(from, le32_to_cpu(entry->e_id));
-                        entry->e_id = cpu_to_le32(from_kgid(to, gid));
-                        break;
-                default:
-                        break;
-                }
-        }
-}
-void posix_acl_fix_xattr_from_user(void *value, size_t size)
-{
-        struct user_namespace *user_ns = current_user_ns();
-        if (user_ns == &init_user_ns)
-                return;
-        posix_acl_fix_xattr_userns(&init_user_ns, user_ns, value, size);
-}
-void posix_acl_fix_xattr_to_user(void *value, size_t size)
-{
-        struct user_namespace *user_ns = current_user_ns();
-        if (user_ns == &init_user_ns)
-                return;
-        posix_acl_fix_xattr_userns(user_ns, &init_user_ns, value, size);
-}
-/*
- * Convert from extended attribute to in-memory representation.
- */
-struct posix_acl *
-posix_acl_from_xattr(struct user_namespace *user_ns,
-                     const void *value, size_t size)
-{
-        posix_acl_xattr_header *header = (posix_acl_xattr_header *)value;
-        posix_acl_xattr_entry *entry = (posix_acl_xattr_entry *)(header+1), *end;
-        int count;
-        struct posix_acl *acl;
-        struct posix_acl_entry *acl_e;
-        if (!value)
-                return NULL;
-        if (size < sizeof(posix_acl_xattr_header))
-                 return ERR_PTR(-EINVAL);
-        if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION))
-                return ERR_PTR(-EOPNOTSUPP);
-        count = posix_acl_xattr_count(size);
-        if (count < 0)
-                return ERR_PTR(-EINVAL);
-        if (count == 0)
-                return NULL;
-        
-        acl = posix_acl_alloc(count, GFP_NOFS);
-        if (!acl)
-                return ERR_PTR(-ENOMEM);
-        acl_e = acl->a_entries;
-        
-        for (end = entry + count; entry != end; acl_e++, entry++) {
-                acl_e->e_tag  = le16_to_cpu(entry->e_tag);
-                acl_e->e_perm = le16_to_cpu(entry->e_perm);
-                switch(acl_e->e_tag) {
-                        case ACL_USER_OBJ:
-                        case ACL_GROUP_OBJ:
-                        case ACL_MASK:
-                        case ACL_OTHER:
-                                break;
-                        case ACL_USER:
-                                acl_e->e_uid =
-                                        make_kuid(user_ns,
-                                                  le32_to_cpu(entry->e_id));
-                                if (!uid_valid(acl_e->e_uid))
-                                        goto fail;
-                                break;
-                        case ACL_GROUP:
-                                acl_e->e_gid =
-                                        make_kgid(user_ns,
-                                                  le32_to_cpu(entry->e_id));
-                                if (!gid_valid(acl_e->e_gid))
-                                        goto fail;
-                                break;
-                        default:
-                                goto fail;
-                }
-        }
-        return acl;
-fail:
-        posix_acl_release(acl);
-        return ERR_PTR(-EINVAL);
-}
-EXPORT_SYMBOL (posix_acl_from_xattr);
-/*
- * Convert from in-memory to extended attribute representation.
- */
-int
-posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl,
-                   void *buffer, size_t size)
-{
-        posix_acl_xattr_header *ext_acl = (posix_acl_xattr_header *)buffer;
-        posix_acl_xattr_entry *ext_entry = ext_acl->a_entries;
-        int real_size, n;
-        real_size = posix_acl_xattr_size(acl->a_count);
-        if (!buffer)
-                return real_size;
-        if (real_size > size)
-                return -ERANGE;
-        
-        ext_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION);
-        for (n=0; n < acl->a_count; n++, ext_entry++) {
-                const struct posix_acl_entry *acl_e = &acl->a_entries[n];
-                ext_entry->e_tag  = cpu_to_le16(acl_e->e_tag);
-                ext_entry->e_perm = cpu_to_le16(acl_e->e_perm);
-                switch(acl_e->e_tag) {
-                case ACL_USER:
-                        ext_entry->e_id =
-                                cpu_to_le32(from_kuid(user_ns, acl_e->e_uid));
-                        break;
-                case ACL_GROUP:
-                        ext_entry->e_id =
-                                cpu_to_le32(from_kgid(user_ns, acl_e->e_gid));
-                        break;
-                default:
-                        ext_entry->e_id = cpu_to_le32(ACL_UNDEFINED_ID);
-                        break;
-                }
-        }
-        return real_size;
-}
-EXPORT_SYMBOL (posix_acl_to_xattr);
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 370eb3e121d1..0ecec1896f25 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -124,16 +124,12 @@ struct posix_acl *
 xfs_get_acl(struct inode *inode, int type)
 {
        struct xfs_inode *ip = XFS_I(inode);
-        struct posix_acl *acl;
+        struct posix_acl *acl = NULL;
        struct xfs_acl *xfs_acl;
        unsigned char *ea_name;
        int error;
        int len;
-        acl = get_cached_acl(inode, type);
-        if (acl != ACL_NOT_CACHED)
-                return acl;
        trace_xfs_get_acl(ip);
        switch (type) {
@@ -164,10 +160,8 @@ xfs_get_acl(struct inode *inode, int type)
                 * cache entry, for any other error assume it is transient and
                 * leave the cache entry as ACL_NOT_CACHED.
                 */
-                if (error == -ENOATTR) {
+                if (error == -ENOATTR)
-                        acl = NULL;
                        goto out_update_cache;
-                }
                goto out;
        }
@@ -183,15 +177,12 @@ out:
 }
 STATIC int
-xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
+__xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 {
        struct xfs_inode *ip = XFS_I(inode);
        unsigned char *ea_name;
        int error;
-        if (S_ISLNK(inode->i_mode))
-                return -EOPNOTSUPP;
        switch (type) {
        case ACL_TYPE_ACCESS:
                ea_name = SGI_ACL_FILE;
@@ -282,131 +273,23 @@ posix_acl_default_exists(struct inode *inode)
        return xfs_acl_exists(inode, SGI_ACL_DEFAULT);
 }
-/*
- * No need for i_mutex because the inode is not yet exposed to the VFS.
- */
 int
-xfs_inherit_acl(struct inode *inode, struct posix_acl *acl)
+xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 {
-        umode_t mode = inode->i_mode;
-        int error = 0, inherit = 0;
-        if (S_ISDIR(inode->i_mode)) {
-                error = xfs_set_acl(inode, ACL_TYPE_DEFAULT, acl);
-                if (error)
-                        goto out;
-        }
-        error = posix_acl_create(&acl, GFP_KERNEL, &mode);
-        if (error < 0)
-                return error;
-        /*
-         * If posix_acl_create returns a positive value we need to
-         * inherit a permission that can't be represented using the Unix
-         * mode bits and we actually need to set an ACL.
-         */
-        if (error > 0)
-                inherit = 1;
-        error = xfs_set_mode(inode, mode);
-        if (error)
-                goto out;
-        if (inherit)
-                error = xfs_set_acl(inode, ACL_TYPE_ACCESS, acl);
-out:
-        posix_acl_release(acl);
-        return error;
-}
-int
-xfs_acl_chmod(struct inode *inode)
-{
-        struct posix_acl *acl;
-        int error;
-        if (S_ISLNK(inode->i_mode))
-                return -EOPNOTSUPP;
-        acl = xfs_get_acl(inode, ACL_TYPE_ACCESS);
-        if (IS_ERR(acl) || !acl)
-                return PTR_ERR(acl);
-        error = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
-        if (error)
-                return error;
-        error = xfs_set_acl(inode, ACL_TYPE_ACCESS, acl);
-        posix_acl_release(acl);
-        return error;
-}
-static int
-xfs_xattr_acl_get(struct dentry *dentry, const char *name,
-                void *value, size_t size, int type)
-{
-        struct posix_acl *acl;
-        int error;
-        acl = xfs_get_acl(dentry->d_inode, type);
-        if (IS_ERR(acl))
-                return PTR_ERR(acl);
-        if (acl == NULL)
-                return -ENODATA;
-        error = posix_acl_to_xattr(&init_user_ns, acl, value, size);
-        posix_acl_release(acl);
-        return error;
-}
-static int
-xfs_xattr_acl_set(struct dentry *dentry, const char *name,
-                const void *value, size_t size, int flags, int type)
-{
-        struct inode *inode = dentry->d_inode;
-        struct posix_acl *acl = NULL;
        int error = 0;
-        if (flags & XATTR_CREATE)
+        if (!acl)
-                return -EINVAL;
-        if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
-                return value ? -EACCES : 0;
-        if (!inode_owner_or_capable(inode))
-                return -EPERM;
-        if (!value)
                goto set_acl;
-        acl = posix_acl_from_xattr(&init_user_ns, value, size);
-        if (!acl) {
-                /*
-                 * acl_set_file(3) may request that we set default ACLs with
-                 * zero length -- defend (gracefully) against that here.
-                 */
-                goto out;
-        }
-        if (IS_ERR(acl)) {
-                error = PTR_ERR(acl);
-                goto out;
-        }
-        error = posix_acl_valid(acl);
-        if (error)
-                goto out_release;
        error = -EINVAL;
        if (acl->a_count > XFS_ACL_MAX_ENTRIES(XFS_M(inode->i_sb)))
-                goto out_release;
+                return error;
        if (type == ACL_TYPE_ACCESS) {
                umode_t mode = inode->i_mode;
                error = posix_acl_equiv_mode(acl, &mode);
                if (error <= 0) {
-                        posix_acl_release(acl);
                        acl = NULL;
                        if (error < 0)
@@ -415,27 +298,9 @@ xfs_xattr_acl_set(struct dentry *dentry, const char *name,
                error = xfs_set_mode(inode, mode);
                if (error)
-                        goto out_release;
+                        return error;
        }
 set_acl:
-        error = xfs_set_acl(inode, type, acl);
+        return __xfs_set_acl(inode, type, acl);
- out_release:
-        posix_acl_release(acl);
- out:
-        return error;
 }
-const struct xattr_handler xfs_xattr_acl_access_handler = {
-        .prefix = POSIX_ACL_XATTR_ACCESS,
-        .flags  = ACL_TYPE_ACCESS,
-        .get    = xfs_xattr_acl_get,
-        .set    = xfs_xattr_acl_set,
-};
-const struct xattr_handler xfs_xattr_acl_default_handler = {
-        .prefix = POSIX_ACL_XATTR_DEFAULT,
-        .flags  = ACL_TYPE_DEFAULT,
-        .get    = xfs_xattr_acl_get,
-        .set    = xfs_xattr_acl_set,
-};
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index 4016a567b83c..5dc163744511 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -60,20 +60,15 @@ struct xfs_acl {
 #ifdef CONFIG_XFS_POSIX_ACL
 extern struct posix_acl *xfs_get_acl(struct inode *inode, int type);
-extern int xfs_inherit_acl(struct inode *inode, struct posix_acl *default_acl);
+extern int xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
-extern int xfs_acl_chmod(struct inode *inode);
 extern int posix_acl_access_exists(struct inode *inode);
 extern int posix_acl_default_exists(struct inode *inode);
-extern const struct xattr_handler xfs_xattr_acl_access_handler;
-extern const struct xattr_handler xfs_xattr_acl_default_handler;
 #else
 static inline struct posix_acl *xfs_get_acl(struct inode *inode, int type)
 {
        return NULL;
 }
-# define xfs_inherit_acl(inode, default_acl)            0
+# define xfs_set_acl                                    NULL
-# define xfs_acl_chmod(inode)                           0
 # define posix_acl_access_exists(inode)                 0
 # define posix_acl_default_exists(inode)                0
 #endif /* CONFIG_XFS_POSIX_ACL */
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 71c8c9d2b882..db2cfb067d0b 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -407,7 +407,7 @@ xfs_alloc_ioend_bio(
        struct bio              *bio = bio_alloc(GFP_NOIO, nvecs);
        ASSERT(bio->bi_private == NULL);
-        bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
+        bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
        bio->bi_bdev = bh->b_bdev;
        return bio;
 }
@@ -1217,7 +1217,7 @@ __xfs_get_blocks(
                lockmode = XFS_ILOCK_EXCL;
                xfs_ilock(ip, lockmode);
        } else {
-                lockmode = xfs_ilock_map_shared(ip);
+                lockmode = xfs_ilock_data_map_shared(ip);
        }
        ASSERT(offset <= mp->m_super->s_maxbytes);
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index b86127072ac3..01b6a0102fbd 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -164,6 +164,7 @@ xfs_attr_get(
 {
        int             error;
        struct xfs_name xname;
+        uint            lock_mode;
        XFS_STATS_INC(xs_attr_get);
@@ -174,9 +175,9 @@ xfs_attr_get(
        if (error)
                return error;
-        xfs_ilock(ip, XFS_ILOCK_SHARED);
+        lock_mode = xfs_ilock_attr_map_shared(ip);
        error = xfs_attr_get_int(ip, &xname, value, valuelenp, flags);
-        xfs_iunlock(ip, XFS_ILOCK_SHARED);
+        xfs_iunlock(ip, lock_mode);
        return(error);
 }
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index 2d174b128153..01db96f60cf0 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -507,17 +507,17 @@ xfs_attr_list_int(
 {
        int error;
        xfs_inode_t *dp = context->dp;
+        uint            lock_mode;
        XFS_STATS_INC(xs_attr_list);
        if (XFS_FORCED_SHUTDOWN(dp->i_mount))
                return EIO;
-        xfs_ilock(dp, XFS_ILOCK_SHARED);
        /*
         * Decide on what work routines to call based on the inode size.
         */
+        lock_mode = xfs_ilock_attr_map_shared(dp);
        if (!xfs_inode_hasattr(dp)) {
                error = 0;
        } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
@@ -527,9 +527,7 @@ xfs_attr_list_int(
        } else {
                error = xfs_attr_node_list(context);
        }
+        xfs_iunlock(dp, lock_mode);
-        xfs_iunlock(dp, XFS_ILOCK_SHARED);
        return error;
 }
diff --git a/fs/xfs/xfs_attr_remote.c b/fs/xfs/xfs_attr_remote.c
index 739e0a52deda..5549d69ddb45 100644
--- a/fs/xfs/xfs_attr_remote.c
+++ b/fs/xfs/xfs_attr_remote.c
@@ -110,7 +110,7 @@ xfs_attr3_rmt_verify(
        if (be32_to_cpu(rmt->rm_bytes) > fsbsize - sizeof(*rmt))
                return false;
        if (be32_to_cpu(rmt->rm_offset) +
-                                be32_to_cpu(rmt->rm_bytes) >= XATTR_SIZE_MAX)
+                                be32_to_cpu(rmt->rm_bytes) > XATTR_SIZE_MAX)
                return false;
        if (rmt->rm_owner == 0)
                return false;
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 3b2c14b6f0fb..152543c4ca70 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -4013,6 +4013,7 @@ xfs_bmapi_read(
        ASSERT(*nmap >= 1);
        ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK|XFS_BMAPI_ENTIRE|
                           XFS_BMAPI_IGSTATE)));
+        ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED|XFS_ILOCK_EXCL));
        if (unlikely(XFS_TEST_ERROR(
            (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
@@ -4207,6 +4208,7 @@ xfs_bmapi_delay(
        ASSERT(*nmap >= 1);
        ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
        ASSERT(!(flags & ~XFS_BMAPI_ENTIRE));
+        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
        if (unlikely(XFS_TEST_ERROR(
            (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS &&
@@ -4500,6 +4502,7 @@ xfs_bmapi_write(
        ASSERT(tp != NULL);
        ASSERT(len > 0);
        ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL);
+        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
        if (unlikely(XFS_TEST_ERROR(
            (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
@@ -5051,6 +5054,7 @@ xfs_bunmapi(
        if (XFS_FORCED_SHUTDOWN(mp))
                return XFS_ERROR(EIO);
+        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
        ASSERT(len > 0);
        ASSERT(nexts >= 0);
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 1394106ed22d..f264616080ca 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -287,6 +287,7 @@ xfs_bmapi_allocate(
        INIT_WORK_ONSTACK(&args->work, xfs_bmapi_allocate_worker);
        queue_work(xfs_alloc_wq, &args->work);
        wait_for_completion(&done);
+        destroy_work_on_stack(&args->work);
        return args->result;
 }
@@ -617,22 +618,27 @@ xfs_getbmap(
                return XFS_ERROR(ENOMEM);
        xfs_ilock(ip, XFS_IOLOCK_SHARED);
-        if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) {
+        if (whichfork == XFS_DATA_FORK) {
-                if (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size) {
+                if (!(iflags & BMV_IF_DELALLOC) &&
+                    (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size)) {
                        error = -filemap_write_and_wait(VFS_I(ip)->i_mapping);
                        if (error)
                                goto out_unlock_iolock;
+                        /*
+                         * Even after flushing the inode, there can still be
+                         * delalloc blocks on the inode beyond EOF due to
+                         * speculative preallocation.  These are not removed
+                         * until the release function is called or the inode
+                         * is inactivated.  Hence we cannot assert here that
+                         * ip->i_delayed_blks == 0.
+                         */
                }
-                /*
-                 * even after flushing the inode, there can still be delalloc
-                 * blocks on the inode beyond EOF due to speculative
-                 * preallocation. These are not removed until the release
-                 * function is called or the inode is inactivated. Hence we
-                 * cannot assert here that ip->i_delayed_blks == 0.
-                 */
-        }
-        lock = xfs_ilock_map_shared(ip);
+                lock = xfs_ilock_data_map_shared(ip);
+        } else {
+                lock = xfs_ilock_attr_map_shared(ip);
+        }
        /*
         * Don't let nex be bigger than the number of extents
@@ -737,7 +743,7 @@ xfs_getbmap(
 out_free_map:
        kmem_free(map);
 out_unlock_ilock:
-        xfs_iunlock_map_shared(ip, lock);
+        xfs_iunlock(ip, lock);
 out_unlock_iolock:
        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
@@ -1168,9 +1174,15 @@ xfs_zero_remaining_bytes(
        xfs_buf_unlock(bp);
        for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
+                uint lock_mode;
                offset_fsb = XFS_B_TO_FSBT(mp, offset);
                nimap = 1;
+                lock_mode = xfs_ilock_data_map_shared(ip);
                error = xfs_bmapi_read(ip, offset_fsb, 1, &imap, &nimap, 0);
+                xfs_iunlock(ip, lock_mode);
                if (error || nimap < 1)
                        break;
                ASSERT(imap.br_blockcount >= 1);
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index afe7645e4b2b..9c061ef2b0d9 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -445,8 +445,8 @@ _xfs_buf_find(
        numbytes = BBTOB(numblks);
        /* Check for IOs smaller than the sector size / not sector aligned */
-        ASSERT(!(numbytes < (1 << btp->bt_sshift)));
+        ASSERT(!(numbytes < btp->bt_meta_sectorsize));
-        ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_smask));
+        ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_meta_sectormask));
        /*
         * Corrupted block numbers can get through to here, unfortunately, so we
@@ -1240,7 +1240,7 @@ next_chunk:
        bio = bio_alloc(GFP_NOIO, nr_pages);
        bio->bi_bdev = bp->b_target->bt_bdev;
-        bio->bi_sector = sector;
+        bio->bi_iter.bi_sector = sector;
        bio->bi_end_io = xfs_buf_bio_end_io;
        bio->bi_private = bp;
@@ -1262,7 +1262,7 @@ next_chunk:
                total_nr_pages--;
        }
-        if (likely(bio->bi_size)) {
+        if (likely(bio->bi_iter.bi_size)) {
                if (xfs_buf_is_vmapped(bp)) {
                        flush_kernel_vmap_range(bp->b_addr,
                                                xfs_buf_vmap_len(bp));
@@ -1593,16 +1593,15 @@ xfs_free_buftarg(
        kmem_free(btp);
 }
-STATIC int
+int
-xfs_setsize_buftarg_flags(
+xfs_setsize_buftarg(
        xfs_buftarg_t           *btp,
        unsigned int            blocksize,
-        unsigned int            sectorsize,
+        unsigned int            sectorsize)
-        int                     verbose)
 {
-        btp->bt_bsize = blocksize;
+        /* Set up metadata sector size info */
-        btp->bt_sshift = ffs(sectorsize) - 1;
+        btp->bt_meta_sectorsize = sectorsize;
-        btp->bt_smask = sectorsize - 1;
+        btp->bt_meta_sectormask = sectorsize - 1;
        if (set_blocksize(btp->bt_bdev, sectorsize)) {
                char name[BDEVNAME_SIZE];
@@ -1615,30 +1614,25 @@ xfs_setsize_buftarg_flags(
                return EINVAL;
        }
+        /* Set up device logical sector size mask */
+        btp->bt_logical_sectorsize = bdev_logical_block_size(btp->bt_bdev);
+        btp->bt_logical_sectormask = bdev_logical_block_size(btp->bt_bdev) - 1;
        return 0;
 }
 /*
- *      When allocating the initial buffer target we have not yet
+ * When allocating the initial buffer target we have not yet
- *      read in the superblock, so don't know what sized sectors
+ * read in the superblock, so don't know what sized sectors
- *      are being used at this early stage.  Play safe.
+ * are being used at this early stage.  Play safe.
 */
 STATIC int
 xfs_setsize_buftarg_early(
        xfs_buftarg_t           *btp,
        struct block_device     *bdev)
 {
-        return xfs_setsize_buftarg_flags(btp,
+        return xfs_setsize_buftarg(btp, PAGE_SIZE,
-                        PAGE_SIZE, bdev_logical_block_size(bdev), 0);
+                                   bdev_logical_block_size(bdev));
-}
-int
-xfs_setsize_buftarg(
-        xfs_buftarg_t           *btp,
-        unsigned int            blocksize,
-        unsigned int            sectorsize)
-{
-        return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1);
 }
 xfs_buftarg_t *
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 1cf21a4a9f22..995339534db6 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -88,14 +88,28 @@ typedef unsigned int xfs_buf_flags_t;
 */
 #define XFS_BSTATE_DISPOSE       (1 << 0)       /* buffer being discarded */
+/*
+ * The xfs_buftarg contains 2 notions of "sector size" -
+ *
+ * 1) The metadata sector size, which is the minimum unit and
+ *    alignment of IO which will be performed by metadata operations.
+ * 2) The device logical sector size
+ *
+ * The first is specified at mkfs time, and is stored on-disk in the
+ * superblock's sb_sectsize.
+ *
+ * The latter is derived from the underlying device, and controls direct IO
+ * alignment constraints.
+ */
 typedef struct xfs_buftarg {
        dev_t                   bt_dev;
        struct block_device     *bt_bdev;
        struct backing_dev_info *bt_bdi;
        struct xfs_mount        *bt_mount;
-        unsigned int            bt_bsize;
+        unsigned int            bt_meta_sectorsize;
-        unsigned int            bt_sshift;
+        size_t                  bt_meta_sectormask;
-        size_t                  bt_smask;
+        size_t                  bt_logical_sectorsize;
+        size_t                  bt_logical_sectormask;
        /* LRU control structures */
        struct shrinker         bt_shrinker;
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 2227b9b050bb..33149113e333 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -182,21 +182,47 @@ xfs_buf_item_size(
        trace_xfs_buf_item_size(bip);
 }
-static struct xfs_log_iovec *
+static inline void
+xfs_buf_item_copy_iovec(
+        struct xfs_log_vec      *lv,
+        struct xfs_log_iovec    **vecp,
+        struct xfs_buf          *bp,
+        uint                    offset,
+        int                     first_bit,
+        uint                    nbits)
+{
+        offset += first_bit * XFS_BLF_CHUNK;
+        xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_BCHUNK,
+                        xfs_buf_offset(bp, offset),
+                        nbits * XFS_BLF_CHUNK);
+}
+static inline bool
+xfs_buf_item_straddle(
+        struct xfs_buf          *bp,
+        uint                    offset,
+        int                     next_bit,
+        int                     last_bit)
+{
+        return xfs_buf_offset(bp, offset + (next_bit << XFS_BLF_SHIFT)) !=
+                (xfs_buf_offset(bp, offset + (last_bit << XFS_BLF_SHIFT)) +
+                 XFS_BLF_CHUNK);
+}
+static void
 xfs_buf_item_format_segment(
        struct xfs_buf_log_item *bip,
-        struct xfs_log_iovec    *vecp,
+        struct xfs_log_vec      *lv,
+        struct xfs_log_iovec    **vecp,
        uint                    offset,
        struct xfs_buf_log_format *blfp)
 {
        struct xfs_buf  *bp = bip->bli_buf;
        uint            base_size;
-        uint            nvecs;
        int             first_bit;
        int             last_bit;
        int             next_bit;
        uint            nbits;
-        uint            buffer_offset;
        /* copy the flags across from the base format item */
        blfp->blf_flags = bip->__bli_format.blf_flags;
@@ -208,21 +234,17 @@ xfs_buf_item_format_segment(
         */
        base_size = xfs_buf_log_format_size(blfp);
-        nvecs = 0;
        first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0);
        if (!(bip->bli_flags & XFS_BLI_STALE) && first_bit == -1) {
                /*
                 * If the map is not be dirty in the transaction, mark
                 * the size as zero and do not advance the vector pointer.
                 */
-                goto out;
+                return;
        }
-        vecp->i_addr = blfp;
+        blfp = xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_BFORMAT, blfp, base_size);
-        vecp->i_len = base_size;
+        blfp->blf_size = 1;
-        vecp->i_type = XLOG_REG_TYPE_BFORMAT;
-        vecp++;
-        nvecs = 1;
        if (bip->bli_flags & XFS_BLI_STALE) {
                /*
@@ -232,14 +254,13 @@ xfs_buf_item_format_segment(
                 */
                trace_xfs_buf_item_format_stale(bip);
                ASSERT(blfp->blf_flags & XFS_BLF_CANCEL);
-                goto out;
+                return;
        }
        /*
         * Fill in an iovec for each set of contiguous chunks.
         */
        last_bit = first_bit;
        nbits = 1;
        for (;;) {
@@ -252,42 +273,22 @@ xfs_buf_item_format_segment(
                next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size,
                                        (uint)last_bit + 1);
                /*
-                 * If we run out of bits fill in the last iovec and get
+                 * If we run out of bits fill in the last iovec and get out of
-                 * out of the loop.
+                 * the loop.  Else if we start a new set of bits then fill in
-                 * Else if we start a new set of bits then fill in the
+                 * the iovec for the series we were looking at and start
-                 * iovec for the series we were looking at and start
+                 * counting the bits in the new one.  Else we're still in the
-                 * counting the bits in the new one.
+                 * same set of bits so just keep counting and scanning.
-                 * Else we're still in the same set of bits so just
-                 * keep counting and scanning.
                 */
                if (next_bit == -1) {
-                        buffer_offset = offset + first_bit * XFS_BLF_CHUNK;
+                        xfs_buf_item_copy_iovec(lv, vecp, bp, offset,
-                        vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
+                                                first_bit, nbits);
-                        vecp->i_len = nbits * XFS_BLF_CHUNK;
+                        blfp->blf_size++;
-                        vecp->i_type = XLOG_REG_TYPE_BCHUNK;
-                        nvecs++;
                        break;
-                } else if (next_bit != last_bit + 1) {
+                } else if (next_bit != last_bit + 1 ||
-                        buffer_offset = offset + first_bit * XFS_BLF_CHUNK;
+                           xfs_buf_item_straddle(bp, offset, next_bit, last_bit)) {
-                        vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
+                        xfs_buf_item_copy_iovec(lv, vecp, bp, offset,
-                        vecp->i_len = nbits * XFS_BLF_CHUNK;
+                                                first_bit, nbits);
-                        vecp->i_type = XLOG_REG_TYPE_BCHUNK;
+                        blfp->blf_size++;
-                        nvecs++;
-                        vecp++;
-                        first_bit = next_bit;
-                        last_bit = next_bit;
-                        nbits = 1;
-                } else if (xfs_buf_offset(bp, offset +
-                                              (next_bit << XFS_BLF_SHIFT)) !=
-                           (xfs_buf_offset(bp, offset +
-                                               (last_bit << XFS_BLF_SHIFT)) +
-                            XFS_BLF_CHUNK)) {
-                        buffer_offset = offset + first_bit * XFS_BLF_CHUNK;
-                        vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
-                        vecp->i_len = nbits * XFS_BLF_CHUNK;
-                        vecp->i_type = XLOG_REG_TYPE_BCHUNK;
-                        nvecs++;
-                        vecp++;
                        first_bit = next_bit;
                        last_bit = next_bit;
                        nbits = 1;
@@ -296,9 +297,6 @@ xfs_buf_item_format_segment(
                        nbits++;
                }
        }
-out:
-        blfp->blf_size = nvecs;
-        return vecp;
 }
 /*
@@ -310,10 +308,11 @@ out:
 STATIC void
 xfs_buf_item_format(
        struct xfs_log_item     *lip,
-        struct xfs_log_iovec    *vecp)
+        struct xfs_log_vec      *lv)
 {
        struct xfs_buf_log_item *bip = BUF_ITEM(lip);
        struct xfs_buf          *bp = bip->bli_buf;
+        struct xfs_log_iovec    *vecp = NULL;
        uint                    offset = 0;
        int                     i;
@@ -354,8 +353,8 @@ xfs_buf_item_format(
        }
        for (i = 0; i < bip->bli_format_count; i++) {
-                vecp = xfs_buf_item_format_segment(bip, vecp, offset,
+                xfs_buf_item_format_segment(bip, lv, &vecp, offset,
-                                                &bip->bli_formats[i]);
+                                            &bip->bli_formats[i]);
                offset += bp->b_maps[i].bm_len;
        }
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index c4e50c6ed584..aead369e1c30 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -674,6 +674,7 @@ xfs_readdir(
 {
        int             rval;           /* return value */
        int             v;              /* type-checking value */
+        uint            lock_mode;
        trace_xfs_readdir(dp);
@@ -683,6 +684,7 @@ xfs_readdir(
        ASSERT(S_ISDIR(dp->i_d.di_mode));
        XFS_STATS_INC(xs_dir_getdents);
+        lock_mode = xfs_ilock_data_map_shared(dp);
        if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
                rval = xfs_dir2_sf_getdents(dp, ctx);
        else if ((rval = xfs_dir2_isblock(NULL, dp, &v)))
@@ -691,5 +693,7 @@ xfs_readdir(
                rval = xfs_dir2_block_getdents(dp, ctx);
        else
                rval = xfs_dir2_leaf_getdents(dp, ctx, bufsize);
+        xfs_iunlock(dp, lock_mode);
        return rval;
 }
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
index aafc6e46cb58..3725fb1b902b 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -170,6 +170,7 @@ xfs_dir2_block_to_sf(
        char                    *ptr;           /* current data pointer */
        xfs_dir2_sf_entry_t     *sfep;          /* shortform entry */
        xfs_dir2_sf_hdr_t       *sfp;           /* shortform directory header */
+        xfs_dir2_sf_hdr_t       *dst;           /* temporary data buffer */
        trace_xfs_dir2_block_to_sf(args);
@@ -177,35 +178,20 @@ xfs_dir2_block_to_sf(
        mp = dp->i_mount;
        /*
-         * Make a copy of the block data, so we can shrink the inode
+         * allocate a temporary destination buffer the size of the inode
-         * and add local data.
+         * to format the data into. Once we have formatted the data, we
+         * can free the block and copy the formatted data into the inode literal
+         * area.
         */
-        hdr = kmem_alloc(mp->m_dirblksize, KM_SLEEP);
+        dst = kmem_alloc(mp->m_sb.sb_inodesize, KM_SLEEP);
-        memcpy(hdr, bp->b_addr, mp->m_dirblksize);
+        hdr = bp->b_addr;
-        logflags = XFS_ILOG_CORE;
-        if ((error = xfs_dir2_shrink_inode(args, mp->m_dirdatablk, bp))) {
-                ASSERT(error != ENOSPC);
-                goto out;
-        }
        /*
-         * The buffer is now unconditionally gone, whether
-         * xfs_dir2_shrink_inode worked or not.
-         *
-         * Convert the inode to local format.
-         */
-        dp->i_df.if_flags &= ~XFS_IFEXTENTS;
-        dp->i_df.if_flags |= XFS_IFINLINE;
-        dp->i_d.di_format = XFS_DINODE_FMT_LOCAL;
-        ASSERT(dp->i_df.if_bytes == 0);
-        xfs_idata_realloc(dp, size, XFS_DATA_FORK);
-        logflags |= XFS_ILOG_DDATA;
-        /*
         * Copy the header into the newly allocate local space.
         */
-        sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+        sfp = (xfs_dir2_sf_hdr_t *)dst;
        memcpy(sfp, sfhp, xfs_dir2_sf_hdr_size(sfhp->i8count));
-        dp->i_d.di_size = size;
        /*
         * Set up to loop over the block's entries.
         */
@@ -258,10 +244,34 @@ xfs_dir2_block_to_sf(
                ptr += dp->d_ops->data_entsize(dep->namelen);
        }
        ASSERT((char *)sfep - (char *)sfp == size);
+        /* now we are done with the block, we can shrink the inode */
+        logflags = XFS_ILOG_CORE;
+        error = xfs_dir2_shrink_inode(args, mp->m_dirdatablk, bp);
+        if (error) {
+                ASSERT(error != ENOSPC);
+                goto out;
+        }
+        /*
+         * The buffer is now unconditionally gone, whether
+         * xfs_dir2_shrink_inode worked or not.
+         *
+         * Convert the inode to local format and copy the data in.
+         */
+        dp->i_df.if_flags &= ~XFS_IFEXTENTS;
+        dp->i_df.if_flags |= XFS_IFINLINE;
+        dp->i_d.di_format = XFS_DINODE_FMT_LOCAL;
+        ASSERT(dp->i_df.if_bytes == 0);
+        xfs_idata_realloc(dp, size, XFS_DATA_FORK);
+        logflags |= XFS_ILOG_DDATA;
+        memcpy(dp->i_df.if_u1.if_data, dst, size);
+        dp->i_d.di_size = size;
        xfs_dir2_sf_check(args);
 out:
        xfs_trans_log_inode(args->trans, dp, logflags);
-        kmem_free(hdr);
+        kmem_free(dst);
        return error;
 }
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 6b1e695caf0e..7aeb4c895b32 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -469,16 +469,17 @@ xfs_qm_dqtobp(
        struct xfs_mount        *mp = dqp->q_mount;
        xfs_dqid_t              id = be32_to_cpu(dqp->q_core.d_id);
        struct xfs_trans        *tp = (tpp ? *tpp : NULL);
+        uint                    lock_mode;
        dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk;
-        xfs_ilock(quotip, XFS_ILOCK_SHARED);
+        lock_mode = xfs_ilock_data_map_shared(quotip);
        if (!xfs_this_quota_on(dqp->q_mount, dqp->dq_flags)) {
                /*
                 * Return if this type of quotas is turned off while we
                 * didn't have the quota inode lock.
                 */
-                xfs_iunlock(quotip, XFS_ILOCK_SHARED);
+                xfs_iunlock(quotip, lock_mode);
                return ESRCH;
        }
@@ -488,7 +489,7 @@ xfs_qm_dqtobp(
        error = xfs_bmapi_read(quotip, dqp->q_fileoffset,
                               XFS_DQUOT_CLUSTER_SIZE_FSB, &map, &nmaps, 0);
-        xfs_iunlock(quotip, XFS_ILOCK_SHARED);
+        xfs_iunlock(quotip, lock_mode);
        if (error)
                return error;
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index 92e5f62eefc6..f33fbaaa4d8a 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -57,20 +57,24 @@ xfs_qm_dquot_logitem_size(
 STATIC void
 xfs_qm_dquot_logitem_format(
        struct xfs_log_item     *lip,
-        struct xfs_log_iovec    *logvec)
+        struct xfs_log_vec      *lv)
 {
        struct xfs_dq_logitem   *qlip = DQUOT_ITEM(lip);
+        struct xfs_log_iovec    *vecp = NULL;
-        logvec->i_addr = &qlip->qli_format;
+        struct xfs_dq_logformat *qlf;
-        logvec->i_len  = sizeof(xfs_dq_logformat_t);
-        logvec->i_type = XLOG_REG_TYPE_QFORMAT;
+        qlf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_QFORMAT);
-        logvec++;
+        qlf->qlf_type = XFS_LI_DQUOT;
-        logvec->i_addr = &qlip->qli_dquot->q_core;
+        qlf->qlf_size = 2;
-        logvec->i_len  = sizeof(xfs_disk_dquot_t);
+        qlf->qlf_id = be32_to_cpu(qlip->qli_dquot->q_core.d_id);
-        logvec->i_type = XLOG_REG_TYPE_DQUOT;
+        qlf->qlf_blkno = qlip->qli_dquot->q_blkno;
+        qlf->qlf_len = 1;
-        qlip->qli_format.qlf_size = 2;
+        qlf->qlf_boffset = qlip->qli_dquot->q_bufoffset;
+        xlog_finish_iovec(lv, vecp, sizeof(struct xfs_dq_logformat));
+        xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_DQUOT,
+                        &qlip->qli_dquot->q_core,
+                        sizeof(struct xfs_disk_dquot));
 }
 /*
@@ -257,18 +261,6 @@ xfs_qm_dquot_logitem_init(
        xfs_log_item_init(dqp->q_mount, &lp->qli_item, XFS_LI_DQUOT,
                                        &xfs_dquot_item_ops);
        lp->qli_dquot = dqp;
-        lp->qli_format.qlf_type = XFS_LI_DQUOT;
-        lp->qli_format.qlf_id = be32_to_cpu(dqp->q_core.d_id);
-        lp->qli_format.qlf_blkno = dqp->q_blkno;
-        lp->qli_format.qlf_len = 1;
-        /*
-         * This is just the offset of this dquot within its buffer
-         * (which is currently 1 FSB and probably won't change).
-         * Hence 32 bits for this offset should be just fine.
-         * Alternatively, we can store (bufoffset / sizeof(xfs_dqblk_t))
-         * here, and recompute it at recovery time.
-         */
-        lp->qli_format.qlf_boffset = (__uint32_t)dqp->q_bufoffset;
 }
 /*------------------  QUOTAOFF LOG ITEMS  -------------------*/
@@ -294,26 +286,20 @@ xfs_qm_qoff_logitem_size(
        *nbytes += sizeof(struct xfs_qoff_logitem);
 }
-/*
- * This is called to fill in the vector of log iovecs for the
- * given quotaoff log item. We use only 1 iovec, and we point that
- * at the quotaoff_log_format structure embedded in the quotaoff item.
- * It is at this point that we assert that all of the extent
- * slots in the quotaoff item have been filled.
- */
 STATIC void
 xfs_qm_qoff_logitem_format(
        struct xfs_log_item     *lip,
-        struct xfs_log_iovec    *log_vector)
+        struct xfs_log_vec      *lv)
 {
        struct xfs_qoff_logitem *qflip = QOFF_ITEM(lip);
+        struct xfs_log_iovec    *vecp = NULL;
-        ASSERT(qflip->qql_format.qf_type == XFS_LI_QUOTAOFF);
+        struct xfs_qoff_logformat *qlf;
-        log_vector->i_addr = &qflip->qql_format;
+        qlf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_QUOTAOFF);
-        log_vector->i_len = sizeof(xfs_qoff_logitem_t);
+        qlf->qf_type = XFS_LI_QUOTAOFF;
-        log_vector->i_type = XLOG_REG_TYPE_QUOTAOFF;
+        qlf->qf_size = 1;
-        qflip->qql_format.qf_size = 1;
+        qlf->qf_flags = qflip->qql_flags;
+        xlog_finish_iovec(lv, vecp, sizeof(struct xfs_qoff_logitem));
 }
 /*
@@ -453,8 +439,7 @@ xfs_qm_qoff_logitem_init(
        xfs_log_item_init(mp, &qf->qql_item, XFS_LI_QUOTAOFF, start ?
                        &xfs_qm_qoffend_logitem_ops : &xfs_qm_qoff_logitem_ops);
        qf->qql_item.li_mountp = mp;
-        qf->qql_format.qf_type = XFS_LI_QUOTAOFF;
-        qf->qql_format.qf_flags = flags;
        qf->qql_start_lip = start;
+        qf->qql_flags = flags;
        return qf;
 }
diff --git a/fs/xfs/xfs_dquot_item.h b/fs/xfs/xfs_dquot_item.h
index 5acae2ada70b..502e9464634a 100644
--- a/fs/xfs/xfs_dquot_item.h
+++ b/fs/xfs/xfs_dquot_item.h
@@ -27,13 +27,12 @@ typedef struct xfs_dq_logitem {
        xfs_log_item_t           qli_item;         /* common portion */
        struct xfs_dquot        *qli_dquot;        /* dquot ptr */
        xfs_lsn_t                qli_flush_lsn;    /* lsn at last flush */
-        xfs_dq_logformat_t       qli_format;       /* logged structure */
 } xfs_dq_logitem_t;
 typedef struct xfs_qoff_logitem {
        xfs_log_item_t           qql_item;      /* common portion */
        struct xfs_qoff_logitem *qql_start_lip; /* qoff-start logitem, if any */
-        xfs_qoff_logformat_t     qql_format;    /* logged structure */
+        unsigned int            qql_flags;
 } xfs_qoff_logitem_t;
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 3680d04f973f..fb7a4c1ce1c5 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -26,6 +26,7 @@
 #include "xfs_trans_priv.h"
 #include "xfs_buf_item.h"
 #include "xfs_extfree_item.h"
+#include "xfs_log.h"
 kmem_zone_t     *xfs_efi_zone;
@@ -101,9 +102,10 @@ xfs_efi_item_size(
 STATIC void
 xfs_efi_item_format(
        struct xfs_log_item     *lip,
-        struct xfs_log_iovec    *log_vector)
+        struct xfs_log_vec      *lv)
 {
        struct xfs_efi_log_item *efip = EFI_ITEM(lip);
+        struct xfs_log_iovec    *vecp = NULL;
        ASSERT(atomic_read(&efip->efi_next_extent) ==
                                efip->efi_format.efi_nextents);
@@ -111,10 +113,9 @@ xfs_efi_item_format(
        efip->efi_format.efi_type = XFS_LI_EFI;
        efip->efi_format.efi_size = 1;
-        log_vector->i_addr = &efip->efi_format;
+        xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_EFI_FORMAT,
-        log_vector->i_len = xfs_efi_item_sizeof(efip);
+                        &efip->efi_format,
-        log_vector->i_type = XLOG_REG_TYPE_EFI_FORMAT;
+                        xfs_efi_item_sizeof(efip));
-        ASSERT(log_vector->i_len >= sizeof(xfs_efi_log_format_t));
 }
@@ -368,19 +369,19 @@ xfs_efd_item_size(
 STATIC void
 xfs_efd_item_format(
        struct xfs_log_item     *lip,
-        struct xfs_log_iovec    *log_vector)
+        struct xfs_log_vec      *lv)
 {
        struct xfs_efd_log_item *efdp = EFD_ITEM(lip);
+        struct xfs_log_iovec    *vecp = NULL;
        ASSERT(efdp->efd_next_extent == efdp->efd_format.efd_nextents);
        efdp->efd_format.efd_type = XFS_LI_EFD;
        efdp->efd_format.efd_size = 1;
-        log_vector->i_addr = &efdp->efd_format;
+        xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_EFD_FORMAT,
-        log_vector->i_len = xfs_efd_item_sizeof(efdp);
+                        &efdp->efd_format,
-        log_vector->i_type = XLOG_REG_TYPE_EFD_FORMAT;
+                        xfs_efd_item_sizeof(efdp));
-        ASSERT(log_vector->i_len >= sizeof(xfs_efd_log_format_t));
 }
 /*
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 52c91e143725..64b48eade91d 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -261,7 +261,8 @@ xfs_file_aio_read(
                xfs_buftarg_t   *target =
                        XFS_IS_REALTIME_INODE(ip) ?
                                mp->m_rtdev_targp : mp->m_ddev_targp;
-                if ((pos & target->bt_smask) || (size & target->bt_smask)) {
+                /* DIO must be aligned to device logical sector size */
+                if ((pos | size) & target->bt_logical_sectormask) {
                        if (pos == i_size_read(inode))
                                return 0;
                        return -XFS_ERROR(EINVAL);
@@ -641,9 +642,11 @@ xfs_file_dio_aio_write(
        struct xfs_buftarg      *target = XFS_IS_REALTIME_INODE(ip) ?
                                        mp->m_rtdev_targp : mp->m_ddev_targp;
-        if ((pos & target->bt_smask) || (count & target->bt_smask))
+        /* DIO must be aligned to device logical sector size */
+        if ((pos | count) & target->bt_logical_sectormask)
                return -XFS_ERROR(EINVAL);
+        /* "unaligned" here means not aligned to a filesystem block */
        if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask))
                unaligned_io = 1;
@@ -796,7 +799,7 @@ xfs_file_aio_write(
                XFS_STATS_ADD(xs_write_bytes, ret);
                /* Handle various SYNC-type writes */
-                err = generic_write_sync(file, pos, ret);
+                err = generic_write_sync(file, iocb->ki_pos - ret, ret);
                if (err < 0)
                        ret = err;
        }
@@ -912,7 +915,7 @@ xfs_dir_open(
         * If there are any blocks, read-ahead block 0 as we're almost
         * certain to have the next operation be a read there.
         */
-        mode = xfs_ilock_map_shared(ip);
+        mode = xfs_ilock_data_map_shared(ip);
        if (ip->i_d.di_nextents > 0)
                xfs_dir3_data_readahead(NULL, ip, 0, -1);
        xfs_iunlock(ip, mode);
@@ -1215,7 +1218,7 @@ xfs_seek_data(
        uint                    lock;
        int                     error;
-        lock = xfs_ilock_map_shared(ip);
+        lock = xfs_ilock_data_map_shared(ip);
        isize = i_size_read(inode);
        if (start >= isize) {
@@ -1294,7 +1297,7 @@ out:
        offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
 out_unlock:
-        xfs_iunlock_map_shared(ip, lock);
+        xfs_iunlock(ip, lock);
        if (error)
                return -error;
@@ -1319,7 +1322,7 @@ xfs_seek_hole(
        if (XFS_FORCED_SHUTDOWN(mp))
                return -XFS_ERROR(EIO);
-        lock = xfs_ilock_map_shared(ip);
+        lock = xfs_ilock_data_map_shared(ip);
        isize = i_size_read(inode);
        if (start >= isize) {
@@ -1402,7 +1405,7 @@ out:
        offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
 out_unlock:
-        xfs_iunlock_map_shared(ip, lock);
+        xfs_iunlock(ip, lock);
        if (error)
                return -error;
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index e87719c5bebe..5d7f105a1c82 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -52,7 +52,7 @@ xfs_ialloc_cluster_alignment(
 {
        if (xfs_sb_version_hasalign(&args->mp->m_sb) &&
            args->mp->m_sb.sb_inoalignmt >=
-             XFS_B_TO_FSBT(args->mp, XFS_INODE_CLUSTER_SIZE(args->mp)))
+             XFS_B_TO_FSBT(args->mp, args->mp->m_inode_cluster_size))
                return args->mp->m_sb.sb_inoalignmt;
        return 1;
 }
@@ -170,27 +170,20 @@ xfs_ialloc_inode_init(
 {
        struct xfs_buf          *fbuf;
        struct xfs_dinode       *free;
-        int                     blks_per_cluster, nbufs, ninodes;
+        int                     nbufs, blks_per_cluster, inodes_per_cluster;
        int                     version;
        int                     i, j;
        xfs_daddr_t             d;
        xfs_ino_t               ino = 0;
        /*
-         * Loop over the new block(s), filling in the inodes.
+         * Loop over the new block(s), filling in the inodes.  For small block
-         * For small block sizes, manipulate the inodes in buffers
+         * sizes, manipulate the inodes in buffers  which are multiples of the
-         * which are multiples of the blocks size.
+         * blocks size.
         */
-        if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) {
+        blks_per_cluster = xfs_icluster_size_fsb(mp);
-                blks_per_cluster = 1;
+        inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog;
-                nbufs = length;
+        nbufs = length / blks_per_cluster;
-                ninodes = mp->m_sb.sb_inopblock;
-        } else {
-                blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) /
-                                   mp->m_sb.sb_blocksize;
-                nbufs = length / blks_per_cluster;
-                ninodes = blks_per_cluster * mp->m_sb.sb_inopblock;
-        }
        /*
         * Figure out what version number to use in the inodes we create.  If
@@ -225,7 +218,7 @@ xfs_ialloc_inode_init(
                 * they track in the AIL as if they were physically logged.
                 */
                if (tp)
-                        xfs_icreate_log(tp, agno, agbno, XFS_IALLOC_INODES(mp),
+                        xfs_icreate_log(tp, agno, agbno, mp->m_ialloc_inos,
                                        mp->m_sb.sb_inodesize, length, gen);
        } else if (xfs_sb_version_hasnlink(&mp->m_sb))
                version = 2;
@@ -246,7 +239,7 @@ xfs_ialloc_inode_init(
                /* Initialize the inode buffers and log them appropriately. */
                fbuf->b_ops = &xfs_inode_buf_ops;
                xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length));
-                for (i = 0; i < ninodes; i++) {
+                for (i = 0; i < inodes_per_cluster; i++) {
                        int     ioffset = i << mp->m_sb.sb_inodelog;
                        uint    isize = xfs_dinode_size(version);
@@ -329,11 +322,11 @@ xfs_ialloc_ag_alloc(
         * Locking will ensure that we don't have two callers in here
         * at one time.
         */
-        newlen = XFS_IALLOC_INODES(args.mp);
+        newlen = args.mp->m_ialloc_inos;
        if (args.mp->m_maxicount &&
            args.mp->m_sb.sb_icount + newlen > args.mp->m_maxicount)
                return XFS_ERROR(ENOSPC);
-        args.minlen = args.maxlen = XFS_IALLOC_BLOCKS(args.mp);
+        args.minlen = args.maxlen = args.mp->m_ialloc_blks;
        /*
         * First try to allocate inodes contiguous with the last-allocated
         * chunk of inodes.  If the filesystem is striped, this will fill
@@ -343,7 +336,7 @@ xfs_ialloc_ag_alloc(
        newino = be32_to_cpu(agi->agi_newino);
        agno = be32_to_cpu(agi->agi_seqno);
        args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) +
-                        XFS_IALLOC_BLOCKS(args.mp);
+                     args.mp->m_ialloc_blks;
        if (likely(newino != NULLAGINO &&
                  (args.agbno < be32_to_cpu(agi->agi_length)))) {
                args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
@@ -585,7 +578,7 @@ xfs_ialloc_ag_select(
                 * Is there enough free space for the file plus a block of
                 * inodes? (if we need to allocate some)?
                 */
-                ineed = XFS_IALLOC_BLOCKS(mp);
+                ineed = mp->m_ialloc_blks;
                longest = pag->pagf_longest;
                if (!longest)
                        longest = pag->pagf_flcount > 0;
@@ -999,7 +992,7 @@ xfs_dialloc(
         * inode.
         */
        if (mp->m_maxicount &&
-            mp->m_sb.sb_icount + XFS_IALLOC_INODES(mp) > mp->m_maxicount) {
+            mp->m_sb.sb_icount + mp->m_ialloc_inos > mp->m_maxicount) {
                noroom = 1;
                okalloc = 0;
        }
@@ -1202,7 +1195,7 @@ xfs_difree(
         * When an inode cluster is free, it becomes eligible for removal
         */
        if (!(mp->m_flags & XFS_MOUNT_IKEEP) &&
-            (rec.ir_freecount == XFS_IALLOC_INODES(mp))) {
+            (rec.ir_freecount == mp->m_ialloc_inos)) {
                *delete = 1;
                *first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino);
@@ -1212,7 +1205,7 @@ xfs_difree(
                 * AGI and Superblock inode counts, and mark the disk space
                 * to be freed when the transaction is committed.
                 */
-                ilen = XFS_IALLOC_INODES(mp);
+                ilen = mp->m_ialloc_inos;
                be32_add_cpu(&agi->agi_count, -ilen);
                be32_add_cpu(&agi->agi_freecount, -(ilen - 1));
                xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT);
@@ -1228,9 +1221,9 @@ xfs_difree(
                        goto error0;
                }
-                xfs_bmap_add_free(XFS_AGB_TO_FSB(mp,
+                xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno,
-                                agno, XFS_INO_TO_AGBNO(mp,rec.ir_startino)),
+                                  XFS_AGINO_TO_AGBNO(mp, rec.ir_startino)),
-                                XFS_IALLOC_BLOCKS(mp), flist, mp);
+                                  mp->m_ialloc_blks, flist, mp);
        } else {
                *delete = 0;
@@ -1311,7 +1304,7 @@ xfs_imap_lookup(
        /* check that the returned record contains the required inode */
        if (rec.ir_startino > agino ||
-            rec.ir_startino + XFS_IALLOC_INODES(mp) <= agino)
+            rec.ir_startino + mp->m_ialloc_inos <= agino)
                return EINVAL;
        /* for untrusted inodes check it is allocated first */
@@ -1384,7 +1377,7 @@ xfs_imap(
                return XFS_ERROR(EINVAL);
        }
-        blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_blocklog;
+        blks_per_cluster = xfs_icluster_size_fsb(mp);
        /*
         * For bulkstat and handle lookups, we have an untrusted inode number
@@ -1405,7 +1398,7 @@ xfs_imap(
         * If the inode cluster size is the same as the blocksize or
         * smaller we get to the buffer by simple arithmetics.
         */
-        if (XFS_INODE_CLUSTER_SIZE(mp) <= mp->m_sb.sb_blocksize) {
+        if (blks_per_cluster == 1) {
                offset = XFS_INO_TO_OFFSET(mp, ino);
                ASSERT(offset < mp->m_sb.sb_inopblock);
diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h
index a8f76a5ff418..812365d17e67 100644
--- a/fs/xfs/xfs_ialloc.h
+++ b/fs/xfs/xfs_ialloc.h
@@ -25,17 +25,18 @@ struct xfs_mount;
 struct xfs_trans;
 struct xfs_btree_cur;
-/*
+/* Move inodes in clusters of this size */
- * Allocation parameters for inode allocation.
- */
-#define XFS_IALLOC_INODES(mp)   (mp)->m_ialloc_inos
-#define XFS_IALLOC_BLOCKS(mp)   (mp)->m_ialloc_blks
-/*
- * Move inodes in clusters of this size.
- */
 #define XFS_INODE_BIG_CLUSTER_SIZE      8192
-#define XFS_INODE_CLUSTER_SIZE(mp)      (mp)->m_inode_cluster_size
+/* Calculate and return the number of filesystem blocks per inode cluster */
+static inline int
+xfs_icluster_size_fsb(
+        struct xfs_mount        *mp)
+{
+        if (mp->m_sb.sb_blocksize >= mp->m_inode_cluster_size)
+                return 1;
+        return mp->m_inode_cluster_size >> mp->m_sb.sb_blocklog;
+}
 /*
 * Make an inode pointer out of the buffer/offset.
diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c
index d2eaccfa73f4..7e4549233251 100644
--- a/fs/xfs/xfs_icreate_item.c
+++ b/fs/xfs/xfs_icreate_item.c
@@ -28,6 +28,7 @@
 #include "xfs_trans_priv.h"
 #include "xfs_error.h"
 #include "xfs_icreate_item.h"
+#include "xfs_log.h"
 kmem_zone_t     *xfs_icreate_zone;              /* inode create item zone */
@@ -58,13 +59,14 @@ xfs_icreate_item_size(
 STATIC void
 xfs_icreate_item_format(
        struct xfs_log_item     *lip,
-        struct xfs_log_iovec    *log_vector)
+        struct xfs_log_vec      *lv)
 {
        struct xfs_icreate_item *icp = ICR_ITEM(lip);
+        struct xfs_log_iovec    *vecp = NULL;
-        log_vector->i_addr = (xfs_caddr_t)&icp->ic_format;
+        xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ICREATE,
-        log_vector->i_len  = sizeof(struct xfs_icreate_log);
+                        &icp->ic_format,
-        log_vector->i_type = XLOG_REG_TYPE_ICREATE;
+                        sizeof(struct xfs_icreate_log));
 }
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 001aa893ed59..3a137e9f9a7d 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -77,48 +77,44 @@ xfs_get_extsz_hint(
 }
 /*
- * This is a wrapper routine around the xfs_ilock() routine used to centralize
+ * These two are wrapper routines around the xfs_ilock() routine used to
- * some grungy code.  It is used in places that wish to lock the inode solely
+ * centralize some grungy code.  They are used in places that wish to lock the
- * for reading the extents.  The reason these places can't just call
+ * inode solely for reading the extents.  The reason these places can't just
- * xfs_ilock(SHARED) is that the inode lock also guards to bringing in of the
+ * call xfs_ilock(ip, XFS_ILOCK_SHARED) is that the inode lock also guards to
- * extents from disk for a file in b-tree format.  If the inode is in b-tree
+ * bringing in of the extents from disk for a file in b-tree format.  If the
- * format, then we need to lock the inode exclusively until the extents are read
+ * inode is in b-tree format, then we need to lock the inode exclusively until
- * in.  Locking it exclusively all the time would limit our parallelism
+ * the extents are read in.  Locking it exclusively all the time would limit
- * unnecessarily, though.  What we do instead is check to see if the extents
+ * our parallelism unnecessarily, though.  What we do instead is check to see
- * have been read in yet, and only lock the inode exclusively if they have not.
+ * if the extents have been read in yet, and only lock the inode exclusively
+ * if they have not.
 *
- * The function returns a value which should be given to the corresponding
+ * The functions return a value which should be given to the corresponding
- * xfs_iunlock_map_shared().  This value is the mode in which the lock was
+ * xfs_iunlock() call.
- * actually taken.
 */
 uint
-xfs_ilock_map_shared(
+xfs_ilock_data_map_shared(
-        xfs_inode_t     *ip)
+        struct xfs_inode        *ip)
 {
-        uint    lock_mode;
+        uint                    lock_mode = XFS_ILOCK_SHARED;
-        if ((ip->i_d.di_format == XFS_DINODE_FMT_BTREE) &&
+        if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
-            ((ip->i_df.if_flags & XFS_IFEXTENTS) == 0)) {
+            (ip->i_df.if_flags & XFS_IFEXTENTS) == 0)
                lock_mode = XFS_ILOCK_EXCL;
-        } else {
-                lock_mode = XFS_ILOCK_SHARED;
-        }
        xfs_ilock(ip, lock_mode);
        return lock_mode;
 }
-/*
+uint
- * This is simply the unlock routine to go with xfs_ilock_map_shared().
+xfs_ilock_attr_map_shared(
- * All it does is call xfs_iunlock() with the given lock_mode.
+        struct xfs_inode        *ip)
- */
-void
-xfs_iunlock_map_shared(
-        xfs_inode_t     *ip,
-        unsigned int    lock_mode)
 {
-        xfs_iunlock(ip, lock_mode);
+        uint                    lock_mode = XFS_ILOCK_SHARED;
+        if (ip->i_d.di_aformat == XFS_DINODE_FMT_BTREE &&
+            (ip->i_afp->if_flags & XFS_IFEXTENTS) == 0)
+                lock_mode = XFS_ILOCK_EXCL;
+        xfs_ilock(ip, lock_mode);
+        return lock_mode;
 }
 /*
@@ -588,9 +584,9 @@ xfs_lookup(
        if (XFS_FORCED_SHUTDOWN(dp->i_mount))
                return XFS_ERROR(EIO);
-        lock_mode = xfs_ilock_map_shared(dp);
+        lock_mode = xfs_ilock_data_map_shared(dp);
        error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name);
-        xfs_iunlock_map_shared(dp, lock_mode);
+        xfs_iunlock(dp, lock_mode);
        if (error)
                goto out;
@@ -2141,8 +2137,8 @@ xfs_ifree_cluster(
 {
        xfs_mount_t             *mp = free_ip->i_mount;
        int                     blks_per_cluster;
+        int                     inodes_per_cluster;
        int                     nbufs;
-        int                     ninodes;
        int                     i, j;
        xfs_daddr_t             blkno;
        xfs_buf_t               *bp;
@@ -2152,18 +2148,11 @@ xfs_ifree_cluster(
        struct xfs_perag        *pag;
        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum));
-        if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) {
+        blks_per_cluster = xfs_icluster_size_fsb(mp);
-                blks_per_cluster = 1;
+        inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog;
-                ninodes = mp->m_sb.sb_inopblock;
+        nbufs = mp->m_ialloc_blks / blks_per_cluster;
-                nbufs = XFS_IALLOC_BLOCKS(mp);
-        } else {
-                blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) /
-                                        mp->m_sb.sb_blocksize;
-                ninodes = blks_per_cluster * mp->m_sb.sb_inopblock;
-                nbufs = XFS_IALLOC_BLOCKS(mp) / blks_per_cluster;
-        }
-        for (j = 0; j < nbufs; j++, inum += ninodes) {
+        for (j = 0; j < nbufs; j++, inum += inodes_per_cluster) {
                blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),
                                         XFS_INO_TO_AGBNO(mp, inum));
@@ -2225,7 +2214,7 @@ xfs_ifree_cluster(
                 * transaction stale above, which means there is no point in
                 * even trying to lock them.
                 */
-                for (i = 0; i < ninodes; i++) {
+                for (i = 0; i < inodes_per_cluster; i++) {
 retry:
                        rcu_read_lock();
                        ip = radix_tree_lookup(&pag->pag_ici_root,
@@ -2906,13 +2895,13 @@ xfs_iflush_cluster(
        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
-        inodes_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog;
+        inodes_per_cluster = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
        ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *);
        ilist = kmem_alloc(ilist_size, KM_MAYFAIL|KM_NOFS);
        if (!ilist)
                goto out_put;
-        mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
+        mask = ~(((mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog)) - 1);
        first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
        rcu_read_lock();
        /* really need a gang lookup range call here */
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 9e6efccbae04..65e2350f449c 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -337,8 +337,8 @@ int		xfs_ilock_nowait(xfs_inode_t *, uint);
 void            xfs_iunlock(xfs_inode_t *, uint);
 void            xfs_ilock_demote(xfs_inode_t *, uint);
 int             xfs_isilocked(xfs_inode_t *, uint);
-uint            xfs_ilock_map_shared(xfs_inode_t *);
+uint            xfs_ilock_data_map_shared(struct xfs_inode *);
-void            xfs_iunlock_map_shared(xfs_inode_t *, uint);
+uint            xfs_ilock_attr_map_shared(struct xfs_inode *);
 int             xfs_ialloc(struct xfs_trans *, xfs_inode_t *, umode_t,
                           xfs_nlink_t, xfs_dev_t, prid_t, int,
                           struct xfs_buf **, xfs_inode_t **);
diff --git a/fs/xfs/xfs_inode_fork.c b/fs/xfs/xfs_inode_fork.c
index cfee14a83cfe..73514c0486b7 100644
--- a/fs/xfs/xfs_inode_fork.c
+++ b/fs/xfs/xfs_inode_fork.c
@@ -431,6 +431,8 @@ xfs_iread_extents(
        xfs_ifork_t     *ifp;
        xfs_extnum_t    nextents;
+        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
        if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) {
                XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW,
                                 ip->i_mount);
@@ -721,15 +723,16 @@ xfs_idestroy_fork(
 }
 /*
- * xfs_iextents_copy()
+ * Convert in-core extents to on-disk form
 *
- * This is called to copy the REAL extents (as opposed to the delayed
+ * For either the data or attr fork in extent format, we need to endian convert
- * allocation extents) from the inode into the given buffer.  It
+ * the in-core extent as we place them into the on-disk inode.
- * returns the number of bytes copied into the buffer.
 *
- * If there are no delayed allocation extents, then we can just
+ * In the case of the data fork, the in-core and on-disk fork sizes can be
- * memcpy() the extents into the buffer.  Otherwise, we need to
+ * different due to delayed allocation extents. We only copy on-disk extents
- * examine each extent in turn and skip those which are delayed.
+ * here, so callers must always use the physical fork size to determine the
+ * size of the buffer passed to this routine.  We will return the size actually
+ * used.
 */
 int
 xfs_iextents_copy(
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 7c0d391f9a6e..686889b4a1e5 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -30,6 +30,7 @@
 #include "xfs_trace.h"
 #include "xfs_trans_priv.h"
 #include "xfs_dinode.h"
+#include "xfs_log.h"
 kmem_zone_t     *xfs_ili_zone;          /* inode log item zone */
@@ -39,27 +40,14 @@ static inline struct xfs_inode_log_item *INODE_ITEM(struct xfs_log_item *lip)
        return container_of(lip, struct xfs_inode_log_item, ili_item);
 }
-/*
- * This returns the number of iovecs needed to log the given inode item.
- *
- * We need one iovec for the inode log format structure, one for the
- * inode core, and possibly one for the inode data/extents/b-tree root
- * and one for the inode attribute data/extents/b-tree root.
- */
 STATIC void
-xfs_inode_item_size(
+xfs_inode_item_data_fork_size(
-        struct xfs_log_item     *lip,
+        struct xfs_inode_log_item *iip,
        int                     *nvecs,
        int                     *nbytes)
 {
-        struct xfs_inode_log_item *iip = INODE_ITEM(lip);
        struct xfs_inode        *ip = iip->ili_inode;
-        *nvecs += 2;
-        *nbytes += sizeof(struct xfs_inode_log_format) +
-                   xfs_icdinode_size(ip->i_d.di_version);
        switch (ip->i_d.di_format) {
        case XFS_DINODE_FMT_EXTENTS:
                if ((iip->ili_fields & XFS_ILOG_DEXT) &&
@@ -70,7 +58,6 @@ xfs_inode_item_size(
                        *nvecs += 1;
                }
                break;
        case XFS_DINODE_FMT_BTREE:
                if ((iip->ili_fields & XFS_ILOG_DBROOT) &&
                    ip->i_df.if_broot_bytes > 0) {
@@ -78,7 +65,6 @@ xfs_inode_item_size(
                        *nvecs += 1;
                }
                break;
        case XFS_DINODE_FMT_LOCAL:
                if ((iip->ili_fields & XFS_ILOG_DDATA) &&
                    ip->i_df.if_bytes > 0) {
@@ -90,19 +76,20 @@ xfs_inode_item_size(
        case XFS_DINODE_FMT_DEV:
        case XFS_DINODE_FMT_UUID:
                break;
        default:
                ASSERT(0);
                break;
        }
+}
-        if (!XFS_IFORK_Q(ip))
+STATIC void
-                return;
+xfs_inode_item_attr_fork_size(
+        struct xfs_inode_log_item *iip,
+        int                     *nvecs,
+        int                     *nbytes)
+{
+        struct xfs_inode        *ip = iip->ili_inode;
-        /*
-         * Log any necessary attribute data.
-         */
        switch (ip->i_d.di_aformat) {
        case XFS_DINODE_FMT_EXTENTS:
                if ((iip->ili_fields & XFS_ILOG_AEXT) &&
@@ -113,7 +100,6 @@ xfs_inode_item_size(
                        *nvecs += 1;
                }
                break;
        case XFS_DINODE_FMT_BTREE:
                if ((iip->ili_fields & XFS_ILOG_ABROOT) &&
                    ip->i_afp->if_broot_bytes > 0) {
@@ -121,7 +107,6 @@ xfs_inode_item_size(
                        *nvecs += 1;
                }
                break;
        case XFS_DINODE_FMT_LOCAL:
                if ((iip->ili_fields & XFS_ILOG_ADATA) &&
                    ip->i_afp->if_bytes > 0) {
@@ -129,7 +114,6 @@ xfs_inode_item_size(
                        *nvecs += 1;
                }
                break;
        default:
                ASSERT(0);
                break;
@@ -137,98 +121,67 @@ xfs_inode_item_size(
 }
 /*
- * xfs_inode_item_format_extents - convert in-core extents to on-disk form
+ * This returns the number of iovecs needed to log the given inode item.
- *
- * For either the data or attr fork in extent format, we need to endian convert
- * the in-core extent as we place them into the on-disk inode. In this case, we
- * need to do this conversion before we write the extents into the log. Because
- * we don't have the disk inode to write into here, we allocate a buffer and
- * format the extents into it via xfs_iextents_copy(). We free the buffer in
- * the unlock routine after the copy for the log has been made.
 *
- * In the case of the data fork, the in-core and on-disk fork sizes can be
+ * We need one iovec for the inode log format structure, one for the
- * different due to delayed allocation extents. We only log on-disk extents
+ * inode core, and possibly one for the inode data/extents/b-tree root
- * here, so always use the physical fork size to determine the size of the
+ * and one for the inode attribute data/extents/b-tree root.
- * buffer we need to allocate.
 */
 STATIC void
-xfs_inode_item_format_extents(
+xfs_inode_item_size(
-        struct xfs_inode        *ip,
+        struct xfs_log_item     *lip,
-        struct xfs_log_iovec    *vecp,
+        int                     *nvecs,
-        int                     whichfork,
+        int                     *nbytes)
-        int                     type)
 {
-        xfs_bmbt_rec_t          *ext_buffer;
+        struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+        struct xfs_inode        *ip = iip->ili_inode;
-        ext_buffer = kmem_alloc(XFS_IFORK_SIZE(ip, whichfork), KM_SLEEP);
+        *nvecs += 2;
-        if (whichfork == XFS_DATA_FORK)
+        *nbytes += sizeof(struct xfs_inode_log_format) +
-                ip->i_itemp->ili_extents_buf = ext_buffer;
+                   xfs_icdinode_size(ip->i_d.di_version);
-        else
-                ip->i_itemp->ili_aextents_buf = ext_buffer;
-        vecp->i_addr = ext_buffer;
+        xfs_inode_item_data_fork_size(iip, nvecs, nbytes);
-        vecp->i_len = xfs_iextents_copy(ip, ext_buffer, whichfork);
+        if (XFS_IFORK_Q(ip))
-        vecp->i_type = type;
+                xfs_inode_item_attr_fork_size(iip, nvecs, nbytes);
 }
 /*
- * This is called to fill in the vector of log iovecs for the
+ * If this is a v1 format inode, then we need to log it as such.  This means
- * given inode log item.  It fills the first item with an inode
+ * that we have to copy the link count from the new field to the old.  We
- * log format structure, the second with the on-disk inode structure,
+ * don't have to worry about the new fields, because nothing trusts them as
- * and a possible third and/or fourth with the inode data/extents/b-tree
+ * long as the old inode version number is there.
- * root and inode attributes data/extents/b-tree root.
 */
 STATIC void
-xfs_inode_item_format(
+xfs_inode_item_format_v1_inode(
-        struct xfs_log_item     *lip,
+        struct xfs_inode        *ip)
-        struct xfs_log_iovec    *vecp)
+{
+        if (!xfs_sb_version_hasnlink(&ip->i_mount->m_sb)) {
+                /*
+                 * Convert it back.
+                 */
+                ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1);
+                ip->i_d.di_onlink = ip->i_d.di_nlink;
+        } else {
+                /*
+                 * The superblock version has already been bumped,
+                 * so just make the conversion to the new inode
+                 * format permanent.
+                 */
+                ip->i_d.di_version = 2;
+                ip->i_d.di_onlink = 0;
+                memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
+        }
+}
+STATIC void
+xfs_inode_item_format_data_fork(
+        struct xfs_inode_log_item *iip,
+        struct xfs_inode_log_format *ilf,
+        struct xfs_log_vec      *lv,
+        struct xfs_log_iovec    **vecp)
 {
-        struct xfs_inode_log_item *iip = INODE_ITEM(lip);
        struct xfs_inode        *ip = iip->ili_inode;
-        uint                    nvecs;
        size_t                  data_bytes;
-        xfs_mount_t             *mp;
-        vecp->i_addr = &iip->ili_format;
-        vecp->i_len  = sizeof(xfs_inode_log_format_t);
-        vecp->i_type = XLOG_REG_TYPE_IFORMAT;
-        vecp++;
-        nvecs        = 1;
-        vecp->i_addr = &ip->i_d;
-        vecp->i_len  = xfs_icdinode_size(ip->i_d.di_version);
-        vecp->i_type = XLOG_REG_TYPE_ICORE;
-        vecp++;
-        nvecs++;
-        /*
-         * If this is really an old format inode, then we need to
-         * log it as such.  This means that we have to copy the link
-         * count from the new field to the old.  We don't have to worry
-         * about the new fields, because nothing trusts them as long as
-         * the old inode version number is there.  If the superblock already
-         * has a new version number, then we don't bother converting back.
-         */
-        mp = ip->i_mount;
-        ASSERT(ip->i_d.di_version == 1 || xfs_sb_version_hasnlink(&mp->m_sb));
-        if (ip->i_d.di_version == 1) {
-                if (!xfs_sb_version_hasnlink(&mp->m_sb)) {
-                        /*
-                         * Convert it back.
-                         */
-                        ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1);
-                        ip->i_d.di_onlink = ip->i_d.di_nlink;
-                } else {
-                        /*
-                         * The superblock version has already been bumped,
-                         * so just make the conversion to the new inode
-                         * format permanent.
-                         */
-                        ip->i_d.di_version = 2;
-                        ip->i_d.di_onlink = 0;
-                        memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
-                }
-        }
        switch (ip->i_d.di_format) {
        case XFS_DINODE_FMT_EXTENTS:
@@ -239,36 +192,23 @@ xfs_inode_item_format(
                if ((iip->ili_fields & XFS_ILOG_DEXT) &&
                    ip->i_d.di_nextents > 0 &&
                    ip->i_df.if_bytes > 0) {
+                        struct xfs_bmbt_rec *p;
                        ASSERT(ip->i_df.if_u1.if_extents != NULL);
                        ASSERT(ip->i_df.if_bytes / sizeof(xfs_bmbt_rec_t) > 0);
-                        ASSERT(iip->ili_extents_buf == NULL);
+                        p = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_IEXT);
-#ifdef XFS_NATIVE_HOST
+                        data_bytes = xfs_iextents_copy(ip, p, XFS_DATA_FORK);
-                       if (ip->i_d.di_nextents == ip->i_df.if_bytes /
+                        xlog_finish_iovec(lv, *vecp, data_bytes);
-                                               (uint)sizeof(xfs_bmbt_rec_t)) {
-                                /*
+                        ASSERT(data_bytes <= ip->i_df.if_bytes);
-                                 * There are no delayed allocation
-                                 * extents, so just point to the
+                        ilf->ilf_dsize = data_bytes;
-                                 * real extents array.
+                        ilf->ilf_size++;
-                                 */
-                                vecp->i_addr = ip->i_df.if_u1.if_extents;
-                                vecp->i_len = ip->i_df.if_bytes;
-                                vecp->i_type = XLOG_REG_TYPE_IEXT;
-                        } else
-#endif
-                        {
-                                xfs_inode_item_format_extents(ip, vecp,
-                                        XFS_DATA_FORK, XLOG_REG_TYPE_IEXT);
-                        }
-                        ASSERT(vecp->i_len <= ip->i_df.if_bytes);
-                        iip->ili_format.ilf_dsize = vecp->i_len;
-                        vecp++;
-                        nvecs++;
                } else {
                        iip->ili_fields &= ~XFS_ILOG_DEXT;
                }
                break;
        case XFS_DINODE_FMT_BTREE:
                iip->ili_fields &=
                        ~(XFS_ILOG_DDATA | XFS_ILOG_DEXT |
@@ -277,80 +217,70 @@ xfs_inode_item_format(
                if ((iip->ili_fields & XFS_ILOG_DBROOT) &&
                    ip->i_df.if_broot_bytes > 0) {
                        ASSERT(ip->i_df.if_broot != NULL);
-                        vecp->i_addr = ip->i_df.if_broot;
+                        xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IBROOT,
-                        vecp->i_len = ip->i_df.if_broot_bytes;
+                                        ip->i_df.if_broot,
-                        vecp->i_type = XLOG_REG_TYPE_IBROOT;
+                                        ip->i_df.if_broot_bytes);
-                        vecp++;
+                        ilf->ilf_dsize = ip->i_df.if_broot_bytes;
-                        nvecs++;
+                        ilf->ilf_size++;
-                        iip->ili_format.ilf_dsize = ip->i_df.if_broot_bytes;
                } else {
                        ASSERT(!(iip->ili_fields &
                                 XFS_ILOG_DBROOT));
                        iip->ili_fields &= ~XFS_ILOG_DBROOT;
                }
                break;
        case XFS_DINODE_FMT_LOCAL:
                iip->ili_fields &=
                        ~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT |
                          XFS_ILOG_DEV | XFS_ILOG_UUID);
                if ((iip->ili_fields & XFS_ILOG_DDATA) &&
                    ip->i_df.if_bytes > 0) {
-                        ASSERT(ip->i_df.if_u1.if_data != NULL);
-                        ASSERT(ip->i_d.di_size > 0);
-                        vecp->i_addr = ip->i_df.if_u1.if_data;
                        /*
                         * Round i_bytes up to a word boundary.
                         * The underlying memory is guaranteed to
                         * to be there by xfs_idata_realloc().
                         */
                        data_bytes = roundup(ip->i_df.if_bytes, 4);
-                        ASSERT((ip->i_df.if_real_bytes == 0) ||
+                        ASSERT(ip->i_df.if_real_bytes == 0 ||
-                               (ip->i_df.if_real_bytes == data_bytes));
+                               ip->i_df.if_real_bytes == data_bytes);
-                        vecp->i_len = (int)data_bytes;
+                        ASSERT(ip->i_df.if_u1.if_data != NULL);
-                        vecp->i_type = XLOG_REG_TYPE_ILOCAL;
+                        ASSERT(ip->i_d.di_size > 0);
-                        vecp++;
+                        xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_ILOCAL,
-                        nvecs++;
+                                        ip->i_df.if_u1.if_data, data_bytes);
-                        iip->ili_format.ilf_dsize = (unsigned)data_bytes;
+                        ilf->ilf_dsize = (unsigned)data_bytes;
+                        ilf->ilf_size++;
                } else {
                        iip->ili_fields &= ~XFS_ILOG_DDATA;
                }
                break;
        case XFS_DINODE_FMT_DEV:
                iip->ili_fields &=
                        ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
                          XFS_ILOG_DEXT | XFS_ILOG_UUID);
-                if (iip->ili_fields & XFS_ILOG_DEV) {
+                if (iip->ili_fields & XFS_ILOG_DEV)
-                        iip->ili_format.ilf_u.ilfu_rdev =
+                        ilf->ilf_u.ilfu_rdev = ip->i_df.if_u2.if_rdev;
-                                ip->i_df.if_u2.if_rdev;
-                }
                break;
        case XFS_DINODE_FMT_UUID:
                iip->ili_fields &=
                        ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
                          XFS_ILOG_DEXT | XFS_ILOG_DEV);
-                if (iip->ili_fields & XFS_ILOG_UUID) {
+                if (iip->ili_fields & XFS_ILOG_UUID)
-                        iip->ili_format.ilf_u.ilfu_uuid =
+                        ilf->ilf_u.ilfu_uuid = ip->i_df.if_u2.if_uuid;
-                                ip->i_df.if_u2.if_uuid;
-                }
                break;
        default:
                ASSERT(0);
                break;
        }
+}
-        /*
+STATIC void
-         * If there are no attributes associated with the file, then we're done.
+xfs_inode_item_format_attr_fork(
-         */
+        struct xfs_inode_log_item *iip,
-        if (!XFS_IFORK_Q(ip)) {
+        struct xfs_inode_log_format *ilf,
-                iip->ili_fields &=
+        struct xfs_log_vec      *lv,
-                        ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT);
+        struct xfs_log_iovec    **vecp)
-                goto out;
+{
-        }
+        struct xfs_inode        *ip = iip->ili_inode;
+        size_t                  data_bytes;
        switch (ip->i_d.di_aformat) {
        case XFS_DINODE_FMT_EXTENTS:
@@ -360,30 +290,22 @@ xfs_inode_item_format(
                if ((iip->ili_fields & XFS_ILOG_AEXT) &&
                    ip->i_d.di_anextents > 0 &&
                    ip->i_afp->if_bytes > 0) {
+                        struct xfs_bmbt_rec *p;
                        ASSERT(ip->i_afp->if_bytes / sizeof(xfs_bmbt_rec_t) ==
                                ip->i_d.di_anextents);
                        ASSERT(ip->i_afp->if_u1.if_extents != NULL);
-#ifdef XFS_NATIVE_HOST
-                        /*
+                        p = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_EXT);
-                         * There are not delayed allocation extents
+                        data_bytes = xfs_iextents_copy(ip, p, XFS_ATTR_FORK);
-                         * for attributes, so just point at the array.
+                        xlog_finish_iovec(lv, *vecp, data_bytes);
-                         */
-                        vecp->i_addr = ip->i_afp->if_u1.if_extents;
+                        ilf->ilf_asize = data_bytes;
-                        vecp->i_len = ip->i_afp->if_bytes;
+                        ilf->ilf_size++;
-                        vecp->i_type = XLOG_REG_TYPE_IATTR_EXT;
-#else
-                        ASSERT(iip->ili_aextents_buf == NULL);
-                        xfs_inode_item_format_extents(ip, vecp,
-                                        XFS_ATTR_FORK, XLOG_REG_TYPE_IATTR_EXT);
-#endif
-                        iip->ili_format.ilf_asize = vecp->i_len;
-                        vecp++;
-                        nvecs++;
                } else {
                        iip->ili_fields &= ~XFS_ILOG_AEXT;
                }
                break;
        case XFS_DINODE_FMT_BTREE:
                iip->ili_fields &=
                        ~(XFS_ILOG_ADATA | XFS_ILOG_AEXT);
@@ -392,61 +314,89 @@ xfs_inode_item_format(
                    ip->i_afp->if_broot_bytes > 0) {
                        ASSERT(ip->i_afp->if_broot != NULL);
-                        vecp->i_addr = ip->i_afp->if_broot;
+                        xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_BROOT,
-                        vecp->i_len = ip->i_afp->if_broot_bytes;
+                                        ip->i_afp->if_broot,
-                        vecp->i_type = XLOG_REG_TYPE_IATTR_BROOT;
+                                        ip->i_afp->if_broot_bytes);
-                        vecp++;
+                        ilf->ilf_asize = ip->i_afp->if_broot_bytes;
-                        nvecs++;
+                        ilf->ilf_size++;
-                        iip->ili_format.ilf_asize = ip->i_afp->if_broot_bytes;
                } else {
                        iip->ili_fields &= ~XFS_ILOG_ABROOT;
                }
                break;
        case XFS_DINODE_FMT_LOCAL:
                iip->ili_fields &=
                        ~(XFS_ILOG_AEXT | XFS_ILOG_ABROOT);
                if ((iip->ili_fields & XFS_ILOG_ADATA) &&
                    ip->i_afp->if_bytes > 0) {
-                        ASSERT(ip->i_afp->if_u1.if_data != NULL);
-                        vecp->i_addr = ip->i_afp->if_u1.if_data;
                        /*
                         * Round i_bytes up to a word boundary.
                         * The underlying memory is guaranteed to
                         * to be there by xfs_idata_realloc().
                         */
                        data_bytes = roundup(ip->i_afp->if_bytes, 4);
-                        ASSERT((ip->i_afp->if_real_bytes == 0) ||
+                        ASSERT(ip->i_afp->if_real_bytes == 0 ||
-                               (ip->i_afp->if_real_bytes == data_bytes));
+                               ip->i_afp->if_real_bytes == data_bytes);
-                        vecp->i_len = (int)data_bytes;
+                        ASSERT(ip->i_afp->if_u1.if_data != NULL);
-                        vecp->i_type = XLOG_REG_TYPE_IATTR_LOCAL;
+                        xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_LOCAL,
-                        vecp++;
+                                        ip->i_afp->if_u1.if_data,
-                        nvecs++;
+                                        data_bytes);
-                        iip->ili_format.ilf_asize = (unsigned)data_bytes;
+                        ilf->ilf_asize = (unsigned)data_bytes;
+                        ilf->ilf_size++;
                } else {
                        iip->ili_fields &= ~XFS_ILOG_ADATA;
                }
                break;
        default:
                ASSERT(0);
                break;
        }
-out:
-        /*
-         * Now update the log format that goes out to disk from the in-core
-         * values.  We always write the inode core to make the arithmetic
-         * games in recovery easier, which isn't a big deal as just about any
-         * transaction would dirty it anyway.
-         */
-        iip->ili_format.ilf_fields = XFS_ILOG_CORE |
-                (iip->ili_fields & ~XFS_ILOG_TIMESTAMP);
-        iip->ili_format.ilf_size = nvecs;
 }
+/*
+ * This is called to fill in the vector of log iovecs for the given inode
+ * log item.  It fills the first item with an inode log format structure,
+ * the second with the on-disk inode structure, and a possible third and/or
+ * fourth with the inode data/extents/b-tree root and inode attributes
+ * data/extents/b-tree root.
+ */
+STATIC void
+xfs_inode_item_format(
+        struct xfs_log_item     *lip,
+        struct xfs_log_vec      *lv)
+{
+        struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+        struct xfs_inode        *ip = iip->ili_inode;
+        struct xfs_inode_log_format *ilf;
+        struct xfs_log_iovec    *vecp = NULL;
+        ilf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_IFORMAT);
+        ilf->ilf_type = XFS_LI_INODE;
+        ilf->ilf_ino = ip->i_ino;
+        ilf->ilf_blkno = ip->i_imap.im_blkno;
+        ilf->ilf_len = ip->i_imap.im_len;
+        ilf->ilf_boffset = ip->i_imap.im_boffset;
+        ilf->ilf_fields = XFS_ILOG_CORE;
+        ilf->ilf_size = 2; /* format + core */
+        xlog_finish_iovec(lv, vecp, sizeof(struct xfs_inode_log_format));
+        if (ip->i_d.di_version == 1)
+                xfs_inode_item_format_v1_inode(ip);
+        xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ICORE,
+                        &ip->i_d,
+                        xfs_icdinode_size(ip->i_d.di_version));
+        xfs_inode_item_format_data_fork(iip, ilf, lv, &vecp);
+        if (XFS_IFORK_Q(ip)) {
+                xfs_inode_item_format_attr_fork(iip, ilf, lv, &vecp);
+        } else {
+                iip->ili_fields &=
+                        ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT);
+        }
+        /* update the format with the exact fields we actually logged */
+        ilf->ilf_fields |= (iip->ili_fields & ~XFS_ILOG_TIMESTAMP);
+}
 /*
 * This is called to pin the inode associated with the inode log
@@ -563,27 +513,6 @@ xfs_inode_item_unlock(
        ASSERT(ip->i_itemp != NULL);
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-        /*
-         * If the inode needed a separate buffer with which to log
-         * its extents, then free it now.
-         */
-        if (iip->ili_extents_buf != NULL) {
-                ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS);
-                ASSERT(ip->i_d.di_nextents > 0);
-                ASSERT(iip->ili_fields & XFS_ILOG_DEXT);
-                ASSERT(ip->i_df.if_bytes > 0);
-                kmem_free(iip->ili_extents_buf);
-                iip->ili_extents_buf = NULL;
-        }
-        if (iip->ili_aextents_buf != NULL) {
-                ASSERT(ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS);
-                ASSERT(ip->i_d.di_anextents > 0);
-                ASSERT(iip->ili_fields & XFS_ILOG_AEXT);
-                ASSERT(ip->i_afp->if_bytes > 0);
-                kmem_free(iip->ili_aextents_buf);
-                iip->ili_aextents_buf = NULL;
-        }
        lock_flags = iip->ili_lock_flags;
        iip->ili_lock_flags = 0;
        if (lock_flags)
@@ -670,11 +599,6 @@ xfs_inode_item_init(
        iip->ili_inode = ip;
        xfs_log_item_init(mp, &iip->ili_item, XFS_LI_INODE,
                                                &xfs_inode_item_ops);
-        iip->ili_format.ilf_type = XFS_LI_INODE;
-        iip->ili_format.ilf_ino = ip->i_ino;
-        iip->ili_format.ilf_blkno = ip->i_imap.im_blkno;
-        iip->ili_format.ilf_len = ip->i_imap.im_len;
-        iip->ili_format.ilf_boffset = ip->i_imap.im_boffset;
 }
 /*
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index dce4d656768c..488d81254e28 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -34,11 +34,6 @@ typedef struct xfs_inode_log_item {
        unsigned short          ili_logged;        /* flushed logged data */
        unsigned int            ili_last_fields;   /* fields when flushed */
        unsigned int            ili_fields;        /* fields to be logged */
-        struct xfs_bmbt_rec     *ili_extents_buf;  /* array of logged
-                                                      data exts */
-        struct xfs_bmbt_rec     *ili_aextents_buf; /* array of logged
-                                                      attr exts */
-        xfs_inode_log_format_t  ili_format;        /* logged structure */
 } xfs_inode_log_item_t;
 static inline int xfs_inode_clean(xfs_inode_t *ip)
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 33ad9a77791f..bcfe61202115 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -112,15 +112,11 @@ xfs_find_handle(
                memset(&handle.ha_fid, 0, sizeof(handle.ha_fid));
                hsize = sizeof(xfs_fsid_t);
        } else {
-                int             lock_mode;
-                lock_mode = xfs_ilock_map_shared(ip);
                handle.ha_fid.fid_len = sizeof(xfs_fid_t) -
                                        sizeof(handle.ha_fid.fid_len);
                handle.ha_fid.fid_pad = 0;
                handle.ha_fid.fid_gen = ip->i_d.di_gen;
                handle.ha_fid.fid_ino = ip->i_ino;
-                xfs_iunlock_map_shared(ip, lock_mode);
                hsize = XFS_HSIZE(handle);
        }
@@ -1587,7 +1583,7 @@ xfs_file_ioctl(
                        XFS_IS_REALTIME_INODE(ip) ?
                        mp->m_rtdev_targp : mp->m_ddev_targp;
-                da.d_mem = da.d_miniosz = 1 << target->bt_sshift;
+                da.d_mem =  da.d_miniosz = target->bt_logical_sectorsize;
                da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1);
                if (copy_to_user(arg, &da, sizeof(da)))
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 104455b8046c..9ddfb8190ca1 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -123,7 +123,7 @@ xfs_vn_mknod(
 {
        struct inode    *inode;
        struct xfs_inode *ip = NULL;
-        struct posix_acl *default_acl = NULL;
+        struct posix_acl *default_acl, *acl;
        struct xfs_name name;
        int             error;
@@ -139,14 +139,9 @@ xfs_vn_mknod(
                rdev = 0;
        }
-        if (IS_POSIXACL(dir)) {
+        error = posix_acl_create(dir, &mode, &default_acl, &acl);
-                default_acl = xfs_get_acl(dir, ACL_TYPE_DEFAULT);
+        if (error)
-                if (IS_ERR(default_acl))
+                return error;
-                        return PTR_ERR(default_acl);
-                if (!default_acl)
-                        mode &= ~current_umask();
-        }
        xfs_dentry_to_name(&name, dentry, mode);
        error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip);
@@ -159,22 +154,30 @@ xfs_vn_mknod(
        if (unlikely(error))
                goto out_cleanup_inode;
+#ifdef CONFIG_XFS_POSIX_ACL
        if (default_acl) {
-                error = -xfs_inherit_acl(inode, default_acl);
+                error = xfs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
-                default_acl = NULL;
+                if (error)
-                if (unlikely(error))
                        goto out_cleanup_inode;
        }
+        if (acl) {
+                error = xfs_set_acl(inode, acl, ACL_TYPE_ACCESS);
+                if (error)
+                        goto out_cleanup_inode;
+        }
+#endif
        d_instantiate(dentry, inode);
+ out_free_acl:
+        if (default_acl)
+                posix_acl_release(default_acl);
+        if (acl)
+                posix_acl_release(acl);
        return -error;
 out_cleanup_inode:
        xfs_cleanup_inode(dir, inode, dentry);
- out_free_acl:
+        goto out_free_acl;
-        posix_acl_release(default_acl);
-        return -error;
 }
 STATIC int
@@ -391,18 +394,6 @@ xfs_vn_follow_link(
        return NULL;
 }
-STATIC void
-xfs_vn_put_link(
-        struct dentry   *dentry,
-        struct nameidata *nd,
-        void            *p)
-{
-        char            *s = nd_get_link(nd);
-        if (!IS_ERR(s))
-                kfree(s);
-}
 STATIC int
 xfs_vn_getattr(
        struct vfsmount         *mnt,
@@ -459,14 +450,12 @@ xfs_vn_getattr(
 static void
 xfs_setattr_mode(
-        struct xfs_trans        *tp,
        struct xfs_inode        *ip,
        struct iattr            *iattr)
 {
-        struct inode    *inode = VFS_I(ip);
+        struct inode            *inode = VFS_I(ip);
-        umode_t         mode = iattr->ia_mode;
+        umode_t                 mode = iattr->ia_mode;
-        ASSERT(tp);
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
        ip->i_d.di_mode &= S_IFMT;
@@ -476,6 +465,32 @@ xfs_setattr_mode(
        inode->i_mode |= mode & ~S_IFMT;
 }
+static void
+xfs_setattr_time(
+        struct xfs_inode        *ip,
+        struct iattr            *iattr)
+{
+        struct inode            *inode = VFS_I(ip);
+        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+        if (iattr->ia_valid & ATTR_ATIME) {
+                inode->i_atime = iattr->ia_atime;
+                ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec;
+                ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec;
+        }
+        if (iattr->ia_valid & ATTR_CTIME) {
+                inode->i_ctime = iattr->ia_ctime;
+                ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec;
+                ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec;
+        }
+        if (iattr->ia_valid & ATTR_MTIME) {
+                inode->i_mtime = iattr->ia_mtime;
+                ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec;
+                ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec;
+        }
+}
 int
 xfs_setattr_nonsize(
        struct xfs_inode        *ip,
@@ -630,30 +645,10 @@ xfs_setattr_nonsize(
                }
        }
-        /*
-         * Change file access modes.
-         */
        if (mask & ATTR_MODE)
-                xfs_setattr_mode(tp, ip, iattr);
+                xfs_setattr_mode(ip, iattr);
+        if (mask & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME))
-        /*
+                xfs_setattr_time(ip, iattr);
-         * Change file access or modified times.
-         */
-        if (mask & ATTR_ATIME) {
-                inode->i_atime = iattr->ia_atime;
-                ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec;
-                ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec;
-        }
-        if (mask & ATTR_CTIME) {
-                inode->i_ctime = iattr->ia_ctime;
-                ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec;
-                ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec;
-        }
-        if (mask & ATTR_MTIME) {
-                inode->i_mtime = iattr->ia_mtime;
-                ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec;
-                ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec;
-        }
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
@@ -684,7 +679,7 @@ xfs_setattr_nonsize(
         *           Posix ACL code seems to care about this issue either.
         */
        if ((mask & ATTR_MODE) && !(flags & XFS_ATTR_NOACL)) {
-                error = -xfs_acl_chmod(inode);
+                error = -posix_acl_chmod(inode, inode->i_mode);
                if (error)
                        return XFS_ERROR(error);
        }
@@ -710,7 +705,6 @@ xfs_setattr_size(
 {
        struct xfs_mount        *mp = ip->i_mount;
        struct inode            *inode = VFS_I(ip);
-        int                     mask = iattr->ia_valid;
        xfs_off_t               oldsize, newsize;
        struct xfs_trans        *tp;
        int                     error;
@@ -731,8 +725,8 @@ xfs_setattr_size(
        ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
        ASSERT(S_ISREG(ip->i_d.di_mode));
-        ASSERT((mask & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET|
+        ASSERT((iattr->ia_valid & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET|
-                        ATTR_MTIME_SET|ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0);
+                ATTR_MTIME_SET|ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0);
        oldsize = inode->i_size;
        newsize = iattr->ia_size;
@@ -741,7 +735,7 @@ xfs_setattr_size(
         * Short circuit the truncate case for zero length files.
         */
        if (newsize == 0 && oldsize == 0 && ip->i_d.di_nextents == 0) {
-                if (!(mask & (ATTR_CTIME|ATTR_MTIME)))
+                if (!(iattr->ia_valid & (ATTR_CTIME|ATTR_MTIME)))
                        return 0;
                /*
@@ -829,10 +823,11 @@ xfs_setattr_size(
         * these flags set.  For all other operations the VFS set these flags
         * explicitly if it wants a timestamp update.
         */
-        if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME)))) {
+        if (newsize != oldsize &&
+            !(iattr->ia_valid & (ATTR_CTIME | ATTR_MTIME))) {
                iattr->ia_ctime = iattr->ia_mtime =
                        current_fs_time(inode->i_sb);
-                mask |= ATTR_CTIME | ATTR_MTIME;
+                iattr->ia_valid |= ATTR_CTIME | ATTR_MTIME;
        }
        /*
@@ -868,22 +863,10 @@ xfs_setattr_size(
                xfs_inode_clear_eofblocks_tag(ip);
        }
-        /*
+        if (iattr->ia_valid & ATTR_MODE)
-         * Change file access modes.
+                xfs_setattr_mode(ip, iattr);
-         */
+        if (iattr->ia_valid & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME))
-        if (mask & ATTR_MODE)
+                xfs_setattr_time(ip, iattr);
-                xfs_setattr_mode(tp, ip, iattr);
-        if (mask & ATTR_CTIME) {
-                inode->i_ctime = iattr->ia_ctime;
-                ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec;
-                ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec;
-        }
-        if (mask & ATTR_MTIME) {
-                inode->i_mtime = iattr->ia_mtime;
-                ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec;
-                ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec;
-        }
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
@@ -1053,6 +1036,7 @@ xfs_vn_fiemap(
 static const struct inode_operations xfs_inode_operations = {
        .get_acl                = xfs_get_acl,
+        .set_acl                = xfs_set_acl,
        .getattr                = xfs_vn_getattr,
        .setattr                = xfs_vn_setattr,
        .setxattr               = generic_setxattr,
@@ -1080,6 +1064,7 @@ static const struct inode_operations xfs_dir_inode_operations = {
        .mknod                  = xfs_vn_mknod,
        .rename                 = xfs_vn_rename,
        .get_acl                = xfs_get_acl,
+        .set_acl                = xfs_set_acl,
        .getattr                = xfs_vn_getattr,
        .setattr                = xfs_vn_setattr,
        .setxattr               = generic_setxattr,
@@ -1106,6 +1091,7 @@ static const struct inode_operations xfs_dir_ci_inode_operations = {
        .mknod                  = xfs_vn_mknod,
        .rename                 = xfs_vn_rename,
        .get_acl                = xfs_get_acl,
+        .set_acl                = xfs_set_acl,
        .getattr                = xfs_vn_getattr,
        .setattr                = xfs_vn_setattr,
        .setxattr               = generic_setxattr,
@@ -1118,8 +1104,7 @@ static const struct inode_operations xfs_dir_ci_inode_operations = {
 static const struct inode_operations xfs_symlink_inode_operations = {
        .readlink               = generic_readlink,
        .follow_link            = xfs_vn_follow_link,
-        .put_link               = xfs_vn_put_link,
+        .put_link               = kfree_put_link,
-        .get_acl                = xfs_get_acl,
        .getattr                = xfs_vn_getattr,
        .setattr                = xfs_vn_setattr,
        .setxattr               = generic_setxattr,
diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h
index d2c5057b5cc4..1c34e4335920 100644
--- a/fs/xfs/xfs_iops.h
+++ b/fs/xfs/xfs_iops.h
@@ -30,7 +30,7 @@ extern void xfs_setup_inode(struct xfs_inode *);
 /*
 * Internal setattr interfaces.
 */
-#define XFS_ATTR_NOACL          0x01    /* Don't call xfs_acl_chmod */
+#define XFS_ATTR_NOACL          0x01    /* Don't call posix_acl_chmod */
 extern int xfs_setattr_nonsize(struct xfs_inode *ip, struct iattr *vap,
                               int flags);
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index c237ad15d500..f46338285152 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -209,9 +209,8 @@ xfs_bulkstat(
        xfs_inobt_rec_incore_t  *irbuf; /* start of irec buffer */
        xfs_inobt_rec_incore_t  *irbufend; /* end of good irec buffer entries */
        xfs_ino_t               lastino; /* last inode number returned */
-        int                     nbcluster; /* # of blocks in a cluster */
+        int                     blks_per_cluster; /* # of blocks per cluster */
-        int                     nicluster; /* # of inodes in a cluster */
+        int                     inodes_per_cluster;/* # of inodes per cluster */
-        int                     nimask; /* mask for inode clusters */
        int                     nirbuf; /* size of irbuf */
        int                     rval;   /* return value error code */
        int                     tmp;    /* result value from btree calls */
@@ -243,11 +242,8 @@ xfs_bulkstat(
        *done = 0;
        fmterror = 0;
        ubufp = ubuffer;
-        nicluster = mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp) ?
+        blks_per_cluster = xfs_icluster_size_fsb(mp);
-                mp->m_sb.sb_inopblock :
+        inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog;
-                (XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog);
-        nimask = ~(nicluster - 1);
-        nbcluster = nicluster >> mp->m_sb.sb_inopblog;
        irbuf = kmem_zalloc_greedy(&irbsize, PAGE_SIZE, PAGE_SIZE * 4);
        if (!irbuf)
                return ENOMEM;
@@ -390,12 +386,12 @@ xfs_bulkstat(
                                agbno = XFS_AGINO_TO_AGBNO(mp, r.ir_startino);
                                for (chunkidx = 0;
                                     chunkidx < XFS_INODES_PER_CHUNK;
-                                     chunkidx += nicluster,
+                                     chunkidx += inodes_per_cluster,
-                                     agbno += nbcluster) {
+                                     agbno += blks_per_cluster) {
-                                        if (xfs_inobt_maskn(chunkidx, nicluster)
+                                        if (xfs_inobt_maskn(chunkidx,
-                                                        & ~r.ir_free)
+                                            inodes_per_cluster) & ~r.ir_free)
                                                xfs_btree_reada_bufs(mp, agno,
-                                                        agbno, nbcluster,
+                                                        agbno, blks_per_cluster,
                                                        &xfs_inode_buf_ops);
                                }
                                blk_finish_plug(&plug);
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index e148719e0a5d..b0f4ef77fa70 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -30,6 +30,52 @@ struct xfs_log_vec {
 #define XFS_LOG_VEC_ORDERED     (-1)
+static inline void *
+xlog_prepare_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp,
+                uint type)
+{
+        struct xfs_log_iovec *vec = *vecp;
+        if (vec) {
+                ASSERT(vec - lv->lv_iovecp < lv->lv_niovecs);
+                vec++;
+        } else {
+                vec = &lv->lv_iovecp[0];
+        }
+        vec->i_type = type;
+        vec->i_addr = lv->lv_buf + lv->lv_buf_len;
+        ASSERT(IS_ALIGNED((unsigned long)vec->i_addr, sizeof(uint64_t)));
+        *vecp = vec;
+        return vec->i_addr;
+}
+static inline void
+xlog_finish_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec *vec, int len)
+{
+        /*
+         * We need to make sure the next buffer is naturally aligned for the
+         * biggest basic data type we put into it.  We already accounted for
+         * this when sizing the buffer.
+         */
+        lv->lv_buf_len += round_up(len, sizeof(uint64_t));
+        vec->i_len = len;
+}
+static inline void *
+xlog_copy_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp,
+                uint type, void *data, int len)
+{
+        void *buf;
+        buf = xlog_prepare_iovec(lv, vecp, type);
+        memcpy(buf, data, len);
+        xlog_finish_iovec(lv, *vecp, len);
+        return buf;
+}
 /*
 * Structure used to pass callback function and the function's argument
 * to the log manager.
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 5eb51fc5eb84..4ef6fdbced78 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -82,36 +82,6 @@ xlog_cil_init_post_recovery(
                                                                log->l_curr_block);
 }
-STATIC int
-xlog_cil_lv_item_format(
-        struct xfs_log_item     *lip,
-        struct xfs_log_vec      *lv)
-{
-        int     index;
-        char    *ptr;
-        /* format new vectors into array */
-        lip->li_ops->iop_format(lip, lv->lv_iovecp);
-        /* copy data into existing array */
-        ptr = lv->lv_buf;
-        for (index = 0; index < lv->lv_niovecs; index++) {
-                struct xfs_log_iovec *vec = &lv->lv_iovecp[index];
-                memcpy(ptr, vec->i_addr, vec->i_len);
-                vec->i_addr = ptr;
-                ptr += vec->i_len;
-        }
-        /*
-         * some size calculations for log vectors over-estimate, so the caller
-         * doesn't know the amount of space actually used by the item. Return
-         * the byte count to the caller so they can check and store it
-         * appropriately.
-         */
-        return ptr - lv->lv_buf;
-}
 /*
 * Prepare the log item for insertion into the CIL. Calculate the difference in
 * log space and vectors it will consume, and if it is a new item pin it as
@@ -232,12 +202,28 @@ xlog_cil_insert_format_items(
                        nbytes = 0;
                }
+                /*
+                 * We 64-bit align the length of each iovec so that the start
+                 * of the next one is naturally aligned.  We'll need to
+                 * account for that slack space here. Then round nbytes up
+                 * to 64-bit alignment so that the initial buffer alignment is
+                 * easy to calculate and verify.
+                 */
+                nbytes += niovecs * sizeof(uint64_t);
+                nbytes = round_up(nbytes, sizeof(uint64_t));
                /* grab the old item if it exists for reservation accounting */
                old_lv = lip->li_lv;
-                /* calc buffer size */
+                /*
-                buf_size = sizeof(struct xfs_log_vec) + nbytes +
+                 * The data buffer needs to start 64-bit aligned, so round up
-                                niovecs * sizeof(struct xfs_log_iovec);
+                 * that space to ensure we can align it appropriately and not
+                 * overrun the buffer.
+                 */
+                buf_size = nbytes +
+                           round_up((sizeof(struct xfs_log_vec) +
+                                     niovecs * sizeof(struct xfs_log_iovec)),
+                                    sizeof(uint64_t));
                /* compare to existing item size */
                if (lip->li_lv && buf_size <= lip->li_lv->lv_size) {
@@ -254,34 +240,29 @@ xlog_cil_insert_format_items(
                         */
                        *diff_iovecs -= lv->lv_niovecs;
                        *diff_len -= lv->lv_buf_len;
+                } else {
-                        /* Ensure the lv is set up according to ->iop_size */
+                        /* allocate new data chunk */
-                        lv->lv_niovecs = niovecs;
+                        lv = kmem_zalloc(buf_size, KM_SLEEP|KM_NOFS);
-                        lv->lv_buf = (char *)lv + buf_size - nbytes;
+                        lv->lv_item = lip;
+                        lv->lv_size = buf_size;
-                        lv->lv_buf_len = xlog_cil_lv_item_format(lip, lv);
+                        if (ordered) {
-                        goto insert;
+                                /* track as an ordered logvec */
+                                ASSERT(lip->li_lv == NULL);
+                                lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
+                                goto insert;
+                        }
+                        lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1];
                }
-                /* allocate new data chunk */
+                /* Ensure the lv is set up according to ->iop_size */
-                lv = kmem_zalloc(buf_size, KM_SLEEP|KM_NOFS);
-                lv->lv_item = lip;
-                lv->lv_size = buf_size;
                lv->lv_niovecs = niovecs;
-                if (ordered) {
-                        /* track as an ordered logvec */
-                        ASSERT(lip->li_lv == NULL);
-                        lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
-                        goto insert;
-                }
-                /* The allocated iovec region lies beyond the log vector. */
-                lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1];
                /* The allocated data region lies beyond the iovec region */
+                lv->lv_buf_len = 0;
                lv->lv_buf = (char *)lv + buf_size - nbytes;
+                ASSERT(IS_ALIGNED((unsigned long)lv->lv_buf, sizeof(uint64_t)));
-                lv->lv_buf_len = xlog_cil_lv_item_format(lip, lv);
+                lip->li_ops->iop_format(lip, lv);
 insert:
                ASSERT(lv->lv_buf_len <= nbytes);
                xfs_cil_prepare_item(log, lv, old_lv, diff_len, diff_iovecs);
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index eae16920655b..bce53ac81096 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1654,6 +1654,7 @@ xlog_recover_reorder_trans(
        int                     pass)
 {
        xlog_recover_item_t     *item, *n;
+        int                     error = 0;
        LIST_HEAD(sort_list);
        LIST_HEAD(cancel_list);
        LIST_HEAD(buffer_list);
@@ -1695,9 +1696,17 @@ xlog_recover_reorder_trans(
                                "%s: unrecognized type of log operation",
                                __func__);
                        ASSERT(0);
-                        return XFS_ERROR(EIO);
+                        /*
+                         * return the remaining items back to the transaction
+                         * item list so they can be freed in caller.
+                         */
+                        if (!list_empty(&sort_list))
+                                list_splice_init(&sort_list, &trans->r_itemq);
+                        error = XFS_ERROR(EIO);
+                        goto out;
                }
        }
+out:
        ASSERT(list_empty(&sort_list));
        if (!list_empty(&buffer_list))
                list_splice(&buffer_list, &trans->r_itemq);
@@ -1707,7 +1716,7 @@ xlog_recover_reorder_trans(
                list_splice_tail(&inode_buffer_list, &trans->r_itemq);
        if (!list_empty(&cancel_list))
                list_splice_tail(&cancel_list, &trans->r_itemq);
-        return 0;
+        return error;
 }
 /*
@@ -2517,19 +2526,19 @@ xlog_recover_buffer_pass2(
         *
         * Also make sure that only inode buffers with good sizes stay in
         * the buffer cache.  The kernel moves inodes in buffers of 1 block
-         * or XFS_INODE_CLUSTER_SIZE bytes, whichever is bigger.  The inode
+         * or mp->m_inode_cluster_size bytes, whichever is bigger.  The inode
         * buffers in the log can be a different size if the log was generated
         * by an older kernel using unclustered inode buffers or a newer kernel
         * running with a different inode cluster size.  Regardless, if the
-         * the inode buffer size isn't MAX(blocksize, XFS_INODE_CLUSTER_SIZE)
+         * the inode buffer size isn't MAX(blocksize, mp->m_inode_cluster_size)
-         * for *our* value of XFS_INODE_CLUSTER_SIZE, then we need to keep
+         * for *our* value of mp->m_inode_cluster_size, then we need to keep
         * the buffer out of the buffer cache so that the buffer won't
         * overlap with future reads of those inodes.
         */
        if (XFS_DINODE_MAGIC ==
            be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) &&
            (BBTOB(bp->b_io_length) != MAX(log->l_mp->m_sb.sb_blocksize,
-                        (__uint32_t)XFS_INODE_CLUSTER_SIZE(log->l_mp)))) {
+                        (__uint32_t)log->l_mp->m_inode_cluster_size))) {
                xfs_buf_stale(bp);
                error = xfs_bwrite(bp);
        } else {
@@ -3202,10 +3211,10 @@ xlog_recover_do_icreate_pass2(
        }
        /* existing allocation is fixed value */
-        ASSERT(count == XFS_IALLOC_INODES(mp));
+        ASSERT(count == mp->m_ialloc_inos);
-        ASSERT(length == XFS_IALLOC_BLOCKS(mp));
+        ASSERT(length == mp->m_ialloc_blks);
-        if (count != XFS_IALLOC_INODES(mp) ||
+        if (count != mp->m_ialloc_inos ||
-             length != XFS_IALLOC_BLOCKS(mp)) {
+             length != mp->m_ialloc_blks) {
                xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count 2");
                return EINVAL;
        }
@@ -3611,8 +3620,10 @@ xlog_recover_process_data(
                                error = XFS_ERROR(EIO);
                                break;
                        }
-                        if (error)
+                        if (error) {
+                                xlog_recover_free_trans(trans);
                                return error;
+                        }
                }
                dp += be32_to_cpu(ohead->oh_len);
                num_logops--;
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 02df7b408a26..f96c05669a9e 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -282,22 +282,29 @@ xfs_readsb(
        struct xfs_sb   *sbp = &mp->m_sb;
        int             error;
        int             loud = !(flags & XFS_MFSI_QUIET);
+        const struct xfs_buf_ops *buf_ops;
        ASSERT(mp->m_sb_bp == NULL);
        ASSERT(mp->m_ddev_targp != NULL);
        /*
+         * For the initial read, we must guess at the sector
+         * size based on the block device.  It's enough to
+         * get the sb_sectsize out of the superblock and
+         * then reread with the proper length.
+         * We don't verify it yet, because it may not be complete.
+         */
+        sector_size = xfs_getsize_buftarg(mp->m_ddev_targp);
+        buf_ops = NULL;
+        /*
         * Allocate a (locked) buffer to hold the superblock.
         * This will be kept around at all times to optimize
         * access to the superblock.
         */
-        sector_size = xfs_getsize_buftarg(mp->m_ddev_targp);
 reread:
        bp = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR,
-                                   BTOBB(sector_size), 0,
+                                   BTOBB(sector_size), 0, buf_ops);
-                                   loud ? &xfs_sb_buf_ops
-                                        : &xfs_sb_quiet_buf_ops);
        if (!bp) {
                if (loud)
                        xfs_warn(mp, "SB buffer read failed");
@@ -328,12 +335,13 @@ reread:
        }
        /*
-         * If device sector size is smaller than the superblock size,
+         * Re-read the superblock so the buffer is correctly sized,
-         * re-read the superblock so the buffer is correctly sized.
+         * and properly verified.
         */
-        if (sector_size < sbp->sb_sectsize) {
+        if (buf_ops == NULL) {
                xfs_buf_relse(bp);
                sector_size = sbp->sb_sectsize;
+                buf_ops = loud ? &xfs_sb_buf_ops : &xfs_sb_quiet_buf_ops;
                goto reread;
        }
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index dd88f0e27bd8..348e4d2ed6e6 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -1222,16 +1222,18 @@ xfs_qm_dqiterate(
        lblkno = 0;
        maxlblkcnt = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
        do {
+                uint            lock_mode;
                nmaps = XFS_DQITER_MAP_SIZE;
                /*
                 * We aren't changing the inode itself. Just changing
                 * some of its data. No new blocks are added here, and
                 * the inode is never added to the transaction.
                 */
-                xfs_ilock(qip, XFS_ILOCK_SHARED);
+                lock_mode = xfs_ilock_data_map_shared(qip);
                error = xfs_bmapi_read(qip, lblkno, maxlblkcnt - lblkno,
                                       map, &nmaps, 0);
-                xfs_iunlock(qip, XFS_ILOCK_SHARED);
+                xfs_iunlock(qip, lock_mode);
                if (error)
                        break;
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index a788b66a5cb1..797fd4636273 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -20,13 +20,29 @@
 #include "xfs_dquot_item.h"
 #include "xfs_dquot.h"
-#include "xfs_quota_priv.h"
 struct xfs_inode;
 extern struct kmem_zone *xfs_qm_dqtrxzone;
 /*
+ * Number of bmaps that we ask from bmapi when doing a quotacheck.
+ * We make this restriction to keep the memory usage to a minimum.
+ */
+#define XFS_DQITER_MAP_SIZE     10
+#define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \
+        !dqp->q_core.d_blk_hardlimit && \
+        !dqp->q_core.d_blk_softlimit && \
+        !dqp->q_core.d_rtb_hardlimit && \
+        !dqp->q_core.d_rtb_softlimit && \
+        !dqp->q_core.d_ino_hardlimit && \
+        !dqp->q_core.d_ino_softlimit && \
+        !dqp->q_core.d_bcount && \
+        !dqp->q_core.d_rtbcount && \
+        !dqp->q_core.d_icount)
+/*
 * This defines the unit of allocation of dquots.
 * Currently, it is just one file system block, and a 4K blk contains 30
 * (136 * 30 = 4080) dquots. It's probably not worth trying to make
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 437c9198031a..3daf5ea1eb8d 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -278,7 +278,7 @@ xfs_qm_scall_trunc_qfiles(
        xfs_mount_t     *mp,
        uint            flags)
 {
-        int             error = 0, error2 = 0;
+        int             error;
        if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0) {
                xfs_debug(mp, "%s: flags=%x m_qflags=%x",
@@ -286,14 +286,20 @@ xfs_qm_scall_trunc_qfiles(
                return XFS_ERROR(EINVAL);
        }
-        if (flags & XFS_DQ_USER)
+        if (flags & XFS_DQ_USER) {
                error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_uquotino);
-        if (flags & XFS_DQ_GROUP)
+                if (error)
-                error2 = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_gquotino);
+                        return error;
+        }
+        if (flags & XFS_DQ_GROUP) {
+                error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_gquotino);
+                if (error)
+                        return error;
+        }
        if (flags & XFS_DQ_PROJ)
-                error2 = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_pquotino);
+                error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_pquotino);
-        return error ? error : error2;
+        return error;
 }
 /*
diff --git a/fs/xfs/xfs_quota_priv.h b/fs/xfs/xfs_quota_priv.h
deleted file mode 100644
index 6d86219d93da..000000000000
--- a/fs/xfs/xfs_quota_priv.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright (c) 2000-2003 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_QUOTA_PRIV_H__
-#define __XFS_QUOTA_PRIV_H__
-/*
- * Number of bmaps that we ask from bmapi when doing a quotacheck.
- * We make this restriction to keep the memory usage to a minimum.
- */
-#define XFS_DQITER_MAP_SIZE     10
-#define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \
-        !dqp->q_core.d_blk_hardlimit && \
-        !dqp->q_core.d_blk_softlimit && \
-        !dqp->q_core.d_rtb_hardlimit && \
-        !dqp->q_core.d_rtb_softlimit && \
-        !dqp->q_core.d_ino_hardlimit && \
-        !dqp->q_core.d_ino_softlimit && \
-        !dqp->q_core.d_bcount && \
-        !dqp->q_core.d_rtbcount && \
-        !dqp->q_core.d_icount)
-#define DQFLAGTO_TYPESTR(d)     (((d)->dq_flags & XFS_DQ_USER) ? "USR" : \
-                                 (((d)->dq_flags & XFS_DQ_GROUP) ? "GRP" : \
-                                 (((d)->dq_flags & XFS_DQ_PROJ) ? "PRJ":"???")))
-#endif  /* __XFS_QUOTA_PRIV_H__ */
diff --git a/fs/xfs/xfs_sb.c b/fs/xfs/xfs_sb.c
index b7c9aea77f8f..1e116794bb66 100644
--- a/fs/xfs/xfs_sb.c
+++ b/fs/xfs/xfs_sb.c
@@ -295,8 +295,7 @@ xfs_mount_validate_sb(
            sbp->sb_dblocks == 0                                        ||
            sbp->sb_dblocks > XFS_MAX_DBLOCKS(sbp)                      ||
            sbp->sb_dblocks < XFS_MIN_DBLOCKS(sbp))) {
-                XFS_CORRUPTION_ERROR("SB sanity check failed",
+                xfs_notice(mp, "SB sanity check failed");
-                                XFS_ERRLEVEL_LOW, mp, sbp);
                return XFS_ERROR(EFSCORRUPTED);
        }
@@ -611,10 +610,10 @@ xfs_sb_read_verify(
                                                XFS_SB_VERSION_5) ||
             dsb->sb_crc != 0)) {
-                if (!xfs_verify_cksum(bp->b_addr, be16_to_cpu(dsb->sb_sectsize),
+                if (!xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
                                      offsetof(struct xfs_sb, sb_crc))) {
                        /* Only fail bad secondaries on a known V5 filesystem */
-                        if (bp->b_bn != XFS_SB_DADDR &&
+                        if (bp->b_bn == XFS_SB_DADDR ||
                            xfs_sb_version_hascrc(&mp->m_sb)) {
                                error = EFSCORRUPTED;
                                goto out_error;
@@ -625,7 +624,7 @@ xfs_sb_read_verify(
 out_error:
        if (error) {
-                if (error != EWRONGFS)
+                if (error == EFSCORRUPTED)
                        XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
                                             mp, bp->b_addr);
                xfs_buf_ioerror(bp, error);
@@ -644,7 +643,6 @@ xfs_sb_quiet_read_verify(
 {
        struct xfs_dsb  *dsb = XFS_BUF_TO_SBP(bp);
        if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC)) {
                /* XFS filesystem, verify noisily! */
                xfs_sb_read_verify(bp);
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index f317488263dd..d971f4932b5d 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -913,7 +913,7 @@ xfs_flush_inodes(
        struct super_block      *sb = mp->m_super;
        if (down_read_trylock(&sb->s_umount)) {
-                sync_inodes_sb(sb, jiffies);
+                sync_inodes_sb(sb);
                up_read(&sb->s_umount);
        }
 }
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 9b96d35e483d..b5bc1ab3c4da 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -64,7 +64,7 @@ typedef struct xfs_log_item {
 struct xfs_item_ops {
        void (*iop_size)(xfs_log_item_t *, int *, int *);
-        void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *);
+        void (*iop_format)(xfs_log_item_t *, struct xfs_log_vec *);
        void (*iop_pin)(xfs_log_item_t *);
        void (*iop_unpin)(xfs_log_item_t *, int remove);
        uint (*iop_push)(struct xfs_log_item *, struct list_head *);
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index cd2a10e15d3a..41172861e857 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -295,8 +295,8 @@ xfs_trans_mod_dquot(
 /*
 * Given an array of dqtrx structures, lock all the dquots associated and join
 * them to the transaction, provided they have been modified.  We know that the
- * highest number of dquots of one type - usr, grp OR prj - involved in a
+ * highest number of dquots of one type - usr, grp and prj - involved in a
- * transaction is 2 so we don't need to make this very generic.
+ * transaction is 3 so we don't need to make this very generic.
 */
 STATIC void
 xfs_trans_dqlockedjoin(
diff --git a/fs/xfs/xfs_trans_resv.c b/fs/xfs/xfs_trans_resv.c
index 2fd59c0dae66..2ffd3e331b49 100644
--- a/fs/xfs/xfs_trans_resv.c
+++ b/fs/xfs/xfs_trans_resv.c
@@ -174,7 +174,7 @@ xfs_calc_itruncate_reservation(
                    xfs_calc_buf_res(5, 0) +
                    xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
                                     XFS_FSB_TO_B(mp, 1)) +
-                    xfs_calc_buf_res(2 + XFS_IALLOC_BLOCKS(mp) +
+                    xfs_calc_buf_res(2 + mp->m_ialloc_blks +
                                     mp->m_in_maxlevels, 0)));
 }
@@ -282,7 +282,7 @@ xfs_calc_create_resv_modify(
 * For create we can allocate some inodes giving:
 *    the agi and agf of the ag getting the new inodes: 2 * sectorsize
 *    the superblock for the nlink flag: sector size
- *    the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
+ *    the inode blocks allocated: mp->m_ialloc_blks * blocksize
 *    the inode btree: max depth * blocksize
 *    the allocation btrees: 2 trees * (max depth - 1) * block size
 */
@@ -292,7 +292,7 @@ xfs_calc_create_resv_alloc(
 {
        return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
                mp->m_sb.sb_sectsize +
-                xfs_calc_buf_res(XFS_IALLOC_BLOCKS(mp), XFS_FSB_TO_B(mp, 1)) +
+                xfs_calc_buf_res(mp->m_ialloc_blks, XFS_FSB_TO_B(mp, 1)) +
                xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) +
                xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
                                 XFS_FSB_TO_B(mp, 1));
@@ -385,9 +385,9 @@ xfs_calc_ifree_reservation(
                xfs_calc_inode_res(mp, 1) +
                xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
                xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) +
-                max_t(uint, XFS_FSB_TO_B(mp, 1), XFS_INODE_CLUSTER_SIZE(mp)) +
+                max_t(uint, XFS_FSB_TO_B(mp, 1), mp->m_inode_cluster_size) +
                xfs_calc_buf_res(1, 0) +
-                xfs_calc_buf_res(2 + XFS_IALLOC_BLOCKS(mp) +
+                xfs_calc_buf_res(2 + mp->m_ialloc_blks +
                                 mp->m_in_maxlevels, 0) +
                xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
                                 XFS_FSB_TO_B(mp, 1));
diff --git a/fs/xfs/xfs_trans_space.h b/fs/xfs/xfs_trans_space.h
index 7d2c920dfb9c..af5dbe06cb65 100644
--- a/fs/xfs/xfs_trans_space.h
+++ b/fs/xfs/xfs_trans_space.h
@@ -47,7 +47,7 @@
 #define XFS_DIRREMOVE_SPACE_RES(mp)     \
        XFS_DAREMOVE_SPACE_RES(mp, XFS_DATA_FORK)
 #define XFS_IALLOC_SPACE_RES(mp)        \
-        (XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels - 1)
+        ((mp)->m_ialloc_blks + (mp)->m_in_maxlevels - 1)
 /*
 * Space reservation values for various transactions.
diff --git a/fs/xfs/xfs_vnode.h b/fs/xfs/xfs_vnode.h
index 3e8e797c6d11..e8a77383c0d5 100644
--- a/fs/xfs/xfs_vnode.h
+++ b/fs/xfs/xfs_vnode.h
@@ -35,15 +35,6 @@ struct attrlist_cursor_kern;
        { IO_INVIS,     "INVIS"}
 /*
- * Flush/Invalidate options for vop_toss/flush/flushinval_pages.
- */
-#define FI_NONE                 0       /* none */
-#define FI_REMAPF               1       /* Do a remapf prior to the operation */
-#define FI_REMAPF_LOCKED        2       /* Do a remapf prior to the operation.
-                                           Prevent VM access to the pages until
-                                           the operation completes. */
-/*
 * Some useful predicates.
 */
 #define VN_MAPPED(vp)   mapping_mapped(vp->i_mapping)
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index 9d479073ba41..78ed92a46fdd 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -102,8 +102,8 @@ const struct xattr_handler *xfs_xattr_handlers[] = {
        &xfs_xattr_trusted_handler,
        &xfs_xattr_security_handler,
 #ifdef CONFIG_XFS_POSIX_ACL
-        &xfs_xattr_acl_access_handler,
+        &posix_acl_access_xattr_handler,
-        &xfs_xattr_acl_default_handler,
+        &posix_acl_default_xattr_handler,
 #endif
        NULL
 };
author	James Morris <james.l.morris@oracle.com>	2014-04-13 21:23:14 -0400
committer	James Morris <james.l.morris@oracle.com>	2014-04-13 21:23:14 -0400
commit	ecd740c6f2f092b90b95fa35f757973589eaaca2 (patch)
tree	ce02b1e18c4fc5729699251460cd8be7604d8401 /fs
parent	f64410ec665479d7b4b77b7519e814253ed0f686 (diff)
parent	455c6fdbd219161bd09b1165f11699d6d73de11c (diff)