174 files changed, 7376 insertions, 6140 deletions
diff --git a/fs/Makefile b/fs/Makefile
index 4fe6df3ec28f..39a824f44e7c 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -53,7 +53,7 @@ obj-$(CONFIG_FHANDLE)		+= fhandle.o
 obj-y                           += quota/
 obj-$(CONFIG_PROC_FS)           += proc/
-obj-$(CONFIG_SYSFS)             += sysfs/
+obj-$(CONFIG_SYSFS)             += sysfs/ kernfs/
 obj-$(CONFIG_CONFIGFS_FS)       += configfs/
 obj-y                           += devpts/
diff --git a/fs/aio.c b/fs/aio.c
index 6efb7f6cb22e..062a5f6a1448 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -244,9 +244,14 @@ static void aio_free_ring(struct kioctx *ctx)
        int i;
        for (i = 0; i < ctx->nr_pages; i++) {
+                struct page *page;
                pr_debug("pid(%d) [%d] page->count=%d\n", current->pid, i,
                                page_count(ctx->ring_pages[i]));
-                put_page(ctx->ring_pages[i]);
+                page = ctx->ring_pages[i];
+                if (!page)
+                        continue;
+                ctx->ring_pages[i] = NULL;
+                put_page(page);
        }
        put_aio_ring_file(ctx);
@@ -280,18 +285,38 @@ static int aio_migratepage(struct address_space *mapping, struct page *new,
        unsigned long flags;
        int rc;
+        rc = 0;
+        /* Make sure the old page hasn't already been changed */
+        spin_lock(&mapping->private_lock);
+        ctx = mapping->private_data;
+        if (ctx) {
+                pgoff_t idx;
+                spin_lock_irqsave(&ctx->completion_lock, flags);
+                idx = old->index;
+                if (idx < (pgoff_t)ctx->nr_pages) {
+                        if (ctx->ring_pages[idx] != old)
+                                rc = -EAGAIN;
+                } else
+                        rc = -EINVAL;
+                spin_unlock_irqrestore(&ctx->completion_lock, flags);
+        } else
+                rc = -EINVAL;
+        spin_unlock(&mapping->private_lock);
+        if (rc != 0)
+                return rc;
        /* Writeback must be complete */
        BUG_ON(PageWriteback(old));
-        put_page(old);
+        get_page(new);
-        rc = migrate_page_move_mapping(mapping, new, old, NULL, mode);
+        rc = migrate_page_move_mapping(mapping, new, old, NULL, mode, 1);
        if (rc != MIGRATEPAGE_SUCCESS) {
-                get_page(old);
+                put_page(new);
                return rc;
        }
-        get_page(new);
        /* We can potentially race against kioctx teardown here.  Use the
         * address_space's private data lock to protect the mapping's
         * private_data.
@@ -303,13 +328,24 @@ static int aio_migratepage(struct address_space *mapping, struct page *new,
                spin_lock_irqsave(&ctx->completion_lock, flags);
                migrate_page_copy(new, old);
                idx = old->index;
-                if (idx < (pgoff_t)ctx->nr_pages)
+                if (idx < (pgoff_t)ctx->nr_pages) {
-                        ctx->ring_pages[idx] = new;
+                        /* And only do the move if things haven't changed */
+                        if (ctx->ring_pages[idx] == old)
+                                ctx->ring_pages[idx] = new;
+                        else
+                                rc = -EAGAIN;
+                } else
+                        rc = -EINVAL;
                spin_unlock_irqrestore(&ctx->completion_lock, flags);
        } else
                rc = -EBUSY;
        spin_unlock(&mapping->private_lock);
+        if (rc == MIGRATEPAGE_SUCCESS)
+                put_page(old);
+        else
+                put_page(new);
        return rc;
 }
 #endif
@@ -326,7 +362,7 @@ static int aio_setup_ring(struct kioctx *ctx)
        struct aio_ring *ring;
        unsigned nr_events = ctx->max_reqs;
        struct mm_struct *mm = current->mm;
-        unsigned long size, populate;
+        unsigned long size, unused;
        int nr_pages;
        int i;
        struct file *file;
@@ -347,6 +383,20 @@ static int aio_setup_ring(struct kioctx *ctx)
                return -EAGAIN;
        }
+        ctx->aio_ring_file = file;
+        nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring))
+                        / sizeof(struct io_event);
+        ctx->ring_pages = ctx->internal_pages;
+        if (nr_pages > AIO_RING_PAGES) {
+                ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *),
+                                          GFP_KERNEL);
+                if (!ctx->ring_pages) {
+                        put_aio_ring_file(ctx);
+                        return -ENOMEM;
+                }
+        }
        for (i = 0; i < nr_pages; i++) {
                struct page *page;
                page = find_or_create_page(file->f_inode->i_mapping,
@@ -358,19 +408,14 @@ static int aio_setup_ring(struct kioctx *ctx)
                SetPageUptodate(page);
                SetPageDirty(page);
                unlock_page(page);
+                ctx->ring_pages[i] = page;
        }
-        ctx->aio_ring_file = file;
+        ctx->nr_pages = i;
-        nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring))
-                        / sizeof(struct io_event);
-        ctx->ring_pages = ctx->internal_pages;
+        if (unlikely(i != nr_pages)) {
-        if (nr_pages > AIO_RING_PAGES) {
+                aio_free_ring(ctx);
-                ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *),
+                return -EAGAIN;
-                                          GFP_KERNEL);
-                if (!ctx->ring_pages) {
-                        put_aio_ring_file(ctx);
-                        return -ENOMEM;
-                }
        }
        ctx->mmap_size = nr_pages * PAGE_SIZE;
@@ -379,9 +424,9 @@ static int aio_setup_ring(struct kioctx *ctx)
        down_write(&mm->mmap_sem);
        ctx->mmap_base = do_mmap_pgoff(ctx->aio_ring_file, 0, ctx->mmap_size,
                                       PROT_READ | PROT_WRITE,
-                                       MAP_SHARED | MAP_POPULATE, 0, &populate);
+                                       MAP_SHARED, 0, &unused);
+        up_write(&mm->mmap_sem);
        if (IS_ERR((void *)ctx->mmap_base)) {
-                up_write(&mm->mmap_sem);
                ctx->mmap_size = 0;
                aio_free_ring(ctx);
                return -EAGAIN;
@@ -389,27 +434,6 @@ static int aio_setup_ring(struct kioctx *ctx)
        pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base);
-        /* We must do this while still holding mmap_sem for write, as we
-         * need to be protected against userspace attempting to mremap()
-         * or munmap() the ring buffer.
-         */
-        ctx->nr_pages = get_user_pages(current, mm, ctx->mmap_base, nr_pages,
-                                       1, 0, ctx->ring_pages, NULL);
-        /* Dropping the reference here is safe as the page cache will hold
-         * onto the pages for us.  It is also required so that page migration
-         * can unmap the pages and get the right reference count.
-         */
-        for (i = 0; i < ctx->nr_pages; i++)
-                put_page(ctx->ring_pages[i]);
-        up_write(&mm->mmap_sem);
-        if (unlikely(ctx->nr_pages != nr_pages)) {
-                aio_free_ring(ctx);
-                return -EAGAIN;
-        }
        ctx->user_id = ctx->mmap_base;
        ctx->nr_events = nr_events; /* trusted copy */
@@ -652,7 +676,8 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
        aio_nr += ctx->max_reqs;
        spin_unlock(&aio_nr_lock);
-        percpu_ref_get(&ctx->users); /* io_setup() will drop this ref */
+        percpu_ref_get(&ctx->users);    /* io_setup() will drop this ref */
+        percpu_ref_get(&ctx->reqs);     /* free_ioctx_users() will drop this */
        err = ioctx_add_table(ctx, mm);
        if (err)
diff --git a/fs/attr.c b/fs/attr.c
index 267968d94673..5d4e59d56e85 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -202,11 +202,6 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de
                        return -EPERM;
        }
-        if ((ia_valid & ATTR_SIZE) && IS_I_VERSION(inode)) {
-                if (attr->ia_size != inode->i_size)
-                        inode_inc_iversion(inode);
-        }
        if ((ia_valid & ATTR_MODE)) {
                umode_t amode = attr->ia_mode;
                /* Flag setting protected by i_mutex */
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index f1a77449d032..471a4f7f4044 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4354,8 +4354,12 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
         * these flags set.  For all other operations the VFS set these flags
         * explicitly if it wants a timestamp update.
         */
-        if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME))))
+        if (newsize != oldsize) {
-                inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb);
+                inode_inc_iversion(inode);
+                if (!(mask & (ATTR_CTIME | ATTR_MTIME)))
+                        inode->i_ctime = inode->i_mtime =
+                                current_fs_time(inode->i_sb);
+        }
        if (newsize > oldsize) {
                truncate_pagecache(inode, newsize);
diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c
index 6fc82010dc15..c8d9ddf84c69 100644
--- a/fs/btrfs/tests/free-space-tests.c
+++ b/fs/btrfs/tests/free-space-tests.c
@@ -101,7 +101,7 @@ static int test_extents(struct btrfs_block_group_cache *cache)
        ret = btrfs_remove_free_space(cache, 2 * 1024 * 1024, 4096);
        if (ret) {
-                test_msg("Error removing middle peice %d\n", ret);
+                test_msg("Error removing middle piece %d\n", ret);
                return ret;
        }
@@ -266,7 +266,7 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
        }
        if (test_check_exists(cache, 512 * 1024, 3 * 1024 * 1024)) {
-                test_msg("Left over peices after removing overlapping\n");
+                test_msg("Left over pieces after removing overlapping\n");
                return -1;
        }
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 1e561c059539..ec3ba43b9faa 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -210,9 +210,13 @@ static int readpage_nounlock(struct file *filp, struct page *page)
        if (err < 0) {
                SetPageError(page);
                goto out;
-        } else if (err < PAGE_CACHE_SIZE) {
+        } else {
+                if (err < PAGE_CACHE_SIZE) {
                /* zero fill remainder of page */
-                zero_user_segment(page, err, PAGE_CACHE_SIZE);
+                        zero_user_segment(page, err, PAGE_CACHE_SIZE);
+                } else {
+                        flush_dcache_page(page);
+                }
        }
        SetPageUptodate(page);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 9a8e396aed89..278fd2891288 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -978,7 +978,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
        struct ceph_mds_reply_inode *ininfo;
        struct ceph_vino vino;
        struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
-        int i = 0;
        int err = 0;
        dout("fill_trace %p is_dentry %d is_target %d\n", req,
@@ -1039,6 +1038,29 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
                }
        }
+        if (rinfo->head->is_target) {
+                vino.ino = le64_to_cpu(rinfo->targeti.in->ino);
+                vino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
+                in = ceph_get_inode(sb, vino);
+                if (IS_ERR(in)) {
+                        err = PTR_ERR(in);
+                        goto done;
+                }
+                req->r_target_inode = in;
+                err = fill_inode(in, &rinfo->targeti, NULL,
+                                session, req->r_request_started,
+                                (le32_to_cpu(rinfo->head->result) == 0) ?
+                                req->r_fmode : -1,
+                                &req->r_caps_reservation);
+                if (err < 0) {
+                        pr_err("fill_inode badness %p %llx.%llx\n",
+                                in, ceph_vinop(in));
+                        goto done;
+                }
+        }
        /*
         * ignore null lease/binding on snapdir ENOENT, or else we
         * will have trouble splicing in the virtual snapdir later
@@ -1108,7 +1130,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
                             ceph_dentry(req->r_old_dentry)->offset);
                        dn = req->r_old_dentry;  /* use old_dentry */
-                        in = dn->d_inode;
                }
                /* null dentry? */
@@ -1130,44 +1151,28 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
                }
                /* attach proper inode */
-                ininfo = rinfo->targeti.in;
+                if (!dn->d_inode) {
-                vino.ino = le64_to_cpu(ininfo->ino);
+                        ihold(in);
-                vino.snap = le64_to_cpu(ininfo->snapid);
-                in = dn->d_inode;
-                if (!in) {
-                        in = ceph_get_inode(sb, vino);
-                        if (IS_ERR(in)) {
-                                pr_err("fill_trace bad get_inode "
-                                       "%llx.%llx\n", vino.ino, vino.snap);
-                                err = PTR_ERR(in);
-                                d_drop(dn);
-                                goto done;
-                        }
                        dn = splice_dentry(dn, in, &have_lease, true);
                        if (IS_ERR(dn)) {
                                err = PTR_ERR(dn);
                                goto done;
                        }
                        req->r_dentry = dn;  /* may have spliced */
-                        ihold(in);
+                } else if (dn->d_inode && dn->d_inode != in) {
-                } else if (ceph_ino(in) == vino.ino &&
-                           ceph_snap(in) == vino.snap) {
-                        ihold(in);
-                } else {
                        dout(" %p links to %p %llx.%llx, not %llx.%llx\n",
-                             dn, in, ceph_ino(in), ceph_snap(in),
+                             dn, dn->d_inode, ceph_vinop(dn->d_inode),
-                             vino.ino, vino.snap);
+                             ceph_vinop(in));
                        have_lease = false;
-                        in = NULL;
                }
                if (have_lease)
                        update_dentry_lease(dn, rinfo->dlease, session,
                                            req->r_request_started);
                dout(" final dn %p\n", dn);
-                i++;
+        } else if (!req->r_aborted &&
-        } else if ((req->r_op == CEPH_MDS_OP_LOOKUPSNAP ||
+                   (req->r_op == CEPH_MDS_OP_LOOKUPSNAP ||
-                   req->r_op == CEPH_MDS_OP_MKSNAP) && !req->r_aborted) {
+                    req->r_op == CEPH_MDS_OP_MKSNAP)) {
                struct dentry *dn = req->r_dentry;
                /* fill out a snapdir LOOKUPSNAP dentry */
@@ -1177,52 +1182,15 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
                ininfo = rinfo->targeti.in;
                vino.ino = le64_to_cpu(ininfo->ino);
                vino.snap = le64_to_cpu(ininfo->snapid);
-                in = ceph_get_inode(sb, vino);
-                if (IS_ERR(in)) {
-                        pr_err("fill_inode get_inode badness %llx.%llx\n",
-                               vino.ino, vino.snap);
-                        err = PTR_ERR(in);
-                        d_delete(dn);
-                        goto done;
-                }
                dout(" linking snapped dir %p to dn %p\n", in, dn);
+                ihold(in);
                dn = splice_dentry(dn, in, NULL, true);
                if (IS_ERR(dn)) {
                        err = PTR_ERR(dn);
                        goto done;
                }
                req->r_dentry = dn;  /* may have spliced */
-                ihold(in);
-                rinfo->head->is_dentry = 1;  /* fool notrace handlers */
-        }
-        if (rinfo->head->is_target) {
-                vino.ino = le64_to_cpu(rinfo->targeti.in->ino);
-                vino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
-                if (in == NULL || ceph_ino(in) != vino.ino ||
-                    ceph_snap(in) != vino.snap) {
-                        in = ceph_get_inode(sb, vino);
-                        if (IS_ERR(in)) {
-                                err = PTR_ERR(in);
-                                goto done;
-                        }
-                }
-                req->r_target_inode = in;
-                err = fill_inode(in,
-                                 &rinfo->targeti, NULL,
-                                 session, req->r_request_started,
-                                 (le32_to_cpu(rinfo->head->result) == 0) ?
-                                 req->r_fmode : -1,
-                                 &req->r_caps_reservation);
-                if (err < 0) {
-                        pr_err("fill_inode badness %p %llx.%llx\n",
-                               in, ceph_vinop(in));
-                        goto done;
-                }
        }
 done:
        dout("fill_trace done err=%d\n", err);
        return err;
@@ -1272,7 +1240,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
        struct qstr dname;
        struct dentry *dn;
        struct inode *in;
-        int err = 0, i;
+        int err = 0, ret, i;
        struct inode *snapdir = NULL;
        struct ceph_mds_request_head *rhead = req->r_request->front.iov_base;
        struct ceph_dentry_info *di;
@@ -1305,6 +1273,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
                        ceph_fill_dirfrag(parent->d_inode, rinfo->dir_dir);
        }
+        /* FIXME: release caps/leases if error occurs */
        for (i = 0; i < rinfo->dir_nr; i++) {
                struct ceph_vino vino;
@@ -1329,9 +1298,10 @@ retry_lookup:
                                err = -ENOMEM;
                                goto out;
                        }
-                        err = ceph_init_dentry(dn);
+                        ret = ceph_init_dentry(dn);
-                        if (err < 0) {
+                        if (ret < 0) {
                                dput(dn);
+                                err = ret;
                                goto out;
                        }
                } else if (dn->d_inode &&
@@ -1351,9 +1321,6 @@ retry_lookup:
                        spin_unlock(&parent->d_lock);
                }
-                di = dn->d_fsdata;
-                di->offset = ceph_make_fpos(frag, i + r_readdir_offset);
                /* inode */
                if (dn->d_inode) {
                        in = dn->d_inode;
@@ -1366,26 +1333,39 @@ retry_lookup:
                                err = PTR_ERR(in);
                                goto out;
                        }
-                        dn = splice_dentry(dn, in, NULL, false);
-                        if (IS_ERR(dn))
-                                dn = NULL;
                }
                if (fill_inode(in, &rinfo->dir_in[i], NULL, session,
                               req->r_request_started, -1,
                               &req->r_caps_reservation) < 0) {
                        pr_err("fill_inode badness on %p\n", in);
+                        if (!dn->d_inode)
+                                iput(in);
+                        d_drop(dn);
                        goto next_item;
                }
-                if (dn)
-                        update_dentry_lease(dn, rinfo->dir_dlease[i],
+                if (!dn->d_inode) {
-                                            req->r_session,
+                        dn = splice_dentry(dn, in, NULL, false);
-                                            req->r_request_started);
+                        if (IS_ERR(dn)) {
+                                err = PTR_ERR(dn);
+                                dn = NULL;
+                                goto next_item;
+                        }
+                }
+                di = dn->d_fsdata;
+                di->offset = ceph_make_fpos(frag, i + r_readdir_offset);
+                update_dentry_lease(dn, rinfo->dir_dlease[i],
+                                    req->r_session,
+                                    req->r_request_started);
 next_item:
                if (dn)
                        dput(dn);
        }
-        req->r_did_prepopulate = true;
+        if (err == 0)
+                req->r_did_prepopulate = true;
 out:
        if (snapdir) {
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index aa3397620342..2c29db6a247e 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -477,9 +477,10 @@ extern int CIFSGetExtAttr(const unsigned int xid, struct cifs_tcon *tcon,
                        const int netfid, __u64 *pExtAttrBits, __u64 *pMask);
 extern void cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb);
 extern bool CIFSCouldBeMFSymlink(const struct cifs_fattr *fattr);
-extern int CIFSCheckMFSymlink(struct cifs_fattr *fattr,
+extern int CIFSCheckMFSymlink(unsigned int xid, struct cifs_tcon *tcon,
-                const unsigned char *path,
+                              struct cifs_sb_info *cifs_sb,
-                struct cifs_sb_info *cifs_sb, unsigned int xid);
+                              struct cifs_fattr *fattr,
+                              const unsigned char *path);
 extern int mdfour(unsigned char *, unsigned char *, int);
 extern int E_md4hash(const unsigned char *passwd, unsigned char *p16,
                        const struct nls_table *codepage);
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 124aa0230c1b..d707edb6b852 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -4010,7 +4010,7 @@ QFileInfoRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc) {
-                cifs_dbg(FYI, "Send error in QPathInfo = %d\n", rc);
+                cifs_dbg(FYI, "Send error in QFileInfo = %d", rc);
        } else {                /* decode response */
                rc = validate_t2((struct smb_t2_rsp *)pSMBr);
@@ -4179,7 +4179,7 @@ UnixQFileInfoRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc) {
-                cifs_dbg(FYI, "Send error in QPathInfo = %d\n", rc);
+                cifs_dbg(FYI, "Send error in UnixQFileInfo = %d", rc);
        } else {                /* decode response */
                rc = validate_t2((struct smb_t2_rsp *)pSMBr);
@@ -4263,7 +4263,7 @@ UnixQPathInfoRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc) {
-                cifs_dbg(FYI, "Send error in QPathInfo = %d\n", rc);
+                cifs_dbg(FYI, "Send error in UnixQPathInfo = %d", rc);
        } else {                /* decode response */
                rc = validate_t2((struct smb_t2_rsp *)pSMBr);
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 11ff5f116b20..a514e0a65f69 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -193,7 +193,7 @@ check_name(struct dentry *direntry)
 static int
 cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid,
               struct tcon_link *tlink, unsigned oflags, umode_t mode,
-               __u32 *oplock, struct cifs_fid *fid, int *created)
+               __u32 *oplock, struct cifs_fid *fid)
 {
        int rc = -ENOENT;
        int create_options = CREATE_NOT_DIR;
@@ -349,7 +349,6 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid,
                                .device = 0,
                };
-                *created |= FILE_CREATED;
                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
                        args.uid = current_fsuid();
                        if (inode->i_mode & S_ISGID)
@@ -480,13 +479,16 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry,
        cifs_add_pending_open(&fid, tlink, &open);
        rc = cifs_do_create(inode, direntry, xid, tlink, oflags, mode,
-                            &oplock, &fid, opened);
+                            &oplock, &fid);
        if (rc) {
                cifs_del_pending_open(&open);
                goto out;
        }
+        if ((oflags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))
+                *opened |= FILE_CREATED;
        rc = finish_open(file, direntry, generic_file_open, opened);
        if (rc) {
                if (server->ops->close)
@@ -529,7 +531,6 @@ int cifs_create(struct inode *inode, struct dentry *direntry, umode_t mode,
        struct TCP_Server_Info *server;
        struct cifs_fid fid;
        __u32 oplock;
-        int created = FILE_CREATED;
        cifs_dbg(FYI, "cifs_create parent inode = 0x%p name is: %s and dentry = 0x%p\n",
                 inode, direntry->d_name.name, direntry);
@@ -546,7 +547,7 @@ int cifs_create(struct inode *inode, struct dentry *direntry, umode_t mode,
                server->ops->new_lease_key(&fid);
        rc = cifs_do_create(inode, direntry, xid, tlink, oflags, mode,
-                            &oplock, &fid, &created);
+                            &oplock, &fid);
        if (!rc && server->ops->close)
                server->ops->close(xid, tcon, &fid);
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 36f9ebb93ceb..49719b8228e5 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -383,7 +383,8 @@ int cifs_get_inode_info_unix(struct inode **pinode,
        /* check for Minshall+French symlinks */
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) {
-                int tmprc = CIFSCheckMFSymlink(&fattr, full_path, cifs_sb, xid);
+                int tmprc = CIFSCheckMFSymlink(xid, tcon, cifs_sb, &fattr,
+                                               full_path);
                if (tmprc)
                        cifs_dbg(FYI, "CIFSCheckMFSymlink: %d\n", tmprc);
        }
@@ -799,7 +800,8 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
        /* check for Minshall+French symlinks */
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) {
-                tmprc = CIFSCheckMFSymlink(&fattr, full_path, cifs_sb, xid);
+                tmprc = CIFSCheckMFSymlink(xid, tcon, cifs_sb, &fattr,
+                                           full_path);
                if (tmprc)
                        cifs_dbg(FYI, "CIFSCheckMFSymlink: %d\n", tmprc);
        }
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index cc0234710ddb..92aee08483a5 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -354,34 +354,30 @@ open_query_close_cifs_symlink(const unsigned char *path, char *pbuf,
 int
-CIFSCheckMFSymlink(struct cifs_fattr *fattr,
+CIFSCheckMFSymlink(unsigned int xid, struct cifs_tcon *tcon,
-                   const unsigned char *path,
+                   struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
-                   struct cifs_sb_info *cifs_sb, unsigned int xid)
+                   const unsigned char *path)
 {
-        int rc = 0;
+        int rc;
        u8 *buf = NULL;
        unsigned int link_len = 0;
        unsigned int bytes_read = 0;
-        struct cifs_tcon *ptcon;
        if (!CIFSCouldBeMFSymlink(fattr))
                /* it's not a symlink */
                return 0;
        buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL);
-        if (!buf) {
+        if (!buf)
-                rc = -ENOMEM;
+                return -ENOMEM;
-                goto out;
-        }
-        ptcon = tlink_tcon(cifs_sb_tlink(cifs_sb));
+        if (tcon->ses->server->ops->query_mf_symlink)
-        if ((ptcon->ses) && (ptcon->ses->server->ops->query_mf_symlink))
+                rc = tcon->ses->server->ops->query_mf_symlink(path, buf,
-                rc = ptcon->ses->server->ops->query_mf_symlink(path, buf,
+                                                &bytes_read, cifs_sb, xid);
-                                                 &bytes_read, cifs_sb, xid);
        else
-                goto out;
+                rc = -ENOSYS;
-        if (rc != 0)
+        if (rc)
                goto out;
        if (bytes_read == 0) /* not a symlink */
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index dc52e13d58e0..3881610b6438 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -680,7 +680,8 @@ static int do_i2c_rdwr_ioctl(unsigned int fd, unsigned int cmd,
        struct i2c_msg                  __user *tmsgs;
        struct i2c_msg32                __user *umsgs;
        compat_caddr_t                  datap;
-        int                             nmsgs, i;
+        u32                             nmsgs;
+        int                             i;
        if (get_user(nmsgs, &udata->nmsgs))
                return -EFAULT;
diff --git a/fs/dcache.c b/fs/dcache.c
index 6055d61811d3..cb4a10690868 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -3061,8 +3061,13 @@ char *d_path(const struct path *path, char *buf, int buflen)
         * thus don't need to be hashed.  They also don't need a name until a
         * user wants to identify the object in /proc/pid/fd/.  The little hack
         * below allows us to generate a name for these objects on demand:
+         *
+         * Some pseudo inodes are mountable.  When they are mounted
+         * path->dentry == path->mnt->mnt_root.  In that case don't call d_dname
+         * and instead have d_path return the mounted path.
         */
-        if (path->dentry->d_op && path->dentry->d_op->d_dname)
+        if (path->dentry->d_op && path->dentry->d_op->d_dname &&
+            (!IS_ROOT(path->dentry) || path->dentry != path->mnt->mnt_root))
                return path->dentry->d_op->d_dname(path->dentry, buf, buflen);
        rcu_read_lock();
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index d90909ec6aa6..a5e34dd6a32c 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -649,6 +649,7 @@ static void process_sctp_notification(struct connection *con,
                                      struct msghdr *msg, char *buf)
 {
        union sctp_notification *sn = (union sctp_notification *)buf;
+        struct linger linger;
        switch (sn->sn_header.sn_type) {
        case SCTP_SEND_FAILED:
@@ -727,6 +728,13 @@ static void process_sctp_notification(struct connection *con,
                        }
                        add_sock(new_con->sock, new_con);
+                        linger.l_onoff = 1;
+                        linger.l_linger = 0;
+                        ret = kernel_setsockopt(new_con->sock, SOL_SOCKET, SO_LINGER,
+                                                (char *)&linger, sizeof(linger));
+                        if (ret < 0)
+                                log_print("set socket option SO_LINGER failed");
                        log_print("connecting to %d sctp association %d",
                                 nodeid, (int)sn->sn_assoc_change.sac_assoc_id);
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 8b5e2584c840..af903128891c 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1907,10 +1907,6 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
                        }
                }
        }
-        if (op == EPOLL_CTL_DEL && is_file_epoll(tf.file)) {
-                tep = tf.file->private_data;
-                mutex_lock_nested(&tep->mtx, 1);
-        }
        /*
         * Try to lookup the file inside our RB tree, Since we grabbed "mtx"
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 288534920fe5..20d6697bd638 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1493,6 +1493,7 @@ static ssize_t ext2_quota_write(struct super_block *sb, int type,
                                sb->s_blocksize - offset : towrite;
                tmp_bh.b_state = 0;
+                tmp_bh.b_size = sb->s_blocksize;
                err = ext2_get_block(inode, blk, &tmp_bh, 1);
                if (err < 0)
                        goto out;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index e6185031c1cc..ece55565b9cd 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -268,6 +268,16 @@ struct ext4_io_submit {
 /* Translate # of blks to # of clusters */
 #define EXT4_NUM_B2C(sbi, blks) (((blks) + (sbi)->s_cluster_ratio - 1) >> \
                                 (sbi)->s_cluster_bits)
+/* Mask out the low bits to get the starting block of the cluster */
+#define EXT4_PBLK_CMASK(s, pblk) ((pblk) &                              \
+                                  ~((ext4_fsblk_t) (s)->s_cluster_ratio - 1))
+#define EXT4_LBLK_CMASK(s, lblk) ((lblk) &                              \
+                                  ~((ext4_lblk_t) (s)->s_cluster_ratio - 1))
+/* Get the cluster offset */
+#define EXT4_PBLK_COFF(s, pblk) ((pblk) &                               \
+                                 ((ext4_fsblk_t) (s)->s_cluster_ratio - 1))
+#define EXT4_LBLK_COFF(s, lblk) ((lblk) &                               \
+                                 ((ext4_lblk_t) (s)->s_cluster_ratio - 1))
 /*
 * Structure of a blocks group descriptor
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 17ac112ab101..3fe29de832c8 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -259,6 +259,15 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
                if (WARN_ON_ONCE(err)) {
                        ext4_journal_abort_handle(where, line, __func__, bh,
                                                  handle, err);
+                        ext4_error_inode(inode, where, line,
+                                         bh->b_blocknr,
+                                         "journal_dirty_metadata failed: "
+                                         "handle type %u started at line %u, "
+                                         "credits %u/%u, errcode %d",
+                                         handle->h_type,
+                                         handle->h_line_no,
+                                         handle->h_requested_credits,
+                                         handle->h_buffer_credits, err);
                }
        } else {
                if (inode)
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 35f65cf4f318..3384dc4bed40 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -360,8 +360,10 @@ static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
 {
        ext4_fsblk_t block = ext4_ext_pblock(ext);
        int len = ext4_ext_get_actual_len(ext);
+        ext4_lblk_t lblock = le32_to_cpu(ext->ee_block);
+        ext4_lblk_t last = lblock + len - 1;
-        if (len == 0)
+        if (lblock > last)
                return 0;
        return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
 }
@@ -387,11 +389,26 @@ static int ext4_valid_extent_entries(struct inode *inode,
        if (depth == 0) {
                /* leaf entries */
                struct ext4_extent *ext = EXT_FIRST_EXTENT(eh);
+                struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
+                ext4_fsblk_t pblock = 0;
+                ext4_lblk_t lblock = 0;
+                ext4_lblk_t prev = 0;
+                int len = 0;
                while (entries) {
                        if (!ext4_valid_extent(inode, ext))
                                return 0;
+                        /* Check for overlapping extents */
+                        lblock = le32_to_cpu(ext->ee_block);
+                        len = ext4_ext_get_actual_len(ext);
+                        if ((lblock <= prev) && prev) {
+                                pblock = ext4_ext_pblock(ext);
+                                es->s_last_error_block = cpu_to_le64(pblock);
+                                return 0;
+                        }
                        ext++;
                        entries--;
+                        prev = lblock + len - 1;
                }
        } else {
                struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh);
@@ -1834,8 +1851,7 @@ static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi,
        depth = ext_depth(inode);
        if (!path[depth].p_ext)
                goto out;
-        b2 = le32_to_cpu(path[depth].p_ext->ee_block);
+        b2 = EXT4_LBLK_CMASK(sbi, le32_to_cpu(path[depth].p_ext->ee_block));
-        b2 &= ~(sbi->s_cluster_ratio - 1);
        /*
         * get the next allocated block if the extent in the path
@@ -1845,7 +1861,7 @@ static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi,
                b2 = ext4_ext_next_allocated_block(path);
                if (b2 == EXT_MAX_BLOCKS)
                        goto out;
-                b2 &= ~(sbi->s_cluster_ratio - 1);
+                b2 = EXT4_LBLK_CMASK(sbi, b2);
        }
        /* check for wrap through zero on extent logical start block*/
@@ -2504,7 +2520,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
                 * extent, we have to mark the cluster as used (store negative
                 * cluster number in partial_cluster).
                 */
-                unaligned = pblk & (sbi->s_cluster_ratio - 1);
+                unaligned = EXT4_PBLK_COFF(sbi, pblk);
                if (unaligned && (ee_len == num) &&
                    (*partial_cluster != -((long long)EXT4_B2C(sbi, pblk))))
                        *partial_cluster = EXT4_B2C(sbi, pblk);
@@ -2598,7 +2614,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
                         * accidentally freeing it later on
                         */
                        pblk = ext4_ext_pblock(ex);
-                        if (pblk & (sbi->s_cluster_ratio - 1))
+                        if (EXT4_PBLK_COFF(sbi, pblk))
                                *partial_cluster =
                                        -((long long)EXT4_B2C(sbi, pblk));
                        ex--;
@@ -3753,7 +3769,7 @@ int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk)
 {
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        ext4_lblk_t lblk_start, lblk_end;
-        lblk_start = lblk & (~(sbi->s_cluster_ratio - 1));
+        lblk_start = EXT4_LBLK_CMASK(sbi, lblk);
        lblk_end = lblk_start + sbi->s_cluster_ratio - 1;
        return ext4_find_delalloc_range(inode, lblk_start, lblk_end);
@@ -3812,9 +3828,9 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
        trace_ext4_get_reserved_cluster_alloc(inode, lblk_start, num_blks);
        /* Check towards left side */
-        c_offset = lblk_start & (sbi->s_cluster_ratio - 1);
+        c_offset = EXT4_LBLK_COFF(sbi, lblk_start);
        if (c_offset) {
-                lblk_from = lblk_start & (~(sbi->s_cluster_ratio - 1));
+                lblk_from = EXT4_LBLK_CMASK(sbi, lblk_start);
                lblk_to = lblk_from + c_offset - 1;
                if (ext4_find_delalloc_range(inode, lblk_from, lblk_to))
@@ -3822,7 +3838,7 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
        }
        /* Now check towards right. */
-        c_offset = (lblk_start + num_blks) & (sbi->s_cluster_ratio - 1);
+        c_offset = EXT4_LBLK_COFF(sbi, lblk_start + num_blks);
        if (allocated_clusters && c_offset) {
                lblk_from = lblk_start + num_blks;
                lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1;
@@ -4030,7 +4046,7 @@ static int get_implied_cluster_alloc(struct super_block *sb,
                                     struct ext4_ext_path *path)
 {
        struct ext4_sb_info *sbi = EXT4_SB(sb);
-        ext4_lblk_t c_offset = map->m_lblk & (sbi->s_cluster_ratio-1);
+        ext4_lblk_t c_offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
        ext4_lblk_t ex_cluster_start, ex_cluster_end;
        ext4_lblk_t rr_cluster_start;
        ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
@@ -4048,8 +4064,7 @@ static int get_implied_cluster_alloc(struct super_block *sb,
            (rr_cluster_start == ex_cluster_start)) {
                if (rr_cluster_start == ex_cluster_end)
                        ee_start += ee_len - 1;
-                map->m_pblk = (ee_start & ~(sbi->s_cluster_ratio - 1)) +
+                map->m_pblk = EXT4_PBLK_CMASK(sbi, ee_start) + c_offset;
-                        c_offset;
                map->m_len = min(map->m_len,
                                 (unsigned) sbi->s_cluster_ratio - c_offset);
                /*
@@ -4203,7 +4218,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
         */
        map->m_flags &= ~EXT4_MAP_FROM_CLUSTER;
        newex.ee_block = cpu_to_le32(map->m_lblk);
-        cluster_offset = map->m_lblk & (sbi->s_cluster_ratio-1);
+        cluster_offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
        /*
         * If we are doing bigalloc, check to see if the extent returned
@@ -4271,7 +4286,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
         * needed so that future calls to get_implied_cluster_alloc()
         * work correctly.
         */
-        offset = map->m_lblk & (sbi->s_cluster_ratio - 1);
+        offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
        ar.len = EXT4_NUM_B2C(sbi, offset+allocated);
        ar.goal -= offset;
        ar.logical -= offset;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 075763474118..31fa964742bc 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1206,7 +1206,6 @@ static int ext4_journalled_write_end(struct file *file,
 */
 static int ext4_da_reserve_metadata(struct inode *inode, ext4_lblk_t lblock)
 {
-        int retries = 0;
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct ext4_inode_info *ei = EXT4_I(inode);
        unsigned int md_needed;
@@ -1218,7 +1217,6 @@ static int ext4_da_reserve_metadata(struct inode *inode, ext4_lblk_t lblock)
         * in order to allocate nrblocks
         * worse case is one extent per block
         */
-repeat:
        spin_lock(&ei->i_block_reservation_lock);
        /*
         * ext4_calc_metadata_amount() has side effects, which we have
@@ -1238,10 +1236,6 @@ repeat:
                ei->i_da_metadata_calc_len = save_len;
                ei->i_da_metadata_calc_last_lblock = save_last_lblock;
                spin_unlock(&ei->i_block_reservation_lock);
-                if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
-                        cond_resched();
-                        goto repeat;
-                }
                return -ENOSPC;
        }
        ei->i_reserved_meta_blocks += md_needed;
@@ -1255,7 +1249,6 @@ repeat:
 */
 static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
 {
-        int retries = 0;
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct ext4_inode_info *ei = EXT4_I(inode);
        unsigned int md_needed;
@@ -1277,7 +1270,6 @@ static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
         * in order to allocate nrblocks
         * worse case is one extent per block
         */
-repeat:
        spin_lock(&ei->i_block_reservation_lock);
        /*
         * ext4_calc_metadata_amount() has side effects, which we have
@@ -1297,10 +1289,6 @@ repeat:
                ei->i_da_metadata_calc_len = save_len;
                ei->i_da_metadata_calc_last_lblock = save_last_lblock;
                spin_unlock(&ei->i_block_reservation_lock);
-                if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
-                        cond_resched();
-                        goto repeat;
-                }
                dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1));
                return -ENOSPC;
        }
@@ -4598,6 +4586,10 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
                        if (attr->ia_size > sbi->s_bitmap_maxbytes)
                                return -EFBIG;
                }
+                if (IS_I_VERSION(inode) && attr->ia_size != inode->i_size)
+                        inode_inc_iversion(inode);
                if (S_ISREG(inode->i_mode) &&
                    (attr->ia_size < inode->i_size)) {
                        if (ext4_should_order_data(inode)) {
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 4d113efa024c..04a5c7504be9 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -3442,6 +3442,9 @@ static void ext4_mb_pa_callback(struct rcu_head *head)
 {
        struct ext4_prealloc_space *pa;
        pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu);
+        BUG_ON(atomic_read(&pa->pa_count));
+        BUG_ON(pa->pa_deleted == 0);
        kmem_cache_free(ext4_pspace_cachep, pa);
 }
@@ -3455,11 +3458,13 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
        ext4_group_t grp;
        ext4_fsblk_t grp_blk;
-        if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0)
-                return;
        /* in this short window concurrent discard can set pa_deleted */
        spin_lock(&pa->pa_lock);
+        if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) {
+                spin_unlock(&pa->pa_lock);
+                return;
+        }
        if (pa->pa_deleted == 1) {
                spin_unlock(&pa->pa_lock);
                return;
@@ -4121,7 +4126,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
        ext4_get_group_no_and_offset(sb, goal, &group, &block);
        /* set up allocation goals */
-        ac->ac_b_ex.fe_logical = ar->logical & ~(sbi->s_cluster_ratio - 1);
+        ac->ac_b_ex.fe_logical = EXT4_LBLK_CMASK(sbi, ar->logical);
        ac->ac_status = AC_STATUS_CONTINUE;
        ac->ac_sb = sb;
        ac->ac_inode = ar->inode;
@@ -4663,7 +4668,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
         * blocks at the beginning or the end unless we are explicitly
         * requested to avoid doing so.
         */
-        overflow = block & (sbi->s_cluster_ratio - 1);
+        overflow = EXT4_PBLK_COFF(sbi, block);
        if (overflow) {
                if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) {
                        overflow = sbi->s_cluster_ratio - overflow;
@@ -4677,7 +4682,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
                        count += overflow;
                }
        }
-        overflow = count & (sbi->s_cluster_ratio - 1);
+        overflow = EXT4_LBLK_COFF(sbi, count);
        if (overflow) {
                if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) {
                        if (count > overflow)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index c977f4e4e63b..1f7784de05b6 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -792,7 +792,7 @@ static void ext4_put_super(struct super_block *sb)
        }
        ext4_es_unregister_shrinker(sbi);
-        del_timer(&sbi->s_err_report);
+        del_timer_sync(&sbi->s_err_report);
        ext4_release_system_zone(sb);
        ext4_mb_release(sb);
        ext4_ext_release(sb);
@@ -3316,11 +3316,19 @@ int ext4_calculate_overhead(struct super_block *sb)
 }
-static ext4_fsblk_t ext4_calculate_resv_clusters(struct ext4_sb_info *sbi)
+static ext4_fsblk_t ext4_calculate_resv_clusters(struct super_block *sb)
 {
        ext4_fsblk_t resv_clusters;
        /*
+         * There's no need to reserve anything when we aren't using extents.
+         * The space estimates are exact, there are no unwritten extents,
+         * hole punching doesn't need new metadata... This is needed especially
+         * to keep ext2/3 backward compatibility.
+         */
+        if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
+                return 0;
+        /*
         * By default we reserve 2% or 4096 clusters, whichever is smaller.
         * This should cover the situations where we can not afford to run
         * out of space like for example punch hole, or converting
@@ -3328,7 +3336,8 @@ static ext4_fsblk_t ext4_calculate_resv_clusters(struct ext4_sb_info *sbi)
         * allocation would require 1, or 2 blocks, higher numbers are
         * very rare.
         */
-        resv_clusters = ext4_blocks_count(sbi->s_es) >> sbi->s_cluster_bits;
+        resv_clusters = ext4_blocks_count(EXT4_SB(sb)->s_es) >>
+                        EXT4_SB(sb)->s_cluster_bits;
        do_div(resv_clusters, 50);
        resv_clusters = min_t(ext4_fsblk_t, resv_clusters, 4096);
@@ -4071,10 +4080,10 @@ no_journal:
                         "available");
        }
-        err = ext4_reserve_clusters(sbi, ext4_calculate_resv_clusters(sbi));
+        err = ext4_reserve_clusters(sbi, ext4_calculate_resv_clusters(sb));
        if (err) {
                ext4_msg(sb, KERN_ERR, "failed to reserve %llu clusters for "
-                         "reserved pool", ext4_calculate_resv_clusters(sbi));
+                         "reserved pool", ext4_calculate_resv_clusters(sb));
                goto failed_mount4a;
        }
@@ -4184,7 +4193,7 @@ failed_mount_wq:
        }
 failed_mount3:
        ext4_es_unregister_shrinker(sbi);
-        del_timer(&sbi->s_err_report);
+        del_timer_sync(&sbi->s_err_report);
        if (sbi->s_flex_groups)
                ext4_kvfree(sbi->s_flex_groups);
        percpu_counter_destroy(&sbi->s_freeclusters_counter);
diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile
index 27a0820340b9..2e35da12d292 100644
--- a/fs/f2fs/Makefile
+++ b/fs/f2fs/Makefile
@@ -1,6 +1,6 @@
 obj-$(CONFIG_F2FS_FS) += f2fs.o
-f2fs-y          := dir.o file.o inode.o namei.o hash.o super.o
+f2fs-y          := dir.o file.o inode.o namei.o hash.o super.o inline.o
 f2fs-y          += checkpoint.o gc.o data.o node.o segment.o recovery.o
 f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o
 f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 5716e5eb4e8e..293d0486a40f 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -30,7 +30,7 @@ static struct kmem_cache *inode_entry_slab;
 */
 struct page *grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index)
 {
-        struct address_space *mapping = sbi->meta_inode->i_mapping;
+        struct address_space *mapping = META_MAPPING(sbi);
        struct page *page = NULL;
 repeat:
        page = grab_cache_page(mapping, index);
@@ -50,7 +50,7 @@ repeat:
 */
 struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index)
 {
-        struct address_space *mapping = sbi->meta_inode->i_mapping;
+        struct address_space *mapping = META_MAPPING(sbi);
        struct page *page;
 repeat:
        page = grab_cache_page(mapping, index);
@@ -61,11 +61,12 @@ repeat:
        if (PageUptodate(page))
                goto out;
-        if (f2fs_readpage(sbi, page, index, READ_SYNC))
+        if (f2fs_submit_page_bio(sbi, page, index,
+                                READ_SYNC | REQ_META | REQ_PRIO))
                goto repeat;
        lock_page(page);
-        if (page->mapping != mapping) {
+        if (unlikely(page->mapping != mapping)) {
                f2fs_put_page(page, 1);
                goto repeat;
        }
@@ -81,13 +82,12 @@ static int f2fs_write_meta_page(struct page *page,
        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
        /* Should not write any meta pages, if any IO error was occurred */
-        if (wbc->for_reclaim || sbi->por_doing ||
+        if (unlikely(sbi->por_doing ||
-                        is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG)) {
+                        is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG)))
-                dec_page_count(sbi, F2FS_DIRTY_META);
+                goto redirty_out;
-                wbc->pages_skipped++;
-                set_page_dirty(page);
+        if (wbc->for_reclaim)
-                return AOP_WRITEPAGE_ACTIVATE;
+                goto redirty_out;
-        }
        wait_on_page_writeback(page);
@@ -95,24 +95,31 @@ static int f2fs_write_meta_page(struct page *page,
        dec_page_count(sbi, F2FS_DIRTY_META);
        unlock_page(page);
        return 0;
+redirty_out:
+        dec_page_count(sbi, F2FS_DIRTY_META);
+        wbc->pages_skipped++;
+        set_page_dirty(page);
+        return AOP_WRITEPAGE_ACTIVATE;
 }
 static int f2fs_write_meta_pages(struct address_space *mapping,
                                struct writeback_control *wbc)
 {
        struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
-        struct block_device *bdev = sbi->sb->s_bdev;
+        int nrpages = MAX_BIO_BLOCKS(max_hw_blocks(sbi));
        long written;
        if (wbc->for_kupdate)
                return 0;
-        if (get_pages(sbi, F2FS_DIRTY_META) == 0)
+        /* collect a number of dirty meta pages and write together */
+        if (get_pages(sbi, F2FS_DIRTY_META) < nrpages)
                return 0;
        /* if mounting is failed, skip writing node pages */
        mutex_lock(&sbi->cp_mutex);
-        written = sync_meta_pages(sbi, META, bio_get_nr_vecs(bdev));
+        written = sync_meta_pages(sbi, META, nrpages);
        mutex_unlock(&sbi->cp_mutex);
        wbc->nr_to_write -= written;
        return 0;
@@ -121,7 +128,7 @@ static int f2fs_write_meta_pages(struct address_space *mapping,
 long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
                                                long nr_to_write)
 {
-        struct address_space *mapping = sbi->meta_inode->i_mapping;
+        struct address_space *mapping = META_MAPPING(sbi);
        pgoff_t index = 0, end = LONG_MAX;
        struct pagevec pvec;
        long nwritten = 0;
@@ -136,7 +143,7 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
                nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
                                PAGECACHE_TAG_DIRTY,
                                min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
-                if (nr_pages == 0)
+                if (unlikely(nr_pages == 0))
                        break;
                for (i = 0; i < nr_pages; i++) {
@@ -149,7 +156,8 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
                                unlock_page(page);
                                break;
                        }
-                        if (nwritten++ >= nr_to_write)
+                        nwritten++;
+                        if (unlikely(nwritten >= nr_to_write))
                                break;
                }
                pagevec_release(&pvec);
@@ -157,7 +165,7 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
        }
        if (nwritten)
-                f2fs_submit_bio(sbi, type, nr_to_write == LONG_MAX);
+                f2fs_submit_merged_bio(sbi, type, WRITE);
        return nwritten;
 }
@@ -186,31 +194,24 @@ const struct address_space_operations f2fs_meta_aops = {
 int acquire_orphan_inode(struct f2fs_sb_info *sbi)
 {
-        unsigned int max_orphans;
        int err = 0;
-        /*
+        spin_lock(&sbi->orphan_inode_lock);
-         * considering 512 blocks in a segment 5 blocks are needed for cp
+        if (unlikely(sbi->n_orphans >= sbi->max_orphans))
-         * and log segment summaries. Remaining blocks are used to keep
-         * orphan entries with the limitation one reserved segment
-         * for cp pack we can have max 1020*507 orphan entries
-         */
-        max_orphans = (sbi->blocks_per_seg - 5) * F2FS_ORPHANS_PER_BLOCK;
-        mutex_lock(&sbi->orphan_inode_mutex);
-        if (sbi->n_orphans >= max_orphans)
                err = -ENOSPC;
        else
                sbi->n_orphans++;
-        mutex_unlock(&sbi->orphan_inode_mutex);
+        spin_unlock(&sbi->orphan_inode_lock);
        return err;
 }
 void release_orphan_inode(struct f2fs_sb_info *sbi)
 {
-        mutex_lock(&sbi->orphan_inode_mutex);
+        spin_lock(&sbi->orphan_inode_lock);
        f2fs_bug_on(sbi->n_orphans == 0);
        sbi->n_orphans--;
-        mutex_unlock(&sbi->orphan_inode_mutex);
+        spin_unlock(&sbi->orphan_inode_lock);
 }
 void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
@@ -218,27 +219,30 @@ void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
        struct list_head *head, *this;
        struct orphan_inode_entry *new = NULL, *orphan = NULL;
-        mutex_lock(&sbi->orphan_inode_mutex);
+        new = f2fs_kmem_cache_alloc(orphan_entry_slab, GFP_ATOMIC);
+        new->ino = ino;
+        spin_lock(&sbi->orphan_inode_lock);
        head = &sbi->orphan_inode_list;
        list_for_each(this, head) {
                orphan = list_entry(this, struct orphan_inode_entry, list);
-                if (orphan->ino == ino)
+                if (orphan->ino == ino) {
-                        goto out;
+                        spin_unlock(&sbi->orphan_inode_lock);
+                        kmem_cache_free(orphan_entry_slab, new);
+                        return;
+                }
                if (orphan->ino > ino)
                        break;
                orphan = NULL;
        }
-        new = f2fs_kmem_cache_alloc(orphan_entry_slab, GFP_ATOMIC);
-        new->ino = ino;
        /* add new_oentry into list which is sorted by inode number */
        if (orphan)
                list_add(&new->list, this->prev);
        else
                list_add_tail(&new->list, head);
-out:
+        spin_unlock(&sbi->orphan_inode_lock);
-        mutex_unlock(&sbi->orphan_inode_mutex);
 }
 void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
@@ -246,7 +250,7 @@ void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
        struct list_head *head;
        struct orphan_inode_entry *orphan;
-        mutex_lock(&sbi->orphan_inode_mutex);
+        spin_lock(&sbi->orphan_inode_lock);
        head = &sbi->orphan_inode_list;
        list_for_each_entry(orphan, head, list) {
                if (orphan->ino == ino) {
@@ -257,7 +261,7 @@ void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
                        break;
                }
        }
-        mutex_unlock(&sbi->orphan_inode_mutex);
+        spin_unlock(&sbi->orphan_inode_lock);
 }
 static void recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
@@ -270,12 +274,12 @@ static void recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
        iput(inode);
 }
-int recover_orphan_inodes(struct f2fs_sb_info *sbi)
+void recover_orphan_inodes(struct f2fs_sb_info *sbi)
 {
        block_t start_blk, orphan_blkaddr, i, j;
        if (!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG))
-                return 0;
+                return;
        sbi->por_doing = true;
        start_blk = __start_cp_addr(sbi) + 1;
@@ -295,29 +299,39 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi)
        /* clear Orphan Flag */
        clear_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG);
        sbi->por_doing = false;
-        return 0;
+        return;
 }
 static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
 {
-        struct list_head *head, *this, *next;
+        struct list_head *head;
        struct f2fs_orphan_block *orphan_blk = NULL;
-        struct page *page = NULL;
        unsigned int nentries = 0;
-        unsigned short index = 1;
+        unsigned short index;
-        unsigned short orphan_blocks;
+        unsigned short orphan_blocks = (unsigned short)((sbi->n_orphans +
-        orphan_blocks = (unsigned short)((sbi->n_orphans +
                (F2FS_ORPHANS_PER_BLOCK - 1)) / F2FS_ORPHANS_PER_BLOCK);
+        struct page *page = NULL;
+        struct orphan_inode_entry *orphan = NULL;
+        for (index = 0; index < orphan_blocks; index++)
+                grab_meta_page(sbi, start_blk + index);
-        mutex_lock(&sbi->orphan_inode_mutex);
+        index = 1;
+        spin_lock(&sbi->orphan_inode_lock);
        head = &sbi->orphan_inode_list;
        /* loop for each orphan inode entry and write them in Jornal block */
-        list_for_each_safe(this, next, head) {
+        list_for_each_entry(orphan, head, list) {
-                struct orphan_inode_entry *orphan;
+                if (!page) {
+                        page = find_get_page(META_MAPPING(sbi), start_blk++);
+                        f2fs_bug_on(!page);
+                        orphan_blk =
+                                (struct f2fs_orphan_block *)page_address(page);
+                        memset(orphan_blk, 0, sizeof(*orphan_blk));
+                        f2fs_put_page(page, 0);
+                }
-                orphan = list_entry(this, struct orphan_inode_entry, list);
+                orphan_blk->ino[nentries++] = cpu_to_le32(orphan->ino);
                if (nentries == F2FS_ORPHANS_PER_BLOCK) {
                        /*
@@ -331,29 +345,20 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
                        set_page_dirty(page);
                        f2fs_put_page(page, 1);
                        index++;
-                        start_blk++;
                        nentries = 0;
                        page = NULL;
                }
-                if (page)
+        }
-                        goto page_exist;
-                page = grab_meta_page(sbi, start_blk);
+        if (page) {
-                orphan_blk = (struct f2fs_orphan_block *)page_address(page);
+                orphan_blk->blk_addr = cpu_to_le16(index);
-                memset(orphan_blk, 0, sizeof(*orphan_blk));
+                orphan_blk->blk_count = cpu_to_le16(orphan_blocks);
-page_exist:
+                orphan_blk->entry_count = cpu_to_le32(nentries);
-                orphan_blk->ino[nentries++] = cpu_to_le32(orphan->ino);
+                set_page_dirty(page);
+                f2fs_put_page(page, 1);
        }
-        if (!page)
-                goto end;
-        orphan_blk->blk_addr = cpu_to_le16(index);
+        spin_unlock(&sbi->orphan_inode_lock);
-        orphan_blk->blk_count = cpu_to_le16(orphan_blocks);
-        orphan_blk->entry_count = cpu_to_le32(nentries);
-        set_page_dirty(page);
-        f2fs_put_page(page, 1);
-end:
-        mutex_unlock(&sbi->orphan_inode_mutex);
 }
 static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
@@ -428,7 +433,8 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi)
        cp1 = validate_checkpoint(sbi, cp_start_blk_no, &cp1_version);
        /* The second checkpoint pack should start at the next segment */
-        cp_start_blk_no += 1 << le32_to_cpu(fsb->log_blocks_per_seg);
+        cp_start_blk_no += ((unsigned long long)1) <<
+                                le32_to_cpu(fsb->log_blocks_per_seg);
        cp2 = validate_checkpoint(sbi, cp_start_blk_no, &cp2_version);
        if (cp1 && cp2) {
@@ -465,7 +471,7 @@ static int __add_dirty_inode(struct inode *inode, struct dir_inode_entry *new)
        list_for_each(this, head) {
                struct dir_inode_entry *entry;
                entry = list_entry(this, struct dir_inode_entry, list);
-                if (entry->inode == inode)
+                if (unlikely(entry->inode == inode))
                        return -EEXIST;
        }
        list_add_tail(&new->list, head);
@@ -513,8 +519,8 @@ void add_dirty_dir_inode(struct inode *inode)
 void remove_dirty_dir_inode(struct inode *inode)
 {
        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
-        struct list_head *head = &sbi->dir_inode_list;
-        struct list_head *this;
+        struct list_head *this, *head;
        if (!S_ISDIR(inode->i_mode))
                return;
@@ -525,6 +531,7 @@ void remove_dirty_dir_inode(struct inode *inode)
                return;
        }
+        head = &sbi->dir_inode_list;
        list_for_each(this, head) {
                struct dir_inode_entry *entry;
                entry = list_entry(this, struct dir_inode_entry, list);
@@ -546,11 +553,13 @@ void remove_dirty_dir_inode(struct inode *inode)
 struct inode *check_dirty_dir_inode(struct f2fs_sb_info *sbi, nid_t ino)
 {
-        struct list_head *head = &sbi->dir_inode_list;
-        struct list_head *this;
+        struct list_head *this, *head;
        struct inode *inode = NULL;
        spin_lock(&sbi->dir_inode_lock);
+        head = &sbi->dir_inode_list;
        list_for_each(this, head) {
                struct dir_inode_entry *entry;
                entry = list_entry(this, struct dir_inode_entry, list);
@@ -565,11 +574,13 @@ struct inode *check_dirty_dir_inode(struct f2fs_sb_info *sbi, nid_t ino)
 void sync_dirty_dir_inodes(struct f2fs_sb_info *sbi)
 {
-        struct list_head *head = &sbi->dir_inode_list;
+        struct list_head *head;
        struct dir_inode_entry *entry;
        struct inode *inode;
 retry:
        spin_lock(&sbi->dir_inode_lock);
+        head = &sbi->dir_inode_list;
        if (list_empty(head)) {
                spin_unlock(&sbi->dir_inode_lock);
                return;
@@ -585,7 +596,7 @@ retry:
                 * We should submit bio, since it exists several
                 * wribacking dentry pages in the freeing inode.
                 */
-                f2fs_submit_bio(sbi, DATA, true);
+                f2fs_submit_merged_bio(sbi, DATA, WRITE);
        }
        goto retry;
 }
@@ -760,8 +771,8 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
        /* wait for previous submitted node/meta pages writeback */
        wait_on_all_pages_writeback(sbi);
-        filemap_fdatawait_range(sbi->node_inode->i_mapping, 0, LONG_MAX);
+        filemap_fdatawait_range(NODE_MAPPING(sbi), 0, LONG_MAX);
-        filemap_fdatawait_range(sbi->meta_inode->i_mapping, 0, LONG_MAX);
+        filemap_fdatawait_range(META_MAPPING(sbi), 0, LONG_MAX);
        /* update user_block_counts */
        sbi->last_valid_block_count = sbi->total_valid_block_count;
@@ -770,7 +781,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
        /* Here, we only have one bio having CP pack */
        sync_meta_pages(sbi, META_FLUSH, LONG_MAX);
-        if (!is_set_ckpt_flags(ckpt, CP_ERROR_FLAG)) {
+        if (unlikely(!is_set_ckpt_flags(ckpt, CP_ERROR_FLAG))) {
                clear_prefree_segments(sbi);
                F2FS_RESET_SB_DIRT(sbi);
        }
@@ -791,9 +802,9 @@ void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
        trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish block_ops");
-        f2fs_submit_bio(sbi, DATA, true);
+        f2fs_submit_merged_bio(sbi, DATA, WRITE);
-        f2fs_submit_bio(sbi, NODE, true);
+        f2fs_submit_merged_bio(sbi, NODE, WRITE);
-        f2fs_submit_bio(sbi, META, true);
+        f2fs_submit_merged_bio(sbi, META, WRITE);
        /*
         * update checkpoint pack index
@@ -818,20 +829,28 @@ void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
 void init_orphan_info(struct f2fs_sb_info *sbi)
 {
-        mutex_init(&sbi->orphan_inode_mutex);
+        spin_lock_init(&sbi->orphan_inode_lock);
        INIT_LIST_HEAD(&sbi->orphan_inode_list);
        sbi->n_orphans = 0;
+        /*
+         * considering 512 blocks in a segment 8 blocks are needed for cp
+         * and log segment summaries. Remaining blocks are used to keep
+         * orphan entries with the limitation one reserved segment
+         * for cp pack we can have max 1020*504 orphan entries
+         */
+        sbi->max_orphans = (sbi->blocks_per_seg - 2 - NR_CURSEG_TYPE)
+                                * F2FS_ORPHANS_PER_BLOCK;
 }
 int __init create_checkpoint_caches(void)
 {
        orphan_entry_slab = f2fs_kmem_cache_create("f2fs_orphan_entry",
                        sizeof(struct orphan_inode_entry), NULL);
-        if (unlikely(!orphan_entry_slab))
+        if (!orphan_entry_slab)
                return -ENOMEM;
        inode_entry_slab = f2fs_kmem_cache_create("f2fs_dirty_dir_entry",
                        sizeof(struct dir_inode_entry), NULL);
-        if (unlikely(!inode_entry_slab)) {
+        if (!inode_entry_slab) {
                kmem_cache_destroy(orphan_entry_slab);
                return -ENOMEM;
        }
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index aa3438c571fa..0ae558723506 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -24,6 +24,195 @@
 #include "segment.h"
 #include <trace/events/f2fs.h>
+static void f2fs_read_end_io(struct bio *bio, int err)
+{
+        const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+        struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+        do {
+                struct page *page = bvec->bv_page;
+                if (--bvec >= bio->bi_io_vec)
+                        prefetchw(&bvec->bv_page->flags);
+                if (unlikely(!uptodate)) {
+                        ClearPageUptodate(page);
+                        SetPageError(page);
+                } else {
+                        SetPageUptodate(page);
+                }
+                unlock_page(page);
+        } while (bvec >= bio->bi_io_vec);
+        bio_put(bio);
+}
+static void f2fs_write_end_io(struct bio *bio, int err)
+{
+        const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+        struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+        struct f2fs_sb_info *sbi = F2FS_SB(bvec->bv_page->mapping->host->i_sb);
+        do {
+                struct page *page = bvec->bv_page;
+                if (--bvec >= bio->bi_io_vec)
+                        prefetchw(&bvec->bv_page->flags);
+                if (unlikely(!uptodate)) {
+                        SetPageError(page);
+                        set_bit(AS_EIO, &page->mapping->flags);
+                        set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG);
+                        sbi->sb->s_flags |= MS_RDONLY;
+                }
+                end_page_writeback(page);
+                dec_page_count(sbi, F2FS_WRITEBACK);
+        } while (bvec >= bio->bi_io_vec);
+        if (bio->bi_private)
+                complete(bio->bi_private);
+        if (!get_pages(sbi, F2FS_WRITEBACK) &&
+                        !list_empty(&sbi->cp_wait.task_list))
+                wake_up(&sbi->cp_wait);
+        bio_put(bio);
+}
+/*
+ * Low-level block read/write IO operations.
+ */
+static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr,
+                                int npages, bool is_read)
+{
+        struct bio *bio;
+        /* No failure on bio allocation */
+        bio = bio_alloc(GFP_NOIO, npages);
+        bio->bi_bdev = sbi->sb->s_bdev;
+        bio->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr);
+        bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io;
+        return bio;
+}
+static void __submit_merged_bio(struct f2fs_bio_info *io)
+{
+        struct f2fs_io_info *fio = &io->fio;
+        int rw;
+        if (!io->bio)
+                return;
+        rw = fio->rw;
+        if (is_read_io(rw)) {
+                trace_f2fs_submit_read_bio(io->sbi->sb, rw,
+                                                fio->type, io->bio);
+                submit_bio(rw, io->bio);
+        } else {
+                trace_f2fs_submit_write_bio(io->sbi->sb, rw,
+                                                fio->type, io->bio);
+                /*
+                 * META_FLUSH is only from the checkpoint procedure, and we
+                 * should wait this metadata bio for FS consistency.
+                 */
+                if (fio->type == META_FLUSH) {
+                        DECLARE_COMPLETION_ONSTACK(wait);
+                        io->bio->bi_private = &wait;
+                        submit_bio(rw, io->bio);
+                        wait_for_completion(&wait);
+                } else {
+                        submit_bio(rw, io->bio);
+                }
+        }
+        io->bio = NULL;
+}
+void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi,
+                                enum page_type type, int rw)
+{
+        enum page_type btype = PAGE_TYPE_OF_BIO(type);
+        struct f2fs_bio_info *io;
+        io = is_read_io(rw) ? &sbi->read_io : &sbi->write_io[btype];
+        mutex_lock(&io->io_mutex);
+        /* change META to META_FLUSH in the checkpoint procedure */
+        if (type >= META_FLUSH) {
+                io->fio.type = META_FLUSH;
+                io->fio.rw = WRITE_FLUSH_FUA | REQ_META | REQ_PRIO;
+        }
+        __submit_merged_bio(io);
+        mutex_unlock(&io->io_mutex);
+}
+/*
+ * Fill the locked page with data located in the block address.
+ * Return unlocked page.
+ */
+int f2fs_submit_page_bio(struct f2fs_sb_info *sbi, struct page *page,
+                                        block_t blk_addr, int rw)
+{
+        struct bio *bio;
+        trace_f2fs_submit_page_bio(page, blk_addr, rw);
+        /* Allocate a new bio */
+        bio = __bio_alloc(sbi, blk_addr, 1, is_read_io(rw));
+        if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) {
+                bio_put(bio);
+                f2fs_put_page(page, 1);
+                return -EFAULT;
+        }
+        submit_bio(rw, bio);
+        return 0;
+}
+void f2fs_submit_page_mbio(struct f2fs_sb_info *sbi, struct page *page,
+                        block_t blk_addr, struct f2fs_io_info *fio)
+{
+        enum page_type btype = PAGE_TYPE_OF_BIO(fio->type);
+        struct f2fs_bio_info *io;
+        bool is_read = is_read_io(fio->rw);
+        io = is_read ? &sbi->read_io : &sbi->write_io[btype];
+        verify_block_addr(sbi, blk_addr);
+        mutex_lock(&io->io_mutex);
+        if (!is_read)
+                inc_page_count(sbi, F2FS_WRITEBACK);
+        if (io->bio && (io->last_block_in_bio != blk_addr - 1 ||
+                                                io->fio.rw != fio->rw))
+                __submit_merged_bio(io);
+alloc_new:
+        if (io->bio == NULL) {
+                int bio_blocks = MAX_BIO_BLOCKS(max_hw_blocks(sbi));
+                io->bio = __bio_alloc(sbi, blk_addr, bio_blocks, is_read);
+                io->fio = *fio;
+        }
+        if (bio_add_page(io->bio, page, PAGE_CACHE_SIZE, 0) <
+                                                        PAGE_CACHE_SIZE) {
+                __submit_merged_bio(io);
+                goto alloc_new;
+        }
+        io->last_block_in_bio = blk_addr;
+        mutex_unlock(&io->io_mutex);
+        trace_f2fs_submit_page_mbio(page, fio->rw, fio->type, blk_addr);
+}
 /*
 * Lock ordering for the change of data block address:
 * ->data_page
@@ -37,7 +226,7 @@ static void __set_data_blkaddr(struct dnode_of_data *dn, block_t new_addr)
        struct page *node_page = dn->node_page;
        unsigned int ofs_in_node = dn->ofs_in_node;
-        f2fs_wait_on_page_writeback(node_page, NODE, false);
+        f2fs_wait_on_page_writeback(node_page, NODE);
        rn = F2FS_NODE(node_page);
@@ -51,19 +240,39 @@ int reserve_new_block(struct dnode_of_data *dn)
 {
        struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
-        if (is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))
+        if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)))
                return -EPERM;
-        if (!inc_valid_block_count(sbi, dn->inode, 1))
+        if (unlikely(!inc_valid_block_count(sbi, dn->inode, 1)))
                return -ENOSPC;
        trace_f2fs_reserve_new_block(dn->inode, dn->nid, dn->ofs_in_node);
        __set_data_blkaddr(dn, NEW_ADDR);
        dn->data_blkaddr = NEW_ADDR;
+        mark_inode_dirty(dn->inode);
        sync_inode_page(dn);
        return 0;
 }
+int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index)
+{
+        bool need_put = dn->inode_page ? false : true;
+        int err;
+        /* if inode_page exists, index should be zero */
+        f2fs_bug_on(!need_put && index);
+        err = get_dnode_of_data(dn, index, ALLOC_NODE);
+        if (err)
+                return err;
+        if (dn->data_blkaddr == NULL_ADDR)
+                err = reserve_new_block(dn);
+        if (err || need_put)
+                f2fs_put_dnode(dn);
+        return err;
+}
 static int check_extent_cache(struct inode *inode, pgoff_t pgofs,
                                        struct buffer_head *bh_result)
 {
@@ -71,6 +280,9 @@ static int check_extent_cache(struct inode *inode, pgoff_t pgofs,
        pgoff_t start_fofs, end_fofs;
        block_t start_blkaddr;
+        if (is_inode_flag_set(fi, FI_NO_EXTENT))
+                return 0;
        read_lock(&fi->ext.ext_lock);
        if (fi->ext.len == 0) {
                read_unlock(&fi->ext.ext_lock);
@@ -109,6 +321,7 @@ void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn)
        struct f2fs_inode_info *fi = F2FS_I(dn->inode);
        pgoff_t fofs, start_fofs, end_fofs;
        block_t start_blkaddr, end_blkaddr;
+        int need_update = true;
        f2fs_bug_on(blk_addr == NEW_ADDR);
        fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
@@ -117,6 +330,9 @@ void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn)
        /* Update the page address in the parent node */
        __set_data_blkaddr(dn, blk_addr);
+        if (is_inode_flag_set(fi, FI_NO_EXTENT))
+                return;
        write_lock(&fi->ext.ext_lock);
        start_fofs = fi->ext.fofs;
@@ -163,14 +379,21 @@ void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn)
                                        fofs - start_fofs + 1;
                        fi->ext.len -= fofs - start_fofs + 1;
                }
-                goto end_update;
+        } else {
+                need_update = false;
        }
-        write_unlock(&fi->ext.ext_lock);
-        return;
+        /* Finally, if the extent is very fragmented, let's drop the cache. */
+        if (fi->ext.len < F2FS_MIN_EXTENT_LEN) {
+                fi->ext.len = 0;
+                set_inode_flag(fi, FI_NO_EXTENT);
+                need_update = true;
+        }
 end_update:
        write_unlock(&fi->ext.ext_lock);
-        sync_inode_page(dn);
+        if (need_update)
+                sync_inode_page(dn);
+        return;
 }
 struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
@@ -196,7 +419,7 @@ struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
                return ERR_PTR(-ENOENT);
        /* By fallocate(), there is no cached page, but with NEW_ADDR */
-        if (dn.data_blkaddr == NEW_ADDR)
+        if (unlikely(dn.data_blkaddr == NEW_ADDR))
                return ERR_PTR(-EINVAL);
        page = grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS);
@@ -208,11 +431,14 @@ struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
                return page;
        }
-        err = f2fs_readpage(sbi, page, dn.data_blkaddr,
+        err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr,
                                        sync ? READ_SYNC : READA);
+        if (err)
+                return ERR_PTR(err);
        if (sync) {
                wait_on_page_locked(page);
-                if (!PageUptodate(page)) {
+                if (unlikely(!PageUptodate(page))) {
                        f2fs_put_page(page, 0);
                        return ERR_PTR(-EIO);
                }
@@ -246,7 +472,7 @@ repeat:
        }
        f2fs_put_dnode(&dn);
-        if (dn.data_blkaddr == NULL_ADDR) {
+        if (unlikely(dn.data_blkaddr == NULL_ADDR)) {
                f2fs_put_page(page, 1);
                return ERR_PTR(-ENOENT);
        }
@@ -266,16 +492,16 @@ repeat:
                return page;
        }
-        err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC);
+        err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr, READ_SYNC);
        if (err)
                return ERR_PTR(err);
        lock_page(page);
-        if (!PageUptodate(page)) {
+        if (unlikely(!PageUptodate(page))) {
                f2fs_put_page(page, 1);
                return ERR_PTR(-EIO);
        }
-        if (page->mapping != mapping) {
+        if (unlikely(page->mapping != mapping)) {
                f2fs_put_page(page, 1);
                goto repeat;
        }
@@ -286,12 +512,12 @@ repeat:
 * Caller ensures that this data page is never allocated.
 * A new zero-filled data page is allocated in the page cache.
 *
- * Also, caller should grab and release a mutex by calling mutex_lock_op() and
+ * Also, caller should grab and release a rwsem by calling f2fs_lock_op() and
- * mutex_unlock_op().
+ * f2fs_unlock_op().
- * Note that, npage is set only by make_empty_dir.
+ * Note that, ipage is set only by make_empty_dir.
 */
 struct page *get_new_data_page(struct inode *inode,
-                struct page *npage, pgoff_t index, bool new_i_size)
+                struct page *ipage, pgoff_t index, bool new_i_size)
 {
        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
        struct address_space *mapping = inode->i_mapping;
@@ -299,24 +525,16 @@ struct page *get_new_data_page(struct inode *inode,
        struct dnode_of_data dn;
        int err;
-        set_new_dnode(&dn, inode, npage, npage, 0);
+        set_new_dnode(&dn, inode, ipage, NULL, 0);
-        err = get_dnode_of_data(&dn, index, ALLOC_NODE);
+        err = f2fs_reserve_block(&dn, index);
        if (err)
                return ERR_PTR(err);
-        if (dn.data_blkaddr == NULL_ADDR) {
-                if (reserve_new_block(&dn)) {
-                        if (!npage)
-                                f2fs_put_dnode(&dn);
-                        return ERR_PTR(-ENOSPC);
-                }
-        }
-        if (!npage)
-                f2fs_put_dnode(&dn);
 repeat:
        page = grab_cache_page(mapping, index);
-        if (!page)
+        if (!page) {
-                return ERR_PTR(-ENOMEM);
+                err = -ENOMEM;
+                goto put_err;
+        }
        if (PageUptodate(page))
                return page;
@@ -325,15 +543,18 @@ repeat:
                zero_user_segment(page, 0, PAGE_CACHE_SIZE);
                SetPageUptodate(page);
        } else {
-                err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC);
+                err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr,
+                                                                READ_SYNC);
                if (err)
-                        return ERR_PTR(err);
+                        goto put_err;
                lock_page(page);
-                if (!PageUptodate(page)) {
+                if (unlikely(!PageUptodate(page))) {
                        f2fs_put_page(page, 1);
-                        return ERR_PTR(-EIO);
+                        err = -EIO;
+                        goto put_err;
                }
-                if (page->mapping != mapping) {
+                if (unlikely(page->mapping != mapping)) {
                        f2fs_put_page(page, 1);
                        goto repeat;
                }
@@ -344,140 +565,187 @@ repeat:
                i_size_write(inode, ((index + 1) << PAGE_CACHE_SHIFT));
                /* Only the directory inode sets new_i_size */
                set_inode_flag(F2FS_I(inode), FI_UPDATE_DIR);
-                mark_inode_dirty_sync(inode);
        }
        return page;
-}
-static void read_end_io(struct bio *bio, int err)
-{
-        const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-        struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
-        do {
+put_err:
-                struct page *page = bvec->bv_page;
+        f2fs_put_dnode(&dn);
+        return ERR_PTR(err);
-                if (--bvec >= bio->bi_io_vec)
-                        prefetchw(&bvec->bv_page->flags);
-                if (uptodate) {
-                        SetPageUptodate(page);
-                } else {
-                        ClearPageUptodate(page);
-                        SetPageError(page);
-                }
-                unlock_page(page);
-        } while (bvec >= bio->bi_io_vec);
-        bio_put(bio);
 }
-/*
+static int __allocate_data_block(struct dnode_of_data *dn)
- * Fill the locked page with data located in the block address.
- * Return unlocked page.
- */
-int f2fs_readpage(struct f2fs_sb_info *sbi, struct page *page,
-                                        block_t blk_addr, int type)
 {
-        struct block_device *bdev = sbi->sb->s_bdev;
+        struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
-        struct bio *bio;
+        struct f2fs_summary sum;
+        block_t new_blkaddr;
+        struct node_info ni;
+        int type;
-        trace_f2fs_readpage(page, blk_addr, type);
+        if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)))
+                return -EPERM;
+        if (unlikely(!inc_valid_block_count(sbi, dn->inode, 1)))
+                return -ENOSPC;
-        down_read(&sbi->bio_sem);
+        __set_data_blkaddr(dn, NEW_ADDR);
+        dn->data_blkaddr = NEW_ADDR;
-        /* Allocate a new bio */
+        get_node_info(sbi, dn->nid, &ni);
-        bio = f2fs_bio_alloc(bdev, 1);
+        set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version);
-        /* Initialize the bio */
+        type = CURSEG_WARM_DATA;
-        bio->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr);
-        bio->bi_end_io = read_end_io;
-        if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) {
+        allocate_data_block(sbi, NULL, NULL_ADDR, &new_blkaddr, &sum, type);
-                bio_put(bio);
-                up_read(&sbi->bio_sem);
-                f2fs_put_page(page, 1);
-                return -EFAULT;
-        }
-        submit_bio(type, bio);
+        /* direct IO doesn't use extent cache to maximize the performance */
-        up_read(&sbi->bio_sem);
+        set_inode_flag(F2FS_I(dn->inode), FI_NO_EXTENT);
+        update_extent_cache(new_blkaddr, dn);
+        clear_inode_flag(F2FS_I(dn->inode), FI_NO_EXTENT);
+        dn->data_blkaddr = new_blkaddr;
        return 0;
 }
 /*
- * This function should be used by the data read flow only where it
+ * get_data_block() now supported readahead/bmap/rw direct_IO with mapped bh.
- * does not check the "create" flag that indicates block allocation.
+ * If original data blocks are allocated, then give them to blockdev.
- * The reason for this special functionality is to exploit VFS readahead
+ * Otherwise,
- * mechanism.
+ *     a. preallocate requested block addresses
+ *     b. do not use extent cache for better performance
+ *     c. give the block addresses to blockdev
 */
-static int get_data_block_ro(struct inode *inode, sector_t iblock,
+static int get_data_block(struct inode *inode, sector_t iblock,
                        struct buffer_head *bh_result, int create)
 {
+        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
        unsigned int blkbits = inode->i_sb->s_blocksize_bits;
        unsigned maxblocks = bh_result->b_size >> blkbits;
        struct dnode_of_data dn;
-        pgoff_t pgofs;
+        int mode = create ? ALLOC_NODE : LOOKUP_NODE_RA;
-        int err;
+        pgoff_t pgofs, end_offset;
+        int err = 0, ofs = 1;
+        bool allocated = false;
        /* Get the page offset from the block offset(iblock) */
        pgofs = (pgoff_t)(iblock >> (PAGE_CACHE_SHIFT - blkbits));
-        if (check_extent_cache(inode, pgofs, bh_result)) {
+        if (check_extent_cache(inode, pgofs, bh_result))
-                trace_f2fs_get_data_block(inode, iblock, bh_result, 0);
+                goto out;
-                return 0;
-        }
+        if (create)
+                f2fs_lock_op(sbi);
        /* When reading holes, we need its node page */
        set_new_dnode(&dn, inode, NULL, NULL, 0);
-        err = get_dnode_of_data(&dn, pgofs, LOOKUP_NODE_RA);
+        err = get_dnode_of_data(&dn, pgofs, mode);
        if (err) {
-                trace_f2fs_get_data_block(inode, iblock, bh_result, err);
+                if (err == -ENOENT)
-                return (err == -ENOENT) ? 0 : err;
+                        err = 0;
+                goto unlock_out;
+        }
+        if (dn.data_blkaddr == NEW_ADDR)
+                goto put_out;
+        if (dn.data_blkaddr != NULL_ADDR) {
+                map_bh(bh_result, inode->i_sb, dn.data_blkaddr);
+        } else if (create) {
+                err = __allocate_data_block(&dn);
+                if (err)
+                        goto put_out;
+                allocated = true;
+                map_bh(bh_result, inode->i_sb, dn.data_blkaddr);
+        } else {
+                goto put_out;
        }
-        /* It does not support data allocation */
+        end_offset = IS_INODE(dn.node_page) ?
-        f2fs_bug_on(create);
+                        ADDRS_PER_INODE(F2FS_I(inode)) : ADDRS_PER_BLOCK;
+        bh_result->b_size = (((size_t)1) << blkbits);
+        dn.ofs_in_node++;
+        pgofs++;
+get_next:
+        if (dn.ofs_in_node >= end_offset) {
+                if (allocated)
+                        sync_inode_page(&dn);
+                allocated = false;
+                f2fs_put_dnode(&dn);
-        if (dn.data_blkaddr != NEW_ADDR && dn.data_blkaddr != NULL_ADDR) {
+                set_new_dnode(&dn, inode, NULL, NULL, 0);
-                int i;
+                err = get_dnode_of_data(&dn, pgofs, mode);
-                unsigned int end_offset;
+                if (err) {
+                        if (err == -ENOENT)
+                                err = 0;
+                        goto unlock_out;
+                }
+                if (dn.data_blkaddr == NEW_ADDR)
+                        goto put_out;
                end_offset = IS_INODE(dn.node_page) ?
-                                ADDRS_PER_INODE(F2FS_I(inode)) :
+                        ADDRS_PER_INODE(F2FS_I(inode)) : ADDRS_PER_BLOCK;
-                                ADDRS_PER_BLOCK;
+        }
-                clear_buffer_new(bh_result);
+        if (maxblocks > (bh_result->b_size >> blkbits)) {
+                block_t blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
+                if (blkaddr == NULL_ADDR && create) {
+                        err = __allocate_data_block(&dn);
+                        if (err)
+                                goto sync_out;
+                        allocated = true;
+                        blkaddr = dn.data_blkaddr;
+                }
                /* Give more consecutive addresses for the read ahead */
-                for (i = 0; i < end_offset - dn.ofs_in_node; i++)
+                if (blkaddr == (bh_result->b_blocknr + ofs)) {
-                        if (((datablock_addr(dn.node_page,
+                        ofs++;
-                                                        dn.ofs_in_node + i))
+                        dn.ofs_in_node++;
-                                != (dn.data_blkaddr + i)) || maxblocks == i)
+                        pgofs++;
-                                break;
+                        bh_result->b_size += (((size_t)1) << blkbits);
-                map_bh(bh_result, inode->i_sb, dn.data_blkaddr);
+                        goto get_next;
-                bh_result->b_size = (i << blkbits);
+                }
        }
+sync_out:
+        if (allocated)
+                sync_inode_page(&dn);
+put_out:
        f2fs_put_dnode(&dn);
-        trace_f2fs_get_data_block(inode, iblock, bh_result, 0);
+unlock_out:
-        return 0;
+        if (create)
+                f2fs_unlock_op(sbi);
+out:
+        trace_f2fs_get_data_block(inode, iblock, bh_result, err);
+        return err;
 }
 static int f2fs_read_data_page(struct file *file, struct page *page)
 {
-        return mpage_readpage(page, get_data_block_ro);
+        struct inode *inode = page->mapping->host;
+        int ret;
+        /* If the file has inline data, try to read it directlly */
+        if (f2fs_has_inline_data(inode))
+                ret = f2fs_read_inline_data(inode, page);
+        else
+                ret = mpage_readpage(page, get_data_block);
+        return ret;
 }
 static int f2fs_read_data_pages(struct file *file,
                        struct address_space *mapping,
                        struct list_head *pages, unsigned nr_pages)
 {
-        return mpage_readpages(mapping, pages, nr_pages, get_data_block_ro);
+        struct inode *inode = file->f_mapping->host;
+        /* If the file has inline data, skip readpages */
+        if (f2fs_has_inline_data(inode))
+                return 0;
+        return mpage_readpages(mapping, pages, nr_pages, get_data_block);
 }
-int do_write_data_page(struct page *page)
+int do_write_data_page(struct page *page, struct f2fs_io_info *fio)
 {
        struct inode *inode = page->mapping->host;
-        block_t old_blk_addr, new_blk_addr;
+        block_t old_blkaddr, new_blkaddr;
        struct dnode_of_data dn;
        int err = 0;
@@ -486,10 +754,10 @@ int do_write_data_page(struct page *page)
        if (err)
                return err;
-        old_blk_addr = dn.data_blkaddr;
+        old_blkaddr = dn.data_blkaddr;
        /* This page is already truncated */
-        if (old_blk_addr == NULL_ADDR)
+        if (old_blkaddr == NULL_ADDR)
                goto out_writepage;
        set_page_writeback(page);
@@ -498,15 +766,13 @@ int do_write_data_page(struct page *page)
         * If current allocation needs SSR,
         * it had better in-place writes for updated data.
         */
-        if (unlikely(old_blk_addr != NEW_ADDR &&
+        if (unlikely(old_blkaddr != NEW_ADDR &&
                        !is_cold_data(page) &&
                        need_inplace_update(inode))) {
-                rewrite_data_page(F2FS_SB(inode->i_sb), page,
+                rewrite_data_page(page, old_blkaddr, fio);
-                                                old_blk_addr);
        } else {
-                write_data_page(inode, page, &dn,
+                write_data_page(page, &dn, &new_blkaddr, fio);
-                                old_blk_addr, &new_blk_addr);
+                update_extent_cache(new_blkaddr, &dn);
-                update_extent_cache(new_blk_addr, &dn);
        }
 out_writepage:
        f2fs_put_dnode(&dn);
@@ -521,9 +787,13 @@ static int f2fs_write_data_page(struct page *page,
        loff_t i_size = i_size_read(inode);
        const pgoff_t end_index = ((unsigned long long) i_size)
                                                        >> PAGE_CACHE_SHIFT;
-        unsigned offset;
+        unsigned offset = 0;
        bool need_balance_fs = false;
        int err = 0;
+        struct f2fs_io_info fio = {
+                .type = DATA,
+                .rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE,
+        };
        if (page->index < end_index)
                goto write;
@@ -543,7 +813,7 @@ static int f2fs_write_data_page(struct page *page,
        zero_user_segment(page, offset, PAGE_CACHE_SIZE);
 write:
-        if (sbi->por_doing) {
+        if (unlikely(sbi->por_doing)) {
                err = AOP_WRITEPAGE_ACTIVATE;
                goto redirty_out;
        }
@@ -552,10 +822,18 @@ write:
        if (S_ISDIR(inode->i_mode)) {
                dec_page_count(sbi, F2FS_DIRTY_DENTS);
                inode_dec_dirty_dents(inode);
-                err = do_write_data_page(page);
+                err = do_write_data_page(page, &fio);
        } else {
                f2fs_lock_op(sbi);
-                err = do_write_data_page(page);
+                if (f2fs_has_inline_data(inode) || f2fs_may_inline(inode)) {
+                        err = f2fs_write_inline_data(inode, page, offset);
+                        f2fs_unlock_op(sbi);
+                        goto out;
+                } else {
+                        err = do_write_data_page(page, &fio);
+                }
                f2fs_unlock_op(sbi);
                need_balance_fs = true;
        }
@@ -564,8 +842,10 @@ write:
        else if (err)
                goto redirty_out;
-        if (wbc->for_reclaim)
+        if (wbc->for_reclaim) {
-                f2fs_submit_bio(sbi, DATA, true);
+                f2fs_submit_merged_bio(sbi, DATA, WRITE);
+                need_balance_fs = false;
+        }
        clear_cold_data(page);
 out:
@@ -617,7 +897,8 @@ static int f2fs_write_data_pages(struct address_space *mapping,
        ret = write_cache_pages(mapping, wbc, __f2fs_writepage, mapping);
        if (locked)
                mutex_unlock(&sbi->writepages);
-        f2fs_submit_bio(sbi, DATA, (wbc->sync_mode == WB_SYNC_ALL));
+        f2fs_submit_merged_bio(sbi, DATA, WRITE);
        remove_dirty_dir_inode(inode);
@@ -638,27 +919,28 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
        f2fs_balance_fs(sbi);
 repeat:
+        err = f2fs_convert_inline_data(inode, pos + len);
+        if (err)
+                return err;
        page = grab_cache_page_write_begin(mapping, index, flags);
        if (!page)
                return -ENOMEM;
        *pagep = page;
-        f2fs_lock_op(sbi);
+        if (f2fs_has_inline_data(inode) && (pos + len) <= MAX_INLINE_DATA)
+                goto inline_data;
+        f2fs_lock_op(sbi);
        set_new_dnode(&dn, inode, NULL, NULL, 0);
-        err = get_dnode_of_data(&dn, index, ALLOC_NODE);
+        err = f2fs_reserve_block(&dn, index);
-        if (err)
-                goto err;
-        if (dn.data_blkaddr == NULL_ADDR)
-                err = reserve_new_block(&dn);
-        f2fs_put_dnode(&dn);
-        if (err)
-                goto err;
        f2fs_unlock_op(sbi);
+        if (err) {
+                f2fs_put_page(page, 1);
+                return err;
+        }
+inline_data:
        if ((len == PAGE_CACHE_SIZE) || PageUptodate(page))
                return 0;
@@ -674,15 +956,19 @@ repeat:
        if (dn.data_blkaddr == NEW_ADDR) {
                zero_user_segment(page, 0, PAGE_CACHE_SIZE);
        } else {
-                err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC);
+                if (f2fs_has_inline_data(inode))
+                        err = f2fs_read_inline_data(inode, page);
+                else
+                        err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr,
+                                                        READ_SYNC);
                if (err)
                        return err;
                lock_page(page);
-                if (!PageUptodate(page)) {
+                if (unlikely(!PageUptodate(page))) {
                        f2fs_put_page(page, 1);
                        return -EIO;
                }
-                if (page->mapping != mapping) {
+                if (unlikely(page->mapping != mapping)) {
                        f2fs_put_page(page, 1);
                        goto repeat;
                }
@@ -691,11 +977,6 @@ out:
        SetPageUptodate(page);
        clear_cold_data(page);
        return 0;
-err:
-        f2fs_unlock_op(sbi);
-        f2fs_put_page(page, 1);
-        return err;
 }
 static int f2fs_write_end(struct file *file,
@@ -714,23 +995,43 @@ static int f2fs_write_end(struct file *file,
                update_inode_page(inode);
        }
-        unlock_page(page);
+        f2fs_put_page(page, 1);
-        page_cache_release(page);
        return copied;
 }
+static int check_direct_IO(struct inode *inode, int rw,
+                const struct iovec *iov, loff_t offset, unsigned long nr_segs)
+{
+        unsigned blocksize_mask = inode->i_sb->s_blocksize - 1;
+        int i;
+        if (rw == READ)
+                return 0;
+        if (offset & blocksize_mask)
+                return -EINVAL;
+        for (i = 0; i < nr_segs; i++)
+                if (iov[i].iov_len & blocksize_mask)
+                        return -EINVAL;
+        return 0;
+}
 static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb,
                const struct iovec *iov, loff_t offset, unsigned long nr_segs)
 {
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;
-        if (rw == WRITE)
+        /* Let buffer I/O handle the inline data case. */
+        if (f2fs_has_inline_data(inode))
+                return 0;
+        if (check_direct_IO(inode, rw, iov, offset, nr_segs))
                return 0;
-        /* Needs synchronization with the cleaner */
        return blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs,
-                                                  get_data_block_ro);
+                                                        get_data_block);
 }
 static void f2fs_invalidate_data_page(struct page *page, unsigned int offset,
@@ -759,6 +1060,8 @@ static int f2fs_set_data_page_dirty(struct page *page)
        trace_f2fs_set_page_dirty(page, DATA);
        SetPageUptodate(page);
+        mark_inode_dirty(inode);
        if (!PageDirty(page)) {
                __set_page_dirty_nobuffers(page);
                set_dirty_dir_page(inode, page);
@@ -769,7 +1072,7 @@ static int f2fs_set_data_page_dirty(struct page *page)
 static sector_t f2fs_bmap(struct address_space *mapping, sector_t block)
 {
-        return generic_block_bmap(mapping, block, get_data_block_ro);
+        return generic_block_bmap(mapping, block, get_data_block);
 }
 const struct address_space_operations f2fs_dblock_aops = {
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index a84b0a8e6854..3de9d20d0c14 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -24,7 +24,7 @@
 #include "gc.h"
 static LIST_HEAD(f2fs_stat_list);
-static struct dentry *debugfs_root;
+static struct dentry *f2fs_debugfs_root;
 static DEFINE_MUTEX(f2fs_stat_mutex);
 static void update_general_status(struct f2fs_sb_info *sbi)
@@ -45,14 +45,15 @@ static void update_general_status(struct f2fs_sb_info *sbi)
        si->valid_count = valid_user_blocks(sbi);
        si->valid_node_count = valid_node_count(sbi);
        si->valid_inode_count = valid_inode_count(sbi);
+        si->inline_inode = sbi->inline_inode;
        si->utilization = utilization(sbi);
        si->free_segs = free_segments(sbi);
        si->free_secs = free_sections(sbi);
        si->prefree_count = prefree_segments(sbi);
        si->dirty_count = dirty_segments(sbi);
-        si->node_pages = sbi->node_inode->i_mapping->nrpages;
+        si->node_pages = NODE_MAPPING(sbi)->nrpages;
-        si->meta_pages = sbi->meta_inode->i_mapping->nrpages;
+        si->meta_pages = META_MAPPING(sbi)->nrpages;
        si->nats = NM_I(sbi)->nat_cnt;
        si->sits = SIT_I(sbi)->dirty_sentries;
        si->fnids = NM_I(sbi)->fcnt;
@@ -165,9 +166,9 @@ get_cache:
        /* free nids */
        si->cache_mem = NM_I(sbi)->fcnt;
        si->cache_mem += NM_I(sbi)->nat_cnt;
-        npages = sbi->node_inode->i_mapping->nrpages;
+        npages = NODE_MAPPING(sbi)->nrpages;
        si->cache_mem += npages << PAGE_CACHE_SHIFT;
-        npages = sbi->meta_inode->i_mapping->nrpages;
+        npages = META_MAPPING(sbi)->nrpages;
        si->cache_mem += npages << PAGE_CACHE_SHIFT;
        si->cache_mem += sbi->n_orphans * sizeof(struct orphan_inode_entry);
        si->cache_mem += sbi->n_dirty_dirs * sizeof(struct dir_inode_entry);
@@ -200,6 +201,8 @@ static int stat_show(struct seq_file *s, void *v)
                seq_printf(s, "Other: %u)\n  - Data: %u\n",
                           si->valid_node_count - si->valid_inode_count,
                           si->valid_count - si->valid_node_count);
+                seq_printf(s, "  - Inline_data Inode: %u\n",
+                           si->inline_inode);
                seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n",
                           si->main_area_segs, si->main_area_sections,
                           si->main_area_zones);
@@ -242,14 +245,14 @@ static int stat_show(struct seq_file *s, void *v)
                seq_printf(s, "  - node blocks : %d\n", si->node_blks);
                seq_printf(s, "\nExtent Hit Ratio: %d / %d\n",
                           si->hit_ext, si->total_ext);
-                seq_printf(s, "\nBalancing F2FS Async:\n");
+                seq_puts(s, "\nBalancing F2FS Async:\n");
-                seq_printf(s, "  - nodes %4d in %4d\n",
+                seq_printf(s, "  - nodes: %4d in %4d\n",
                           si->ndirty_node, si->node_pages);
-                seq_printf(s, "  - dents %4d in dirs:%4d\n",
+                seq_printf(s, "  - dents: %4d in dirs:%4d\n",
                           si->ndirty_dent, si->ndirty_dirs);
-                seq_printf(s, "  - meta %4d in %4d\n",
+                seq_printf(s, "  - meta: %4d in %4d\n",
                           si->ndirty_meta, si->meta_pages);
-                seq_printf(s, "  - NATs %5d > %lu\n",
+                seq_printf(s, "  - NATs: %5d > %lu\n",
                           si->nats, NM_WOUT_THRESHOLD);
                seq_printf(s, "  - SITs: %5d\n  - free_nids: %5d\n",
                           si->sits, si->fnids);
@@ -340,14 +343,32 @@ void f2fs_destroy_stats(struct f2fs_sb_info *sbi)
 void __init f2fs_create_root_stats(void)
 {
-        debugfs_root = debugfs_create_dir("f2fs", NULL);
+        struct dentry *file;
-        if (debugfs_root)
-                debugfs_create_file("status", S_IRUGO, debugfs_root,
+        f2fs_debugfs_root = debugfs_create_dir("f2fs", NULL);
-                                         NULL, &stat_fops);
+        if (!f2fs_debugfs_root)
+                goto bail;
+        file = debugfs_create_file("status", S_IRUGO, f2fs_debugfs_root,
+                        NULL, &stat_fops);
+        if (!file)
+                goto free_debugfs_dir;
+        return;
+free_debugfs_dir:
+        debugfs_remove(f2fs_debugfs_root);
+bail:
+        f2fs_debugfs_root = NULL;
+        return;
 }
 void f2fs_destroy_root_stats(void)
 {
-        debugfs_remove_recursive(debugfs_root);
+        if (!f2fs_debugfs_root)
-        debugfs_root = NULL;
+                return;
+        debugfs_remove_recursive(f2fs_debugfs_root);
+        f2fs_debugfs_root = NULL;
 }
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 594fc1bb64ef..2b7c255bcbdf 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -190,9 +190,6 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
        unsigned int max_depth;
        unsigned int level;
-        if (namelen > F2FS_NAME_LEN)
-                return NULL;
        if (npages == 0)
                return NULL;
@@ -259,20 +256,17 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
        dir->i_mtime = dir->i_ctime = CURRENT_TIME;
        mark_inode_dirty(dir);
-        /* update parent inode number before releasing dentry page */
-        F2FS_I(inode)->i_pino = dir->i_ino;
        f2fs_put_page(page, 1);
 }
 static void init_dent_inode(const struct qstr *name, struct page *ipage)
 {
-        struct f2fs_node *rn;
+        struct f2fs_inode *ri;
        /* copy name info. to this inode page */
-        rn = F2FS_NODE(ipage);
+        ri = F2FS_INODE(ipage);
-        rn->i.i_namelen = cpu_to_le32(name->len);
+        ri->i_namelen = cpu_to_le32(name->len);
-        memcpy(rn->i.i_name, name->name, name->len);
+        memcpy(ri->i_name, name->name, name->len);
        set_page_dirty(ipage);
 }
@@ -348,11 +342,11 @@ static struct page *init_inode_metadata(struct inode *inode,
                err = f2fs_init_acl(inode, dir, page);
                if (err)
-                        goto error;
+                        goto put_error;
                err = f2fs_init_security(inode, dir, name, page);
                if (err)
-                        goto error;
+                        goto put_error;
                wait_on_page_writeback(page);
        } else {
@@ -376,8 +370,9 @@ static struct page *init_inode_metadata(struct inode *inode,
        }
        return page;
-error:
+put_error:
        f2fs_put_page(page, 1);
+error:
        remove_inode_page(inode);
        return ERR_PTR(err);
 }
@@ -393,6 +388,8 @@ static void update_parent_metadata(struct inode *dir, struct inode *inode,
                clear_inode_flag(F2FS_I(inode), FI_NEW_INODE);
        }
        dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+        mark_inode_dirty(dir);
        if (F2FS_I(dir)->i_current_depth != current_depth) {
                F2FS_I(dir)->i_current_depth = current_depth;
                set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
@@ -400,8 +397,6 @@ static void update_parent_metadata(struct inode *dir, struct inode *inode,
        if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR))
                update_inode_page(dir);
-        else
-                mark_inode_dirty(dir);
        if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK))
                clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
@@ -432,10 +427,11 @@ next:
 }
 /*
- * Caller should grab and release a mutex by calling mutex_lock_op() and
+ * Caller should grab and release a rwsem by calling f2fs_lock_op() and
- * mutex_unlock_op().
+ * f2fs_unlock_op().
 */
-int __f2fs_add_link(struct inode *dir, const struct qstr *name, struct inode *inode)
+int __f2fs_add_link(struct inode *dir, const struct qstr *name,
+                                                struct inode *inode)
 {
        unsigned int bit_pos;
        unsigned int level;
@@ -461,7 +457,7 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name, struct inode *in
        }
 start:
-        if (current_depth == MAX_DIR_HASH_DEPTH)
+        if (unlikely(current_depth == MAX_DIR_HASH_DEPTH))
                return -ENOSPC;
        /* Increase the depth, if required */
@@ -554,14 +550,11 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
        dir->i_ctime = dir->i_mtime = CURRENT_TIME;
-        if (inode && S_ISDIR(inode->i_mode)) {
-                drop_nlink(dir);
-                update_inode_page(dir);
-        } else {
-                mark_inode_dirty(dir);
-        }
        if (inode) {
+                if (S_ISDIR(inode->i_mode)) {
+                        drop_nlink(dir);
+                        update_inode_page(dir);
+                }
                inode->i_ctime = CURRENT_TIME;
                drop_nlink(inode);
                if (S_ISDIR(inode->i_mode)) {
@@ -636,7 +629,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
        bit_pos = ((unsigned long)ctx->pos % NR_DENTRY_IN_BLOCK);
-        for ( ; n < npages; n++) {
+        for (; n < npages; n++) {
                dentry_page = get_lock_data_page(inode, n);
                if (IS_ERR(dentry_page))
                        continue;
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 89dc7508faf2..af51a0bd2dee 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -22,8 +22,10 @@
 #ifdef CONFIG_F2FS_CHECK_FS
 #define f2fs_bug_on(condition)  BUG_ON(condition)
+#define f2fs_down_write(x, y)   down_write_nest_lock(x, y)
 #else
 #define f2fs_bug_on(condition)
+#define f2fs_down_write(x, y)   down_write(x)
 #endif
 /*
@@ -37,6 +39,7 @@
 #define F2FS_MOUNT_POSIX_ACL            0x00000020
 #define F2FS_MOUNT_DISABLE_EXT_IDENTIFY 0x00000040
 #define F2FS_MOUNT_INLINE_XATTR         0x00000080
+#define F2FS_MOUNT_INLINE_DATA          0x00000100
 #define clear_opt(sbi, option)  (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option)
 #define set_opt(sbi, option)    (sbi->mount_opt.opt |= F2FS_MOUNT_##option)
@@ -97,6 +100,13 @@ struct dir_inode_entry {
        struct inode *inode;    /* vfs inode pointer */
 };
+/* for the list of blockaddresses to be discarded */
+struct discard_entry {
+        struct list_head list;  /* list head */
+        block_t blkaddr;        /* block address to be discarded */
+        int len;                /* # of consecutive blocks of the discard */
+};
 /* for the list of fsync inodes, used only during recovery */
 struct fsync_inode_entry {
        struct list_head list;  /* list head */
@@ -155,13 +165,15 @@ enum {
        LOOKUP_NODE,                    /* look up a node without readahead */
        LOOKUP_NODE_RA,                 /*
                                         * look up a node with readahead called
-                                         * by get_datablock_ro.
+                                         * by get_data_block.
                                         */
 };
 #define F2FS_LINK_MAX           32000   /* maximum link count per file */
 /* for in-memory extent cache entry */
+#define F2FS_MIN_EXTENT_LEN     16      /* minimum extent length */
 struct extent_info {
        rwlock_t ext_lock;      /* rwlock for consistency */
        unsigned int fofs;      /* start offset in a file */
@@ -308,6 +320,14 @@ struct f2fs_sm_info {
        /* a threshold to reclaim prefree segments */
        unsigned int rec_prefree_segments;
+        /* for small discard management */
+        struct list_head discard_list;          /* 4KB discard list */
+        int nr_discards;                        /* # of discards in the list */
+        int max_discards;                       /* max. discards to be issued */
+        unsigned int ipu_policy;        /* in-place-update policy */
+        unsigned int min_ipu_util;      /* in-place-update threshold */
 };
 /*
@@ -338,6 +358,7 @@ enum count_type {
 *                      with waiting the bio's completion
 * ...                  Only can be used with META.
 */
+#define PAGE_TYPE_OF_BIO(type)  ((type) > META ? META : (type))
 enum page_type {
        DATA,
        NODE,
@@ -346,6 +367,20 @@ enum page_type {
        META_FLUSH,
 };
+struct f2fs_io_info {
+        enum page_type type;    /* contains DATA/NODE/META/META_FLUSH */
+        int rw;                 /* contains R/RS/W/WS with REQ_META/REQ_PRIO */
+};
+#define is_read_io(rw)  (((rw) & 1) == READ)
+struct f2fs_bio_info {
+        struct f2fs_sb_info *sbi;       /* f2fs superblock */
+        struct bio *bio;                /* bios to merge */
+        sector_t last_block_in_bio;     /* last block number */
+        struct f2fs_io_info fio;        /* store buffered io info. */
+        struct mutex io_mutex;          /* mutex for bio */
+};
 struct f2fs_sb_info {
        struct super_block *sb;                 /* pointer to VFS super block */
        struct proc_dir_entry *s_proc;          /* proc entry */
@@ -359,9 +394,10 @@ struct f2fs_sb_info {
        /* for segment-related operations */
        struct f2fs_sm_info *sm_info;           /* segment manager */
-        struct bio *bio[NR_PAGE_TYPE];          /* bios to merge */
-        sector_t last_block_in_bio[NR_PAGE_TYPE];       /* last block number */
+        /* for bio operations */
-        struct rw_semaphore bio_sem;            /* IO semaphore */
+        struct f2fs_bio_info read_io;                   /* for read bios */
+        struct f2fs_bio_info write_io[NR_PAGE_TYPE];    /* for write bios */
        /* for checkpoint */
        struct f2fs_checkpoint *ckpt;           /* raw checkpoint pointer */
@@ -376,8 +412,9 @@ struct f2fs_sb_info {
        /* for orphan inode management */
        struct list_head orphan_inode_list;     /* orphan inode list */
-        struct mutex orphan_inode_mutex;        /* for orphan inode list */
+        spinlock_t orphan_inode_lock;           /* for orphan inode list */
        unsigned int n_orphans;                 /* # of orphan inodes */
+        unsigned int max_orphans;               /* max orphan inodes */
        /* for directory inode management */
        struct list_head dir_inode_list;        /* dir inode list */
@@ -414,6 +451,9 @@ struct f2fs_sb_info {
        struct f2fs_gc_kthread  *gc_thread;     /* GC thread */
        unsigned int cur_victim_sec;            /* current victim section num */
+        /* maximum # of trials to find a victim segment for SSR and GC */
+        unsigned int max_victim_search;
        /*
         * for stat information.
         * one is for the LFS mode, and the other is for the SSR mode.
@@ -423,6 +463,7 @@ struct f2fs_sb_info {
        unsigned int segment_count[2];          /* # of allocated segments */
        unsigned int block_count[2];            /* # of allocated blocks */
        int total_hit_ext, read_hit_ext;        /* extent cache hit ratio */
+        int inline_inode;                       /* # of inline_data inodes */
        int bg_gc;                              /* background gc calls */
        unsigned int n_dirty_dirs;              /* # of dir inodes */
 #endif
@@ -462,6 +503,11 @@ static inline struct f2fs_node *F2FS_NODE(struct page *page)
        return (struct f2fs_node *)page_address(page);
 }
+static inline struct f2fs_inode *F2FS_INODE(struct page *page)
+{
+        return &((struct f2fs_node *)page_address(page))->i;
+}
 static inline struct f2fs_nm_info *NM_I(struct f2fs_sb_info *sbi)
 {
        return (struct f2fs_nm_info *)(sbi->nm_info);
@@ -487,6 +533,16 @@ static inline struct dirty_seglist_info *DIRTY_I(struct f2fs_sb_info *sbi)
        return (struct dirty_seglist_info *)(SM_I(sbi)->dirty_info);
 }
+static inline struct address_space *META_MAPPING(struct f2fs_sb_info *sbi)
+{
+        return sbi->meta_inode->i_mapping;
+}
+static inline struct address_space *NODE_MAPPING(struct f2fs_sb_info *sbi)
+{
+        return sbi->node_inode->i_mapping;
+}
 static inline void F2FS_SET_SB_DIRT(struct f2fs_sb_info *sbi)
 {
        sbi->s_dirty = 1;
@@ -534,7 +590,7 @@ static inline void f2fs_unlock_op(struct f2fs_sb_info *sbi)
 static inline void f2fs_lock_all(struct f2fs_sb_info *sbi)
 {
-        down_write_nest_lock(&sbi->cp_rwsem, &sbi->cp_mutex);
+        f2fs_down_write(&sbi->cp_rwsem, &sbi->cp_mutex);
 }
 static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi)
@@ -548,7 +604,7 @@ static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi)
 static inline int check_nid_range(struct f2fs_sb_info *sbi, nid_t nid)
 {
        WARN_ON((nid >= NM_I(sbi)->max_nid));
-        if (nid >= NM_I(sbi)->max_nid)
+        if (unlikely(nid >= NM_I(sbi)->max_nid))
                return -EINVAL;
        return 0;
 }
@@ -561,9 +617,9 @@ static inline int check_nid_range(struct f2fs_sb_info *sbi, nid_t nid)
 static inline int F2FS_HAS_BLOCKS(struct inode *inode)
 {
        if (F2FS_I(inode)->i_xattr_nid)
-                return (inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS + 1);
+                return inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS + 1;
        else
-                return (inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS);
+                return inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS;
 }
 static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi,
@@ -574,7 +630,7 @@ static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi,
        spin_lock(&sbi->stat_lock);
        valid_block_count =
                sbi->total_valid_block_count + (block_t)count;
-        if (valid_block_count > sbi->user_block_count) {
+        if (unlikely(valid_block_count > sbi->user_block_count)) {
                spin_unlock(&sbi->stat_lock);
                return false;
        }
@@ -585,7 +641,7 @@ static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi,
        return true;
 }
-static inline int dec_valid_block_count(struct f2fs_sb_info *sbi,
+static inline void dec_valid_block_count(struct f2fs_sb_info *sbi,
                                                struct inode *inode,
                                                blkcnt_t count)
 {
@@ -595,7 +651,6 @@ static inline int dec_valid_block_count(struct f2fs_sb_info *sbi,
        inode->i_blocks -= count;
        sbi->total_valid_block_count -= (block_t)count;
        spin_unlock(&sbi->stat_lock);
-        return 0;
 }
 static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type)
@@ -686,50 +741,48 @@ static inline block_t __start_sum_addr(struct f2fs_sb_info *sbi)
 }
 static inline bool inc_valid_node_count(struct f2fs_sb_info *sbi,
-                                                struct inode *inode,
+                                                struct inode *inode)
-                                                unsigned int count)
 {
        block_t valid_block_count;
        unsigned int valid_node_count;
        spin_lock(&sbi->stat_lock);
-        valid_block_count = sbi->total_valid_block_count + (block_t)count;
+        valid_block_count = sbi->total_valid_block_count + 1;
-        sbi->alloc_valid_block_count += (block_t)count;
+        if (unlikely(valid_block_count > sbi->user_block_count)) {
-        valid_node_count = sbi->total_valid_node_count + count;
-        if (valid_block_count > sbi->user_block_count) {
                spin_unlock(&sbi->stat_lock);
                return false;
        }
-        if (valid_node_count > sbi->total_node_count) {
+        valid_node_count = sbi->total_valid_node_count + 1;
+        if (unlikely(valid_node_count > sbi->total_node_count)) {
                spin_unlock(&sbi->stat_lock);
                return false;
        }
        if (inode)
-                inode->i_blocks += count;
+                inode->i_blocks++;
-        sbi->total_valid_node_count = valid_node_count;
-        sbi->total_valid_block_count = valid_block_count;
+        sbi->alloc_valid_block_count++;
+        sbi->total_valid_node_count++;
+        sbi->total_valid_block_count++;
        spin_unlock(&sbi->stat_lock);
        return true;
 }
 static inline void dec_valid_node_count(struct f2fs_sb_info *sbi,
-                                                struct inode *inode,
+                                                struct inode *inode)
-                                                unsigned int count)
 {
        spin_lock(&sbi->stat_lock);
-        f2fs_bug_on(sbi->total_valid_block_count < count);
+        f2fs_bug_on(!sbi->total_valid_block_count);
-        f2fs_bug_on(sbi->total_valid_node_count < count);
+        f2fs_bug_on(!sbi->total_valid_node_count);
-        f2fs_bug_on(inode->i_blocks < count);
+        f2fs_bug_on(!inode->i_blocks);
-        inode->i_blocks -= count;
+        inode->i_blocks--;
-        sbi->total_valid_node_count -= count;
+        sbi->total_valid_node_count--;
-        sbi->total_valid_block_count -= (block_t)count;
+        sbi->total_valid_block_count--;
        spin_unlock(&sbi->stat_lock);
 }
@@ -751,13 +804,12 @@ static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi)
        spin_unlock(&sbi->stat_lock);
 }
-static inline int dec_valid_inode_count(struct f2fs_sb_info *sbi)
+static inline void dec_valid_inode_count(struct f2fs_sb_info *sbi)
 {
        spin_lock(&sbi->stat_lock);
        f2fs_bug_on(!sbi->total_valid_inode_count);
        sbi->total_valid_inode_count--;
        spin_unlock(&sbi->stat_lock);
-        return 0;
 }
 static inline unsigned int valid_inode_count(struct f2fs_sb_info *sbi)
@@ -771,7 +823,7 @@ static inline unsigned int valid_inode_count(struct f2fs_sb_info *sbi)
 static inline void f2fs_put_page(struct page *page, int unlock)
 {
-        if (!page || IS_ERR(page))
+        if (!page)
                return;
        if (unlock) {
@@ -876,7 +928,9 @@ enum {
        FI_NO_ALLOC,            /* should not allocate any blocks */
        FI_UPDATE_DIR,          /* should update inode block for consistency */
        FI_DELAY_IPUT,          /* used for the recovery */
+        FI_NO_EXTENT,           /* not to use the extent cache */
        FI_INLINE_XATTR,        /* used for inline xattr */
+        FI_INLINE_DATA,         /* used for inline data*/
 };
 static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag)
@@ -914,6 +968,8 @@ static inline void get_inline_info(struct f2fs_inode_info *fi,
 {
        if (ri->i_inline & F2FS_INLINE_XATTR)
                set_inode_flag(fi, FI_INLINE_XATTR);
+        if (ri->i_inline & F2FS_INLINE_DATA)
+                set_inode_flag(fi, FI_INLINE_DATA);
 }
 static inline void set_raw_inline(struct f2fs_inode_info *fi,
@@ -923,6 +979,8 @@ static inline void set_raw_inline(struct f2fs_inode_info *fi,
        if (is_inode_flag_set(fi, FI_INLINE_XATTR))
                ri->i_inline |= F2FS_INLINE_XATTR;
+        if (is_inode_flag_set(fi, FI_INLINE_DATA))
+                ri->i_inline |= F2FS_INLINE_DATA;
 }
 static inline unsigned int addrs_per_inode(struct f2fs_inode_info *fi)
@@ -948,6 +1006,18 @@ static inline int inline_xattr_size(struct inode *inode)
                return 0;
 }
+static inline int f2fs_has_inline_data(struct inode *inode)
+{
+        return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DATA);
+}
+static inline void *inline_data_addr(struct page *page)
+{
+        struct f2fs_inode *ri;
+        ri = (struct f2fs_inode *)page_address(page);
+        return (void *)&(ri->i_addr[1]);
+}
 static inline int f2fs_readonly(struct super_block *sb)
 {
        return sb->s_flags & MS_RDONLY;
@@ -958,6 +1028,7 @@ static inline int f2fs_readonly(struct super_block *sb)
 */
 int f2fs_sync_file(struct file *, loff_t, loff_t, int);
 void truncate_data_blocks(struct dnode_of_data *);
+int truncate_blocks(struct inode *, u64);
 void f2fs_truncate(struct inode *);
 int f2fs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
 int f2fs_setattr(struct dentry *, struct iattr *);
@@ -1027,7 +1098,7 @@ int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int);
 int truncate_inode_blocks(struct inode *, pgoff_t);
 int truncate_xattr_node(struct inode *, struct page *);
 int wait_on_node_pages_writeback(struct f2fs_sb_info *, nid_t);
-int remove_inode_page(struct inode *);
+void remove_inode_page(struct inode *);
 struct page *new_inode_page(struct inode *, const struct qstr *);
 struct page *new_node_page(struct dnode_of_data *, unsigned int, struct page *);
 void ra_node_page(struct f2fs_sb_info *, nid_t);
@@ -1059,19 +1130,19 @@ void clear_prefree_segments(struct f2fs_sb_info *);
 int npages_for_summary_flush(struct f2fs_sb_info *);
 void allocate_new_segments(struct f2fs_sb_info *);
 struct page *get_sum_page(struct f2fs_sb_info *, unsigned int);
-struct bio *f2fs_bio_alloc(struct block_device *, int);
-void f2fs_submit_bio(struct f2fs_sb_info *, enum page_type, bool);
-void f2fs_wait_on_page_writeback(struct page *, enum page_type, bool);
 void write_meta_page(struct f2fs_sb_info *, struct page *);
-void write_node_page(struct f2fs_sb_info *, struct page *, unsigned int,
+void write_node_page(struct f2fs_sb_info *, struct page *,
-                                        block_t, block_t *);
+                struct f2fs_io_info *, unsigned int, block_t, block_t *);
-void write_data_page(struct inode *, struct page *, struct dnode_of_data*,
+void write_data_page(struct page *, struct dnode_of_data *, block_t *,
-                                        block_t, block_t *);
+                                        struct f2fs_io_info *);
-void rewrite_data_page(struct f2fs_sb_info *, struct page *, block_t);
+void rewrite_data_page(struct page *, block_t, struct f2fs_io_info *);
 void recover_data_page(struct f2fs_sb_info *, struct page *,
                                struct f2fs_summary *, block_t, block_t);
 void rewrite_node_page(struct f2fs_sb_info *, struct page *,
                                struct f2fs_summary *, block_t, block_t);
+void allocate_data_block(struct f2fs_sb_info *, struct page *,
+                block_t, block_t *, struct f2fs_summary *, int);
+void f2fs_wait_on_page_writeback(struct page *, enum page_type);
 void write_data_summaries(struct f2fs_sb_info *, block_t);
 void write_node_summaries(struct f2fs_sb_info *, block_t);
 int lookup_journal_in_cursum(struct f2fs_summary_block *,
@@ -1079,6 +1150,8 @@ int lookup_journal_in_cursum(struct f2fs_summary_block *,
 void flush_sit_entries(struct f2fs_sb_info *);
 int build_segment_manager(struct f2fs_sb_info *);
 void destroy_segment_manager(struct f2fs_sb_info *);
+int __init create_segment_manager_caches(void);
+void destroy_segment_manager_caches(void);
 /*
 * checkpoint.c
@@ -1090,7 +1163,7 @@ int acquire_orphan_inode(struct f2fs_sb_info *);
 void release_orphan_inode(struct f2fs_sb_info *);
 void add_orphan_inode(struct f2fs_sb_info *, nid_t);
 void remove_orphan_inode(struct f2fs_sb_info *, nid_t);
-int recover_orphan_inodes(struct f2fs_sb_info *);
+void recover_orphan_inodes(struct f2fs_sb_info *);
 int get_valid_checkpoint(struct f2fs_sb_info *);
 void set_dirty_dir_page(struct inode *, struct page *);
 void add_dirty_dir_inode(struct inode *);
@@ -1105,13 +1178,17 @@ void destroy_checkpoint_caches(void);
 /*
 * data.c
 */
+void f2fs_submit_merged_bio(struct f2fs_sb_info *, enum page_type, int);
+int f2fs_submit_page_bio(struct f2fs_sb_info *, struct page *, block_t, int);
+void f2fs_submit_page_mbio(struct f2fs_sb_info *, struct page *, block_t,
+                                                struct f2fs_io_info *);
 int reserve_new_block(struct dnode_of_data *);
+int f2fs_reserve_block(struct dnode_of_data *, pgoff_t);
 void update_extent_cache(block_t, struct dnode_of_data *);
 struct page *find_data_page(struct inode *, pgoff_t, bool);
 struct page *get_lock_data_page(struct inode *, pgoff_t);
 struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool);
-int f2fs_readpage(struct f2fs_sb_info *, struct page *, block_t, int);
+int do_write_data_page(struct page *, struct f2fs_io_info *);
-int do_write_data_page(struct page *);
 /*
 * gc.c
@@ -1144,7 +1221,7 @@ struct f2fs_stat_info {
        int ndirty_node, ndirty_dent, ndirty_dirs, ndirty_meta;
        int nats, sits, fnids;
        int total_count, utilization;
-        int bg_gc;
+        int bg_gc, inline_inode;
        unsigned int valid_count, valid_node_count, valid_inode_count;
        unsigned int bimodal, avg_vblocks;
        int util_free, util_valid, util_invalid;
@@ -1164,7 +1241,7 @@ struct f2fs_stat_info {
 static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
 {
-        return (struct f2fs_stat_info*)sbi->stat_info;
+        return (struct f2fs_stat_info *)sbi->stat_info;
 }
 #define stat_inc_call_count(si)         ((si)->call_count++)
@@ -1173,6 +1250,17 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
 #define stat_dec_dirty_dir(sbi)         ((sbi)->n_dirty_dirs--)
 #define stat_inc_total_hit(sb)          ((F2FS_SB(sb))->total_hit_ext++)
 #define stat_inc_read_hit(sb)           ((F2FS_SB(sb))->read_hit_ext++)
+#define stat_inc_inline_inode(inode)                                    \
+        do {                                                            \
+                if (f2fs_has_inline_data(inode))                        \
+                        ((F2FS_SB(inode->i_sb))->inline_inode++);       \
+        } while (0)
+#define stat_dec_inline_inode(inode)                                    \
+        do {                                                            \
+                if (f2fs_has_inline_data(inode))                        \
+                        ((F2FS_SB(inode->i_sb))->inline_inode--);       \
+        } while (0)
 #define stat_inc_seg_type(sbi, curseg)                                  \
                ((sbi)->segment_count[(curseg)->alloc_type]++)
 #define stat_inc_block_count(sbi, curseg)                               \
@@ -1216,6 +1304,8 @@ void f2fs_destroy_root_stats(void);
 #define stat_dec_dirty_dir(sbi)
 #define stat_inc_total_hit(sb)
 #define stat_inc_read_hit(sb)
+#define stat_inc_inline_inode(inode)
+#define stat_dec_inline_inode(inode)
 #define stat_inc_seg_type(sbi, curseg)
 #define stat_inc_block_count(sbi, curseg)
 #define stat_inc_seg_count(si, type)
@@ -1238,4 +1328,13 @@ extern const struct address_space_operations f2fs_meta_aops;
 extern const struct inode_operations f2fs_dir_inode_operations;
 extern const struct inode_operations f2fs_symlink_inode_operations;
 extern const struct inode_operations f2fs_special_inode_operations;
+/*
+ * inline.c
+ */
+bool f2fs_may_inline(struct inode *);
+int f2fs_read_inline_data(struct inode *, struct page *);
+int f2fs_convert_inline_data(struct inode *, pgoff_t);
+int f2fs_write_inline_data(struct inode *, struct page *, unsigned int);
+int recover_inline_data(struct inode *, struct page *);
 #endif
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 7d714f4972d5..85e91ca88d57 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -33,7 +33,6 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
        struct page *page = vmf->page;
        struct inode *inode = file_inode(vma->vm_file);
        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
-        block_t old_blk_addr;
        struct dnode_of_data dn;
        int err;
@@ -44,30 +43,16 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
        /* block allocation */
        f2fs_lock_op(sbi);
        set_new_dnode(&dn, inode, NULL, NULL, 0);
-        err = get_dnode_of_data(&dn, page->index, ALLOC_NODE);
+        err = f2fs_reserve_block(&dn, page->index);
-        if (err) {
-                f2fs_unlock_op(sbi);
-                goto out;
-        }
-        old_blk_addr = dn.data_blkaddr;
-        if (old_blk_addr == NULL_ADDR) {
-                err = reserve_new_block(&dn);
-                if (err) {
-                        f2fs_put_dnode(&dn);
-                        f2fs_unlock_op(sbi);
-                        goto out;
-                }
-        }
-        f2fs_put_dnode(&dn);
        f2fs_unlock_op(sbi);
+        if (err)
+                goto out;
        file_update_time(vma->vm_file);
        lock_page(page);
-        if (page->mapping != inode->i_mapping ||
+        if (unlikely(page->mapping != inode->i_mapping ||
                        page_offset(page) > i_size_read(inode) ||
-                        !PageUptodate(page)) {
+                        !PageUptodate(page))) {
                unlock_page(page);
                err = -EFAULT;
                goto out;
@@ -130,12 +115,12 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
        int ret = 0;
        bool need_cp = false;
        struct writeback_control wbc = {
-                .sync_mode = WB_SYNC_ALL,
+                .sync_mode = WB_SYNC_NONE,
                .nr_to_write = LONG_MAX,
                .for_reclaim = 0,
        };
-        if (f2fs_readonly(inode->i_sb))
+        if (unlikely(f2fs_readonly(inode->i_sb)))
                return 0;
        trace_f2fs_sync_file_enter(inode);
@@ -217,7 +202,7 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
        raw_node = F2FS_NODE(dn->node_page);
        addr = blkaddr_in_node(raw_node) + ofs;
-        for ( ; count > 0; count--, addr++, dn->ofs_in_node++) {
+        for (; count > 0; count--, addr++, dn->ofs_in_node++) {
                block_t blkaddr = le32_to_cpu(*addr);
                if (blkaddr == NULL_ADDR)
                        continue;
@@ -256,7 +241,7 @@ static void truncate_partial_data_page(struct inode *inode, u64 from)
                return;
        lock_page(page);
-        if (page->mapping != inode->i_mapping) {
+        if (unlikely(page->mapping != inode->i_mapping)) {
                f2fs_put_page(page, 1);
                return;
        }
@@ -266,21 +251,24 @@ static void truncate_partial_data_page(struct inode *inode, u64 from)
        f2fs_put_page(page, 1);
 }
-static int truncate_blocks(struct inode *inode, u64 from)
+int truncate_blocks(struct inode *inode, u64 from)
 {
        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
        unsigned int blocksize = inode->i_sb->s_blocksize;
        struct dnode_of_data dn;
        pgoff_t free_from;
-        int count = 0;
+        int count = 0, err = 0;
-        int err;
        trace_f2fs_truncate_blocks_enter(inode, from);
+        if (f2fs_has_inline_data(inode))
+                goto done;
        free_from = (pgoff_t)
                        ((from + blocksize - 1) >> (sbi->log_blocksize));
        f2fs_lock_op(sbi);
        set_new_dnode(&dn, inode, NULL, NULL, 0);
        err = get_dnode_of_data(&dn, free_from, LOOKUP_NODE);
        if (err) {
@@ -308,7 +296,7 @@ static int truncate_blocks(struct inode *inode, u64 from)
 free_next:
        err = truncate_inode_blocks(inode, free_from);
        f2fs_unlock_op(sbi);
+done:
        /* lastly zero out the first data page */
        truncate_partial_data_page(inode, from);
@@ -382,6 +370,10 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
        if ((attr->ia_valid & ATTR_SIZE) &&
                        attr->ia_size != i_size_read(inode)) {
+                err = f2fs_convert_inline_data(inode, attr->ia_size);
+                if (err)
+                        return err;
                truncate_setsize(inode, attr->ia_size);
                f2fs_truncate(inode);
                f2fs_balance_fs(F2FS_SB(inode->i_sb));
@@ -459,12 +451,16 @@ int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end)
        return 0;
 }
-static int punch_hole(struct inode *inode, loff_t offset, loff_t len, int mode)
+static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
 {
        pgoff_t pg_start, pg_end;
        loff_t off_start, off_end;
        int ret = 0;
+        ret = f2fs_convert_inline_data(inode, MAX_INLINE_DATA + 1);
+        if (ret)
+                return ret;
        pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT;
        pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT;
@@ -499,12 +495,6 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len, int mode)
                }
        }
-        if (!(mode & FALLOC_FL_KEEP_SIZE) &&
-                i_size_read(inode) <= (offset + len)) {
-                i_size_write(inode, offset);
-                mark_inode_dirty(inode);
-        }
        return ret;
 }
@@ -521,6 +511,10 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
        if (ret)
                return ret;
+        ret = f2fs_convert_inline_data(inode, offset + len);
+        if (ret)
+                return ret;
        pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT;
        pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT;
@@ -532,22 +526,10 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
                f2fs_lock_op(sbi);
                set_new_dnode(&dn, inode, NULL, NULL, 0);
-                ret = get_dnode_of_data(&dn, index, ALLOC_NODE);
+                ret = f2fs_reserve_block(&dn, index);
-                if (ret) {
-                        f2fs_unlock_op(sbi);
-                        break;
-                }
-                if (dn.data_blkaddr == NULL_ADDR) {
-                        ret = reserve_new_block(&dn);
-                        if (ret) {
-                                f2fs_put_dnode(&dn);
-                                f2fs_unlock_op(sbi);
-                                break;
-                        }
-                }
-                f2fs_put_dnode(&dn);
                f2fs_unlock_op(sbi);
+                if (ret)
+                        break;
                if (pg_start == pg_end)
                        new_size = offset + len;
@@ -578,7 +560,7 @@ static long f2fs_fallocate(struct file *file, int mode,
                return -EOPNOTSUPP;
        if (mode & FALLOC_FL_PUNCH_HOLE)
-                ret = punch_hole(inode, offset, len, mode);
+                ret = punch_hole(inode, offset, len);
        else
                ret = expand_inode_data(inode, offset, len, mode);
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index b7ad1ec7e4cc..ea0371e854b4 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -119,7 +119,6 @@ int start_gc_thread(struct f2fs_sb_info *sbi)
                kfree(gc_th);
                sbi->gc_thread = NULL;
        }
 out:
        return err;
 }
@@ -164,8 +163,8 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type,
                p->ofs_unit = sbi->segs_per_sec;
        }
-        if (p->max_search > MAX_VICTIM_SEARCH)
+        if (p->max_search > sbi->max_victim_search)
-                p->max_search = MAX_VICTIM_SEARCH;
+                p->max_search = sbi->max_victim_search;
        p->offset = sbi->last_victim[p->gc_mode];
 }
@@ -429,7 +428,7 @@ next_step:
                /* set page dirty and write it */
                if (gc_type == FG_GC) {
-                        f2fs_wait_on_page_writeback(node_page, NODE, true);
+                        f2fs_wait_on_page_writeback(node_page, NODE);
                        set_page_dirty(node_page);
                } else {
                        if (!PageWriteback(node_page))
@@ -521,6 +520,11 @@ static int check_dnode(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
 static void move_data_page(struct inode *inode, struct page *page, int gc_type)
 {
+        struct f2fs_io_info fio = {
+                .type = DATA,
+                .rw = WRITE_SYNC,
+        };
        if (gc_type == BG_GC) {
                if (PageWriteback(page))
                        goto out;
@@ -529,7 +533,7 @@ static void move_data_page(struct inode *inode, struct page *page, int gc_type)
        } else {
                struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
-                f2fs_wait_on_page_writeback(page, DATA, true);
+                f2fs_wait_on_page_writeback(page, DATA);
                if (clear_page_dirty_for_io(page) &&
                        S_ISDIR(inode->i_mode)) {
@@ -537,7 +541,7 @@ static void move_data_page(struct inode *inode, struct page *page, int gc_type)
                        inode_dec_dirty_dents(inode);
                }
                set_cold_data(page);
-                do_write_data_page(page);
+                do_write_data_page(page, &fio);
                clear_cold_data(page);
        }
 out:
@@ -631,7 +635,7 @@ next_iput:
                goto next_step;
        if (gc_type == FG_GC) {
-                f2fs_submit_bio(sbi, DATA, true);
+                f2fs_submit_merged_bio(sbi, DATA, WRITE);
                /*
                 * In the case of FG_GC, it'd be better to reclaim this victim
@@ -664,8 +668,6 @@ static void do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno,
        /* read segment summary of victim */
        sum_page = get_sum_page(sbi, segno);
-        if (IS_ERR(sum_page))
-                return;
        blk_start_plug(&plug);
@@ -697,7 +699,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi)
        INIT_LIST_HEAD(&ilist);
 gc_more:
-        if (!(sbi->sb->s_flags & MS_ACTIVE))
+        if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE)))
                goto stop;
        if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree)) {
diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h
index 507056d22205..5d5eb6047bf4 100644
--- a/fs/f2fs/gc.h
+++ b/fs/f2fs/gc.h
@@ -20,7 +20,7 @@
 #define LIMIT_FREE_BLOCK        40 /* percentage over invalid + free space */
 /* Search max. number of dirty segments to select a victim segment */
-#define MAX_VICTIM_SEARCH 4096 /* covers 8GB */
+#define DEF_MAX_VICTIM_SEARCH 4096 /* covers 8GB */
 struct f2fs_gc_kthread {
        struct task_struct *f2fs_gc_task;
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
new file mode 100644
index 000000000000..31ee5b164ff9
--- /dev/null
+++ b/fs/f2fs/inline.c
@@ -0,0 +1,222 @@
+/*
+ * fs/f2fs/inline.c
+ * Copyright (c) 2013, Intel Corporation
+ * Authors: Huajun Li <huajun.li@intel.com>
+ *          Haicheng Li <haicheng.li@intel.com>
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/fs.h>
+#include <linux/f2fs_fs.h>
+#include "f2fs.h"
+bool f2fs_may_inline(struct inode *inode)
+{
+        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        block_t nr_blocks;
+        loff_t i_size;
+        if (!test_opt(sbi, INLINE_DATA))
+                return false;
+        nr_blocks = F2FS_I(inode)->i_xattr_nid ? 3 : 2;
+        if (inode->i_blocks > nr_blocks)
+                return false;
+        i_size = i_size_read(inode);
+        if (i_size > MAX_INLINE_DATA)
+                return false;
+        return true;
+}
+int f2fs_read_inline_data(struct inode *inode, struct page *page)
+{
+        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        struct page *ipage;
+        void *src_addr, *dst_addr;
+        if (page->index) {
+                zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+                goto out;
+        }
+        ipage = get_node_page(sbi, inode->i_ino);
+        if (IS_ERR(ipage))
+                return PTR_ERR(ipage);
+        zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE);
+        /* Copy the whole inline data block */
+        src_addr = inline_data_addr(ipage);
+        dst_addr = kmap(page);
+        memcpy(dst_addr, src_addr, MAX_INLINE_DATA);
+        kunmap(page);
+        f2fs_put_page(ipage, 1);
+out:
+        SetPageUptodate(page);
+        unlock_page(page);
+        return 0;
+}
+static int __f2fs_convert_inline_data(struct inode *inode, struct page *page)
+{
+        int err;
+        struct page *ipage;
+        struct dnode_of_data dn;
+        void *src_addr, *dst_addr;
+        block_t new_blk_addr;
+        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        struct f2fs_io_info fio = {
+                .type = DATA,
+                .rw = WRITE_SYNC | REQ_PRIO,
+        };
+        f2fs_lock_op(sbi);
+        ipage = get_node_page(sbi, inode->i_ino);
+        if (IS_ERR(ipage))
+                return PTR_ERR(ipage);
+        /*
+         * i_addr[0] is not used for inline data,
+         * so reserving new block will not destroy inline data
+         */
+        set_new_dnode(&dn, inode, ipage, NULL, 0);
+        err = f2fs_reserve_block(&dn, 0);
+        if (err) {
+                f2fs_unlock_op(sbi);
+                return err;
+        }
+        zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE);
+        /* Copy the whole inline data block */
+        src_addr = inline_data_addr(ipage);
+        dst_addr = kmap(page);
+        memcpy(dst_addr, src_addr, MAX_INLINE_DATA);
+        kunmap(page);
+        SetPageUptodate(page);
+        /* write data page to try to make data consistent */
+        set_page_writeback(page);
+        write_data_page(page, &dn, &new_blk_addr, &fio);
+        update_extent_cache(new_blk_addr, &dn);
+        f2fs_wait_on_page_writeback(page, DATA);
+        /* clear inline data and flag after data writeback */
+        zero_user_segment(ipage, INLINE_DATA_OFFSET,
+                                 INLINE_DATA_OFFSET + MAX_INLINE_DATA);
+        clear_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
+        stat_dec_inline_inode(inode);
+        sync_inode_page(&dn);
+        f2fs_put_dnode(&dn);
+        f2fs_unlock_op(sbi);
+        return err;
+}
+int f2fs_convert_inline_data(struct inode *inode, pgoff_t to_size)
+{
+        struct page *page;
+        int err;
+        if (!f2fs_has_inline_data(inode))
+                return 0;
+        else if (to_size <= MAX_INLINE_DATA)
+                return 0;
+        page = grab_cache_page_write_begin(inode->i_mapping, 0, AOP_FLAG_NOFS);
+        if (!page)
+                return -ENOMEM;
+        err = __f2fs_convert_inline_data(inode, page);
+        f2fs_put_page(page, 1);
+        return err;
+}
+int f2fs_write_inline_data(struct inode *inode,
+                           struct page *page, unsigned size)
+{
+        void *src_addr, *dst_addr;
+        struct page *ipage;
+        struct dnode_of_data dn;
+        int err;
+        set_new_dnode(&dn, inode, NULL, NULL, 0);
+        err = get_dnode_of_data(&dn, 0, LOOKUP_NODE);
+        if (err)
+                return err;
+        ipage = dn.inode_page;
+        zero_user_segment(ipage, INLINE_DATA_OFFSET,
+                                 INLINE_DATA_OFFSET + MAX_INLINE_DATA);
+        src_addr = kmap(page);
+        dst_addr = inline_data_addr(ipage);
+        memcpy(dst_addr, src_addr, size);
+        kunmap(page);
+        /* Release the first data block if it is allocated */
+        if (!f2fs_has_inline_data(inode)) {
+                truncate_data_blocks_range(&dn, 1);
+                set_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
+                stat_inc_inline_inode(inode);
+        }
+        sync_inode_page(&dn);
+        f2fs_put_dnode(&dn);
+        return 0;
+}
+int recover_inline_data(struct inode *inode, struct page *npage)
+{
+        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        struct f2fs_inode *ri = NULL;
+        void *src_addr, *dst_addr;
+        struct page *ipage;
+        /*
+         * The inline_data recovery policy is as follows.
+         * [prev.] [next] of inline_data flag
+         *    o       o  -> recover inline_data
+         *    o       x  -> remove inline_data, and then recover data blocks
+         *    x       o  -> remove inline_data, and then recover inline_data
+         *    x       x  -> recover data blocks
+         */
+        if (IS_INODE(npage))
+                ri = F2FS_INODE(npage);
+        if (f2fs_has_inline_data(inode) &&
+                        ri && ri->i_inline & F2FS_INLINE_DATA) {
+process_inline:
+                ipage = get_node_page(sbi, inode->i_ino);
+                f2fs_bug_on(IS_ERR(ipage));
+                src_addr = inline_data_addr(npage);
+                dst_addr = inline_data_addr(ipage);
+                memcpy(dst_addr, src_addr, MAX_INLINE_DATA);
+                update_inode(inode, ipage);
+                f2fs_put_page(ipage, 1);
+                return -1;
+        }
+        if (f2fs_has_inline_data(inode)) {
+                ipage = get_node_page(sbi, inode->i_ino);
+                f2fs_bug_on(IS_ERR(ipage));
+                zero_user_segment(ipage, INLINE_DATA_OFFSET,
+                                 INLINE_DATA_OFFSET + MAX_INLINE_DATA);
+                clear_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
+                update_inode(inode, ipage);
+                f2fs_put_page(ipage, 1);
+        } else if (ri && ri->i_inline & F2FS_INLINE_DATA) {
+                truncate_blocks(inode, 0);
+                set_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
+                goto process_inline;
+        }
+        return 0;
+}
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index d0eaa9faeca0..4d67ed736dca 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -42,9 +42,11 @@ static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
        if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
                        S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
                if (ri->i_addr[0])
-                        inode->i_rdev = old_decode_dev(le32_to_cpu(ri->i_addr[0]));
+                        inode->i_rdev =
+                                old_decode_dev(le32_to_cpu(ri->i_addr[0]));
                else
-                        inode->i_rdev = new_decode_dev(le32_to_cpu(ri->i_addr[1]));
+                        inode->i_rdev =
+                                new_decode_dev(le32_to_cpu(ri->i_addr[1]));
        }
 }
@@ -52,11 +54,13 @@ static void __set_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
 {
        if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
                if (old_valid_dev(inode->i_rdev)) {
-                        ri->i_addr[0] = cpu_to_le32(old_encode_dev(inode->i_rdev));
+                        ri->i_addr[0] =
+                                cpu_to_le32(old_encode_dev(inode->i_rdev));
                        ri->i_addr[1] = 0;
                } else {
                        ri->i_addr[0] = 0;
-                        ri->i_addr[1] = cpu_to_le32(new_encode_dev(inode->i_rdev));
+                        ri->i_addr[1] =
+                                cpu_to_le32(new_encode_dev(inode->i_rdev));
                        ri->i_addr[2] = 0;
                }
        }
@@ -67,7 +71,6 @@ static int do_read_inode(struct inode *inode)
        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
        struct f2fs_inode_info *fi = F2FS_I(inode);
        struct page *node_page;
-        struct f2fs_node *rn;
        struct f2fs_inode *ri;
        /* Check if ino is within scope */
@@ -81,8 +84,7 @@ static int do_read_inode(struct inode *inode)
        if (IS_ERR(node_page))
                return PTR_ERR(node_page);
-        rn = F2FS_NODE(node_page);
+        ri = F2FS_INODE(node_page);
-        ri = &(rn->i);
        inode->i_mode = le16_to_cpu(ri->i_mode);
        i_uid_write(inode, le32_to_cpu(ri->i_uid));
@@ -175,13 +177,11 @@ bad_inode:
 void update_inode(struct inode *inode, struct page *node_page)
 {
-        struct f2fs_node *rn;
        struct f2fs_inode *ri;
-        f2fs_wait_on_page_writeback(node_page, NODE, false);
+        f2fs_wait_on_page_writeback(node_page, NODE);
-        rn = F2FS_NODE(node_page);
+        ri = F2FS_INODE(node_page);
-        ri = &(rn->i);
        ri->i_mode = cpu_to_le16(inode->i_mode);
        ri->i_advise = F2FS_I(inode)->i_advise;
@@ -281,6 +281,7 @@ void f2fs_evict_inode(struct inode *inode)
        f2fs_lock_op(sbi);
        remove_inode_page(inode);
+        stat_dec_inline_inode(inode);
        f2fs_unlock_op(sbi);
        sb_end_intwrite(inode->i_sb);
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 575adac17f8b..3d32f2969c5e 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -424,11 +424,13 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
                }
                f2fs_set_link(new_dir, new_entry, new_page, old_inode);
+                F2FS_I(old_inode)->i_pino = new_dir->i_ino;
                new_inode->i_ctime = CURRENT_TIME;
                if (old_dir_entry)
                        drop_nlink(new_inode);
                drop_nlink(new_inode);
+                mark_inode_dirty(new_inode);
                if (!new_inode->i_nlink)
                        add_orphan_inode(sbi, new_inode->i_ino);
@@ -457,11 +459,14 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
                if (old_dir != new_dir) {
                        f2fs_set_link(old_inode, old_dir_entry,
                                                old_dir_page, new_dir);
+                        F2FS_I(old_inode)->i_pino = new_dir->i_ino;
+                        update_inode_page(old_inode);
                } else {
                        kunmap(old_dir_page);
                        f2fs_put_page(old_dir_page, 0);
                }
                drop_nlink(old_dir);
+                mark_inode_dirty(old_dir);
                update_inode_page(old_dir);
        }
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 4ac4150d421d..b0649b76eb4f 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -87,17 +87,19 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
 */
 static void ra_nat_pages(struct f2fs_sb_info *sbi, int nid)
 {
-        struct address_space *mapping = sbi->meta_inode->i_mapping;
+        struct address_space *mapping = META_MAPPING(sbi);
        struct f2fs_nm_info *nm_i = NM_I(sbi);
-        struct blk_plug plug;
        struct page *page;
        pgoff_t index;
        int i;
+        struct f2fs_io_info fio = {
+                .type = META,
+                .rw = READ_SYNC | REQ_META | REQ_PRIO
+        };
-        blk_start_plug(&plug);
        for (i = 0; i < FREE_NID_PAGES; i++, nid += NAT_ENTRY_PER_BLOCK) {
-                if (nid >= nm_i->max_nid)
+                if (unlikely(nid >= nm_i->max_nid))
                        nid = 0;
                index = current_nat_addr(sbi, nid);
@@ -105,15 +107,15 @@ static void ra_nat_pages(struct f2fs_sb_info *sbi, int nid)
                if (!page)
                        continue;
                if (PageUptodate(page)) {
+                        mark_page_accessed(page);
                        f2fs_put_page(page, 1);
                        continue;
                }
-                if (f2fs_readpage(sbi, page, index, READ))
+                f2fs_submit_page_mbio(sbi, page, index, &fio);
-                        continue;
+                mark_page_accessed(page);
                f2fs_put_page(page, 0);
        }
-        blk_finish_plug(&plug);
+        f2fs_submit_merged_bio(sbi, META, READ);
 }
 static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n)
@@ -391,8 +393,8 @@ got:
 /*
 * Caller should call f2fs_put_dnode(dn).
- * Also, it should grab and release a mutex by calling mutex_lock_op() and
+ * Also, it should grab and release a rwsem by calling f2fs_lock_op() and
- * mutex_unlock_op() only if ro is not set RDONLY_NODE.
+ * f2fs_unlock_op() only if ro is not set RDONLY_NODE.
 * In the case of RDONLY_NODE, we don't need to care about mutex.
 */
 int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
@@ -502,7 +504,7 @@ static void truncate_node(struct dnode_of_data *dn)
        /* Deallocate node address */
        invalidate_blocks(sbi, ni.blk_addr);
-        dec_valid_node_count(sbi, dn->inode, 1);
+        dec_valid_node_count(sbi, dn->inode);
        set_node_addr(sbi, &ni, NULL_ADDR);
        if (dn->nid == dn->inode->i_ino) {
@@ -516,6 +518,10 @@ invalidate:
        F2FS_SET_SB_DIRT(sbi);
        f2fs_put_page(dn->node_page, 1);
+        invalidate_mapping_pages(NODE_MAPPING(sbi),
+                        dn->node_page->index, dn->node_page->index);
        dn->node_page = NULL;
        trace_f2fs_truncate_node(dn->inode, dn->nid, ni.blk_addr);
 }
@@ -631,19 +637,19 @@ static int truncate_partial_nodes(struct dnode_of_data *dn,
                return 0;
        /* get indirect nodes in the path */
-        for (i = 0; i < depth - 1; i++) {
+        for (i = 0; i < idx + 1; i++) {
                /* refernece count'll be increased */
                pages[i] = get_node_page(sbi, nid[i]);
                if (IS_ERR(pages[i])) {
-                        depth = i + 1;
                        err = PTR_ERR(pages[i]);
+                        idx = i - 1;
                        goto fail;
                }
                nid[i + 1] = get_nid(pages[i], offset[i + 1], false);
        }
        /* free direct nodes linked to a partial indirect node */
-        for (i = offset[depth - 1]; i < NIDS_PER_BLOCK; i++) {
+        for (i = offset[idx + 1]; i < NIDS_PER_BLOCK; i++) {
                child_nid = get_nid(pages[idx], i, false);
                if (!child_nid)
                        continue;
@@ -654,7 +660,7 @@ static int truncate_partial_nodes(struct dnode_of_data *dn,
                set_nid(pages[idx], i, 0, false);
        }
-        if (offset[depth - 1] == 0) {
+        if (offset[idx + 1] == 0) {
                dn->node_page = pages[idx];
                dn->nid = nid[idx];
                truncate_node(dn);
@@ -662,9 +668,10 @@ static int truncate_partial_nodes(struct dnode_of_data *dn,
                f2fs_put_page(pages[idx], 1);
        }
        offset[idx]++;
-        offset[depth - 1] = 0;
+        offset[idx + 1] = 0;
+        idx--;
 fail:
-        for (i = depth - 3; i >= 0; i--)
+        for (i = idx; i >= 0; i--)
                f2fs_put_page(pages[i], 1);
        trace_f2fs_truncate_partial_nodes(dn->inode, nid, depth, err);
@@ -678,11 +685,10 @@ fail:
 int truncate_inode_blocks(struct inode *inode, pgoff_t from)
 {
        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
-        struct address_space *node_mapping = sbi->node_inode->i_mapping;
        int err = 0, cont = 1;
        int level, offset[4], noffset[4];
        unsigned int nofs = 0;
-        struct f2fs_node *rn;
+        struct f2fs_inode *ri;
        struct dnode_of_data dn;
        struct page *page;
@@ -699,7 +705,7 @@ restart:
        set_new_dnode(&dn, inode, page, NULL, 0);
        unlock_page(page);
-        rn = F2FS_NODE(page);
+        ri = F2FS_INODE(page);
        switch (level) {
        case 0:
        case 1:
@@ -709,7 +715,7 @@ restart:
                nofs = noffset[1];
                if (!offset[level - 1])
                        goto skip_partial;
-                err = truncate_partial_nodes(&dn, &rn->i, offset, level);
+                err = truncate_partial_nodes(&dn, ri, offset, level);
                if (err < 0 && err != -ENOENT)
                        goto fail;
                nofs += 1 + NIDS_PER_BLOCK;
@@ -718,7 +724,7 @@ restart:
                nofs = 5 + 2 * NIDS_PER_BLOCK;
                if (!offset[level - 1])
                        goto skip_partial;
-                err = truncate_partial_nodes(&dn, &rn->i, offset, level);
+                err = truncate_partial_nodes(&dn, ri, offset, level);
                if (err < 0 && err != -ENOENT)
                        goto fail;
                break;
@@ -728,7 +734,7 @@ restart:
 skip_partial:
        while (cont) {
-                dn.nid = le32_to_cpu(rn->i.i_nid[offset[0] - NODE_DIR1_BLOCK]);
+                dn.nid = le32_to_cpu(ri->i_nid[offset[0] - NODE_DIR1_BLOCK]);
                switch (offset[0]) {
                case NODE_DIR1_BLOCK:
                case NODE_DIR2_BLOCK:
@@ -751,14 +757,14 @@ skip_partial:
                if (err < 0 && err != -ENOENT)
                        goto fail;
                if (offset[1] == 0 &&
-                                rn->i.i_nid[offset[0] - NODE_DIR1_BLOCK]) {
+                                ri->i_nid[offset[0] - NODE_DIR1_BLOCK]) {
                        lock_page(page);
-                        if (page->mapping != node_mapping) {
+                        if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
                                f2fs_put_page(page, 1);
                                goto restart;
                        }
                        wait_on_page_writeback(page);
-                        rn->i.i_nid[offset[0] - NODE_DIR1_BLOCK] = 0;
+                        ri->i_nid[offset[0] - NODE_DIR1_BLOCK] = 0;
                        set_page_dirty(page);
                        unlock_page(page);
                }
@@ -794,38 +800,34 @@ int truncate_xattr_node(struct inode *inode, struct page *page)
        set_new_dnode(&dn, inode, page, npage, nid);
        if (page)
-                dn.inode_page_locked = 1;
+                dn.inode_page_locked = true;
        truncate_node(&dn);
        return 0;
 }
 /*
- * Caller should grab and release a mutex by calling mutex_lock_op() and
+ * Caller should grab and release a rwsem by calling f2fs_lock_op() and
- * mutex_unlock_op().
+ * f2fs_unlock_op().
 */
-int remove_inode_page(struct inode *inode)
+void remove_inode_page(struct inode *inode)
 {
        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
        struct page *page;
        nid_t ino = inode->i_ino;
        struct dnode_of_data dn;
-        int err;
        page = get_node_page(sbi, ino);
        if (IS_ERR(page))
-                return PTR_ERR(page);
+                return;
-        err = truncate_xattr_node(inode, page);
+        if (truncate_xattr_node(inode, page)) {
-        if (err) {
                f2fs_put_page(page, 1);
-                return err;
+                return;
        }
        /* 0 is possible, after f2fs_new_inode() is failed */
        f2fs_bug_on(inode->i_blocks != 0 && inode->i_blocks != 1);
        set_new_dnode(&dn, inode, page, page, ino);
        truncate_node(&dn);
-        return 0;
 }
 struct page *new_inode_page(struct inode *inode, const struct qstr *name)
@@ -843,19 +845,18 @@ struct page *new_node_page(struct dnode_of_data *dn,
                                unsigned int ofs, struct page *ipage)
 {
        struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
-        struct address_space *mapping = sbi->node_inode->i_mapping;
        struct node_info old_ni, new_ni;
        struct page *page;
        int err;
-        if (is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))
+        if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)))
                return ERR_PTR(-EPERM);
-        page = grab_cache_page(mapping, dn->nid);
+        page = grab_cache_page(NODE_MAPPING(sbi), dn->nid);
        if (!page)
                return ERR_PTR(-ENOMEM);
-        if (!inc_valid_node_count(sbi, dn->inode, 1)) {
+        if (unlikely(!inc_valid_node_count(sbi, dn->inode))) {
                err = -ENOSPC;
                goto fail;
        }
@@ -898,14 +899,14 @@ fail:
 * LOCKED_PAGE: f2fs_put_page(page, 1)
 * error: nothing
 */
-static int read_node_page(struct page *page, int type)
+static int read_node_page(struct page *page, int rw)
 {
        struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
        struct node_info ni;
        get_node_info(sbi, page->index, &ni);
-        if (ni.blk_addr == NULL_ADDR) {
+        if (unlikely(ni.blk_addr == NULL_ADDR)) {
                f2fs_put_page(page, 1);
                return -ENOENT;
        }
@@ -913,7 +914,7 @@ static int read_node_page(struct page *page, int type)
        if (PageUptodate(page))
                return LOCKED_PAGE;
-        return f2fs_readpage(sbi, page, ni.blk_addr, type);
+        return f2fs_submit_page_bio(sbi, page, ni.blk_addr, rw);
 }
 /*
@@ -921,18 +922,17 @@ static int read_node_page(struct page *page, int type)
 */
 void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid)
 {
-        struct address_space *mapping = sbi->node_inode->i_mapping;
        struct page *apage;
        int err;
-        apage = find_get_page(mapping, nid);
+        apage = find_get_page(NODE_MAPPING(sbi), nid);
        if (apage && PageUptodate(apage)) {
                f2fs_put_page(apage, 0);
                return;
        }
        f2fs_put_page(apage, 0);
-        apage = grab_cache_page(mapping, nid);
+        apage = grab_cache_page(NODE_MAPPING(sbi), nid);
        if (!apage)
                return;
@@ -945,11 +945,10 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid)
 struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid)
 {
-        struct address_space *mapping = sbi->node_inode->i_mapping;
        struct page *page;
        int err;
 repeat:
-        page = grab_cache_page(mapping, nid);
+        page = grab_cache_page(NODE_MAPPING(sbi), nid);
        if (!page)
                return ERR_PTR(-ENOMEM);
@@ -960,11 +959,11 @@ repeat:
                goto got_it;
        lock_page(page);
-        if (!PageUptodate(page)) {
+        if (unlikely(!PageUptodate(page))) {
                f2fs_put_page(page, 1);
                return ERR_PTR(-EIO);
        }
-        if (page->mapping != mapping) {
+        if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
                f2fs_put_page(page, 1);
                goto repeat;
        }
@@ -981,7 +980,6 @@ got_it:
 struct page *get_node_page_ra(struct page *parent, int start)
 {
        struct f2fs_sb_info *sbi = F2FS_SB(parent->mapping->host->i_sb);
-        struct address_space *mapping = sbi->node_inode->i_mapping;
        struct blk_plug plug;
        struct page *page;
        int err, i, end;
@@ -992,7 +990,7 @@ struct page *get_node_page_ra(struct page *parent, int start)
        if (!nid)
                return ERR_PTR(-ENOENT);
 repeat:
-        page = grab_cache_page(mapping, nid);
+        page = grab_cache_page(NODE_MAPPING(sbi), nid);
        if (!page)
                return ERR_PTR(-ENOMEM);
@@ -1017,12 +1015,12 @@ repeat:
        blk_finish_plug(&plug);
        lock_page(page);
-        if (page->mapping != mapping) {
+        if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
                f2fs_put_page(page, 1);
                goto repeat;
        }
 page_hit:
-        if (!PageUptodate(page)) {
+        if (unlikely(!PageUptodate(page))) {
                f2fs_put_page(page, 1);
                return ERR_PTR(-EIO);
        }
@@ -1048,7 +1046,6 @@ void sync_inode_page(struct dnode_of_data *dn)
 int sync_node_pages(struct f2fs_sb_info *sbi, nid_t ino,
                                        struct writeback_control *wbc)
 {
-        struct address_space *mapping = sbi->node_inode->i_mapping;
        pgoff_t index, end;
        struct pagevec pvec;
        int step = ino ? 2 : 0;
@@ -1062,7 +1059,7 @@ next_step:
        while (index <= end) {
                int i, nr_pages;
-                nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+                nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
                                PAGECACHE_TAG_DIRTY,
                                min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
                if (nr_pages == 0)
@@ -1095,7 +1092,7 @@ next_step:
                        else if (!trylock_page(page))
                                continue;
-                        if (unlikely(page->mapping != mapping)) {
+                        if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
 continue_unlock:
                                unlock_page(page);
                                continue;
@@ -1122,7 +1119,7 @@ continue_unlock:
                                set_fsync_mark(page, 0);
                                set_dentry_mark(page, 0);
                        }
-                        mapping->a_ops->writepage(page, wbc);
+                        NODE_MAPPING(sbi)->a_ops->writepage(page, wbc);
                        wrote++;
                        if (--wbc->nr_to_write == 0)
@@ -1143,31 +1140,31 @@ continue_unlock:
        }
        if (wrote)
-                f2fs_submit_bio(sbi, NODE, wbc->sync_mode == WB_SYNC_ALL);
+                f2fs_submit_merged_bio(sbi, NODE, WRITE);
        return nwritten;
 }
 int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino)
 {
-        struct address_space *mapping = sbi->node_inode->i_mapping;
        pgoff_t index = 0, end = LONG_MAX;
        struct pagevec pvec;
-        int nr_pages;
        int ret2 = 0, ret = 0;
        pagevec_init(&pvec, 0);
-        while ((index <= end) &&
-                        (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+        while (index <= end) {
-                        PAGECACHE_TAG_WRITEBACK,
+                int i, nr_pages;
-                        min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) {
+                nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
-                unsigned i;
+                                PAGECACHE_TAG_WRITEBACK,
+                                min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
+                if (nr_pages == 0)
+                        break;
                for (i = 0; i < nr_pages; i++) {
                        struct page *page = pvec.pages[i];
                        /* until radix tree lookup accepts end_index */
-                        if (page->index > end)
+                        if (unlikely(page->index > end))
                                continue;
                        if (ino && ino_of_node(page) == ino) {
@@ -1180,9 +1177,9 @@ int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino)
                cond_resched();
        }
-        if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
+        if (unlikely(test_and_clear_bit(AS_ENOSPC, &NODE_MAPPING(sbi)->flags)))
                ret2 = -ENOSPC;
-        if (test_and_clear_bit(AS_EIO, &mapping->flags))
+        if (unlikely(test_and_clear_bit(AS_EIO, &NODE_MAPPING(sbi)->flags)))
                ret2 = -EIO;
        if (!ret)
                ret = ret2;
@@ -1196,8 +1193,12 @@ static int f2fs_write_node_page(struct page *page,
        nid_t nid;
        block_t new_addr;
        struct node_info ni;
+        struct f2fs_io_info fio = {
+                .type = NODE,
+                .rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE,
+        };
-        if (sbi->por_doing)
+        if (unlikely(sbi->por_doing))
                goto redirty_out;
        wait_on_page_writeback(page);
@@ -1209,7 +1210,7 @@ static int f2fs_write_node_page(struct page *page,
        get_node_info(sbi, nid, &ni);
        /* This page is already truncated */
-        if (ni.blk_addr == NULL_ADDR) {
+        if (unlikely(ni.blk_addr == NULL_ADDR)) {
                dec_page_count(sbi, F2FS_DIRTY_NODES);
                unlock_page(page);
                return 0;
@@ -1220,7 +1221,7 @@ static int f2fs_write_node_page(struct page *page,
        mutex_lock(&sbi->node_write);
        set_page_writeback(page);
-        write_node_page(sbi, page, nid, ni.blk_addr, &new_addr);
+        write_node_page(sbi, page, &fio, nid, ni.blk_addr, &new_addr);
        set_node_addr(sbi, &ni, new_addr);
        dec_page_count(sbi, F2FS_DIRTY_NODES);
        mutex_unlock(&sbi->node_write);
@@ -1255,6 +1256,7 @@ static int f2fs_write_node_pages(struct address_space *mapping,
        /* if mounting is failed, skip writing node pages */
        wbc->nr_to_write = 3 * max_hw_blocks(sbi);
+        wbc->sync_mode = WB_SYNC_NONE;
        sync_node_pages(sbi, 0, wbc);
        wbc->nr_to_write = nr_to_write - (3 * max_hw_blocks(sbi) -
                                                wbc->nr_to_write);
@@ -1333,7 +1335,7 @@ static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid, bool build)
                return -1;
        /* 0 nid should not be used */
-        if (nid == 0)
+        if (unlikely(nid == 0))
                return 0;
        if (build) {
@@ -1386,7 +1388,7 @@ static void scan_nat_page(struct f2fs_nm_info *nm_i,
        for (; i < NAT_ENTRY_PER_BLOCK; i++, start_nid++) {
-                if (start_nid >= nm_i->max_nid)
+                if (unlikely(start_nid >= nm_i->max_nid))
                        break;
                blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr);
@@ -1420,7 +1422,7 @@ static void build_free_nids(struct f2fs_sb_info *sbi)
                f2fs_put_page(page, 1);
                nid += (NAT_ENTRY_PER_BLOCK - (nid % NAT_ENTRY_PER_BLOCK));
-                if (nid >= nm_i->max_nid)
+                if (unlikely(nid >= nm_i->max_nid))
                        nid = 0;
                if (i++ == FREE_NID_PAGES)
@@ -1454,7 +1456,7 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid)
        struct free_nid *i = NULL;
        struct list_head *this;
 retry:
-        if (sbi->total_valid_node_count + 1 >= nm_i->max_nid)
+        if (unlikely(sbi->total_valid_node_count + 1 >= nm_i->max_nid))
                return false;
        spin_lock(&nm_i->free_nid_list_lock);
@@ -1535,13 +1537,12 @@ void recover_node_page(struct f2fs_sb_info *sbi, struct page *page,
 int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
 {
-        struct address_space *mapping = sbi->node_inode->i_mapping;
+        struct f2fs_inode *src, *dst;
-        struct f2fs_node *src, *dst;
        nid_t ino = ino_of_node(page);
        struct node_info old_ni, new_ni;
        struct page *ipage;
-        ipage = grab_cache_page(mapping, ino);
+        ipage = grab_cache_page(NODE_MAPPING(sbi), ino);
        if (!ipage)
                return -ENOMEM;
@@ -1552,19 +1553,19 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
        SetPageUptodate(ipage);
        fill_node_footer(ipage, ino, ino, 0, true);
-        src = F2FS_NODE(page);
+        src = F2FS_INODE(page);
-        dst = F2FS_NODE(ipage);
+        dst = F2FS_INODE(ipage);
-        memcpy(dst, src, (unsigned long)&src->i.i_ext - (unsigned long)&src->i);
+        memcpy(dst, src, (unsigned long)&src->i_ext - (unsigned long)src);
-        dst->i.i_size = 0;
+        dst->i_size = 0;
-        dst->i.i_blocks = cpu_to_le64(1);
+        dst->i_blocks = cpu_to_le64(1);
-        dst->i.i_links = cpu_to_le32(1);
+        dst->i_links = cpu_to_le32(1);
-        dst->i.i_xattr_nid = 0;
+        dst->i_xattr_nid = 0;
        new_ni = old_ni;
        new_ni.ino = ino;
-        if (!inc_valid_node_count(sbi, NULL, 1))
+        if (unlikely(!inc_valid_node_count(sbi, NULL)))
                WARN_ON(1);
        set_node_addr(sbi, &new_ni, NEW_ADDR);
        inc_valid_inode_count(sbi);
@@ -1572,47 +1573,88 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
        return 0;
 }
+/*
+ * ra_sum_pages() merge contiguous pages into one bio and submit.
+ * these pre-readed pages are linked in pages list.
+ */
+static int ra_sum_pages(struct f2fs_sb_info *sbi, struct list_head *pages,
+                                int start, int nrpages)
+{
+        struct page *page;
+        int page_idx = start;
+        struct f2fs_io_info fio = {
+                .type = META,
+                .rw = READ_SYNC | REQ_META | REQ_PRIO
+        };
+        for (; page_idx < start + nrpages; page_idx++) {
+                /* alloc temporal page for read node summary info*/
+                page = alloc_page(GFP_F2FS_ZERO);
+                if (!page) {
+                        struct page *tmp;
+                        list_for_each_entry_safe(page, tmp, pages, lru) {
+                                list_del(&page->lru);
+                                unlock_page(page);
+                                __free_pages(page, 0);
+                        }
+                        return -ENOMEM;
+                }
+                lock_page(page);
+                page->index = page_idx;
+                list_add_tail(&page->lru, pages);
+        }
+        list_for_each_entry(page, pages, lru)
+                f2fs_submit_page_mbio(sbi, page, page->index, &fio);
+        f2fs_submit_merged_bio(sbi, META, READ);
+        return 0;
+}
 int restore_node_summary(struct f2fs_sb_info *sbi,
                        unsigned int segno, struct f2fs_summary_block *sum)
 {
        struct f2fs_node *rn;
        struct f2fs_summary *sum_entry;
-        struct page *page;
+        struct page *page, *tmp;
        block_t addr;
-        int i, last_offset;
+        int bio_blocks = MAX_BIO_BLOCKS(max_hw_blocks(sbi));
+        int i, last_offset, nrpages, err = 0;
-        /* alloc temporal page for read node */
+        LIST_HEAD(page_list);
-        page = alloc_page(GFP_NOFS | __GFP_ZERO);
-        if (!page)
-                return -ENOMEM;
-        lock_page(page);
        /* scan the node segment */
        last_offset = sbi->blocks_per_seg;
        addr = START_BLOCK(sbi, segno);
        sum_entry = &sum->entries[0];
-        for (i = 0; i < last_offset; i++, sum_entry++) {
+        for (i = 0; i < last_offset; i += nrpages, addr += nrpages) {
-                /*
+                nrpages = min(last_offset - i, bio_blocks);
-                 * In order to read next node page,
-                 * we must clear PageUptodate flag.
-                 */
-                ClearPageUptodate(page);
-                if (f2fs_readpage(sbi, page, addr, READ_SYNC))
+                /* read ahead node pages */
-                        goto out;
+                err = ra_sum_pages(sbi, &page_list, addr, nrpages);
+                if (err)
+                        return err;
-                lock_page(page);
+                list_for_each_entry_safe(page, tmp, &page_list, lru) {
-                rn = F2FS_NODE(page);
-                sum_entry->nid = rn->footer.nid;
+                        lock_page(page);
-                sum_entry->version = 0;
+                        if (unlikely(!PageUptodate(page))) {
-                sum_entry->ofs_in_node = 0;
+                                err = -EIO;
-                addr++;
+                        } else {
+                                rn = F2FS_NODE(page);
+                                sum_entry->nid = rn->footer.nid;
+                                sum_entry->version = 0;
+                                sum_entry->ofs_in_node = 0;
+                                sum_entry++;
+                        }
+                        list_del(&page->lru);
+                        unlock_page(page);
+                        __free_pages(page, 0);
+                }
        }
-        unlock_page(page);
+        return err;
-out:
-        __free_pages(page, 0);
-        return 0;
 }
 static bool flush_nats_in_journal(struct f2fs_sb_info *sbi)
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 3496bb3e15dc..c4c79885c993 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -224,7 +224,13 @@ static inline block_t next_blkaddr_of_node(struct page *node_page)
 *    |            `- direct node (5 + N => 5 + 2N - 1)
 *    `- double indirect node (5 + 2N)
 *                 `- indirect node (6 + 2N)
- *                       `- direct node (x(N + 1))
+ *                       `- direct node
+ *                 ......
+ *                 `- indirect node ((6 + 2N) + x(N + 1))
+ *                       `- direct node
+ *                 ......
+ *                 `- indirect node ((6 + 2N) + (N - 1)(N + 1))
+ *                       `- direct node
 */
 static inline bool IS_DNODE(struct page *node_page)
 {
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index fdc81161f254..976a7a934db5 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -40,8 +40,7 @@ static struct fsync_inode_entry *get_fsync_inode(struct list_head *head,
 static int recover_dentry(struct page *ipage, struct inode *inode)
 {
-        struct f2fs_node *raw_node = F2FS_NODE(ipage);
+        struct f2fs_inode *raw_inode = F2FS_INODE(ipage);
-        struct f2fs_inode *raw_inode = &(raw_node->i);
        nid_t pino = le32_to_cpu(raw_inode->i_pino);
        struct f2fs_dir_entry *de;
        struct qstr name;
@@ -62,6 +61,12 @@ static int recover_dentry(struct page *ipage, struct inode *inode)
        name.len = le32_to_cpu(raw_inode->i_namelen);
        name.name = raw_inode->i_name;
+        if (unlikely(name.len > F2FS_NAME_LEN)) {
+                WARN_ON(1);
+                err = -ENAMETOOLONG;
+                goto out;
+        }
 retry:
        de = f2fs_find_entry(dir, &name, &page);
        if (de && inode->i_ino == le32_to_cpu(de->ino))
@@ -90,17 +95,16 @@ out_unmap_put:
        kunmap(page);
        f2fs_put_page(page, 0);
 out:
-        f2fs_msg(inode->i_sb, KERN_NOTICE, "recover_inode and its dentry: "
+        f2fs_msg(inode->i_sb, KERN_NOTICE,
-                        "ino = %x, name = %s, dir = %lx, err = %d",
+                        "%s: ino = %x, name = %s, dir = %lx, err = %d",
-                        ino_of_node(ipage), raw_inode->i_name,
+                        __func__, ino_of_node(ipage), raw_inode->i_name,
                        IS_ERR(dir) ? 0 : dir->i_ino, err);
        return err;
 }
 static int recover_inode(struct inode *inode, struct page *node_page)
 {
-        struct f2fs_node *raw_node = F2FS_NODE(node_page);
+        struct f2fs_inode *raw_inode = F2FS_INODE(node_page);
-        struct f2fs_inode *raw_inode = &(raw_node->i);
        if (!IS_INODE(node_page))
                return 0;
@@ -143,9 +147,9 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
        while (1) {
                struct fsync_inode_entry *entry;
-                err = f2fs_readpage(sbi, page, blkaddr, READ_SYNC);
+                err = f2fs_submit_page_bio(sbi, page, blkaddr, READ_SYNC);
                if (err)
-                        goto out;
+                        return err;
                lock_page(page);
@@ -191,9 +195,10 @@ next:
                /* check next segment */
                blkaddr = next_blkaddr_of_node(page);
        }
        unlock_page(page);
-out:
        __free_pages(page, 0);
        return err;
 }
@@ -293,6 +298,9 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
        struct node_info ni;
        int err = 0, recovered = 0;
+        if (recover_inline_data(inode, page))
+                goto out;
        start = start_bidx_of_node(ofs_of_node(page), fi);
        if (IS_INODE(page))
                end = start + ADDRS_PER_INODE(fi);
@@ -300,12 +308,13 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
                end = start + ADDRS_PER_BLOCK;
        f2fs_lock_op(sbi);
        set_new_dnode(&dn, inode, NULL, NULL, 0);
        err = get_dnode_of_data(&dn, start, ALLOC_NODE);
        if (err) {
                f2fs_unlock_op(sbi);
-                return err;
+                goto out;
        }
        wait_on_page_writeback(dn.node_page);
@@ -356,10 +365,10 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
 err:
        f2fs_put_dnode(&dn);
        f2fs_unlock_op(sbi);
+out:
-        f2fs_msg(sbi->sb, KERN_NOTICE, "recover_data: ino = %lx, "
+        f2fs_msg(sbi->sb, KERN_NOTICE,
-                        "recovered_data = %d blocks, err = %d",
+                "recover_data: ino = %lx, recovered = %d blocks, err = %d",
-                        inode->i_ino, recovered, err);
+                inode->i_ino, recovered, err);
        return err;
 }
@@ -377,7 +386,7 @@ static int recover_data(struct f2fs_sb_info *sbi,
        blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
        /* read node page */
-        page = alloc_page(GFP_NOFS | __GFP_ZERO);
+        page = alloc_page(GFP_F2FS_ZERO);
        if (!page)
                return -ENOMEM;
@@ -386,9 +395,9 @@ static int recover_data(struct f2fs_sb_info *sbi,
        while (1) {
                struct fsync_inode_entry *entry;
-                err = f2fs_readpage(sbi, page, blkaddr, READ_SYNC);
+                err = f2fs_submit_page_bio(sbi, page, blkaddr, READ_SYNC);
                if (err)
-                        goto out;
+                        return err;
                lock_page(page);
@@ -412,8 +421,8 @@ next:
                /* check next segment */
                blkaddr = next_blkaddr_of_node(page);
        }
        unlock_page(page);
-out:
        __free_pages(page, 0);
        if (!err)
@@ -429,7 +438,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi)
        fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry",
                        sizeof(struct fsync_inode_entry), NULL);
-        if (unlikely(!fsync_entry_slab))
+        if (!fsync_entry_slab)
                return -ENOMEM;
        INIT_LIST_HEAD(&inode_list);
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index fa284d397199..7caac5f2ca9e 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -14,12 +14,163 @@
 #include <linux/blkdev.h>
 #include <linux/prefetch.h>
 #include <linux/vmalloc.h>
+#include <linux/swap.h>
 #include "f2fs.h"
 #include "segment.h"
 #include "node.h"
 #include <trace/events/f2fs.h>
+#define __reverse_ffz(x) __reverse_ffs(~(x))
+static struct kmem_cache *discard_entry_slab;
+/*
+ * __reverse_ffs is copied from include/asm-generic/bitops/__ffs.h since
+ * MSB and LSB are reversed in a byte by f2fs_set_bit.
+ */
+static inline unsigned long __reverse_ffs(unsigned long word)
+{
+        int num = 0;
+#if BITS_PER_LONG == 64
+        if ((word & 0xffffffff) == 0) {
+                num += 32;
+                word >>= 32;
+        }
+#endif
+        if ((word & 0xffff) == 0) {
+                num += 16;
+                word >>= 16;
+        }
+        if ((word & 0xff) == 0) {
+                num += 8;
+                word >>= 8;
+        }
+        if ((word & 0xf0) == 0)
+                num += 4;
+        else
+                word >>= 4;
+        if ((word & 0xc) == 0)
+                num += 2;
+        else
+                word >>= 2;
+        if ((word & 0x2) == 0)
+                num += 1;
+        return num;
+}
+/*
+ * __find_rev_next(_zero)_bit is copied from lib/find_next_bit.c becasue
+ * f2fs_set_bit makes MSB and LSB reversed in a byte.
+ * Example:
+ *                             LSB <--> MSB
+ *   f2fs_set_bit(0, bitmap) => 0000 0001
+ *   f2fs_set_bit(7, bitmap) => 1000 0000
+ */
+static unsigned long __find_rev_next_bit(const unsigned long *addr,
+                        unsigned long size, unsigned long offset)
+{
+        const unsigned long *p = addr + BIT_WORD(offset);
+        unsigned long result = offset & ~(BITS_PER_LONG - 1);
+        unsigned long tmp;
+        unsigned long mask, submask;
+        unsigned long quot, rest;
+        if (offset >= size)
+                return size;
+        size -= result;
+        offset %= BITS_PER_LONG;
+        if (!offset)
+                goto aligned;
+        tmp = *(p++);
+        quot = (offset >> 3) << 3;
+        rest = offset & 0x7;
+        mask = ~0UL << quot;
+        submask = (unsigned char)(0xff << rest) >> rest;
+        submask <<= quot;
+        mask &= submask;
+        tmp &= mask;
+        if (size < BITS_PER_LONG)
+                goto found_first;
+        if (tmp)
+                goto found_middle;
+        size -= BITS_PER_LONG;
+        result += BITS_PER_LONG;
+aligned:
+        while (size & ~(BITS_PER_LONG-1)) {
+                tmp = *(p++);
+                if (tmp)
+                        goto found_middle;
+                result += BITS_PER_LONG;
+                size -= BITS_PER_LONG;
+        }
+        if (!size)
+                return result;
+        tmp = *p;
+found_first:
+        tmp &= (~0UL >> (BITS_PER_LONG - size));
+        if (tmp == 0UL)         /* Are any bits set? */
+                return result + size;   /* Nope. */
+found_middle:
+        return result + __reverse_ffs(tmp);
+}
+static unsigned long __find_rev_next_zero_bit(const unsigned long *addr,
+                        unsigned long size, unsigned long offset)
+{
+        const unsigned long *p = addr + BIT_WORD(offset);
+        unsigned long result = offset & ~(BITS_PER_LONG - 1);
+        unsigned long tmp;
+        unsigned long mask, submask;
+        unsigned long quot, rest;
+        if (offset >= size)
+                return size;
+        size -= result;
+        offset %= BITS_PER_LONG;
+        if (!offset)
+                goto aligned;
+        tmp = *(p++);
+        quot = (offset >> 3) << 3;
+        rest = offset & 0x7;
+        mask = ~(~0UL << quot);
+        submask = (unsigned char)~((unsigned char)(0xff << rest) >> rest);
+        submask <<= quot;
+        mask += submask;
+        tmp |= mask;
+        if (size < BITS_PER_LONG)
+                goto found_first;
+        if (~tmp)
+                goto found_middle;
+        size -= BITS_PER_LONG;
+        result += BITS_PER_LONG;
+aligned:
+        while (size & ~(BITS_PER_LONG - 1)) {
+                tmp = *(p++);
+                if (~tmp)
+                        goto found_middle;
+                result += BITS_PER_LONG;
+                size -= BITS_PER_LONG;
+        }
+        if (!size)
+                return result;
+        tmp = *p;
+found_first:
+        tmp |= ~0UL << size;
+        if (tmp == ~0UL)        /* Are any bits zero? */
+                return result + size;   /* Nope. */
+found_middle:
+        return result + __reverse_ffz(tmp);
+}
 /*
 * This function balances dirty node and dentry pages.
 * In addition, it controls garbage collection.
@@ -116,6 +267,56 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
        mutex_unlock(&dirty_i->seglist_lock);
 }
+static void f2fs_issue_discard(struct f2fs_sb_info *sbi,
+                                block_t blkstart, block_t blklen)
+{
+        sector_t start = SECTOR_FROM_BLOCK(sbi, blkstart);
+        sector_t len = SECTOR_FROM_BLOCK(sbi, blklen);
+        blkdev_issue_discard(sbi->sb->s_bdev, start, len, GFP_NOFS, 0);
+        trace_f2fs_issue_discard(sbi->sb, blkstart, blklen);
+}
+static void add_discard_addrs(struct f2fs_sb_info *sbi,
+                        unsigned int segno, struct seg_entry *se)
+{
+        struct list_head *head = &SM_I(sbi)->discard_list;
+        struct discard_entry *new;
+        int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long);
+        int max_blocks = sbi->blocks_per_seg;
+        unsigned long *cur_map = (unsigned long *)se->cur_valid_map;
+        unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map;
+        unsigned long dmap[entries];
+        unsigned int start = 0, end = -1;
+        int i;
+        if (!test_opt(sbi, DISCARD))
+                return;
+        /* zero block will be discarded through the prefree list */
+        if (!se->valid_blocks || se->valid_blocks == max_blocks)
+                return;
+        /* SIT_VBLOCK_MAP_SIZE should be multiple of sizeof(unsigned long) */
+        for (i = 0; i < entries; i++)
+                dmap[i] = (cur_map[i] ^ ckpt_map[i]) & ckpt_map[i];
+        while (SM_I(sbi)->nr_discards <= SM_I(sbi)->max_discards) {
+                start = __find_rev_next_bit(dmap, max_blocks, end + 1);
+                if (start >= max_blocks)
+                        break;
+                end = __find_rev_next_zero_bit(dmap, max_blocks, start + 1);
+                new = f2fs_kmem_cache_alloc(discard_entry_slab, GFP_NOFS);
+                INIT_LIST_HEAD(&new->list);
+                new->blkaddr = START_BLOCK(sbi, segno) + start;
+                new->len = end - start;
+                list_add_tail(&new->list, head);
+                SM_I(sbi)->nr_discards += end - start;
+        }
+}
 /*
 * Should call clear_prefree_segments after checkpoint is done.
 */
@@ -138,6 +339,9 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
 void clear_prefree_segments(struct f2fs_sb_info *sbi)
 {
+        struct list_head *head = &(SM_I(sbi)->discard_list);
+        struct list_head *this, *next;
+        struct discard_entry *entry;
        struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
        unsigned long *prefree_map = dirty_i->dirty_segmap[PRE];
        unsigned int total_segs = TOTAL_SEGS(sbi);
@@ -160,14 +364,19 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi)
                if (!test_opt(sbi, DISCARD))
                        continue;
-                blkdev_issue_discard(sbi->sb->s_bdev,
+                f2fs_issue_discard(sbi, START_BLOCK(sbi, start),
-                                START_BLOCK(sbi, start) <<
+                                (end - start) << sbi->log_blocks_per_seg);
-                                sbi->log_sectors_per_block,
-                                (1 << (sbi->log_sectors_per_block +
-                                sbi->log_blocks_per_seg)) * (end - start),
-                                GFP_NOFS, 0);
        }
        mutex_unlock(&dirty_i->seglist_lock);
+        /* send small discards */
+        list_for_each_safe(this, next, head) {
+                entry = list_entry(this, struct discard_entry, list);
+                f2fs_issue_discard(sbi, entry->blkaddr, entry->len);
+                list_del(&entry->list);
+                SM_I(sbi)->nr_discards -= entry->len;
+                kmem_cache_free(discard_entry_slab, entry);
+        }
 }
 static void __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno)
@@ -459,13 +668,18 @@ static void __next_free_blkoff(struct f2fs_sb_info *sbi,
                        struct curseg_info *seg, block_t start)
 {
        struct seg_entry *se = get_seg_entry(sbi, seg->segno);
-        block_t ofs;
+        int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long);
-        for (ofs = start; ofs < sbi->blocks_per_seg; ofs++) {
+        unsigned long target_map[entries];
-                if (!f2fs_test_bit(ofs, se->ckpt_valid_map)
+        unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map;
-                        && !f2fs_test_bit(ofs, se->cur_valid_map))
+        unsigned long *cur_map = (unsigned long *)se->cur_valid_map;
-                        break;
+        int i, pos;
-        }
-        seg->next_blkoff = ofs;
+        for (i = 0; i < entries; i++)
+                target_map[i] = ckpt_map[i] | cur_map[i];
+        pos = __find_rev_next_zero_bit(target_map, sbi->blocks_per_seg, start);
+        seg->next_blkoff = pos;
 }
 /*
@@ -573,148 +787,6 @@ static const struct segment_allocation default_salloc_ops = {
        .allocate_segment = allocate_segment_by_default,
 };
-static void f2fs_end_io_write(struct bio *bio, int err)
-{
-        const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-        struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
-        struct bio_private *p = bio->bi_private;
-        do {
-                struct page *page = bvec->bv_page;
-                if (--bvec >= bio->bi_io_vec)
-                        prefetchw(&bvec->bv_page->flags);
-                if (!uptodate) {
-                        SetPageError(page);
-                        if (page->mapping)
-                                set_bit(AS_EIO, &page->mapping->flags);
-                        set_ckpt_flags(p->sbi->ckpt, CP_ERROR_FLAG);
-                        p->sbi->sb->s_flags |= MS_RDONLY;
-                }
-                end_page_writeback(page);
-                dec_page_count(p->sbi, F2FS_WRITEBACK);
-        } while (bvec >= bio->bi_io_vec);
-        if (p->is_sync)
-                complete(p->wait);
-        if (!get_pages(p->sbi, F2FS_WRITEBACK) &&
-                        !list_empty(&p->sbi->cp_wait.task_list))
-                wake_up(&p->sbi->cp_wait);
-        kfree(p);
-        bio_put(bio);
-}
-struct bio *f2fs_bio_alloc(struct block_device *bdev, int npages)
-{
-        struct bio *bio;
-        /* No failure on bio allocation */
-        bio = bio_alloc(GFP_NOIO, npages);
-        bio->bi_bdev = bdev;
-        bio->bi_private = NULL;
-        return bio;
-}
-static void do_submit_bio(struct f2fs_sb_info *sbi,
-                                enum page_type type, bool sync)
-{
-        int rw = sync ? WRITE_SYNC : WRITE;
-        enum page_type btype = type > META ? META : type;
-        if (type >= META_FLUSH)
-                rw = WRITE_FLUSH_FUA;
-        if (btype == META)
-                rw |= REQ_META;
-        if (sbi->bio[btype]) {
-                struct bio_private *p = sbi->bio[btype]->bi_private;
-                p->sbi = sbi;
-                sbi->bio[btype]->bi_end_io = f2fs_end_io_write;
-                trace_f2fs_do_submit_bio(sbi->sb, btype, sync, sbi->bio[btype]);
-                if (type == META_FLUSH) {
-                        DECLARE_COMPLETION_ONSTACK(wait);
-                        p->is_sync = true;
-                        p->wait = &wait;
-                        submit_bio(rw, sbi->bio[btype]);
-                        wait_for_completion(&wait);
-                } else {
-                        p->is_sync = false;
-                        submit_bio(rw, sbi->bio[btype]);
-                }
-                sbi->bio[btype] = NULL;
-        }
-}
-void f2fs_submit_bio(struct f2fs_sb_info *sbi, enum page_type type, bool sync)
-{
-        down_write(&sbi->bio_sem);
-        do_submit_bio(sbi, type, sync);
-        up_write(&sbi->bio_sem);
-}
-static void submit_write_page(struct f2fs_sb_info *sbi, struct page *page,
-                                block_t blk_addr, enum page_type type)
-{
-        struct block_device *bdev = sbi->sb->s_bdev;
-        int bio_blocks;
-        verify_block_addr(sbi, blk_addr);
-        down_write(&sbi->bio_sem);
-        inc_page_count(sbi, F2FS_WRITEBACK);
-        if (sbi->bio[type] && sbi->last_block_in_bio[type] != blk_addr - 1)
-                do_submit_bio(sbi, type, false);
-alloc_new:
-        if (sbi->bio[type] == NULL) {
-                struct bio_private *priv;
-retry:
-                priv = kmalloc(sizeof(struct bio_private), GFP_NOFS);
-                if (!priv) {
-                        cond_resched();
-                        goto retry;
-                }
-                bio_blocks = MAX_BIO_BLOCKS(max_hw_blocks(sbi));
-                sbi->bio[type] = f2fs_bio_alloc(bdev, bio_blocks);
-                sbi->bio[type]->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr);
-                sbi->bio[type]->bi_private = priv;
-                /*
-                 * The end_io will be assigned at the sumbission phase.
-                 * Until then, let bio_add_page() merge consecutive IOs as much
-                 * as possible.
-                 */
-        }
-        if (bio_add_page(sbi->bio[type], page, PAGE_CACHE_SIZE, 0) <
-                                                        PAGE_CACHE_SIZE) {
-                do_submit_bio(sbi, type, false);
-                goto alloc_new;
-        }
-        sbi->last_block_in_bio[type] = blk_addr;
-        up_write(&sbi->bio_sem);
-        trace_f2fs_submit_write_page(page, blk_addr, type);
-}
-void f2fs_wait_on_page_writeback(struct page *page,
-                                enum page_type type, bool sync)
-{
-        struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
-        if (PageWriteback(page)) {
-                f2fs_submit_bio(sbi, type, sync);
-                wait_on_page_writeback(page);
-        }
-}
 static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type)
 {
        struct curseg_info *curseg = CURSEG_I(sbi, type);
@@ -782,16 +854,14 @@ static int __get_segment_type(struct page *page, enum page_type p_type)
        return __get_segment_type_6(page, p_type);
 }
-static void do_write_page(struct f2fs_sb_info *sbi, struct page *page,
+void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
-                        block_t old_blkaddr, block_t *new_blkaddr,
+                block_t old_blkaddr, block_t *new_blkaddr,
-                        struct f2fs_summary *sum, enum page_type p_type)
+                struct f2fs_summary *sum, int type)
 {
        struct sit_info *sit_i = SIT_I(sbi);
        struct curseg_info *curseg;
        unsigned int old_cursegno;
-        int type;
-        type = __get_segment_type(page, p_type);
        curseg = CURSEG_I(sbi, type);
        mutex_lock(&curseg->curseg_mutex);
@@ -824,49 +894,64 @@ static void do_write_page(struct f2fs_sb_info *sbi, struct page *page,
        locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr));
        mutex_unlock(&sit_i->sentry_lock);
-        if (p_type == NODE)
+        if (page && IS_NODESEG(type))
                fill_node_footer_blkaddr(page, NEXT_FREE_BLKADDR(sbi, curseg));
-        /* writeout dirty page into bdev */
-        submit_write_page(sbi, page, *new_blkaddr, p_type);
        mutex_unlock(&curseg->curseg_mutex);
 }
+static void do_write_page(struct f2fs_sb_info *sbi, struct page *page,
+                        block_t old_blkaddr, block_t *new_blkaddr,
+                        struct f2fs_summary *sum, struct f2fs_io_info *fio)
+{
+        int type = __get_segment_type(page, fio->type);
+        allocate_data_block(sbi, page, old_blkaddr, new_blkaddr, sum, type);
+        /* writeout dirty page into bdev */
+        f2fs_submit_page_mbio(sbi, page, *new_blkaddr, fio);
+}
 void write_meta_page(struct f2fs_sb_info *sbi, struct page *page)
 {
+        struct f2fs_io_info fio = {
+                .type = META,
+                .rw = WRITE_SYNC | REQ_META | REQ_PRIO
+        };
        set_page_writeback(page);
-        submit_write_page(sbi, page, page->index, META);
+        f2fs_submit_page_mbio(sbi, page, page->index, &fio);
 }
 void write_node_page(struct f2fs_sb_info *sbi, struct page *page,
+                struct f2fs_io_info *fio,
                unsigned int nid, block_t old_blkaddr, block_t *new_blkaddr)
 {
        struct f2fs_summary sum;
        set_summary(&sum, nid, 0, 0);
-        do_write_page(sbi, page, old_blkaddr, new_blkaddr, &sum, NODE);
+        do_write_page(sbi, page, old_blkaddr, new_blkaddr, &sum, fio);
 }
-void write_data_page(struct inode *inode, struct page *page,
+void write_data_page(struct page *page, struct dnode_of_data *dn,
-                struct dnode_of_data *dn, block_t old_blkaddr,
+                block_t *new_blkaddr, struct f2fs_io_info *fio)
-                block_t *new_blkaddr)
 {
-        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
        struct f2fs_summary sum;
        struct node_info ni;
-        f2fs_bug_on(old_blkaddr == NULL_ADDR);
+        f2fs_bug_on(dn->data_blkaddr == NULL_ADDR);
        get_node_info(sbi, dn->nid, &ni);
        set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version);
-        do_write_page(sbi, page, old_blkaddr,
+        do_write_page(sbi, page, dn->data_blkaddr, new_blkaddr, &sum, fio);
-                        new_blkaddr, &sum, DATA);
 }
-void rewrite_data_page(struct f2fs_sb_info *sbi, struct page *page,
+void rewrite_data_page(struct page *page, block_t old_blkaddr,
-                                        block_t old_blk_addr)
+                                        struct f2fs_io_info *fio)
 {
-        submit_write_page(sbi, page, old_blk_addr, DATA);
+        struct inode *inode = page->mapping->host;
+        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        f2fs_submit_page_mbio(sbi, page, old_blkaddr, fio);
 }
 void recover_data_page(struct f2fs_sb_info *sbi,
@@ -925,6 +1010,10 @@ void rewrite_node_page(struct f2fs_sb_info *sbi,
        unsigned int segno, old_cursegno;
        block_t next_blkaddr = next_blkaddr_of_node(page);
        unsigned int next_segno = GET_SEGNO(sbi, next_blkaddr);
+        struct f2fs_io_info fio = {
+                .type = NODE,
+                .rw = WRITE_SYNC,
+        };
        curseg = CURSEG_I(sbi, type);
@@ -953,8 +1042,8 @@ void rewrite_node_page(struct f2fs_sb_info *sbi,
        /* rewrite node page */
        set_page_writeback(page);
-        submit_write_page(sbi, page, new_blkaddr, NODE);
+        f2fs_submit_page_mbio(sbi, page, new_blkaddr, &fio);
-        f2fs_submit_bio(sbi, NODE, true);
+        f2fs_submit_merged_bio(sbi, NODE, WRITE);
        refresh_sit_entry(sbi, old_blkaddr, new_blkaddr);
        locate_dirty_segment(sbi, old_cursegno);
@@ -964,6 +1053,16 @@ void rewrite_node_page(struct f2fs_sb_info *sbi,
        mutex_unlock(&curseg->curseg_mutex);
 }
+void f2fs_wait_on_page_writeback(struct page *page,
+                                enum page_type type)
+{
+        struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
+        if (PageWriteback(page)) {
+                f2fs_submit_merged_bio(sbi, type, WRITE);
+                wait_on_page_writeback(page);
+        }
+}
 static int read_compacted_summaries(struct f2fs_sb_info *sbi)
 {
        struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
@@ -1314,6 +1413,10 @@ void flush_sit_entries(struct f2fs_sb_info *sbi)
                sit_offset = SIT_ENTRY_OFFSET(sit_i, segno);
+                /* add discard candidates */
+                if (SM_I(sbi)->nr_discards < SM_I(sbi)->max_discards)
+                        add_discard_addrs(sbi, segno, se);
                if (flushed)
                        goto to_sit_page;
@@ -1480,41 +1583,94 @@ static int build_curseg(struct f2fs_sb_info *sbi)
        return restore_curseg_summaries(sbi);
 }
+static int ra_sit_pages(struct f2fs_sb_info *sbi, int start, int nrpages)
+{
+        struct address_space *mapping = META_MAPPING(sbi);
+        struct page *page;
+        block_t blk_addr, prev_blk_addr = 0;
+        int sit_blk_cnt = SIT_BLK_CNT(sbi);
+        int blkno = start;
+        struct f2fs_io_info fio = {
+                .type = META,
+                .rw = READ_SYNC | REQ_META | REQ_PRIO
+        };
+        for (; blkno < start + nrpages && blkno < sit_blk_cnt; blkno++) {
+                blk_addr = current_sit_addr(sbi, blkno * SIT_ENTRY_PER_BLOCK);
+                if (blkno != start && prev_blk_addr + 1 != blk_addr)
+                        break;
+                prev_blk_addr = blk_addr;
+repeat:
+                page = grab_cache_page(mapping, blk_addr);
+                if (!page) {
+                        cond_resched();
+                        goto repeat;
+                }
+                if (PageUptodate(page)) {
+                        mark_page_accessed(page);
+                        f2fs_put_page(page, 1);
+                        continue;
+                }
+                f2fs_submit_page_mbio(sbi, page, blk_addr, &fio);
+                mark_page_accessed(page);
+                f2fs_put_page(page, 0);
+        }
+        f2fs_submit_merged_bio(sbi, META, READ);
+        return blkno - start;
+}
 static void build_sit_entries(struct f2fs_sb_info *sbi)
 {
        struct sit_info *sit_i = SIT_I(sbi);
        struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
        struct f2fs_summary_block *sum = curseg->sum_blk;
-        unsigned int start;
+        int sit_blk_cnt = SIT_BLK_CNT(sbi);
+        unsigned int i, start, end;
+        unsigned int readed, start_blk = 0;
+        int nrpages = MAX_BIO_BLOCKS(max_hw_blocks(sbi));
-        for (start = 0; start < TOTAL_SEGS(sbi); start++) {
+        do {
-                struct seg_entry *se = &sit_i->sentries[start];
+                readed = ra_sit_pages(sbi, start_blk, nrpages);
-                struct f2fs_sit_block *sit_blk;
-                struct f2fs_sit_entry sit;
+                start = start_blk * sit_i->sents_per_block;
-                struct page *page;
+                end = (start_blk + readed) * sit_i->sents_per_block;
-                int i;
+                for (; start < end && start < TOTAL_SEGS(sbi); start++) {
-                mutex_lock(&curseg->curseg_mutex);
+                        struct seg_entry *se = &sit_i->sentries[start];
-                for (i = 0; i < sits_in_cursum(sum); i++) {
+                        struct f2fs_sit_block *sit_blk;
-                        if (le32_to_cpu(segno_in_journal(sum, i)) == start) {
+                        struct f2fs_sit_entry sit;
-                                sit = sit_in_journal(sum, i);
+                        struct page *page;
-                                mutex_unlock(&curseg->curseg_mutex);
-                                goto got_it;
+                        mutex_lock(&curseg->curseg_mutex);
+                        for (i = 0; i < sits_in_cursum(sum); i++) {
+                                if (le32_to_cpu(segno_in_journal(sum, i))
+                                                                == start) {
+                                        sit = sit_in_journal(sum, i);
+                                        mutex_unlock(&curseg->curseg_mutex);
+                                        goto got_it;
+                                }
                        }
-                }
+                        mutex_unlock(&curseg->curseg_mutex);
-                mutex_unlock(&curseg->curseg_mutex);
-                page = get_current_sit_page(sbi, start);
+                        page = get_current_sit_page(sbi, start);
-                sit_blk = (struct f2fs_sit_block *)page_address(page);
+                        sit_blk = (struct f2fs_sit_block *)page_address(page);
-                sit = sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, start)];
+                        sit = sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, start)];
-                f2fs_put_page(page, 1);
+                        f2fs_put_page(page, 1);
 got_it:
-                check_block_count(sbi, start, &sit);
+                        check_block_count(sbi, start, &sit);
-                seg_info_from_raw_sit(se, &sit);
+                        seg_info_from_raw_sit(se, &sit);
-                if (sbi->segs_per_sec > 1) {
+                        if (sbi->segs_per_sec > 1) {
-                        struct sec_entry *e = get_sec_entry(sbi, start);
+                                struct sec_entry *e = get_sec_entry(sbi, start);
-                        e->valid_blocks += se->valid_blocks;
+                                e->valid_blocks += se->valid_blocks;
+                        }
                }
-        }
+                start_blk += readed;
+        } while (start_blk < sit_blk_cnt);
 }
 static void init_free_segmap(struct f2fs_sb_info *sbi)
@@ -1644,6 +1800,12 @@ int build_segment_manager(struct f2fs_sb_info *sbi)
        sm_info->main_segments = le32_to_cpu(raw_super->segment_count_main);
        sm_info->ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr);
        sm_info->rec_prefree_segments = DEF_RECLAIM_PREFREE_SEGMENTS;
+        sm_info->ipu_policy = F2FS_IPU_DISABLE;
+        sm_info->min_ipu_util = DEF_MIN_IPU_UTIL;
+        INIT_LIST_HEAD(&sm_info->discard_list);
+        sm_info->nr_discards = 0;
+        sm_info->max_discards = 0;
        err = build_sit_info(sbi);
        if (err)
@@ -1760,3 +1922,17 @@ void destroy_segment_manager(struct f2fs_sb_info *sbi)
        sbi->sm_info = NULL;
        kfree(sm_info);
 }
+int __init create_segment_manager_caches(void)
+{
+        discard_entry_slab = f2fs_kmem_cache_create("discard_entry",
+                        sizeof(struct discard_entry), NULL);
+        if (!discard_entry_slab)
+                return -ENOMEM;
+        return 0;
+}
+void destroy_segment_manager_caches(void)
+{
+        kmem_cache_destroy(discard_entry_slab);
+}
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 269f690b4e24..5731682d7516 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -20,13 +20,8 @@
 #define GET_L2R_SEGNO(free_i, segno)    (segno - free_i->start_segno)
 #define GET_R2L_SEGNO(free_i, segno)    (segno + free_i->start_segno)
-#define IS_DATASEG(t)                                                   \
+#define IS_DATASEG(t)   (t <= CURSEG_COLD_DATA)
-        ((t == CURSEG_HOT_DATA) || (t == CURSEG_COLD_DATA) ||           \
+#define IS_NODESEG(t)   (t >= CURSEG_HOT_NODE)
-        (t == CURSEG_WARM_DATA))
-#define IS_NODESEG(t)                                                   \
-        ((t == CURSEG_HOT_NODE) || (t == CURSEG_COLD_NODE) ||           \
-        (t == CURSEG_WARM_NODE))
 #define IS_CURSEG(sbi, seg)                                             \
        ((seg == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) ||      \
@@ -83,25 +78,20 @@
        (segno / SIT_ENTRY_PER_BLOCK)
 #define START_SEGNO(sit_i, segno)               \
        (SIT_BLOCK_OFFSET(sit_i, segno) * SIT_ENTRY_PER_BLOCK)
+#define SIT_BLK_CNT(sbi)                        \
+        ((TOTAL_SEGS(sbi) + SIT_ENTRY_PER_BLOCK - 1) / SIT_ENTRY_PER_BLOCK)
 #define f2fs_bitmap_size(nr)                    \
        (BITS_TO_LONGS(nr) * sizeof(unsigned long))
 #define TOTAL_SEGS(sbi) (SM_I(sbi)->main_segments)
 #define TOTAL_SECS(sbi) (sbi->total_sections)
 #define SECTOR_FROM_BLOCK(sbi, blk_addr)                                \
-        (blk_addr << ((sbi)->log_blocksize - F2FS_LOG_SECTOR_SIZE))
+        (((sector_t)blk_addr) << (sbi)->log_sectors_per_block)
 #define SECTOR_TO_BLOCK(sbi, sectors)                                   \
-        (sectors >> ((sbi)->log_blocksize - F2FS_LOG_SECTOR_SIZE))
+        (sectors >> (sbi)->log_sectors_per_block)
 #define MAX_BIO_BLOCKS(max_hw_blocks)                                   \
        (min((int)max_hw_blocks, BIO_MAX_PAGES))
-/* during checkpoint, bio_private is used to synchronize the last bio */
-struct bio_private {
-        struct f2fs_sb_info *sbi;
-        bool is_sync;
-        void *wait;
-};
 /*
 * indicate a block allocation direction: RIGHT and LEFT.
 * RIGHT means allocating new sections towards the end of volume.
@@ -458,8 +448,8 @@ static inline int reserved_sections(struct f2fs_sb_info *sbi)
 static inline bool need_SSR(struct f2fs_sb_info *sbi)
 {
-        return ((prefree_segments(sbi) / sbi->segs_per_sec)
+        return (prefree_segments(sbi) / sbi->segs_per_sec)
-                        + free_sections(sbi) < overprovision_sections(sbi));
+                        + free_sections(sbi) < overprovision_sections(sbi);
 }
 static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed)
@@ -467,38 +457,71 @@ static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed)
        int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES);
        int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS);
-        if (sbi->por_doing)
+        if (unlikely(sbi->por_doing))
                return false;
-        return ((free_sections(sbi) + freed) <= (node_secs + 2 * dent_secs +
+        return (free_sections(sbi) + freed) <= (node_secs + 2 * dent_secs +
-                                                reserved_sections(sbi)));
+                                                reserved_sections(sbi));
 }
 static inline bool excess_prefree_segs(struct f2fs_sb_info *sbi)
 {
-        return (prefree_segments(sbi) > SM_I(sbi)->rec_prefree_segments);
+        return prefree_segments(sbi) > SM_I(sbi)->rec_prefree_segments;
 }
 static inline int utilization(struct f2fs_sb_info *sbi)
 {
-        return div_u64((u64)valid_user_blocks(sbi) * 100, sbi->user_block_count);
+        return div_u64((u64)valid_user_blocks(sbi) * 100,
+                                        sbi->user_block_count);
 }
 /*
 * Sometimes f2fs may be better to drop out-of-place update policy.
- * So, if fs utilization is over MIN_IPU_UTIL, then f2fs tries to write
+ * And, users can control the policy through sysfs entries.
- * data in the original place likewise other traditional file systems.
+ * There are five policies with triggering conditions as follows.
- * But, currently set 100 in percentage, which means it is disabled.
+ * F2FS_IPU_FORCE - all the time,
- * See below need_inplace_update().
+ * F2FS_IPU_SSR - if SSR mode is activated,
+ * F2FS_IPU_UTIL - if FS utilization is over threashold,
+ * F2FS_IPU_SSR_UTIL - if SSR mode is activated and FS utilization is over
+ *                     threashold,
+ * F2FS_IPUT_DISABLE - disable IPU. (=default option)
 */
-#define MIN_IPU_UTIL            100
+#define DEF_MIN_IPU_UTIL        70
+enum {
+        F2FS_IPU_FORCE,
+        F2FS_IPU_SSR,
+        F2FS_IPU_UTIL,
+        F2FS_IPU_SSR_UTIL,
+        F2FS_IPU_DISABLE,
+};
 static inline bool need_inplace_update(struct inode *inode)
 {
        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        /* IPU can be done only for the user data */
        if (S_ISDIR(inode->i_mode))
                return false;
-        if (need_SSR(sbi) && utilization(sbi) > MIN_IPU_UTIL)
+        switch (SM_I(sbi)->ipu_policy) {
+        case F2FS_IPU_FORCE:
                return true;
+        case F2FS_IPU_SSR:
+                if (need_SSR(sbi))
+                        return true;
+                break;
+        case F2FS_IPU_UTIL:
+                if (utilization(sbi) > SM_I(sbi)->min_ipu_util)
+                        return true;
+                break;
+        case F2FS_IPU_SSR_UTIL:
+                if (need_SSR(sbi) && utilization(sbi) > SM_I(sbi)->min_ipu_util)
+                        return true;
+                break;
+        case F2FS_IPU_DISABLE:
+                break;
+        }
        return false;
 }
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index bafff72de8e8..1a85f83abd53 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -50,6 +50,7 @@ enum {
        Opt_active_logs,
        Opt_disable_ext_identify,
        Opt_inline_xattr,
+        Opt_inline_data,
        Opt_err,
 };
@@ -65,6 +66,7 @@ static match_table_t f2fs_tokens = {
        {Opt_active_logs, "active_logs=%u"},
        {Opt_disable_ext_identify, "disable_ext_identify"},
        {Opt_inline_xattr, "inline_xattr"},
+        {Opt_inline_data, "inline_data"},
        {Opt_err, NULL},
 };
@@ -72,6 +74,7 @@ static match_table_t f2fs_tokens = {
 enum {
        GC_THREAD,      /* struct f2fs_gc_thread */
        SM_INFO,        /* struct f2fs_sm_info */
+        F2FS_SBI,       /* struct f2fs_sb_info */
 };
 struct f2fs_attr {
@@ -89,6 +92,8 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type)
                return (unsigned char *)sbi->gc_thread;
        else if (struct_type == SM_INFO)
                return (unsigned char *)SM_I(sbi);
+        else if (struct_type == F2FS_SBI)
+                return (unsigned char *)sbi;
        return NULL;
 }
@@ -175,6 +180,10 @@ F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_max_sleep_time, max_sleep_time);
 F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time);
 F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_idle, gc_idle);
 F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments);
+F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, max_small_discards, max_discards);
+F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy);
+F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search);
 #define ATTR_LIST(name) (&f2fs_attr_##name.attr)
 static struct attribute *f2fs_attrs[] = {
@@ -183,6 +192,10 @@ static struct attribute *f2fs_attrs[] = {
        ATTR_LIST(gc_no_gc_sleep_time),
        ATTR_LIST(gc_idle),
        ATTR_LIST(reclaim_segments),
+        ATTR_LIST(max_small_discards),
+        ATTR_LIST(ipu_policy),
+        ATTR_LIST(min_ipu_util),
+        ATTR_LIST(max_victim_search),
        NULL,
 };
@@ -311,6 +324,9 @@ static int parse_options(struct super_block *sb, char *options)
                case Opt_disable_ext_identify:
                        set_opt(sbi, DISABLE_EXT_IDENTIFY);
                        break;
+                case Opt_inline_data:
+                        set_opt(sbi, INLINE_DATA);
+                        break;
                default:
                        f2fs_msg(sb, KERN_ERR,
                                "Unrecognized mount option \"%s\" or missing value",
@@ -325,7 +341,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
 {
        struct f2fs_inode_info *fi;
-        fi = kmem_cache_alloc(f2fs_inode_cachep, GFP_NOFS | __GFP_ZERO);
+        fi = kmem_cache_alloc(f2fs_inode_cachep, GFP_F2FS_ZERO);
        if (!fi)
                return NULL;
@@ -508,7 +524,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
 #endif
        if (test_opt(sbi, DISABLE_EXT_IDENTIFY))
                seq_puts(seq, ",disable_ext_identify");
+        if (test_opt(sbi, INLINE_DATA))
+                seq_puts(seq, ",inline_data");
        seq_printf(seq, ",active_logs=%u", sbi->active_logs);
        return 0;
@@ -518,7 +535,8 @@ static int segment_info_seq_show(struct seq_file *seq, void *offset)
 {
        struct super_block *sb = seq->private;
        struct f2fs_sb_info *sbi = F2FS_SB(sb);
-        unsigned int total_segs = le32_to_cpu(sbi->raw_super->segment_count_main);
+        unsigned int total_segs =
+                        le32_to_cpu(sbi->raw_super->segment_count_main);
        int i;
        for (i = 0; i < total_segs; i++) {
@@ -618,7 +636,7 @@ static struct inode *f2fs_nfs_get_inode(struct super_block *sb,
        struct f2fs_sb_info *sbi = F2FS_SB(sb);
        struct inode *inode;
-        if (ino < F2FS_ROOT_INO(sbi))
+        if (unlikely(ino < F2FS_ROOT_INO(sbi)))
                return ERR_PTR(-ESTALE);
        /*
@@ -629,7 +647,7 @@ static struct inode *f2fs_nfs_get_inode(struct super_block *sb,
        inode = f2fs_iget(sb, ino);
        if (IS_ERR(inode))
                return ERR_CAST(inode);
-        if (generation && inode->i_generation != generation) {
+        if (unlikely(generation && inode->i_generation != generation)) {
                /* we didn't find the right inode.. */
                iput(inode);
                return ERR_PTR(-ESTALE);
@@ -732,10 +750,10 @@ static int sanity_check_ckpt(struct f2fs_sb_info *sbi)
        fsmeta += le32_to_cpu(ckpt->rsvd_segment_count);
        fsmeta += le32_to_cpu(raw_super->segment_count_ssa);
-        if (fsmeta >= total)
+        if (unlikely(fsmeta >= total))
                return 1;
-        if (is_set_ckpt_flags(ckpt, CP_ERROR_FLAG)) {
+        if (unlikely(is_set_ckpt_flags(ckpt, CP_ERROR_FLAG))) {
                f2fs_msg(sbi->sb, KERN_ERR, "A bug case: need to run fsck");
                return 1;
        }
@@ -763,6 +781,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
        sbi->node_ino_num = le32_to_cpu(raw_super->node_ino);
        sbi->meta_ino_num = le32_to_cpu(raw_super->meta_ino);
        sbi->cur_victim_sec = NULL_SECNO;
+        sbi->max_victim_search = DEF_MAX_VICTIM_SEARCH;
        for (i = 0; i < NR_COUNT_TYPE; i++)
                atomic_set(&sbi->nr_pages[i], 0);
@@ -798,9 +817,10 @@ retry:
        /* sanity checking of raw super */
        if (sanity_check_raw_super(sb, *raw_super)) {
                brelse(*raw_super_buf);
-                f2fs_msg(sb, KERN_ERR, "Can't find a valid F2FS filesystem "
+                f2fs_msg(sb, KERN_ERR,
-                                "in %dth superblock", block + 1);
+                        "Can't find valid F2FS filesystem in %dth superblock",
-                if(block == 0) {
+                                                                block + 1);
+                if (block == 0) {
                        block++;
                        goto retry;
                } else {
@@ -818,6 +838,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
        struct buffer_head *raw_super_buf;
        struct inode *root;
        long err = -EINVAL;
+        int i;
        /* allocate memory for f2fs-specific super block info */
        sbi = kzalloc(sizeof(struct f2fs_sb_info), GFP_KERNEL);
@@ -825,7 +846,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
                return -ENOMEM;
        /* set a block size */
-        if (!sb_set_blocksize(sb, F2FS_BLKSIZE)) {
+        if (unlikely(!sb_set_blocksize(sb, F2FS_BLKSIZE))) {
                f2fs_msg(sb, KERN_ERR, "unable to set blocksize");
                goto free_sbi;
        }
@@ -874,7 +895,16 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
        mutex_init(&sbi->node_write);
        sbi->por_doing = false;
        spin_lock_init(&sbi->stat_lock);
-        init_rwsem(&sbi->bio_sem);
+        mutex_init(&sbi->read_io.io_mutex);
+        sbi->read_io.sbi = sbi;
+        sbi->read_io.bio = NULL;
+        for (i = 0; i < NR_PAGE_TYPE; i++) {
+                mutex_init(&sbi->write_io[i].io_mutex);
+                sbi->write_io[i].sbi = sbi;
+                sbi->write_io[i].bio = NULL;
+        }
        init_rwsem(&sbi->cp_rwsem);
        init_waitqueue_head(&sbi->cp_wait);
        init_sb_info(sbi);
@@ -939,9 +969,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
        }
        /* if there are nt orphan nodes free them */
-        err = -EINVAL;
+        recover_orphan_inodes(sbi);
-        if (recover_orphan_inodes(sbi))
-                goto free_node_inode;
        /* read root inode and dentry */
        root = f2fs_iget(sb, F2FS_ROOT_INO(sbi));
@@ -950,8 +978,10 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
                err = PTR_ERR(root);
                goto free_node_inode;
        }
-        if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size)
+        if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
+                err = -EINVAL;
                goto free_root_inode;
+        }
        sb->s_root = d_make_root(root); /* allocate root dentry */
        if (!sb->s_root) {
@@ -1053,7 +1083,7 @@ static int __init init_inodecache(void)
 {
        f2fs_inode_cachep = f2fs_kmem_cache_create("f2fs_inode_cache",
                        sizeof(struct f2fs_inode_info), NULL);
-        if (f2fs_inode_cachep == NULL)
+        if (!f2fs_inode_cachep)
                return -ENOMEM;
        return 0;
 }
@@ -1078,9 +1108,12 @@ static int __init init_f2fs_fs(void)
        err = create_node_manager_caches();
        if (err)
                goto free_inodecache;
-        err = create_gc_caches();
+        err = create_segment_manager_caches();
        if (err)
                goto free_node_manager_caches;
+        err = create_gc_caches();
+        if (err)
+                goto free_segment_manager_caches;
        err = create_checkpoint_caches();
        if (err)
                goto free_gc_caches;
@@ -1102,6 +1135,8 @@ free_checkpoint_caches:
        destroy_checkpoint_caches();
 free_gc_caches:
        destroy_gc_caches();
+free_segment_manager_caches:
+        destroy_segment_manager_caches();
 free_node_manager_caches:
        destroy_node_manager_caches();
 free_inodecache:
@@ -1117,6 +1152,7 @@ static void __exit exit_f2fs_fs(void)
        unregister_filesystem(&f2fs_fs_type);
        destroy_checkpoint_caches();
        destroy_gc_caches();
+        destroy_segment_manager_caches();
        destroy_node_manager_caches();
        destroy_inodecache();
        kset_unregister(f2fs_kset);
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index aa7a3f139fe5..b0fb8a27f3da 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -522,7 +522,7 @@ static int __f2fs_setxattr(struct inode *inode, int name_index,
                if (found)
                        free = free + ENTRY_SIZE(here);
-                if (free < newsize) {
+                if (unlikely(free < newsize)) {
                        error = -ENOSPC;
                        goto exit;
                }
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 1f4a10ece2f1..e0259a163f98 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -516,13 +516,16 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
        }
        WARN_ON(inode->i_state & I_SYNC);
        /*
-         * Skip inode if it is clean. We don't want to mess with writeback
+         * Skip inode if it is clean and we have no outstanding writeback in
-         * lists in this function since flusher thread may be doing for example
+         * WB_SYNC_ALL mode. We don't want to mess with writeback lists in this
-         * sync in parallel and if we move the inode, it could get skipped. So
+         * function since flusher thread may be doing for example sync in
-         * here we make sure inode is on some writeback list and leave it there
+         * parallel and if we move the inode, it could get skipped. So here we
-         * unless we have completely cleaned the inode.
+         * make sure inode is on some writeback list and leave it there unless
+         * we have completely cleaned the inode.
         */
-        if (!(inode->i_state & I_DIRTY))
+        if (!(inode->i_state & I_DIRTY) &&
+            (wbc->sync_mode != WB_SYNC_ALL ||
+             !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)))
                goto out;
        inode->i_state |= I_SYNC;
        spin_unlock(&inode->i_lock);
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index ef74ad5fd362..0a648bb455ae 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1296,22 +1296,6 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
        return fuse_dev_do_read(fc, file, &cs, iov_length(iov, nr_segs));
 }
-static int fuse_dev_pipe_buf_steal(struct pipe_inode_info *pipe,
-                                   struct pipe_buffer *buf)
-{
-        return 1;
-}
-static const struct pipe_buf_operations fuse_dev_pipe_buf_ops = {
-        .can_merge = 0,
-        .map = generic_pipe_buf_map,
-        .unmap = generic_pipe_buf_unmap,
-        .confirm = generic_pipe_buf_confirm,
-        .release = generic_pipe_buf_release,
-        .steal = fuse_dev_pipe_buf_steal,
-        .get = generic_pipe_buf_get,
-};
 static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
                                    struct pipe_inode_info *pipe,
                                    size_t len, unsigned int flags)
@@ -1358,7 +1342,11 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
                buf->page = bufs[page_nr].page;
                buf->offset = bufs[page_nr].offset;
                buf->len = bufs[page_nr].len;
-                buf->ops = &fuse_dev_pipe_buf_ops;
+                /*
+                 * Need to be careful about this.  Having buf->ops in module
+                 * code can Oops if the buffer persists after module unload.
+                 */
+                buf->ops = &nosteal_pipe_buf_ops;
                pipe->nrbufs++;
                page_nr++;
@@ -1599,7 +1587,8 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size,
                this_num = min_t(unsigned, num, PAGE_CACHE_SIZE - offset);
                err = fuse_copy_page(cs, &page, offset, this_num, 0);
-                if (!err && offset == 0 && (num != 0 || file_size == end))
+                if (!err && offset == 0 &&
+                    (this_num == PAGE_CACHE_SIZE || file_size == end))
                        SetPageUptodate(page);
                unlock_page(page);
                page_cache_release(page);
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index c3eb2c46c8f1..1d1292c581c3 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -112,6 +112,16 @@ void fuse_invalidate_attr(struct inode *inode)
        get_fuse_inode(inode)->i_time = 0;
 }
+/**
+ * Mark the attributes as stale due to an atime change.  Avoid the invalidate if
+ * atime is not used.
+ */
+void fuse_invalidate_atime(struct inode *inode)
+{
+        if (!IS_RDONLY(inode))
+                fuse_invalidate_attr(inode);
+}
 /*
 * Just mark the entry as stale, so that a next attempt to look it up
 * will result in a new lookup call to userspace
@@ -1371,7 +1381,7 @@ static int fuse_readdir(struct file *file, struct dir_context *ctx)
        }
        __free_page(page);
-        fuse_invalidate_attr(inode); /* atime changed */
+        fuse_invalidate_atime(inode);
        return err;
 }
@@ -1404,7 +1414,7 @@ static char *read_link(struct dentry *dentry)
                link[req->out.args[0].size] = '\0';
 out:
        fuse_put_request(fc, req);
-        fuse_invalidate_attr(inode); /* atime changed */
+        fuse_invalidate_atime(inode);
        return link;
 }
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 7e70506297bc..74f6ca500504 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -127,7 +127,15 @@ static void fuse_file_put(struct fuse_file *ff, bool sync)
        if (atomic_dec_and_test(&ff->count)) {
                struct fuse_req *req = ff->reserved_req;
-                if (sync) {
+                if (ff->fc->no_open) {
+                        /*
+                         * Drop the release request when client does not
+                         * implement 'open'
+                         */
+                        req->background = 0;
+                        path_put(&req->misc.release.path);
+                        fuse_put_request(ff->fc, req);
+                } else if (sync) {
                        req->background = 0;
                        fuse_request_send(ff->fc, req);
                        path_put(&req->misc.release.path);
@@ -144,27 +152,36 @@ static void fuse_file_put(struct fuse_file *ff, bool sync)
 int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
                 bool isdir)
 {
-        struct fuse_open_out outarg;
        struct fuse_file *ff;
-        int err;
        int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN;
        ff = fuse_file_alloc(fc);
        if (!ff)
                return -ENOMEM;
-        err = fuse_send_open(fc, nodeid, file, opcode, &outarg);
+        ff->fh = 0;
-        if (err) {
+        ff->open_flags = FOPEN_KEEP_CACHE; /* Default for no-open */
-                fuse_file_free(ff);
+        if (!fc->no_open || isdir) {
-                return err;
+                struct fuse_open_out outarg;
+                int err;
+                err = fuse_send_open(fc, nodeid, file, opcode, &outarg);
+                if (!err) {
+                        ff->fh = outarg.fh;
+                        ff->open_flags = outarg.open_flags;
+                } else if (err != -ENOSYS || isdir) {
+                        fuse_file_free(ff);
+                        return err;
+                } else {
+                        fc->no_open = 1;
+                }
        }
        if (isdir)
-                outarg.open_flags &= ~FOPEN_DIRECT_IO;
+                ff->open_flags &= ~FOPEN_DIRECT_IO;
-        ff->fh = outarg.fh;
        ff->nodeid = nodeid;
-        ff->open_flags = outarg.open_flags;
        file->private_data = fuse_file_get(ff);
        return 0;
@@ -687,7 +704,7 @@ static int fuse_readpage(struct file *file, struct page *page)
                SetPageUptodate(page);
        }
-        fuse_invalidate_attr(inode); /* atime changed */
+        fuse_invalidate_atime(inode);
 out:
        unlock_page(page);
        return err;
@@ -716,7 +733,7 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
                        fuse_read_update_size(inode, pos,
                                              req->misc.read.attr_ver);
                }
-                fuse_invalidate_attr(inode); /* atime changed */
+                fuse_invalidate_atime(inode);
        }
        for (i = 0; i < req->num_pages; i++) {
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 7d2730912667..2da5db2c8bdb 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -485,6 +485,9 @@ struct fuse_conn {
         * and hence races in setting them will not cause malfunction
         */
+        /** Is open/release not implemented by fs? */
+        unsigned no_open:1;
        /** Is fsync not implemented by fs? */
        unsigned no_fsync:1;
@@ -788,6 +791,8 @@ void fuse_invalidate_attr(struct inode *inode);
 void fuse_invalidate_entry_cache(struct dentry *entry);
+void fuse_invalidate_atime(struct inode *inode);
 /**
 * Acquire reference to fuse_conn
 */
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index b7fc035a6943..49436fa7cd4f 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -986,6 +986,7 @@ static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
 {
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;
+        struct address_space *mapping = inode->i_mapping;
        struct gfs2_inode *ip = GFS2_I(inode);
        struct gfs2_holder gh;
        int rv;
@@ -1006,6 +1007,36 @@ static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
        if (rv != 1)
                goto out; /* dio not valid, fall back to buffered i/o */
+        /*
+         * Now since we are holding a deferred (CW) lock at this point, you
+         * might be wondering why this is ever needed. There is a case however
+         * where we've granted a deferred local lock against a cached exclusive
+         * glock. That is ok provided all granted local locks are deferred, but
+         * it also means that it is possible to encounter pages which are
+         * cached and possibly also mapped. So here we check for that and sort
+         * them out ahead of the dio. The glock state machine will take care of
+         * everything else.
+         *
+         * If in fact the cached glock state (gl->gl_state) is deferred (CW) in
+         * the first place, mapping->nr_pages will always be zero.
+         */
+        if (mapping->nrpages) {
+                loff_t lstart = offset & (PAGE_CACHE_SIZE - 1);
+                loff_t len = iov_length(iov, nr_segs);
+                loff_t end = PAGE_ALIGN(offset + len) - 1;
+                rv = 0;
+                if (len == 0)
+                        goto out;
+                if (test_and_clear_bit(GIF_SW_PAGED, &ip->i_flags))
+                        unmap_shared_mapping_range(ip->i_inode.i_mapping, offset, len);
+                rv = filemap_write_and_wait_range(mapping, lstart, end);
+                if (rv)
+                        goto out;
+                if (rw == WRITE)
+                        truncate_inode_pages_range(mapping, lstart, end);
+        }
        rv = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
                                  offset, nr_segs, gfs2_get_block_direct,
                                  NULL, NULL, 0);
@@ -1050,30 +1081,22 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
                bh = bh->b_this_page;
        } while(bh != head);
        spin_unlock(&sdp->sd_ail_lock);
-        gfs2_log_unlock(sdp);
        head = bh = page_buffers(page);
        do {
-                gfs2_log_lock(sdp);
                bd = bh->b_private;
                if (bd) {
                        gfs2_assert_warn(sdp, bd->bd_bh == bh);
-                        if (!list_empty(&bd->bd_list)) {
+                        if (!list_empty(&bd->bd_list))
-                                if (!buffer_pinned(bh))
+                                list_del_init(&bd->bd_list);
-                                        list_del_init(&bd->bd_list);
+                        bd->bd_bh = NULL;
-                                else
-                                        bd = NULL;
-                        }
-                        if (bd)
-                                bd->bd_bh = NULL;
                        bh->b_private = NULL;
-                }
-                gfs2_log_unlock(sdp);
-                if (bd)
                        kmem_cache_free(gfs2_bufdata_cachep, bd);
+                }
                bh = bh->b_this_page;
        } while (bh != head);
+        gfs2_log_unlock(sdp);
        return try_to_free_buffers(page);
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 2e5fc268d324..fa32655449c8 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -834,6 +834,7 @@ static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh,
        struct gfs2_leaf *leaf;
        struct gfs2_dirent *dent;
        struct qstr name = { .name = "" };
+        struct timespec tv = CURRENT_TIME;
        error = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL);
        if (error)
@@ -850,7 +851,11 @@ static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh,
        leaf->lf_entries = 0;
        leaf->lf_dirent_format = cpu_to_be32(GFS2_FORMAT_DE);
        leaf->lf_next = 0;
-        memset(leaf->lf_reserved, 0, sizeof(leaf->lf_reserved));
+        leaf->lf_inode = cpu_to_be64(ip->i_no_addr);
+        leaf->lf_dist = cpu_to_be32(1);
+        leaf->lf_nsec = cpu_to_be32(tv.tv_nsec);
+        leaf->lf_sec = cpu_to_be64(tv.tv_sec);
+        memset(leaf->lf_reserved2, 0, sizeof(leaf->lf_reserved2));
        dent = (struct gfs2_dirent *)(leaf+1);
        gfs2_qstr2dirent(&name, bh->b_size - sizeof(struct gfs2_leaf), dent);
        *pbh = bh;
@@ -1612,11 +1617,31 @@ out:
        return ret;
 }
+/**
+ * dir_new_leaf - Add a new leaf onto hash chain
+ * @inode: The directory
+ * @name: The name we are adding
+ *
+ * This adds a new dir leaf onto an existing leaf when there is not
+ * enough space to add a new dir entry. This is a last resort after
+ * we've expanded the hash table to max size and also split existing
+ * leaf blocks, so it will only occur for very large directories.
+ *
+ * The dist parameter is set to 1 for leaf blocks directly attached
+ * to the hash table, 2 for one layer of indirection, 3 for two layers
+ * etc. We are thus able to tell the difference between an old leaf
+ * with dist set to zero (i.e. "don't know") and a new one where we
+ * set this information for debug/fsck purposes.
+ *
+ * Returns: 0 on success, or -ve on error
+ */
 static int dir_new_leaf(struct inode *inode, const struct qstr *name)
 {
        struct buffer_head *bh, *obh;
        struct gfs2_inode *ip = GFS2_I(inode);
        struct gfs2_leaf *leaf, *oleaf;
+        u32 dist = 1;
        int error;
        u32 index;
        u64 bn;
@@ -1626,6 +1651,7 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name)
        if (error)
                return error;
        do {
+                dist++;
                oleaf = (struct gfs2_leaf *)obh->b_data;
                bn = be64_to_cpu(oleaf->lf_next);
                if (!bn)
@@ -1643,6 +1669,7 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name)
                brelse(obh);
                return -ENOSPC;
        }
+        leaf->lf_dist = cpu_to_be32(dist);
        oleaf->lf_next = cpu_to_be64(bh->b_blocknr);
        brelse(bh);
        brelse(obh);
@@ -1659,39 +1686,53 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name)
 /**
 * gfs2_dir_add - Add new filename into directory
- * @dip: The GFS2 inode
+ * @inode: The directory inode
- * @filename: The new name
+ * @name: The new name
- * @inode: The inode number of the entry
+ * @nip: The GFS2 inode to be linked in to the directory
- * @type: The type of the entry
+ * @da: The directory addition info
+ *
+ * If the call to gfs2_diradd_alloc_required resulted in there being
+ * no need to allocate any new directory blocks, then it will contain
+ * a pointer to the directory entry and the bh in which it resides. We
+ * can use that without having to repeat the search. If there was no
+ * free space, then we must now create more space.
 *
 * Returns: 0 on success, error code on failure
 */
 int gfs2_dir_add(struct inode *inode, const struct qstr *name,
-                 const struct gfs2_inode *nip)
+                 const struct gfs2_inode *nip, struct gfs2_diradd *da)
 {
        struct gfs2_inode *ip = GFS2_I(inode);
-        struct buffer_head *bh;
+        struct buffer_head *bh = da->bh;
-        struct gfs2_dirent *dent;
+        struct gfs2_dirent *dent = da->dent;
+        struct timespec tv;
        struct gfs2_leaf *leaf;
        int error;
        while(1) {
-                dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space,
+                if (da->bh == NULL) {
-                                          &bh);
+                        dent = gfs2_dirent_search(inode, name,
+                                                  gfs2_dirent_find_space, &bh);
+                }
                if (dent) {
                        if (IS_ERR(dent))
                                return PTR_ERR(dent);
                        dent = gfs2_init_dirent(inode, dent, name, bh);
                        gfs2_inum_out(nip, dent);
                        dent->de_type = cpu_to_be16(IF2DT(nip->i_inode.i_mode));
+                        tv = CURRENT_TIME;
                        if (ip->i_diskflags & GFS2_DIF_EXHASH) {
                                leaf = (struct gfs2_leaf *)bh->b_data;
                                be16_add_cpu(&leaf->lf_entries, 1);
+                                leaf->lf_nsec = cpu_to_be32(tv.tv_nsec);
+                                leaf->lf_sec = cpu_to_be64(tv.tv_sec);
                        }
+                        da->dent = NULL;
+                        da->bh = NULL;
                        brelse(bh);
                        ip->i_entries++;
-                        ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
+                        ip->i_inode.i_mtime = ip->i_inode.i_ctime = tv;
                        if (S_ISDIR(nip->i_inode.i_mode))
                                inc_nlink(&ip->i_inode);
                        mark_inode_dirty(inode);
@@ -1742,6 +1783,7 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry)
        const struct qstr *name = &dentry->d_name;
        struct gfs2_dirent *dent, *prev = NULL;
        struct buffer_head *bh;
+        struct timespec tv = CURRENT_TIME;
        /* Returns _either_ the entry (if its first in block) or the
           previous entry otherwise */
@@ -1767,13 +1809,15 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry)
                if (!entries)
                        gfs2_consist_inode(dip);
                leaf->lf_entries = cpu_to_be16(--entries);
+                leaf->lf_nsec = cpu_to_be32(tv.tv_nsec);
+                leaf->lf_sec = cpu_to_be64(tv.tv_sec);
        }
        brelse(bh);
        if (!dip->i_entries)
                gfs2_consist_inode(dip);
        dip->i_entries--;
-        dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME;
+        dip->i_inode.i_mtime = dip->i_inode.i_ctime = tv;
        if (S_ISDIR(dentry->d_inode->i_mode))
                drop_nlink(&dip->i_inode);
        mark_inode_dirty(&dip->i_inode);
@@ -2017,22 +2061,36 @@ out:
 * gfs2_diradd_alloc_required - find if adding entry will require an allocation
 * @ip: the file being written to
 * @filname: the filename that's going to be added
+ * @da: The structure to return dir alloc info
 *
- * Returns: 1 if alloc required, 0 if not, -ve on error
+ * Returns: 0 if ok, -ve on error
 */
-int gfs2_diradd_alloc_required(struct inode *inode, const struct qstr *name)
+int gfs2_diradd_alloc_required(struct inode *inode, const struct qstr *name,
+                               struct gfs2_diradd *da)
 {
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_sbd *sdp = GFS2_SB(inode);
+        const unsigned int extra = sizeof(struct gfs2_dinode) - sizeof(struct gfs2_leaf);
        struct gfs2_dirent *dent;
        struct buffer_head *bh;
+        da->nr_blocks = 0;
+        da->bh = NULL;
+        da->dent = NULL;
        dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space, &bh);
        if (!dent) {
-                return 1;
+                da->nr_blocks = sdp->sd_max_dirres;
+                if (!(ip->i_diskflags & GFS2_DIF_EXHASH) &&
+                    (GFS2_DIRENT_SIZE(name->len) < extra))
+                        da->nr_blocks = 1;
+                return 0;
        }
        if (IS_ERR(dent))
                return PTR_ERR(dent);
-        brelse(bh);
+        da->bh = bh;
+        da->dent = dent;
        return 0;
 }
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
index 4f03bbd1873f..126c65dda028 100644
--- a/fs/gfs2/dir.h
+++ b/fs/gfs2/dir.h
@@ -16,6 +16,14 @@
 struct inode;
 struct gfs2_inode;
 struct gfs2_inum;
+struct buffer_head;
+struct gfs2_dirent;
+struct gfs2_diradd {
+        unsigned nr_blocks;
+        struct gfs2_dirent *dent;
+        struct buffer_head *bh;
+};
 extern struct inode *gfs2_dir_search(struct inode *dir,
                                     const struct qstr *filename,
@@ -23,7 +31,13 @@ extern struct inode *gfs2_dir_search(struct inode *dir,
 extern int gfs2_dir_check(struct inode *dir, const struct qstr *filename,
                          const struct gfs2_inode *ip);
 extern int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
-                        const struct gfs2_inode *ip);
+                        const struct gfs2_inode *ip, struct gfs2_diradd *da);
+static inline void gfs2_dir_no_add(struct gfs2_diradd *da)
+{
+        if (da->bh)
+                brelse(da->bh);
+        da->bh = NULL;
+}
 extern int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry);
 extern int gfs2_dir_read(struct inode *inode, struct dir_context *ctx,
                         struct file_ra_state *f_ra);
@@ -33,7 +47,8 @@ extern int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
 extern int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip);
 extern int gfs2_diradd_alloc_required(struct inode *dir,
-                                      const struct qstr *filename);
+                                      const struct qstr *filename,
+                                      struct gfs2_diradd *da);
 extern int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block,
                                   struct buffer_head **bhp);
 extern void gfs2_dir_hash_inval(struct gfs2_inode *ip);
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index c8420f7e4db6..ca0be6c69a26 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1552,13 +1552,11 @@ void gfs2_glock_thaw(struct gfs2_sbd *sdp)
        glock_hash_walk(thaw_glock, sdp);
 }
-static int dump_glock(struct seq_file *seq, struct gfs2_glock *gl)
+static void dump_glock(struct seq_file *seq, struct gfs2_glock *gl)
 {
-        int ret;
        spin_lock(&gl->gl_spin);
-        ret = gfs2_dump_glock(seq, gl);
+        gfs2_dump_glock(seq, gl);
        spin_unlock(&gl->gl_spin);
-        return ret;
 }
 static void dump_glock_func(struct gfs2_glock *gl)
@@ -1647,14 +1645,14 @@ static const char *hflags2str(char *buf, unsigned flags, unsigned long iflags)
 * @seq: the seq_file struct
 * @gh: the glock holder
 *
- * Returns: 0 on success, -ENOBUFS when we run out of space
 */
-static int dump_holder(struct seq_file *seq, const struct gfs2_holder *gh)
+static void dump_holder(struct seq_file *seq, const struct gfs2_holder *gh)
 {
        struct task_struct *gh_owner = NULL;
        char flags_buf[32];
+        rcu_read_lock();
        if (gh->gh_owner_pid)
                gh_owner = pid_task(gh->gh_owner_pid, PIDTYPE_PID);
        gfs2_print_dbg(seq, " H: s:%s f:%s e:%d p:%ld [%s] %pS\n",
@@ -1664,7 +1662,7 @@ static int dump_holder(struct seq_file *seq, const struct gfs2_holder *gh)
                       gh->gh_owner_pid ? (long)pid_nr(gh->gh_owner_pid) : -1,
                       gh_owner ? gh_owner->comm : "(ended)",
                       (void *)gh->gh_ip);
-        return 0;
+        rcu_read_unlock();
 }
 static const char *gflags2str(char *buf, const struct gfs2_glock *gl)
@@ -1719,16 +1717,14 @@ static const char *gflags2str(char *buf, const struct gfs2_glock *gl)
 * example. The field's are n = number (id of the object), f = flags,
 * t = type, s = state, r = refcount, e = error, p = pid.
 *
- * Returns: 0 on success, -ENOBUFS when we run out of space
 */
-int gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl)
+void gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl)
 {
        const struct gfs2_glock_operations *glops = gl->gl_ops;
        unsigned long long dtime;
        const struct gfs2_holder *gh;
        char gflags_buf[32];
-        int error = 0;
        dtime = jiffies - gl->gl_demote_time;
        dtime *= 1000000/HZ; /* demote time in uSec */
@@ -1745,15 +1741,11 @@ int gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl)
                  atomic_read(&gl->gl_revokes),
                  (int)gl->gl_lockref.count, gl->gl_hold_time);
-        list_for_each_entry(gh, &gl->gl_holders, gh_list) {
+        list_for_each_entry(gh, &gl->gl_holders, gh_list)
-                error = dump_holder(seq, gh);
+                dump_holder(seq, gh);
-                if (error)
-                        goto out;
-        }
        if (gl->gl_state != LM_ST_UNLOCKED && glops->go_dump)
-                error = glops->go_dump(seq, gl);
+                glops->go_dump(seq, gl);
-out:
-        return error;
 }
 static int gfs2_glstats_seq_show(struct seq_file *seq, void *iter_ptr)
@@ -1951,7 +1943,8 @@ static void gfs2_glock_seq_stop(struct seq_file *seq, void *iter_ptr)
 static int gfs2_glock_seq_show(struct seq_file *seq, void *iter_ptr)
 {
-        return dump_glock(seq, iter_ptr);
+        dump_glock(seq, iter_ptr);
+        return 0;
 }
 static void *gfs2_sbstats_seq_start(struct seq_file *seq, loff_t *pos)
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 6647d77366ba..32572f71f027 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -199,7 +199,7 @@ extern int gfs2_glock_nq_num(struct gfs2_sbd *sdp, u64 number,
                             struct gfs2_holder *gh);
 extern int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
 extern void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
-extern int gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl);
+extern void gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl);
 #define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { gfs2_dump_glock(NULL, gl); BUG(); } } while(0)
 extern __printf(2, 3)
 void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index db908f697139..3bf0631b5d56 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -133,7 +133,8 @@ void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
 static void rgrp_go_sync(struct gfs2_glock *gl)
 {
-        struct address_space *metamapping = gfs2_glock2aspace(gl);
+        struct gfs2_sbd *sdp = gl->gl_sbd;
+        struct address_space *mapping = &sdp->sd_aspace;
        struct gfs2_rgrpd *rgd;
        int error;
@@ -141,10 +142,10 @@ static void rgrp_go_sync(struct gfs2_glock *gl)
                return;
        GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_EXCLUSIVE);
-        gfs2_log_flush(gl->gl_sbd, gl);
+        gfs2_log_flush(sdp, gl);
-        filemap_fdatawrite(metamapping);
+        filemap_fdatawrite_range(mapping, gl->gl_vm.start, gl->gl_vm.end);
-        error = filemap_fdatawait(metamapping);
+        error = filemap_fdatawait_range(mapping, gl->gl_vm.start, gl->gl_vm.end);
-        mapping_set_error(metamapping, error);
+        mapping_set_error(mapping, error);
        gfs2_ail_empty_gl(gl);
        spin_lock(&gl->gl_spin);
@@ -166,11 +167,12 @@ static void rgrp_go_sync(struct gfs2_glock *gl)
 static void rgrp_go_inval(struct gfs2_glock *gl, int flags)
 {
-        struct address_space *mapping = gfs2_glock2aspace(gl);
+        struct gfs2_sbd *sdp = gl->gl_sbd;
+        struct address_space *mapping = &sdp->sd_aspace;
        WARN_ON_ONCE(!(flags & DIO_METADATA));
-        gfs2_assert_withdraw(gl->gl_sbd, !atomic_read(&gl->gl_ail_count));
+        gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count));
-        truncate_inode_pages(mapping, 0);
+        truncate_inode_pages_range(mapping, gl->gl_vm.start, gl->gl_vm.end);
        if (gl->gl_object) {
                struct gfs2_rgrpd *rgd = (struct gfs2_rgrpd *)gl->gl_object;
@@ -192,8 +194,11 @@ static void inode_go_sync(struct gfs2_glock *gl)
        if (ip && !S_ISREG(ip->i_inode.i_mode))
                ip = NULL;
-        if (ip && test_and_clear_bit(GIF_SW_PAGED, &ip->i_flags))
+        if (ip) {
-                unmap_shared_mapping_range(ip->i_inode.i_mapping, 0, 0);
+                if (test_and_clear_bit(GIF_SW_PAGED, &ip->i_flags))
+                        unmap_shared_mapping_range(ip->i_inode.i_mapping, 0, 0);
+                inode_dio_wait(&ip->i_inode);
+        }
        if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags))
                return;
@@ -410,6 +415,9 @@ static int inode_go_lock(struct gfs2_holder *gh)
                        return error;
        }
+        if (gh->gh_state != LM_ST_DEFERRED)
+                inode_dio_wait(&ip->i_inode);
        if ((ip->i_diskflags & GFS2_DIF_TRUNC_IN_PROG) &&
            (gl->gl_state == LM_ST_EXCLUSIVE) &&
            (gh->gh_state == LM_ST_EXCLUSIVE)) {
@@ -429,21 +437,19 @@ static int inode_go_lock(struct gfs2_holder *gh)
 * @seq: The iterator
 * @ip: the inode
 *
- * Returns: 0 on success, -ENOBUFS when we run out of space
 */
-static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
+static void inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
 {
        const struct gfs2_inode *ip = gl->gl_object;
        if (ip == NULL)
-                return 0;
+                return;
        gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu\n",
                  (unsigned long long)ip->i_no_formal_ino,
                  (unsigned long long)ip->i_no_addr,
                  IF2DT(ip->i_inode.i_mode), ip->i_flags,
                  (unsigned int)ip->i_diskflags,
                  (unsigned long long)i_size_read(&ip->i_inode));
-        return 0;
 }
 /**
@@ -552,7 +558,7 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = {
        .go_unlock = gfs2_rgrp_go_unlock,
        .go_dump = gfs2_rgrp_dump,
        .go_type = LM_TYPE_RGRP,
-        .go_flags = GLOF_ASPACE | GLOF_LVB,
+        .go_flags = GLOF_LVB,
 };
 const struct gfs2_glock_operations gfs2_trans_glops = {
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index ba1ea67f4eeb..cf0e34400f71 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -93,6 +93,7 @@ struct gfs2_rgrpd {
        struct gfs2_rgrp_lvb *rd_rgl;
        u32 rd_last_alloc;
        u32 rd_flags;
+        u32 rd_extfail_pt;              /* extent failure point */
 #define GFS2_RDF_CHECK          0x10000000 /* check for unlinked inodes */
 #define GFS2_RDF_UPTODATE       0x20000000 /* rg is up to date */
 #define GFS2_RDF_ERROR          0x40000000 /* error in rg */
@@ -217,7 +218,7 @@ struct gfs2_glock_operations {
        int (*go_demote_ok) (const struct gfs2_glock *gl);
        int (*go_lock) (struct gfs2_holder *gh);
        void (*go_unlock) (struct gfs2_holder *gh);
-        int (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl);
+        void (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl);
        void (*go_callback)(struct gfs2_glock *gl, bool remote);
        const int go_type;
        const unsigned long go_flags;
@@ -350,7 +351,15 @@ struct gfs2_glock {
        atomic_t gl_ail_count;
        atomic_t gl_revokes;
        struct delayed_work gl_work;
-        struct work_struct gl_delete;
+        union {
+                /* For inode and iopen glocks only */
+                struct work_struct gl_delete;
+                /* For rgrp glocks only */
+                struct {
+                        loff_t start;
+                        loff_t end;
+                } gl_vm;
+        };
        struct rcu_head gl_rcu;
 };
@@ -419,10 +428,13 @@ enum {
 };
 struct gfs2_quota_data {
+        struct hlist_bl_node qd_hlist;
        struct list_head qd_list;
        struct kqid qd_id;
+        struct gfs2_sbd *qd_sbd;
        struct lockref qd_lockref;
        struct list_head qd_lru;
+        unsigned qd_hash;
        unsigned long qd_flags;         /* QDF_... */
@@ -441,6 +453,7 @@ struct gfs2_quota_data {
        u64 qd_sync_gen;
        unsigned long qd_last_warn;
+        struct rcu_head qd_rcu;
 };
 struct gfs2_trans {
@@ -720,13 +733,15 @@ struct gfs2_sbd {
        spinlock_t sd_trunc_lock;
        unsigned int sd_quota_slots;
-        unsigned int sd_quota_chunks;
+        unsigned long *sd_quota_bitmap;
-        unsigned char **sd_quota_bitmap;
+        spinlock_t sd_bitmap_lock;
        u64 sd_quota_sync_gen;
        /* Log stuff */
+        struct address_space sd_aspace;
        spinlock_t sd_log_lock;
        struct gfs2_trans *sd_log_tr;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 7119504159f1..890588c7fb33 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -149,7 +149,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
        ip = GFS2_I(inode);
        if (!inode)
-                return ERR_PTR(-ENOBUFS);
+                return ERR_PTR(-ENOMEM);
        if (inode->i_state & I_NEW) {
                struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -469,14 +469,36 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_inode *ip,
        brelse(dibh);
 }
+/**
+ * gfs2_trans_da_blocks - Calculate number of blocks to link inode
+ * @dip: The directory we are linking into
+ * @da: The dir add information
+ * @nr_inodes: The number of inodes involved
+ *
+ * This calculate the number of blocks we need to reserve in a
+ * transaction to link @nr_inodes into a directory. In most cases
+ * @nr_inodes will be 2 (the directory plus the inode being linked in)
+ * but in case of rename, 4 may be required.
+ *
+ * Returns: Number of blocks
+ */
+static unsigned gfs2_trans_da_blks(const struct gfs2_inode *dip,
+                                   const struct gfs2_diradd *da,
+                                   unsigned nr_inodes)
+{
+        return da->nr_blocks + gfs2_rg_blocks(dip, da->nr_blocks) +
+               (nr_inodes * RES_DINODE) + RES_QUOTA + RES_STATFS;
+}
 static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
-                       struct gfs2_inode *ip, int arq)
+                       struct gfs2_inode *ip, struct gfs2_diradd *da)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
-        struct gfs2_alloc_parms ap = { .target = sdp->sd_max_dirres, };
+        struct gfs2_alloc_parms ap = { .target = da->nr_blocks, };
        int error;
-        if (arq) {
+        if (da->nr_blocks) {
                error = gfs2_quota_lock_check(dip);
                if (error)
                        goto fail_quota_locks;
@@ -485,10 +507,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
                if (error)
                        goto fail_quota_locks;
-                error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
+                error = gfs2_trans_begin(sdp, gfs2_trans_da_blks(dip, da, 2), 0);
-                                         dip->i_rgd->rd_length +
-                                         2 * RES_DINODE +
-                                         RES_STATFS + RES_QUOTA, 0);
                if (error)
                        goto fail_ipreserv;
        } else {
@@ -497,7 +516,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
                        goto fail_quota_locks;
        }
-        error = gfs2_dir_add(&dip->i_inode, name, ip);
+        error = gfs2_dir_add(&dip->i_inode, name, ip, da);
        if (error)
                goto fail_end_trans;
@@ -560,7 +579,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
        struct dentry *d;
        int error;
        u32 aflags = 0;
-        int arq;
+        struct gfs2_diradd da = { .bh = NULL, };
        if (!name->len || name->len > GFS2_FNAMESIZE)
                return -ENAMETOOLONG;
@@ -585,6 +604,9 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
        error = PTR_ERR(inode);
        if (!IS_ERR(inode)) {
                d = d_splice_alias(inode, dentry);
+                error = PTR_ERR(d);
+                if (IS_ERR(d))
+                        goto fail_gunlock;
                error = 0;
                if (file) {
                        if (S_ISREG(inode->i_mode)) {
@@ -602,7 +624,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
                goto fail_gunlock;
        }
-        arq = error = gfs2_diradd_alloc_required(dir, name);
+        error = gfs2_diradd_alloc_required(dir, name, &da);
        if (error < 0)
                goto fail_gunlock;
@@ -690,7 +712,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
        if (error)
                goto fail_gunlock3;
-        error = link_dinode(dip, name, ip, arq);
+        error = link_dinode(dip, name, ip, &da);
        if (error)
                goto fail_gunlock3;
@@ -719,6 +741,7 @@ fail_free_inode:
        free_inode_nonrcu(inode);
        inode = NULL;
 fail_gunlock:
+        gfs2_dir_no_add(&da);
        gfs2_glock_dq_uninit(ghs);
        if (inode && !IS_ERR(inode)) {
                clear_nlink(inode);
@@ -779,6 +802,11 @@ static struct dentry *__gfs2_lookup(struct inode *dir, struct dentry *dentry,
        }
        d = d_splice_alias(inode, dentry);
+        if (IS_ERR(d)) {
+                iput(inode);
+                gfs2_glock_dq_uninit(&gh);
+                return d;
+        }
        if (file && S_ISREG(inode->i_mode))
                error = finish_open(file, dentry, gfs2_open_common, opened);
@@ -817,7 +845,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
        struct gfs2_inode *ip = GFS2_I(inode);
        struct gfs2_holder ghs[2];
        struct buffer_head *dibh;
-        int alloc_required;
+        struct gfs2_diradd da = { .bh = NULL, };
        int error;
        if (S_ISDIR(inode->i_mode))
@@ -872,13 +900,12 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
        if (ip->i_inode.i_nlink == (u32)-1)
                goto out_gunlock;
-        alloc_required = error = gfs2_diradd_alloc_required(dir, &dentry->d_name);
+        error = gfs2_diradd_alloc_required(dir, &dentry->d_name, &da);
        if (error < 0)
                goto out_gunlock;
-        error = 0;
-        if (alloc_required) {
+        if (da.nr_blocks) {
-                struct gfs2_alloc_parms ap = { .target = sdp->sd_max_dirres, };
+                struct gfs2_alloc_parms ap = { .target = da.nr_blocks, };
                error = gfs2_quota_lock_check(dip);
                if (error)
                        goto out_gunlock;
@@ -887,10 +914,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
                if (error)
                        goto out_gunlock_q;
-                error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
+                error = gfs2_trans_begin(sdp, gfs2_trans_da_blks(dip, &da, 2), 0);
-                                         gfs2_rg_blocks(dip, sdp->sd_max_dirres) +
-                                         2 * RES_DINODE + RES_STATFS +
-                                         RES_QUOTA, 0);
                if (error)
                        goto out_ipres;
        } else {
@@ -903,7 +927,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
        if (error)
                goto out_end_trans;
-        error = gfs2_dir_add(dir, &dentry->d_name, ip);
+        error = gfs2_dir_add(dir, &dentry->d_name, ip, &da);
        if (error)
                goto out_brelse;
@@ -919,12 +943,13 @@ out_brelse:
 out_end_trans:
        gfs2_trans_end(sdp);
 out_ipres:
-        if (alloc_required)
+        if (da.nr_blocks)
                gfs2_inplace_release(dip);
 out_gunlock_q:
-        if (alloc_required)
+        if (da.nr_blocks)
                gfs2_quota_unlock(dip);
 out_gunlock:
+        gfs2_dir_no_add(&da);
        gfs2_glock_dq(ghs + 1);
 out_child:
        gfs2_glock_dq(ghs);
@@ -1254,7 +1279,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
        struct gfs2_rgrpd *nrgd;
        unsigned int num_gh;
        int dir_rename = 0;
-        int alloc_required = 0;
+        struct gfs2_diradd da = { .nr_blocks = 0, };
        unsigned int x;
        int error;
@@ -1388,14 +1413,14 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
                        goto out_gunlock;
        }
-        if (nip == NULL)
+        if (nip == NULL) {
-                alloc_required = gfs2_diradd_alloc_required(ndir, &ndentry->d_name);
+                error = gfs2_diradd_alloc_required(ndir, &ndentry->d_name, &da);
-        error = alloc_required;
+                if (error)
-        if (error < 0)
+                        goto out_gunlock;
-                goto out_gunlock;
+        }
-        if (alloc_required) {
+        if (da.nr_blocks) {
-                struct gfs2_alloc_parms ap = { .target = sdp->sd_max_dirres, };
+                struct gfs2_alloc_parms ap = { .target = da.nr_blocks, };
                error = gfs2_quota_lock_check(ndip);
                if (error)
                        goto out_gunlock;
@@ -1404,10 +1429,8 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
                if (error)
                        goto out_gunlock_q;
-                error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
+                error = gfs2_trans_begin(sdp, gfs2_trans_da_blks(ndip, &da, 4) +
-                                         gfs2_rg_blocks(ndip, sdp->sd_max_dirres) +
+                                         4 * RES_LEAF + 4, 0);
-                                         4 * RES_DINODE + 4 * RES_LEAF +
-                                         RES_STATFS + RES_QUOTA + 4, 0);
                if (error)
                        goto out_ipreserv;
        } else {
@@ -1441,19 +1464,20 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
        if (error)
                goto out_end_trans;
-        error = gfs2_dir_add(ndir, &ndentry->d_name, ip);
+        error = gfs2_dir_add(ndir, &ndentry->d_name, ip, &da);
        if (error)
                goto out_end_trans;
 out_end_trans:
        gfs2_trans_end(sdp);
 out_ipreserv:
-        if (alloc_required)
+        if (da.nr_blocks)
                gfs2_inplace_release(ndip);
 out_gunlock_q:
-        if (alloc_required)
+        if (da.nr_blocks)
                gfs2_quota_unlock(ndip);
 out_gunlock:
+        gfs2_dir_no_add(&da);
        while (x--) {
                gfs2_glock_dq(ghs + x);
                gfs2_holder_uninit(ghs + x);
@@ -1607,10 +1631,22 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
        if (!(attr->ia_valid & ATTR_GID) || gid_eq(ogid, ngid))
                ogid = ngid = NO_GID_QUOTA_CHANGE;
-        error = gfs2_quota_lock(ip, nuid, ngid);
+        error = get_write_access(inode);
        if (error)
                return error;
+        error = gfs2_rs_alloc(ip);
+        if (error)
+                goto out;
+        error = gfs2_rindex_update(sdp);
+        if (error)
+                goto out;
+        error = gfs2_quota_lock(ip, nuid, ngid);
+        if (error)
+                goto out;
        if (!uid_eq(ouid, NO_UID_QUOTA_CHANGE) ||
            !gid_eq(ogid, NO_GID_QUOTA_CHANGE)) {
                error = gfs2_quota_check(ip, nuid, ngid);
@@ -1637,6 +1673,8 @@ out_end_trans:
        gfs2_trans_end(sdp);
 out_gunlock_q:
        gfs2_quota_unlock(ip);
+out:
+        put_write_access(inode);
        return error;
 }
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 610613fb65b5..9dcb9777a5f8 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -551,10 +551,10 @@ void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
        struct buffer_head *bh = bd->bd_bh;
        struct gfs2_glock *gl = bd->bd_gl;
-        gfs2_remove_from_ail(bd);
-        bd->bd_bh = NULL;
        bh->b_private = NULL;
        bd->bd_blkno = bh->b_blocknr;
+        gfs2_remove_from_ail(bd); /* drops ref on bh */
+        bd->bd_bh = NULL;
        bd->bd_ops = &gfs2_revoke_lops;
        sdp->sd_log_num_revoke++;
        atomic_inc(&gl->gl_revokes);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 010b9fb9fec6..58f06400b7b8 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -83,6 +83,7 @@ static void maybe_release_space(struct gfs2_bufdata *bd)
               bd->bd_bh->b_data + bi->bi_offset, bi->bi_len);
        clear_bit(GBF_FULL, &bi->bi_flags);
        rgd->rd_free_clone = rgd->rd_free;
+        rgd->rd_extfail_pt = rgd->rd_free;
 }
 /**
@@ -588,8 +589,12 @@ static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
 static void gfs2_meta_sync(struct gfs2_glock *gl)
 {
        struct address_space *mapping = gfs2_glock2aspace(gl);
+        struct gfs2_sbd *sdp = gl->gl_sbd;
        int error;
+        if (mapping == NULL)
+                mapping = &sdp->sd_aspace;
        filemap_fdatawrite(mapping);
        error = filemap_fdatawait(mapping);
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 0650db2541ef..c272e73063de 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -76,6 +76,7 @@ static int __init init_gfs2_fs(void)
        gfs2_str2qstr(&gfs2_qdot, ".");
        gfs2_str2qstr(&gfs2_qdotdot, "..");
+        gfs2_quota_hash_init();
        error = gfs2_sys_init();
        if (error)
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 932415050540..c7f24690ed05 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -116,6 +116,9 @@ struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create)
        unsigned long index;
        unsigned int bufnum;
+        if (mapping == NULL)
+                mapping = &sdp->sd_aspace;
        shift = PAGE_CACHE_SHIFT - sdp->sd_sb.sb_bsize_shift;
        index = blkno >> shift;             /* convert block to page */
        bufnum = blkno - (index << shift);  /* block buf index within page */
@@ -258,6 +261,7 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int
        struct address_space *mapping = bh->b_page->mapping;
        struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping);
        struct gfs2_bufdata *bd = bh->b_private;
+        int was_pinned = 0;
        if (test_clear_buffer_pinned(bh)) {
                trace_gfs2_pin(bd, 0);
@@ -273,12 +277,16 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int
                        tr->tr_num_databuf_rm++;
                }
                tr->tr_touched = 1;
+                was_pinned = 1;
                brelse(bh);
        }
        if (bd) {
                spin_lock(&sdp->sd_ail_lock);
                if (bd->bd_tr) {
                        gfs2_trans_add_revoke(sdp, bd);
+                } else if (was_pinned) {
+                        bh->b_private = NULL;
+                        kmem_cache_free(gfs2_bufdata_cachep, bd);
                }
                spin_unlock(&sdp->sd_ail_lock);
        }
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 82303b474958..1e712b566d76 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -36,6 +36,7 @@
 #include "log.h"
 #include "quota.h"
 #include "dir.h"
+#include "meta_io.h"
 #include "trace_gfs2.h"
 #define DO 0
@@ -62,6 +63,7 @@ static void gfs2_tune_init(struct gfs2_tune *gt)
 static struct gfs2_sbd *init_sbd(struct super_block *sb)
 {
        struct gfs2_sbd *sdp;
+        struct address_space *mapping;
        sdp = kzalloc(sizeof(struct gfs2_sbd), GFP_KERNEL);
        if (!sdp)
@@ -97,6 +99,18 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
        init_waitqueue_head(&sdp->sd_quota_wait);
        INIT_LIST_HEAD(&sdp->sd_trunc_list);
        spin_lock_init(&sdp->sd_trunc_lock);
+        spin_lock_init(&sdp->sd_bitmap_lock);
+        mapping = &sdp->sd_aspace;
+        address_space_init_once(mapping);
+        mapping->a_ops = &gfs2_meta_aops;
+        mapping->host = sb->s_bdev->bd_inode;
+        mapping->flags = 0;
+        mapping_set_gfp_mask(mapping, GFP_NOFS);
+        mapping->private_data = NULL;
+        mapping->backing_dev_info = sb->s_bdi;
+        mapping->writeback_index = 0;
        spin_lock_init(&sdp->sd_log_lock);
        atomic_set(&sdp->sd_log_pinned, 0);
@@ -217,7 +231,7 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector, int silent)
        page = alloc_page(GFP_NOFS);
        if (unlikely(!page))
-                return -ENOBUFS;
+                return -ENOMEM;
        ClearPageUptodate(page);
        ClearPageDirty(page);
@@ -956,40 +970,6 @@ fail:
        return error;
 }
-static int init_threads(struct gfs2_sbd *sdp, int undo)
-{
-        struct task_struct *p;
-        int error = 0;
-        if (undo)
-                goto fail_quotad;
-        p = kthread_run(gfs2_logd, sdp, "gfs2_logd");
-        if (IS_ERR(p)) {
-                error = PTR_ERR(p);
-                fs_err(sdp, "can't start logd thread: %d\n", error);
-                return error;
-        }
-        sdp->sd_logd_process = p;
-        p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad");
-        if (IS_ERR(p)) {
-                error = PTR_ERR(p);
-                fs_err(sdp, "can't start quotad thread: %d\n", error);
-                goto fail;
-        }
-        sdp->sd_quotad_process = p;
-        return 0;
-fail_quotad:
-        kthread_stop(sdp->sd_quotad_process);
-fail:
-        kthread_stop(sdp->sd_logd_process);
-        return error;
-}
 static const match_table_t nolock_tokens = {
        { Opt_jid, "jid=%d\n", },
        { Opt_err, NULL },
@@ -1254,15 +1234,11 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
                goto fail_per_node;
        }
-        error = init_threads(sdp, DO);
-        if (error)
-                goto fail_per_node;
        if (!(sb->s_flags & MS_RDONLY)) {
                error = gfs2_make_fs_rw(sdp);
                if (error) {
                        fs_err(sdp, "can't make FS RW: %d\n", error);
-                        goto fail_threads;
+                        goto fail_per_node;
                }
        }
@@ -1270,8 +1246,6 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
        gfs2_online_uevent(sdp);
        return 0;
-fail_threads:
-        init_threads(sdp, UNDO);
 fail_per_node:
        init_per_node(sdp, UNDO);
 fail_inodes:
@@ -1366,8 +1340,18 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
        if (IS_ERR(s))
                goto error_bdev;
-        if (s->s_root)
+        if (s->s_root) {
+                /*
+                 * s_umount nests inside bd_mutex during
+                 * __invalidate_device().  blkdev_put() acquires
+                 * bd_mutex and can't be called under s_umount.  Drop
+                 * s_umount temporarily.  This is safe as we're
+                 * holding an active reference.
+                 */
+                up_write(&s->s_umount);
                blkdev_put(bdev, mode);
+                down_write(&s->s_umount);
+        }
        memset(&args, 0, sizeof(args));
        args.ar_quota = GFS2_QUOTA_DEFAULT;
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 98236d0df3ca..8bec0e3192dd 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -52,6 +52,11 @@
 #include <linux/dqblk_xfs.h>
 #include <linux/lockref.h>
 #include <linux/list_lru.h>
+#include <linux/rcupdate.h>
+#include <linux/rculist_bl.h>
+#include <linux/bit_spinlock.h>
+#include <linux/jhash.h>
+#include <linux/vmalloc.h>
 #include "gfs2.h"
 #include "incore.h"
@@ -67,16 +72,44 @@
 #include "inode.h"
 #include "util.h"
-struct gfs2_quota_change_host {
+#define GFS2_QD_HASH_SHIFT      12
-        u64 qc_change;
+#define GFS2_QD_HASH_SIZE       (1 << GFS2_QD_HASH_SHIFT)
-        u32 qc_flags; /* GFS2_QCF_... */
+#define GFS2_QD_HASH_MASK       (GFS2_QD_HASH_SIZE - 1)
-        struct kqid qc_id;
-};
-/* Lock order: qd_lock -> qd->lockref.lock -> lru lock */
+/* Lock order: qd_lock -> bucket lock -> qd->lockref.lock -> lru lock */
+/*                     -> sd_bitmap_lock                              */
 static DEFINE_SPINLOCK(qd_lock);
 struct list_lru gfs2_qd_lru;
+static struct hlist_bl_head qd_hash_table[GFS2_QD_HASH_SIZE];
+static unsigned int gfs2_qd_hash(const struct gfs2_sbd *sdp,
+                                 const struct kqid qid)
+{
+        unsigned int h;
+        h = jhash(&sdp, sizeof(struct gfs2_sbd *), 0);
+        h = jhash(&qid, sizeof(struct kqid), h);
+        return h & GFS2_QD_HASH_MASK;
+}
+static inline void spin_lock_bucket(unsigned int hash)
+{
+        hlist_bl_lock(&qd_hash_table[hash]);
+}
+static inline void spin_unlock_bucket(unsigned int hash)
+{
+        hlist_bl_unlock(&qd_hash_table[hash]);
+}
+static void gfs2_qd_dealloc(struct rcu_head *rcu)
+{
+        struct gfs2_quota_data *qd = container_of(rcu, struct gfs2_quota_data, qd_rcu);
+        kmem_cache_free(gfs2_quotad_cachep, qd);
+}
 static void gfs2_qd_dispose(struct list_head *list)
 {
        struct gfs2_quota_data *qd;
@@ -93,6 +126,10 @@ static void gfs2_qd_dispose(struct list_head *list)
                list_del(&qd->qd_list);
                spin_unlock(&qd_lock);
+                spin_lock_bucket(qd->qd_hash);
+                hlist_bl_del_rcu(&qd->qd_hlist);
+                spin_unlock_bucket(qd->qd_hash);
                gfs2_assert_warn(sdp, !qd->qd_change);
                gfs2_assert_warn(sdp, !qd->qd_slot_count);
                gfs2_assert_warn(sdp, !qd->qd_bh_count);
@@ -101,7 +138,7 @@ static void gfs2_qd_dispose(struct list_head *list)
                atomic_dec(&sdp->sd_quota_count);
                /* Delete it from the common reclaim list */
-                kmem_cache_free(gfs2_quotad_cachep, qd);
+                call_rcu(&qd->qd_rcu, gfs2_qd_dealloc);
        }
 }
@@ -171,83 +208,95 @@ static u64 qd2offset(struct gfs2_quota_data *qd)
        return offset;
 }
-static int qd_alloc(struct gfs2_sbd *sdp, struct kqid qid,
+static struct gfs2_quota_data *qd_alloc(unsigned hash, struct gfs2_sbd *sdp, struct kqid qid)
-                    struct gfs2_quota_data **qdp)
 {
        struct gfs2_quota_data *qd;
        int error;
        qd = kmem_cache_zalloc(gfs2_quotad_cachep, GFP_NOFS);
        if (!qd)
-                return -ENOMEM;
+                return NULL;
+        qd->qd_sbd = sdp;
        qd->qd_lockref.count = 1;
        spin_lock_init(&qd->qd_lockref.lock);
        qd->qd_id = qid;
        qd->qd_slot = -1;
        INIT_LIST_HEAD(&qd->qd_lru);
+        qd->qd_hash = hash;
        error = gfs2_glock_get(sdp, qd2index(qd),
                              &gfs2_quota_glops, CREATE, &qd->qd_gl);
        if (error)
                goto fail;
-        *qdp = qd;
+        return qd;
-        return 0;
 fail:
        kmem_cache_free(gfs2_quotad_cachep, qd);
-        return error;
+        return NULL;
 }
-static int qd_get(struct gfs2_sbd *sdp, struct kqid qid,
+static struct gfs2_quota_data *gfs2_qd_search_bucket(unsigned int hash,
-                  struct gfs2_quota_data **qdp)
+                                                     const struct gfs2_sbd *sdp,
+                                                     struct kqid qid)
 {
-        struct gfs2_quota_data *qd = NULL, *new_qd = NULL;
+        struct gfs2_quota_data *qd;
-        int error, found;
+        struct hlist_bl_node *h;
-        *qdp = NULL;
-        for (;;) {
+        hlist_bl_for_each_entry_rcu(qd, h, &qd_hash_table[hash], qd_hlist) {
-                found = 0;
+                if (!qid_eq(qd->qd_id, qid))
-                spin_lock(&qd_lock);
+                        continue;
-                list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) {
+                if (qd->qd_sbd != sdp)
-                        if (qid_eq(qd->qd_id, qid) &&
+                        continue;
-                            lockref_get_not_dead(&qd->qd_lockref)) {
+                if (lockref_get_not_dead(&qd->qd_lockref)) {
-                                list_lru_del(&gfs2_qd_lru, &qd->qd_lru);
+                        list_lru_del(&gfs2_qd_lru, &qd->qd_lru);
-                                found = 1;
+                        return qd;
-                                break;
-                        }
                }
+        }
-                if (!found)
+        return NULL;
-                        qd = NULL;
+}
-                if (!qd && new_qd) {
-                        qd = new_qd;
-                        list_add(&qd->qd_list, &sdp->sd_quota_list);
-                        atomic_inc(&sdp->sd_quota_count);
-                        new_qd = NULL;
-                }
-                spin_unlock(&qd_lock);
+static int qd_get(struct gfs2_sbd *sdp, struct kqid qid,
+                  struct gfs2_quota_data **qdp)
+{
+        struct gfs2_quota_data *qd, *new_qd;
+        unsigned int hash = gfs2_qd_hash(sdp, qid);
-                if (qd) {
+        rcu_read_lock();
-                        if (new_qd) {
+        *qdp = qd = gfs2_qd_search_bucket(hash, sdp, qid);
-                                gfs2_glock_put(new_qd->qd_gl);
+        rcu_read_unlock();
-                                kmem_cache_free(gfs2_quotad_cachep, new_qd);
-                        }
-                        *qdp = qd;
-                        return 0;
-                }
-                error = qd_alloc(sdp, qid, &new_qd);
+        if (qd)
-                if (error)
+                return 0;
-                        return error;
+        new_qd = qd_alloc(hash, sdp, qid);
+        if (!new_qd)
+                return -ENOMEM;
+        spin_lock(&qd_lock);
+        spin_lock_bucket(hash);
+        *qdp = qd = gfs2_qd_search_bucket(hash, sdp, qid);
+        if (qd == NULL) {
+                *qdp = new_qd;
+                list_add(&new_qd->qd_list, &sdp->sd_quota_list);
+                hlist_bl_add_head_rcu(&new_qd->qd_hlist, &qd_hash_table[hash]);
+                atomic_inc(&sdp->sd_quota_count);
        }
+        spin_unlock_bucket(hash);
+        spin_unlock(&qd_lock);
+        if (qd) {
+                gfs2_glock_put(new_qd->qd_gl);
+                kmem_cache_free(gfs2_quotad_cachep, new_qd);
+        }
+        return 0;
 }
 static void qd_hold(struct gfs2_quota_data *qd)
 {
        struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
@@ -268,88 +317,48 @@ static void qd_put(struct gfs2_quota_data *qd)
 static int slot_get(struct gfs2_quota_data *qd)
 {
-        struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+        struct gfs2_sbd *sdp = qd->qd_sbd;
-        unsigned int c, o = 0, b;
+        unsigned int bit;
-        unsigned char byte = 0;
+        int error = 0;
-        spin_lock(&qd_lock);
+        spin_lock(&sdp->sd_bitmap_lock);
+        if (qd->qd_slot_count != 0)
+                goto out;
-        if (qd->qd_slot_count++) {
+        error = -ENOSPC;
-                spin_unlock(&qd_lock);
+        bit = find_first_zero_bit(sdp->sd_quota_bitmap, sdp->sd_quota_slots);
-                return 0;
+        if (bit < sdp->sd_quota_slots) {
+                set_bit(bit, sdp->sd_quota_bitmap);
+                qd->qd_slot = bit;
+out:
+                qd->qd_slot_count++;
        }
+        spin_unlock(&sdp->sd_bitmap_lock);
-        for (c = 0; c < sdp->sd_quota_chunks; c++)
+        return error;
-                for (o = 0; o < PAGE_SIZE; o++) {
-                        byte = sdp->sd_quota_bitmap[c][o];
-                        if (byte != 0xFF)
-                                goto found;
-                }
-        goto fail;
-found:
-        for (b = 0; b < 8; b++)
-                if (!(byte & (1 << b)))
-                        break;
-        qd->qd_slot = c * (8 * PAGE_SIZE) + o * 8 + b;
-        if (qd->qd_slot >= sdp->sd_quota_slots)
-                goto fail;
-        sdp->sd_quota_bitmap[c][o] |= 1 << b;
-        spin_unlock(&qd_lock);
-        return 0;
-fail:
-        qd->qd_slot_count--;
-        spin_unlock(&qd_lock);
-        return -ENOSPC;
 }
 static void slot_hold(struct gfs2_quota_data *qd)
 {
-        struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+        struct gfs2_sbd *sdp = qd->qd_sbd;
-        spin_lock(&qd_lock);
+        spin_lock(&sdp->sd_bitmap_lock);
        gfs2_assert(sdp, qd->qd_slot_count);
        qd->qd_slot_count++;
-        spin_unlock(&qd_lock);
+        spin_unlock(&sdp->sd_bitmap_lock);
-}
-static void gfs2_icbit_munge(struct gfs2_sbd *sdp, unsigned char **bitmap,
-                             unsigned int bit, int new_value)
-{
-        unsigned int c, o, b = bit;
-        int old_value;
-        c = b / (8 * PAGE_SIZE);
-        b %= 8 * PAGE_SIZE;
-        o = b / 8;
-        b %= 8;
-        old_value = (bitmap[c][o] & (1 << b));
-        gfs2_assert_withdraw(sdp, !old_value != !new_value);
-        if (new_value)
-                bitmap[c][o] |= 1 << b;
-        else
-                bitmap[c][o] &= ~(1 << b);
 }
 static void slot_put(struct gfs2_quota_data *qd)
 {
-        struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+        struct gfs2_sbd *sdp = qd->qd_sbd;
-        spin_lock(&qd_lock);
+        spin_lock(&sdp->sd_bitmap_lock);
        gfs2_assert(sdp, qd->qd_slot_count);
        if (!--qd->qd_slot_count) {
-                gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, qd->qd_slot, 0);
+                BUG_ON(!test_and_clear_bit(qd->qd_slot, sdp->sd_quota_bitmap));
                qd->qd_slot = -1;
        }
-        spin_unlock(&qd_lock);
+        spin_unlock(&sdp->sd_bitmap_lock);
 }
 static int bh_get(struct gfs2_quota_data *qd)
@@ -427,8 +436,7 @@ static int qd_check_sync(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd,
        list_move_tail(&qd->qd_list, &sdp->sd_quota_list);
        set_bit(QDF_LOCKED, &qd->qd_flags);
        qd->qd_change_sync = qd->qd_change;
-        gfs2_assert_warn(sdp, qd->qd_slot_count);
+        slot_hold(qd);
-        qd->qd_slot_count++;
        return 1;
 }
@@ -1214,17 +1222,6 @@ int gfs2_quota_refresh(struct gfs2_sbd *sdp, struct kqid qid)
        return error;
 }
-static void gfs2_quota_change_in(struct gfs2_quota_change_host *qc, const void *buf)
-{
-        const struct gfs2_quota_change *str = buf;
-        qc->qc_change = be64_to_cpu(str->qc_change);
-        qc->qc_flags = be32_to_cpu(str->qc_flags);
-        qc->qc_id = make_kqid(&init_user_ns,
-                              (qc->qc_flags & GFS2_QCF_USER)?USRQUOTA:GRPQUOTA,
-                              be32_to_cpu(str->qc_id));
-}
 int gfs2_quota_init(struct gfs2_sbd *sdp)
 {
        struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
@@ -1232,6 +1229,8 @@ int gfs2_quota_init(struct gfs2_sbd *sdp)
        unsigned int blocks = size >> sdp->sd_sb.sb_bsize_shift;
        unsigned int x, slot = 0;
        unsigned int found = 0;
+        unsigned int hash;
+        unsigned int bm_size;
        u64 dblock;
        u32 extlen = 0;
        int error;
@@ -1240,23 +1239,20 @@ int gfs2_quota_init(struct gfs2_sbd *sdp)
                return -EIO;
        sdp->sd_quota_slots = blocks * sdp->sd_qc_per_block;
-        sdp->sd_quota_chunks = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * PAGE_SIZE);
+        bm_size = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * sizeof(unsigned long));
+        bm_size *= sizeof(unsigned long);
        error = -ENOMEM;
+        sdp->sd_quota_bitmap = kmalloc(bm_size, GFP_NOFS|__GFP_NOWARN);
-        sdp->sd_quota_bitmap = kcalloc(sdp->sd_quota_chunks,
+        if (sdp->sd_quota_bitmap == NULL)
-                                       sizeof(unsigned char *), GFP_NOFS);
+                sdp->sd_quota_bitmap = __vmalloc(bm_size, GFP_NOFS, PAGE_KERNEL);
        if (!sdp->sd_quota_bitmap)
                return error;
-        for (x = 0; x < sdp->sd_quota_chunks; x++) {
+        memset(sdp->sd_quota_bitmap, 0, bm_size);
-                sdp->sd_quota_bitmap[x] = kzalloc(PAGE_SIZE, GFP_NOFS);
-                if (!sdp->sd_quota_bitmap[x])
-                        goto fail;
-        }
        for (x = 0; x < blocks; x++) {
                struct buffer_head *bh;
+                const struct gfs2_quota_change *qc;
                unsigned int y;
                if (!extlen) {
@@ -1274,34 +1270,42 @@ int gfs2_quota_init(struct gfs2_sbd *sdp)
                        goto fail;
                }
+                qc = (const struct gfs2_quota_change *)(bh->b_data + sizeof(struct gfs2_meta_header));
                for (y = 0; y < sdp->sd_qc_per_block && slot < sdp->sd_quota_slots;
                     y++, slot++) {
-                        struct gfs2_quota_change_host qc;
                        struct gfs2_quota_data *qd;
+                        s64 qc_change = be64_to_cpu(qc->qc_change);
-                        gfs2_quota_change_in(&qc, bh->b_data +
+                        u32 qc_flags = be32_to_cpu(qc->qc_flags);
-                                          sizeof(struct gfs2_meta_header) +
+                        enum quota_type qtype = (qc_flags & GFS2_QCF_USER) ?
-                                          y * sizeof(struct gfs2_quota_change));
+                                                USRQUOTA : GRPQUOTA;
-                        if (!qc.qc_change)
+                        struct kqid qc_id = make_kqid(&init_user_ns, qtype,
+                                                      be32_to_cpu(qc->qc_id));
+                        qc++;
+                        if (!qc_change)
                                continue;
-                        error = qd_alloc(sdp, qc.qc_id, &qd);
+                        hash = gfs2_qd_hash(sdp, qc_id);
-                        if (error) {
+                        qd = qd_alloc(hash, sdp, qc_id);
+                        if (qd == NULL) {
                                brelse(bh);
                                goto fail;
                        }
                        set_bit(QDF_CHANGE, &qd->qd_flags);
-                        qd->qd_change = qc.qc_change;
+                        qd->qd_change = qc_change;
                        qd->qd_slot = slot;
                        qd->qd_slot_count = 1;
                        spin_lock(&qd_lock);
-                        gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, slot, 1);
+                        BUG_ON(test_and_set_bit(slot, sdp->sd_quota_bitmap));
                        list_add(&qd->qd_list, &sdp->sd_quota_list);
                        atomic_inc(&sdp->sd_quota_count);
                        spin_unlock(&qd_lock);
+                        spin_lock_bucket(hash);
+                        hlist_bl_add_head_rcu(&qd->qd_hlist, &qd_hash_table[hash]);
+                        spin_unlock_bucket(hash);
                        found++;
                }
@@ -1324,44 +1328,28 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
 {
        struct list_head *head = &sdp->sd_quota_list;
        struct gfs2_quota_data *qd;
-        unsigned int x;
        spin_lock(&qd_lock);
        while (!list_empty(head)) {
                qd = list_entry(head->prev, struct gfs2_quota_data, qd_list);
-                /*
-                 * To be removed in due course... we should be able to
-                 * ensure that all refs to the qd have done by this point
-                 * so that this rather odd test is not required
-                 */
-                spin_lock(&qd->qd_lockref.lock);
-                if (qd->qd_lockref.count > 1 ||
-                    (qd->qd_lockref.count && !test_bit(QDF_CHANGE, &qd->qd_flags))) {
-                        spin_unlock(&qd->qd_lockref.lock);
-                        list_move(&qd->qd_list, head);
-                        spin_unlock(&qd_lock);
-                        schedule();
-                        spin_lock(&qd_lock);
-                        continue;
-                }
-                spin_unlock(&qd->qd_lockref.lock);
                list_del(&qd->qd_list);
                /* Also remove if this qd exists in the reclaim list */
                list_lru_del(&gfs2_qd_lru, &qd->qd_lru);
                atomic_dec(&sdp->sd_quota_count);
                spin_unlock(&qd_lock);
-                if (!qd->qd_lockref.count) {
+                spin_lock_bucket(qd->qd_hash);
-                        gfs2_assert_warn(sdp, !qd->qd_change);
+                hlist_bl_del_rcu(&qd->qd_hlist);
-                        gfs2_assert_warn(sdp, !qd->qd_slot_count);
+                spin_unlock_bucket(qd->qd_hash);
-                } else
-                        gfs2_assert_warn(sdp, qd->qd_slot_count == 1);
+                gfs2_assert_warn(sdp, !qd->qd_change);
+                gfs2_assert_warn(sdp, !qd->qd_slot_count);
                gfs2_assert_warn(sdp, !qd->qd_bh_count);
                gfs2_glock_put(qd->qd_gl);
-                kmem_cache_free(gfs2_quotad_cachep, qd);
+                call_rcu(&qd->qd_rcu, gfs2_qd_dealloc);
                spin_lock(&qd_lock);
        }
@@ -1370,9 +1358,11 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
        gfs2_assert_warn(sdp, !atomic_read(&sdp->sd_quota_count));
        if (sdp->sd_quota_bitmap) {
-                for (x = 0; x < sdp->sd_quota_chunks; x++)
+                if (is_vmalloc_addr(sdp->sd_quota_bitmap))
-                        kfree(sdp->sd_quota_bitmap[x]);
+                        vfree(sdp->sd_quota_bitmap);
-                kfree(sdp->sd_quota_bitmap);
+                else
+                        kfree(sdp->sd_quota_bitmap);
+                sdp->sd_quota_bitmap = NULL;
        }
 }
@@ -1656,3 +1646,11 @@ const struct quotactl_ops gfs2_quotactl_ops = {
        .get_dqblk      = gfs2_get_dqblk,
        .set_dqblk      = gfs2_set_dqblk,
 };
+void __init gfs2_quota_hash_init(void)
+{
+        unsigned i;
+        for(i = 0; i < GFS2_QD_HASH_SIZE; i++)
+                INIT_HLIST_BL_HEAD(&qd_hash_table[i]);
+}
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index 96e4f34a03b0..55d506eb3c4a 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -57,5 +57,6 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)
 extern const struct quotactl_ops gfs2_quotactl_ops;
 extern struct shrinker gfs2_qd_shrinker;
 extern struct list_lru gfs2_qd_lru;
+extern void __init gfs2_quota_hash_init(void);
 #endif /* __QUOTA_DOT_H__ */
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index c8d6161bd682..a1da21349235 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -57,6 +57,11 @@
 * 3 = Used (metadata)
 */
+struct gfs2_extent {
+        struct gfs2_rbm rbm;
+        u32 len;
+};
 static const char valid_change[16] = {
                /* current */
        /* n */ 0, 1, 1, 1,
@@ -65,8 +70,9 @@ static const char valid_change[16] = {
                1, 0, 0, 0
 };
-static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 minext,
+static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
-                         const struct gfs2_inode *ip, bool nowrap);
+                         const struct gfs2_inode *ip, bool nowrap,
+                         const struct gfs2_alloc_parms *ap);
 /**
@@ -635,9 +641,13 @@ static void __rs_deltree(struct gfs2_blkreserv *rs)
                /* return reserved blocks to the rgrp */
                BUG_ON(rs->rs_rbm.rgd->rd_reserved < rs->rs_free);
                rs->rs_rbm.rgd->rd_reserved -= rs->rs_free;
+                /* The rgrp extent failure point is likely not to increase;
+                   it will only do so if the freed blocks are somehow
+                   contiguous with a span of free blocks that follows. Still,
+                   it will force the number to be recalculated later. */
+                rgd->rd_extfail_pt += rs->rs_free;
                rs->rs_free = 0;
                clear_bit(GBF_FULL, &bi->bi_flags);
-                smp_mb__after_clear_bit();
        }
 }
@@ -876,6 +886,7 @@ static int rgd_insert(struct gfs2_rgrpd *rgd)
 static int read_rindex_entry(struct gfs2_inode *ip)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        const unsigned bsize = sdp->sd_sb.sb_bsize;
        loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex);
        struct gfs2_rindex buf;
        int error;
@@ -913,6 +924,8 @@ static int read_rindex_entry(struct gfs2_inode *ip)
                goto fail;
        rgd->rd_gl->gl_object = rgd;
+        rgd->rd_gl->gl_vm.start = rgd->rd_addr * bsize;
+        rgd->rd_gl->gl_vm.end = rgd->rd_gl->gl_vm.start + (rgd->rd_length * bsize) - 1;
        rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr;
        rgd->rd_flags &= ~GFS2_RDF_UPTODATE;
        if (rgd->rd_data > sdp->sd_max_rg_data)
@@ -1126,6 +1139,8 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
                gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data);
                rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK);
                rgd->rd_free_clone = rgd->rd_free;
+                /* max out the rgrp allocation failure point */
+                rgd->rd_extfail_pt = rgd->rd_free;
        }
        if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic) {
                rgd->rd_rgl->rl_unlinked = cpu_to_be32(count_unlinked(rgd));
@@ -1184,7 +1199,7 @@ int gfs2_rgrp_go_lock(struct gfs2_holder *gh)
        if (gh->gh_flags & GL_SKIP && sdp->sd_args.ar_rgrplvb)
                return 0;
-        return gfs2_rgrp_bh_get((struct gfs2_rgrpd *)gh->gh_gl->gl_object);
+        return gfs2_rgrp_bh_get(rgd);
 }
 /**
@@ -1455,7 +1470,7 @@ static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip,
        if (WARN_ON(gfs2_rbm_from_block(&rbm, goal)))
                return;
-        ret = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, extlen, ip, true);
+        ret = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, &extlen, ip, true, ap);
        if (ret == 0) {
                rs->rs_rbm = rbm;
                rs->rs_free = extlen;
@@ -1520,6 +1535,7 @@ static u64 gfs2_next_unreserved_block(struct gfs2_rgrpd *rgd, u64 block,
 * @rbm: The current position in the resource group
 * @ip: The inode for which we are searching for blocks
 * @minext: The minimum extent length
+ * @maxext: A pointer to the maximum extent structure
 *
 * This checks the current position in the rgrp to see whether there is
 * a reservation covering this block. If not then this function is a
@@ -1532,7 +1548,8 @@ static u64 gfs2_next_unreserved_block(struct gfs2_rgrpd *rgd, u64 block,
 static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm,
                                             const struct gfs2_inode *ip,
-                                             u32 minext)
+                                             u32 minext,
+                                             struct gfs2_extent *maxext)
 {
        u64 block = gfs2_rbm_to_block(rbm);
        u32 extlen = 1;
@@ -1545,8 +1562,7 @@ static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm,
         */
        if (minext) {
                extlen = gfs2_free_extlen(rbm, minext);
-                nblock = block + extlen;
+                if (extlen <= maxext->len)
-                if (extlen < minext)
                        goto fail;
        }
@@ -1555,9 +1571,17 @@ static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm,
         * and skip if parts of it are already reserved
         */
        nblock = gfs2_next_unreserved_block(rbm->rgd, block, extlen, ip);
-        if (nblock == block)
+        if (nblock == block) {
-                return 0;
+                if (!minext || extlen >= minext)
+                        return 0;
+                if (extlen > maxext->len) {
+                        maxext->len = extlen;
+                        maxext->rbm = *rbm;
+                }
 fail:
+                nblock = block + extlen;
+        }
        ret = gfs2_rbm_from_block(rbm, nblock);
        if (ret < 0)
                return ret;
@@ -1568,30 +1592,38 @@ fail:
 * gfs2_rbm_find - Look for blocks of a particular state
 * @rbm: Value/result starting position and final position
 * @state: The state which we want to find
- * @minext: The requested extent length (0 for a single block)
+ * @minext: Pointer to the requested extent length (NULL for a single block)
+ *          This is updated to be the actual reservation size.
 * @ip: If set, check for reservations
 * @nowrap: Stop looking at the end of the rgrp, rather than wrapping
 *          around until we've reached the starting point.
+ * @ap: the allocation parameters
 *
 * Side effects:
 * - If looking for free blocks, we set GBF_FULL on each bitmap which
 *   has no free blocks in it.
+ * - If looking for free blocks, we set rd_extfail_pt on each rgrp which
+ *   has come up short on a free block search.
 *
 * Returns: 0 on success, -ENOSPC if there is no block of the requested state
 */
-static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 minext,
+static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
-                         const struct gfs2_inode *ip, bool nowrap)
+                         const struct gfs2_inode *ip, bool nowrap,
+                         const struct gfs2_alloc_parms *ap)
 {
        struct buffer_head *bh;
        int initial_bii;
        u32 initial_offset;
+        int first_bii = rbm->bii;
+        u32 first_offset = rbm->offset;
        u32 offset;
        u8 *buffer;
        int n = 0;
        int iters = rbm->rgd->rd_length;
        int ret;
        struct gfs2_bitmap *bi;
+        struct gfs2_extent maxext = { .rbm.rgd = rbm->rgd, };
        /* If we are not starting at the beginning of a bitmap, then we
         * need to add one to the bitmap count to ensure that we search
@@ -1620,7 +1652,9 @@ static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 minext,
                        return 0;
                initial_bii = rbm->bii;
-                ret = gfs2_reservation_check_and_update(rbm, ip, minext);
+                ret = gfs2_reservation_check_and_update(rbm, ip,
+                                                        minext ? *minext : 0,
+                                                        &maxext);
                if (ret == 0)
                        return 0;
                if (ret > 0) {
@@ -1655,6 +1689,24 @@ next_iter:
                        break;
        }
+        if (minext == NULL || state != GFS2_BLKST_FREE)
+                return -ENOSPC;
+        /* If the extent was too small, and it's smaller than the smallest
+           to have failed before, remember for future reference that it's
+           useless to search this rgrp again for this amount or more. */
+        if ((first_offset == 0) && (first_bii == 0) &&
+            (*minext < rbm->rgd->rd_extfail_pt))
+                rbm->rgd->rd_extfail_pt = *minext;
+        /* If the maximum extent we found is big enough to fulfill the
+           minimum requirements, use it anyway. */
+        if (maxext.len) {
+                *rbm = maxext.rbm;
+                *minext = maxext.len;
+                return 0;
+        }
        return -ENOSPC;
 }
@@ -1680,7 +1732,8 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip
        while (1) {
                down_write(&sdp->sd_log_flush_lock);
-                error = gfs2_rbm_find(&rbm, GFS2_BLKST_UNLINKED, 0, NULL, true);
+                error = gfs2_rbm_find(&rbm, GFS2_BLKST_UNLINKED, NULL, NULL,
+                                      true, NULL);
                up_write(&sdp->sd_log_flush_lock);
                if (error == -ENOSPC)
                        break;
@@ -1891,7 +1944,9 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, const struct gfs2_alloc_parms *a
                }
                /* Skip unuseable resource groups */
-                if (rs->rs_rbm.rgd->rd_flags & (GFS2_RGF_NOALLOC | GFS2_RDF_ERROR))
+                if ((rs->rs_rbm.rgd->rd_flags & (GFS2_RGF_NOALLOC |
+                                                 GFS2_RDF_ERROR)) ||
+                    (ap->target > rs->rs_rbm.rgd->rd_extfail_pt))
                        goto skip_rgrp;
                if (sdp->sd_args.ar_rgrplvb)
@@ -1911,15 +1966,16 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, const struct gfs2_alloc_parms *a
                        return 0;
                }
-                /* Drop reservation, if we couldn't use reserved rgrp */
-                if (gfs2_rs_active(rs))
-                        gfs2_rs_deltree(rs);
 check_rgrp:
                /* Check for unlinked inodes which can be reclaimed */
                if (rs->rs_rbm.rgd->rd_flags & GFS2_RDF_CHECK)
                        try_rgrp_unlink(rs->rs_rbm.rgd, &last_unlinked,
                                        ip->i_no_addr);
 skip_rgrp:
+                /* Drop reservation, if we couldn't use reserved rgrp */
+                if (gfs2_rs_active(rs))
+                        gfs2_rs_deltree(rs);
                /* Unlock rgrp if required */
                if (!rg_locked)
                        gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
@@ -2064,25 +2120,24 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
 *
 */
-int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl)
+void gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl)
 {
        struct gfs2_rgrpd *rgd = gl->gl_object;
        struct gfs2_blkreserv *trs;
        const struct rb_node *n;
        if (rgd == NULL)
-                return 0;
+                return;
-        gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u r:%u\n",
+        gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u r:%u e:%u\n",
                       (unsigned long long)rgd->rd_addr, rgd->rd_flags,
                       rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes,
-                       rgd->rd_reserved);
+                       rgd->rd_reserved, rgd->rd_extfail_pt);
        spin_lock(&rgd->rd_rsspin);
        for (n = rb_first(&rgd->rd_rstree); n; n = rb_next(&trs->rs_node)) {
                trs = rb_entry(n, struct gfs2_blkreserv, rs_node);
                dump_rs(seq, trs);
        }
        spin_unlock(&rgd->rd_rsspin);
-        return 0;
 }
 static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd)
@@ -2184,18 +2239,20 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
        int error;
        gfs2_set_alloc_start(&rbm, ip, dinode);
-        error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, 0, ip, false);
+        error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, ip, false, NULL);
        if (error == -ENOSPC) {
                gfs2_set_alloc_start(&rbm, ip, dinode);
-                error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, 0, NULL, false);
+                error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, NULL, false,
+                                      NULL);
        }
        /* Since all blocks are reserved in advance, this shouldn't happen */
        if (error) {
-                fs_warn(sdp, "inum=%llu error=%d, nblocks=%u, full=%d\n",
+                fs_warn(sdp, "inum=%llu error=%d, nblocks=%u, full=%d fail_pt=%d\n",
                        (unsigned long long)ip->i_no_addr, error, *nblocks,
-                        test_bit(GBF_FULL, &rbm.rgd->rd_bits->bi_flags));
+                        test_bit(GBF_FULL, &rbm.rgd->rd_bits->bi_flags),
+                        rbm.rgd->rd_extfail_pt);
                goto rgrp_error;
        }
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 3a10d2ffbbe7..463ab2e95d1c 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -68,7 +68,7 @@ extern void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist,
 extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state);
 extern void gfs2_rlist_free(struct gfs2_rgrp_list *rlist);
 extern u64 gfs2_ri_total(struct gfs2_sbd *sdp);
-extern int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl);
+extern void gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl);
 extern int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
                                   struct buffer_head *bh,
                                   const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 35da5b19c0de..60f60f6181f3 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -369,6 +369,33 @@ int gfs2_jdesc_check(struct gfs2_jdesc *jd)
        return 0;
 }
+static int init_threads(struct gfs2_sbd *sdp)
+{
+        struct task_struct *p;
+        int error = 0;
+        p = kthread_run(gfs2_logd, sdp, "gfs2_logd");
+        if (IS_ERR(p)) {
+                error = PTR_ERR(p);
+                fs_err(sdp, "can't start logd thread: %d\n", error);
+                return error;
+        }
+        sdp->sd_logd_process = p;
+        p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad");
+        if (IS_ERR(p)) {
+                error = PTR_ERR(p);
+                fs_err(sdp, "can't start quotad thread: %d\n", error);
+                goto fail;
+        }
+        sdp->sd_quotad_process = p;
+        return 0;
+fail:
+        kthread_stop(sdp->sd_logd_process);
+        return error;
+}
 /**
 * gfs2_make_fs_rw - Turn a Read-Only FS into a Read-Write one
 * @sdp: the filesystem
@@ -384,10 +411,14 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp)
        struct gfs2_log_header_host head;
        int error;
-        error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &t_gh);
+        error = init_threads(sdp);
        if (error)
                return error;
+        error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &t_gh);
+        if (error)
+                goto fail_threads;
        j_gl->gl_ops->go_inval(j_gl, DIO_METADATA);
        error = gfs2_find_jhead(sdp->sd_jdesc, &head);
@@ -417,7 +448,9 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp)
 fail:
        t_gh.gh_flags |= GL_NOCACHE;
        gfs2_glock_dq_uninit(&t_gh);
+fail_threads:
+        kthread_stop(sdp->sd_quotad_process);
+        kthread_stop(sdp->sd_logd_process);
        return error;
 }
@@ -800,6 +833,9 @@ static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
        struct gfs2_holder t_gh;
        int error;
+        kthread_stop(sdp->sd_quotad_process);
+        kthread_stop(sdp->sd_logd_process);
        flush_workqueue(gfs2_delete_workqueue);
        gfs2_quota_sync(sdp->sd_vfs, 0);
        gfs2_statfs_sync(sdp->sd_vfs, 0);
@@ -857,9 +893,6 @@ restart:
        }
        spin_unlock(&sdp->sd_jindex_spin);
-        kthread_stop(sdp->sd_quotad_process);
-        kthread_stop(sdp->sd_logd_process);
        if (!(sb->s_flags & MS_RDONLY)) {
                error = gfs2_make_fs_ro(sdp);
                if (error)
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 2d04f9afafd7..06fe11e0abfa 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -573,7 +573,7 @@ int log_wait_commit(journal_t *journal, tid_t tid)
 #ifdef CONFIG_JBD_DEBUG
        spin_lock(&journal->j_state_lock);
        if (!tid_geq(journal->j_commit_request, tid)) {
-                printk(KERN_EMERG
+                printk(KERN_ERR
                       "%s: error: j_commit_request=%d, tid=%d\n",
                       __func__, journal->j_commit_request, tid);
        }
@@ -604,10 +604,8 @@ int log_wait_commit(journal_t *journal, tid_t tid)
 out_unlock:
        spin_unlock(&journal->j_state_lock);
-        if (unlikely(is_journal_aborted(journal))) {
+        if (unlikely(is_journal_aborted(journal)))
-                printk(KERN_EMERG "journal commit I/O error\n");
                err = -EIO;
-        }
        return err;
 }
@@ -2136,7 +2134,7 @@ static void __exit journal_exit(void)
 #ifdef CONFIG_JBD_DEBUG
        int n = atomic_read(&nr_journal_heads);
        if (n)
-                printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n);
+                printk(KERN_ERR "JBD: leaked %d journal_heads!\n", n);
 #endif
        jbd_remove_debugfs_entry();
        journal_destroy_caches();
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index aa603e017d22..1695ba8334a2 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -675,7 +675,7 @@ repeat:
                                        jbd_alloc(jh2bh(jh)->b_size,
                                                         GFP_NOFS);
                                if (!frozen_buffer) {
-                                        printk(KERN_EMERG
+                                        printk(KERN_ERR
                                               "%s: OOM for frozen_buffer\n",
                                               __func__);
                                        JBUFFER_TRACE(jh, "oom!");
@@ -898,7 +898,7 @@ repeat:
        if (!jh->b_committed_data) {
                committed_data = jbd_alloc(jh2bh(jh)->b_size, GFP_NOFS);
                if (!committed_data) {
-                        printk(KERN_EMERG "%s: No memory for committed data\n",
+                        printk(KERN_ERR "%s: No memory for committed data\n",
                                __func__);
                        err = -ENOMEM;
                        goto out;
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 52032647dd4a..5fa344afb49a 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -702,7 +702,7 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
        read_lock(&journal->j_state_lock);
 #ifdef CONFIG_JBD2_DEBUG
        if (!tid_geq(journal->j_commit_request, tid)) {
-                printk(KERN_EMERG
+                printk(KERN_ERR
                       "%s: error: j_commit_request=%d, tid=%d\n",
                       __func__, journal->j_commit_request, tid);
        }
@@ -718,10 +718,8 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
        }
        read_unlock(&journal->j_state_lock);
-        if (unlikely(is_journal_aborted(journal))) {
+        if (unlikely(is_journal_aborted(journal)))
-                printk(KERN_EMERG "journal commit I/O error\n");
                err = -EIO;
-        }
        return err;
 }
@@ -1527,13 +1525,13 @@ static int journal_get_superblock(journal_t *journal)
        if (JBD2_HAS_COMPAT_FEATURE(journal, JBD2_FEATURE_COMPAT_CHECKSUM) &&
            JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) {
                /* Can't have checksum v1 and v2 on at the same time! */
-                printk(KERN_ERR "JBD: Can't enable checksumming v1 and v2 "
+                printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2 "
                       "at the same time!\n");
                goto out;
        }
        if (!jbd2_verify_csum_type(journal, sb)) {
-                printk(KERN_ERR "JBD: Unknown checksum type\n");
+                printk(KERN_ERR "JBD2: Unknown checksum type\n");
                goto out;
        }
@@ -1541,7 +1539,7 @@ static int journal_get_superblock(journal_t *journal)
        if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) {
                journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
                if (IS_ERR(journal->j_chksum_driver)) {
-                        printk(KERN_ERR "JBD: Cannot load crc32c driver.\n");
+                        printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n");
                        err = PTR_ERR(journal->j_chksum_driver);
                        journal->j_chksum_driver = NULL;
                        goto out;
@@ -1550,7 +1548,7 @@ static int journal_get_superblock(journal_t *journal)
        /* Check superblock checksum */
        if (!jbd2_superblock_csum_verify(journal, sb)) {
-                printk(KERN_ERR "JBD: journal checksum error\n");
+                printk(KERN_ERR "JBD2: journal checksum error\n");
                goto out;
        }
@@ -1836,7 +1834,7 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
                        journal->j_chksum_driver = crypto_alloc_shash("crc32c",
                                                                      0, 0);
                        if (IS_ERR(journal->j_chksum_driver)) {
-                                printk(KERN_ERR "JBD: Cannot load crc32c "
+                                printk(KERN_ERR "JBD2: Cannot load crc32c "
                                       "driver.\n");
                                journal->j_chksum_driver = NULL;
                                return 0;
@@ -2645,7 +2643,7 @@ static void __exit journal_exit(void)
 #ifdef CONFIG_JBD2_DEBUG
        int n = atomic_read(&nr_journal_heads);
        if (n)
-                printk(KERN_EMERG "JBD2: leaked %d journal_heads!\n", n);
+                printk(KERN_ERR "JBD2: leaked %d journal_heads!\n", n);
 #endif
        jbd2_remove_jbd_stats_proc_entry();
        jbd2_journal_destroy_caches();
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 3929c50428b1..3b6bb19d60b1 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -594,7 +594,7 @@ static int do_one_pass(journal_t *journal,
                                                be32_to_cpu(tmp->h_sequence))) {
                                                brelse(obh);
                                                success = -EIO;
-                                                printk(KERN_ERR "JBD: Invalid "
+                                                printk(KERN_ERR "JBD2: Invalid "
                                                       "checksum recovering "
                                                       "block %llu in log\n",
                                                       blocknr);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 7aa9a32573bb..8360674c85bc 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -932,7 +932,7 @@ repeat:
                                        jbd2_alloc(jh2bh(jh)->b_size,
                                                         GFP_NOFS);
                                if (!frozen_buffer) {
-                                        printk(KERN_EMERG
+                                        printk(KERN_ERR
                                               "%s: OOM for frozen_buffer\n",
                                               __func__);
                                        JBUFFER_TRACE(jh, "oom!");
@@ -1166,7 +1166,7 @@ repeat:
        if (!jh->b_committed_data) {
                committed_data = jbd2_alloc(jh2bh(jh)->b_size, GFP_NOFS);
                if (!committed_data) {
-                        printk(KERN_EMERG "%s: No memory for committed data\n",
+                        printk(KERN_ERR "%s: No memory for committed data\n",
                                __func__);
                        err = -ENOMEM;
                        goto out;
@@ -1290,7 +1290,10 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
                 * once a transaction -bzzz
                 */
                jh->b_modified = 1;
-                J_ASSERT_JH(jh, handle->h_buffer_credits > 0);
+                if (handle->h_buffer_credits <= 0) {
+                        ret = -ENOSPC;
+                        goto out_unlock_bh;
+                }
                handle->h_buffer_credits--;
        }
@@ -1305,7 +1308,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
                JBUFFER_TRACE(jh, "fastpath");
                if (unlikely(jh->b_transaction !=
                             journal->j_running_transaction)) {
-                        printk(KERN_EMERG "JBD: %s: "
+                        printk(KERN_ERR "JBD2: %s: "
                               "jh->b_transaction (%llu, %p, %u) != "
                               "journal->j_running_transaction (%p, %u)",
                               journal->j_devname,
@@ -1332,7 +1335,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
                JBUFFER_TRACE(jh, "already on other transaction");
                if (unlikely(jh->b_transaction !=
                             journal->j_committing_transaction)) {
-                        printk(KERN_EMERG "JBD: %s: "
+                        printk(KERN_ERR "JBD2: %s: "
                               "jh->b_transaction (%llu, %p, %u) != "
                               "journal->j_committing_transaction (%p, %u)",
                               journal->j_devname,
@@ -1345,7 +1348,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
                        ret = -EINVAL;
                }
                if (unlikely(jh->b_next_transaction != transaction)) {
-                        printk(KERN_EMERG "JBD: %s: "
+                        printk(KERN_ERR "JBD2: %s: "
                               "jh->b_next_transaction (%llu, %p, %u) != "
                               "transaction (%p, %u)",
                               journal->j_devname,
@@ -1373,7 +1376,6 @@ out_unlock_bh:
        jbd2_journal_put_journal_head(jh);
 out:
        JBUFFER_TRACE(jh, "exit");
-        WARN_ON(ret);   /* All errors are bugs, so dump the stack */
        return ret;
 }
diff --git a/fs/kernfs/Makefile b/fs/kernfs/Makefile
new file mode 100644
index 000000000000..674337c76673
--- /dev/null
+++ b/fs/kernfs/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for the kernfs pseudo filesystem
+#
+obj-y           := mount.o inode.o dir.o file.o symlink.o
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
new file mode 100644
index 000000000000..5104cf5d25c5
--- /dev/null
+++ b/fs/kernfs/dir.c
@@ -0,0 +1,1073 @@
+/*
+ * fs/kernfs/dir.c - kernfs directory implementation
+ *
+ * Copyright (c) 2001-3 Patrick Mochel
+ * Copyright (c) 2007 SUSE Linux Products GmbH
+ * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2.
+ */
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/idr.h>
+#include <linux/slab.h>
+#include <linux/security.h>
+#include <linux/hash.h>
+#include "kernfs-internal.h"
+DEFINE_MUTEX(kernfs_mutex);
+#define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb)
+/**
+ *      kernfs_name_hash
+ *      @name: Null terminated string to hash
+ *      @ns:   Namespace tag to hash
+ *
+ *      Returns 31 bit hash of ns + name (so it fits in an off_t )
+ */
+static unsigned int kernfs_name_hash(const char *name, const void *ns)
+{
+        unsigned long hash = init_name_hash();
+        unsigned int len = strlen(name);
+        while (len--)
+                hash = partial_name_hash(*name++, hash);
+        hash = (end_name_hash(hash) ^ hash_ptr((void *)ns, 31));
+        hash &= 0x7fffffffU;
+        /* Reserve hash numbers 0, 1 and INT_MAX for magic directory entries */
+        if (hash < 1)
+                hash += 2;
+        if (hash >= INT_MAX)
+                hash = INT_MAX - 1;
+        return hash;
+}
+static int kernfs_name_compare(unsigned int hash, const char *name,
+                               const void *ns, const struct kernfs_node *kn)
+{
+        if (hash != kn->hash)
+                return hash - kn->hash;
+        if (ns != kn->ns)
+                return ns - kn->ns;
+        return strcmp(name, kn->name);
+}
+static int kernfs_sd_compare(const struct kernfs_node *left,
+                             const struct kernfs_node *right)
+{
+        return kernfs_name_compare(left->hash, left->name, left->ns, right);
+}
+/**
+ *      kernfs_link_sibling - link kernfs_node into sibling rbtree
+ *      @kn: kernfs_node of interest
+ *
+ *      Link @kn into its sibling rbtree which starts from
+ *      @kn->parent->dir.children.
+ *
+ *      Locking:
+ *      mutex_lock(kernfs_mutex)
+ *
+ *      RETURNS:
+ *      0 on susccess -EEXIST on failure.
+ */
+static int kernfs_link_sibling(struct kernfs_node *kn)
+{
+        struct rb_node **node = &kn->parent->dir.children.rb_node;
+        struct rb_node *parent = NULL;
+        if (kernfs_type(kn) == KERNFS_DIR)
+                kn->parent->dir.subdirs++;
+        while (*node) {
+                struct kernfs_node *pos;
+                int result;
+                pos = rb_to_kn(*node);
+                parent = *node;
+                result = kernfs_sd_compare(kn, pos);
+                if (result < 0)
+                        node = &pos->rb.rb_left;
+                else if (result > 0)
+                        node = &pos->rb.rb_right;
+                else
+                        return -EEXIST;
+        }
+        /* add new node and rebalance the tree */
+        rb_link_node(&kn->rb, parent, node);
+        rb_insert_color(&kn->rb, &kn->parent->dir.children);
+        return 0;
+}
+/**
+ *      kernfs_unlink_sibling - unlink kernfs_node from sibling rbtree
+ *      @kn: kernfs_node of interest
+ *
+ *      Unlink @kn from its sibling rbtree which starts from
+ *      kn->parent->dir.children.
+ *
+ *      Locking:
+ *      mutex_lock(kernfs_mutex)
+ */
+static void kernfs_unlink_sibling(struct kernfs_node *kn)
+{
+        if (kernfs_type(kn) == KERNFS_DIR)
+                kn->parent->dir.subdirs--;
+        rb_erase(&kn->rb, &kn->parent->dir.children);
+}
+/**
+ *      kernfs_get_active - get an active reference to kernfs_node
+ *      @kn: kernfs_node to get an active reference to
+ *
+ *      Get an active reference of @kn.  This function is noop if @kn
+ *      is NULL.
+ *
+ *      RETURNS:
+ *      Pointer to @kn on success, NULL on failure.
+ */
+struct kernfs_node *kernfs_get_active(struct kernfs_node *kn)
+{
+        if (unlikely(!kn))
+                return NULL;
+        if (!atomic_inc_unless_negative(&kn->active))
+                return NULL;
+        if (kn->flags & KERNFS_LOCKDEP)
+                rwsem_acquire_read(&kn->dep_map, 0, 1, _RET_IP_);
+        return kn;
+}
+/**
+ *      kernfs_put_active - put an active reference to kernfs_node
+ *      @kn: kernfs_node to put an active reference to
+ *
+ *      Put an active reference to @kn.  This function is noop if @kn
+ *      is NULL.
+ */
+void kernfs_put_active(struct kernfs_node *kn)
+{
+        int v;
+        if (unlikely(!kn))
+                return;
+        if (kn->flags & KERNFS_LOCKDEP)
+                rwsem_release(&kn->dep_map, 1, _RET_IP_);
+        v = atomic_dec_return(&kn->active);
+        if (likely(v != KN_DEACTIVATED_BIAS))
+                return;
+        /*
+         * atomic_dec_return() is a mb(), we'll always see the updated
+         * kn->u.completion.
+         */
+        complete(kn->u.completion);
+}
+/**
+ *      kernfs_deactivate - deactivate kernfs_node
+ *      @kn: kernfs_node to deactivate
+ *
+ *      Deny new active references and drain existing ones.
+ */
+static void kernfs_deactivate(struct kernfs_node *kn)
+{
+        DECLARE_COMPLETION_ONSTACK(wait);
+        int v;
+        BUG_ON(!(kn->flags & KERNFS_REMOVED));
+        if (!(kernfs_type(kn) & KERNFS_ACTIVE_REF))
+                return;
+        kn->u.completion = (void *)&wait;
+        rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_);
+        /* atomic_add_return() is a mb(), put_active() will always see
+         * the updated kn->u.completion.
+         */
+        v = atomic_add_return(KN_DEACTIVATED_BIAS, &kn->active);
+        if (v != KN_DEACTIVATED_BIAS) {
+                lock_contended(&kn->dep_map, _RET_IP_);
+                wait_for_completion(&wait);
+        }
+        lock_acquired(&kn->dep_map, _RET_IP_);
+        rwsem_release(&kn->dep_map, 1, _RET_IP_);
+}
+/**
+ * kernfs_get - get a reference count on a kernfs_node
+ * @kn: the target kernfs_node
+ */
+void kernfs_get(struct kernfs_node *kn)
+{
+        if (kn) {
+                WARN_ON(!atomic_read(&kn->count));
+                atomic_inc(&kn->count);
+        }
+}
+EXPORT_SYMBOL_GPL(kernfs_get);
+/**
+ * kernfs_put - put a reference count on a kernfs_node
+ * @kn: the target kernfs_node
+ *
+ * Put a reference count of @kn and destroy it if it reached zero.
+ */
+void kernfs_put(struct kernfs_node *kn)
+{
+        struct kernfs_node *parent;
+        struct kernfs_root *root;
+        if (!kn || !atomic_dec_and_test(&kn->count))
+                return;
+        root = kernfs_root(kn);
+ repeat:
+        /* Moving/renaming is always done while holding reference.
+         * kn->parent won't change beneath us.
+         */
+        parent = kn->parent;
+        WARN(!(kn->flags & KERNFS_REMOVED), "kernfs: free using entry: %s/%s\n",
+             parent ? parent->name : "", kn->name);
+        if (kernfs_type(kn) == KERNFS_LINK)
+                kernfs_put(kn->symlink.target_kn);
+        if (!(kn->flags & KERNFS_STATIC_NAME))
+                kfree(kn->name);
+        if (kn->iattr) {
+                if (kn->iattr->ia_secdata)
+                        security_release_secctx(kn->iattr->ia_secdata,
+                                                kn->iattr->ia_secdata_len);
+                simple_xattrs_free(&kn->iattr->xattrs);
+        }
+        kfree(kn->iattr);
+        ida_simple_remove(&root->ino_ida, kn->ino);
+        kmem_cache_free(kernfs_node_cache, kn);
+        kn = parent;
+        if (kn) {
+                if (atomic_dec_and_test(&kn->count))
+                        goto repeat;
+        } else {
+                /* just released the root kn, free @root too */
+                ida_destroy(&root->ino_ida);
+                kfree(root);
+        }
+}
+EXPORT_SYMBOL_GPL(kernfs_put);
+static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags)
+{
+        struct kernfs_node *kn;
+        if (flags & LOOKUP_RCU)
+                return -ECHILD;
+        /* Always perform fresh lookup for negatives */
+        if (!dentry->d_inode)
+                goto out_bad_unlocked;
+        kn = dentry->d_fsdata;
+        mutex_lock(&kernfs_mutex);
+        /* The kernfs node has been deleted */
+        if (kn->flags & KERNFS_REMOVED)
+                goto out_bad;
+        /* The kernfs node has been moved? */
+        if (dentry->d_parent->d_fsdata != kn->parent)
+                goto out_bad;
+        /* The kernfs node has been renamed */
+        if (strcmp(dentry->d_name.name, kn->name) != 0)
+                goto out_bad;
+        /* The kernfs node has been moved to a different namespace */
+        if (kn->parent && kernfs_ns_enabled(kn->parent) &&
+            kernfs_info(dentry->d_sb)->ns != kn->ns)
+                goto out_bad;
+        mutex_unlock(&kernfs_mutex);
+out_valid:
+        return 1;
+out_bad:
+        mutex_unlock(&kernfs_mutex);
+out_bad_unlocked:
+        /*
+         * @dentry doesn't match the underlying kernfs node, drop the
+         * dentry and force lookup.  If we have submounts we must allow the
+         * vfs caches to lie about the state of the filesystem to prevent
+         * leaks and other nasty things, so use check_submounts_and_drop()
+         * instead of d_drop().
+         */
+        if (check_submounts_and_drop(dentry) != 0)
+                goto out_valid;
+        return 0;
+}
+static void kernfs_dop_release(struct dentry *dentry)
+{
+        kernfs_put(dentry->d_fsdata);
+}
+const struct dentry_operations kernfs_dops = {
+        .d_revalidate   = kernfs_dop_revalidate,
+        .d_release      = kernfs_dop_release,
+};
+static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
+                                             const char *name, umode_t mode,
+                                             unsigned flags)
+{
+        char *dup_name = NULL;
+        struct kernfs_node *kn;
+        int ret;
+        if (!(flags & KERNFS_STATIC_NAME)) {
+                name = dup_name = kstrdup(name, GFP_KERNEL);
+                if (!name)
+                        return NULL;
+        }
+        kn = kmem_cache_zalloc(kernfs_node_cache, GFP_KERNEL);
+        if (!kn)
+                goto err_out1;
+        ret = ida_simple_get(&root->ino_ida, 1, 0, GFP_KERNEL);
+        if (ret < 0)
+                goto err_out2;
+        kn->ino = ret;
+        atomic_set(&kn->count, 1);
+        atomic_set(&kn->active, 0);
+        kn->name = name;
+        kn->mode = mode;
+        kn->flags = flags | KERNFS_REMOVED;
+        return kn;
+ err_out2:
+        kmem_cache_free(kernfs_node_cache, kn);
+ err_out1:
+        kfree(dup_name);
+        return NULL;
+}
+struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
+                                    const char *name, umode_t mode,
+                                    unsigned flags)
+{
+        struct kernfs_node *kn;
+        kn = __kernfs_new_node(kernfs_root(parent), name, mode, flags);
+        if (kn) {
+                kernfs_get(parent);
+                kn->parent = parent;
+        }
+        return kn;
+}
+/**
+ *      kernfs_addrm_start - prepare for kernfs_node add/remove
+ *      @acxt: pointer to kernfs_addrm_cxt to be used
+ *
+ *      This function is called when the caller is about to add or remove
+ *      kernfs_node.  This function acquires kernfs_mutex.  @acxt is used
+ *      to keep and pass context to other addrm functions.
+ *
+ *      LOCKING:
+ *      Kernel thread context (may sleep).  kernfs_mutex is locked on
+ *      return.
+ */
+void kernfs_addrm_start(struct kernfs_addrm_cxt *acxt)
+        __acquires(kernfs_mutex)
+{
+        memset(acxt, 0, sizeof(*acxt));
+        mutex_lock(&kernfs_mutex);
+}
+/**
+ *      kernfs_add_one - add kernfs_node to parent without warning
+ *      @acxt: addrm context to use
+ *      @kn: kernfs_node to be added
+ *
+ *      The caller must already have initialized @kn->parent.  This
+ *      function increments nlink of the parent's inode if @kn is a
+ *      directory and link into the children list of the parent.
+ *
+ *      This function should be called between calls to
+ *      kernfs_addrm_start() and kernfs_addrm_finish() and should be passed
+ *      the same @acxt as passed to kernfs_addrm_start().
+ *
+ *      LOCKING:
+ *      Determined by kernfs_addrm_start().
+ *
+ *      RETURNS:
+ *      0 on success, -EEXIST if entry with the given name already
+ *      exists.
+ */
+int kernfs_add_one(struct kernfs_addrm_cxt *acxt, struct kernfs_node *kn)
+{
+        struct kernfs_node *parent = kn->parent;
+        bool has_ns = kernfs_ns_enabled(parent);
+        struct kernfs_iattrs *ps_iattr;
+        int ret;
+        if (has_ns != (bool)kn->ns) {
+                WARN(1, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n",
+                     has_ns ? "required" : "invalid", parent->name, kn->name);
+                return -EINVAL;
+        }
+        if (kernfs_type(parent) != KERNFS_DIR)
+                return -EINVAL;
+        if (parent->flags & KERNFS_REMOVED)
+                return -ENOENT;
+        kn->hash = kernfs_name_hash(kn->name, kn->ns);
+        ret = kernfs_link_sibling(kn);
+        if (ret)
+                return ret;
+        /* Update timestamps on the parent */
+        ps_iattr = parent->iattr;
+        if (ps_iattr) {
+                struct iattr *ps_iattrs = &ps_iattr->ia_iattr;
+                ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME;
+        }
+        /* Mark the entry added into directory tree */
+        kn->flags &= ~KERNFS_REMOVED;
+        return 0;
+}
+/**
+ *      kernfs_remove_one - remove kernfs_node from parent
+ *      @acxt: addrm context to use
+ *      @kn: kernfs_node to be removed
+ *
+ *      Mark @kn removed and drop nlink of parent inode if @kn is a
+ *      directory.  @kn is unlinked from the children list.
+ *
+ *      This function should be called between calls to
+ *      kernfs_addrm_start() and kernfs_addrm_finish() and should be
+ *      passed the same @acxt as passed to kernfs_addrm_start().
+ *
+ *      LOCKING:
+ *      Determined by kernfs_addrm_start().
+ */
+static void kernfs_remove_one(struct kernfs_addrm_cxt *acxt,
+                              struct kernfs_node *kn)
+{
+        struct kernfs_iattrs *ps_iattr;
+        /*
+         * Removal can be called multiple times on the same node.  Only the
+         * first invocation is effective and puts the base ref.
+         */
+        if (kn->flags & KERNFS_REMOVED)
+                return;
+        if (kn->parent) {
+                kernfs_unlink_sibling(kn);
+                /* Update timestamps on the parent */
+                ps_iattr = kn->parent->iattr;
+                if (ps_iattr) {
+                        ps_iattr->ia_iattr.ia_ctime = CURRENT_TIME;
+                        ps_iattr->ia_iattr.ia_mtime = CURRENT_TIME;
+                }
+        }
+        kn->flags |= KERNFS_REMOVED;
+        kn->u.removed_list = acxt->removed;
+        acxt->removed = kn;
+}
+/**
+ *      kernfs_addrm_finish - finish up kernfs_node add/remove
+ *      @acxt: addrm context to finish up
+ *
+ *      Finish up kernfs_node add/remove.  Resources acquired by
+ *      kernfs_addrm_start() are released and removed kernfs_nodes are
+ *      cleaned up.
+ *
+ *      LOCKING:
+ *      kernfs_mutex is released.
+ */
+void kernfs_addrm_finish(struct kernfs_addrm_cxt *acxt)
+        __releases(kernfs_mutex)
+{
+        /* release resources acquired by kernfs_addrm_start() */
+        mutex_unlock(&kernfs_mutex);
+        /* kill removed kernfs_nodes */
+        while (acxt->removed) {
+                struct kernfs_node *kn = acxt->removed;
+                acxt->removed = kn->u.removed_list;
+                kernfs_deactivate(kn);
+                kernfs_unmap_bin_file(kn);
+                kernfs_put(kn);
+        }
+}
+/**
+ * kernfs_find_ns - find kernfs_node with the given name
+ * @parent: kernfs_node to search under
+ * @name: name to look for
+ * @ns: the namespace tag to use
+ *
+ * Look for kernfs_node with name @name under @parent.  Returns pointer to
+ * the found kernfs_node on success, %NULL on failure.
+ */
+static struct kernfs_node *kernfs_find_ns(struct kernfs_node *parent,
+                                          const unsigned char *name,
+                                          const void *ns)
+{
+        struct rb_node *node = parent->dir.children.rb_node;
+        bool has_ns = kernfs_ns_enabled(parent);
+        unsigned int hash;
+        lockdep_assert_held(&kernfs_mutex);
+        if (has_ns != (bool)ns) {
+                WARN(1, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n",
+                     has_ns ? "required" : "invalid", parent->name, name);
+                return NULL;
+        }
+        hash = kernfs_name_hash(name, ns);
+        while (node) {
+                struct kernfs_node *kn;
+                int result;
+                kn = rb_to_kn(node);
+                result = kernfs_name_compare(hash, name, ns, kn);
+                if (result < 0)
+                        node = node->rb_left;
+                else if (result > 0)
+                        node = node->rb_right;
+                else
+                        return kn;
+        }
+        return NULL;
+}
+/**
+ * kernfs_find_and_get_ns - find and get kernfs_node with the given name
+ * @parent: kernfs_node to search under
+ * @name: name to look for
+ * @ns: the namespace tag to use
+ *
+ * Look for kernfs_node with name @name under @parent and get a reference
+ * if found.  This function may sleep and returns pointer to the found
+ * kernfs_node on success, %NULL on failure.
+ */
+struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent,
+                                           const char *name, const void *ns)
+{
+        struct kernfs_node *kn;
+        mutex_lock(&kernfs_mutex);
+        kn = kernfs_find_ns(parent, name, ns);
+        kernfs_get(kn);
+        mutex_unlock(&kernfs_mutex);
+        return kn;
+}
+EXPORT_SYMBOL_GPL(kernfs_find_and_get_ns);
+/**
+ * kernfs_create_root - create a new kernfs hierarchy
+ * @kdops: optional directory syscall operations for the hierarchy
+ * @priv: opaque data associated with the new directory
+ *
+ * Returns the root of the new hierarchy on success, ERR_PTR() value on
+ * failure.
+ */
+struct kernfs_root *kernfs_create_root(struct kernfs_dir_ops *kdops, void *priv)
+{
+        struct kernfs_root *root;
+        struct kernfs_node *kn;
+        root = kzalloc(sizeof(*root), GFP_KERNEL);
+        if (!root)
+                return ERR_PTR(-ENOMEM);
+        ida_init(&root->ino_ida);
+        kn = __kernfs_new_node(root, "", S_IFDIR | S_IRUGO | S_IXUGO,
+                               KERNFS_DIR);
+        if (!kn) {
+                ida_destroy(&root->ino_ida);
+                kfree(root);
+                return ERR_PTR(-ENOMEM);
+        }
+        kn->flags &= ~KERNFS_REMOVED;
+        kn->priv = priv;
+        kn->dir.root = root;
+        root->dir_ops = kdops;
+        root->kn = kn;
+        return root;
+}
+/**
+ * kernfs_destroy_root - destroy a kernfs hierarchy
+ * @root: root of the hierarchy to destroy
+ *
+ * Destroy the hierarchy anchored at @root by removing all existing
+ * directories and destroying @root.
+ */
+void kernfs_destroy_root(struct kernfs_root *root)
+{
+        kernfs_remove(root->kn);        /* will also free @root */
+}
+/**
+ * kernfs_create_dir_ns - create a directory
+ * @parent: parent in which to create a new directory
+ * @name: name of the new directory
+ * @mode: mode of the new directory
+ * @priv: opaque data associated with the new directory
+ * @ns: optional namespace tag of the directory
+ *
+ * Returns the created node on success, ERR_PTR() value on failure.
+ */
+struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent,
+                                         const char *name, umode_t mode,
+                                         void *priv, const void *ns)
+{
+        struct kernfs_addrm_cxt acxt;
+        struct kernfs_node *kn;
+        int rc;
+        /* allocate */
+        kn = kernfs_new_node(parent, name, mode | S_IFDIR, KERNFS_DIR);
+        if (!kn)
+                return ERR_PTR(-ENOMEM);
+        kn->dir.root = parent->dir.root;
+        kn->ns = ns;
+        kn->priv = priv;
+        /* link in */
+        kernfs_addrm_start(&acxt);
+        rc = kernfs_add_one(&acxt, kn);
+        kernfs_addrm_finish(&acxt);
+        if (!rc)
+                return kn;
+        kernfs_put(kn);
+        return ERR_PTR(rc);
+}
+static struct dentry *kernfs_iop_lookup(struct inode *dir,
+                                        struct dentry *dentry,
+                                        unsigned int flags)
+{
+        struct dentry *ret;
+        struct kernfs_node *parent = dentry->d_parent->d_fsdata;
+        struct kernfs_node *kn;
+        struct inode *inode;
+        const void *ns = NULL;
+        mutex_lock(&kernfs_mutex);
+        if (kernfs_ns_enabled(parent))
+                ns = kernfs_info(dir->i_sb)->ns;
+        kn = kernfs_find_ns(parent, dentry->d_name.name, ns);
+        /* no such entry */
+        if (!kn) {
+                ret = NULL;
+                goto out_unlock;
+        }
+        kernfs_get(kn);
+        dentry->d_fsdata = kn;
+        /* attach dentry and inode */
+        inode = kernfs_get_inode(dir->i_sb, kn);
+        if (!inode) {
+                ret = ERR_PTR(-ENOMEM);
+                goto out_unlock;
+        }
+        /* instantiate and hash dentry */
+        ret = d_materialise_unique(dentry, inode);
+ out_unlock:
+        mutex_unlock(&kernfs_mutex);
+        return ret;
+}
+static int kernfs_iop_mkdir(struct inode *dir, struct dentry *dentry,
+                            umode_t mode)
+{
+        struct kernfs_node *parent = dir->i_private;
+        struct kernfs_dir_ops *kdops = kernfs_root(parent)->dir_ops;
+        if (!kdops || !kdops->mkdir)
+                return -EPERM;
+        return kdops->mkdir(parent, dentry->d_name.name, mode);
+}
+static int kernfs_iop_rmdir(struct inode *dir, struct dentry *dentry)
+{
+        struct kernfs_node *kn  = dentry->d_fsdata;
+        struct kernfs_dir_ops *kdops = kernfs_root(kn)->dir_ops;
+        if (!kdops || !kdops->rmdir)
+                return -EPERM;
+        return kdops->rmdir(kn);
+}
+static int kernfs_iop_rename(struct inode *old_dir, struct dentry *old_dentry,
+                             struct inode *new_dir, struct dentry *new_dentry)
+{
+        struct kernfs_node *kn  = old_dentry->d_fsdata;
+        struct kernfs_node *new_parent = new_dir->i_private;
+        struct kernfs_dir_ops *kdops = kernfs_root(kn)->dir_ops;
+        if (!kdops || !kdops->rename)
+                return -EPERM;
+        return kdops->rename(kn, new_parent, new_dentry->d_name.name);
+}
+const struct inode_operations kernfs_dir_iops = {
+        .lookup         = kernfs_iop_lookup,
+        .permission     = kernfs_iop_permission,
+        .setattr        = kernfs_iop_setattr,
+        .getattr        = kernfs_iop_getattr,
+        .setxattr       = kernfs_iop_setxattr,
+        .removexattr    = kernfs_iop_removexattr,
+        .getxattr       = kernfs_iop_getxattr,
+        .listxattr      = kernfs_iop_listxattr,
+        .mkdir          = kernfs_iop_mkdir,
+        .rmdir          = kernfs_iop_rmdir,
+        .rename         = kernfs_iop_rename,
+};
+static struct kernfs_node *kernfs_leftmost_descendant(struct kernfs_node *pos)
+{
+        struct kernfs_node *last;
+        while (true) {
+                struct rb_node *rbn;
+                last = pos;
+                if (kernfs_type(pos) != KERNFS_DIR)
+                        break;
+                rbn = rb_first(&pos->dir.children);
+                if (!rbn)
+                        break;
+                pos = rb_to_kn(rbn);
+        }
+        return last;
+}
+/**
+ * kernfs_next_descendant_post - find the next descendant for post-order walk
+ * @pos: the current position (%NULL to initiate traversal)
+ * @root: kernfs_node whose descendants to walk
+ *
+ * Find the next descendant to visit for post-order traversal of @root's
+ * descendants.  @root is included in the iteration and the last node to be
+ * visited.
+ */
+static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos,
+                                                       struct kernfs_node *root)
+{
+        struct rb_node *rbn;
+        lockdep_assert_held(&kernfs_mutex);
+        /* if first iteration, visit leftmost descendant which may be root */
+        if (!pos)
+                return kernfs_leftmost_descendant(root);
+        /* if we visited @root, we're done */
+        if (pos == root)
+                return NULL;
+        /* if there's an unvisited sibling, visit its leftmost descendant */
+        rbn = rb_next(&pos->rb);
+        if (rbn)
+                return kernfs_leftmost_descendant(rb_to_kn(rbn));
+        /* no sibling left, visit parent */
+        return pos->parent;
+}
+static void __kernfs_remove(struct kernfs_addrm_cxt *acxt,
+                            struct kernfs_node *kn)
+{
+        struct kernfs_node *pos, *next;
+        if (!kn)
+                return;
+        pr_debug("kernfs %s: removing\n", kn->name);
+        next = NULL;
+        do {
+                pos = next;
+                next = kernfs_next_descendant_post(pos, kn);
+                if (pos)
+                        kernfs_remove_one(acxt, pos);
+        } while (next);
+}
+/**
+ * kernfs_remove - remove a kernfs_node recursively
+ * @kn: the kernfs_node to remove
+ *
+ * Remove @kn along with all its subdirectories and files.
+ */
+void kernfs_remove(struct kernfs_node *kn)
+{
+        struct kernfs_addrm_cxt acxt;
+        kernfs_addrm_start(&acxt);
+        __kernfs_remove(&acxt, kn);
+        kernfs_addrm_finish(&acxt);
+}
+/**
+ * kernfs_remove_by_name_ns - find a kernfs_node by name and remove it
+ * @parent: parent of the target
+ * @name: name of the kernfs_node to remove
+ * @ns: namespace tag of the kernfs_node to remove
+ *
+ * Look for the kernfs_node with @name and @ns under @parent and remove it.
+ * Returns 0 on success, -ENOENT if such entry doesn't exist.
+ */
+int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name,
+                             const void *ns)
+{
+        struct kernfs_addrm_cxt acxt;
+        struct kernfs_node *kn;
+        if (!parent) {
+                WARN(1, KERN_WARNING "kernfs: can not remove '%s', no directory\n",
+                        name);
+                return -ENOENT;
+        }
+        kernfs_addrm_start(&acxt);
+        kn = kernfs_find_ns(parent, name, ns);
+        if (kn)
+                __kernfs_remove(&acxt, kn);
+        kernfs_addrm_finish(&acxt);
+        if (kn)
+                return 0;
+        else
+                return -ENOENT;
+}
+/**
+ * kernfs_rename_ns - move and rename a kernfs_node
+ * @kn: target node
+ * @new_parent: new parent to put @sd under
+ * @new_name: new name
+ * @new_ns: new namespace tag
+ */
+int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
+                     const char *new_name, const void *new_ns)
+{
+        int error;
+        mutex_lock(&kernfs_mutex);
+        error = -ENOENT;
+        if ((kn->flags | new_parent->flags) & KERNFS_REMOVED)
+                goto out;
+        error = 0;
+        if ((kn->parent == new_parent) && (kn->ns == new_ns) &&
+            (strcmp(kn->name, new_name) == 0))
+                goto out;       /* nothing to rename */
+        error = -EEXIST;
+        if (kernfs_find_ns(new_parent, new_name, new_ns))
+                goto out;
+        /* rename kernfs_node */
+        if (strcmp(kn->name, new_name) != 0) {
+                error = -ENOMEM;
+                new_name = kstrdup(new_name, GFP_KERNEL);
+                if (!new_name)
+                        goto out;
+                if (kn->flags & KERNFS_STATIC_NAME)
+                        kn->flags &= ~KERNFS_STATIC_NAME;
+                else
+                        kfree(kn->name);
+                kn->name = new_name;
+        }
+        /*
+         * Move to the appropriate place in the appropriate directories rbtree.
+         */
+        kernfs_unlink_sibling(kn);
+        kernfs_get(new_parent);
+        kernfs_put(kn->parent);
+        kn->ns = new_ns;
+        kn->hash = kernfs_name_hash(kn->name, kn->ns);
+        kn->parent = new_parent;
+        kernfs_link_sibling(kn);
+        error = 0;
+ out:
+        mutex_unlock(&kernfs_mutex);
+        return error;
+}
+/* Relationship between s_mode and the DT_xxx types */
+static inline unsigned char dt_type(struct kernfs_node *kn)
+{
+        return (kn->mode >> 12) & 15;
+}
+static int kernfs_dir_fop_release(struct inode *inode, struct file *filp)
+{
+        kernfs_put(filp->private_data);
+        return 0;
+}
+static struct kernfs_node *kernfs_dir_pos(const void *ns,
+        struct kernfs_node *parent, loff_t hash, struct kernfs_node *pos)
+{
+        if (pos) {
+                int valid = !(pos->flags & KERNFS_REMOVED) &&
+                        pos->parent == parent && hash == pos->hash;
+                kernfs_put(pos);
+                if (!valid)
+                        pos = NULL;
+        }
+        if (!pos && (hash > 1) && (hash < INT_MAX)) {
+                struct rb_node *node = parent->dir.children.rb_node;
+                while (node) {
+                        pos = rb_to_kn(node);
+                        if (hash < pos->hash)
+                                node = node->rb_left;
+                        else if (hash > pos->hash)
+                                node = node->rb_right;
+                        else
+                                break;
+                }
+        }
+        /* Skip over entries in the wrong namespace */
+        while (pos && pos->ns != ns) {
+                struct rb_node *node = rb_next(&pos->rb);
+                if (!node)
+                        pos = NULL;
+                else
+                        pos = rb_to_kn(node);
+        }
+        return pos;
+}
+static struct kernfs_node *kernfs_dir_next_pos(const void *ns,
+        struct kernfs_node *parent, ino_t ino, struct kernfs_node *pos)
+{
+        pos = kernfs_dir_pos(ns, parent, ino, pos);
+        if (pos)
+                do {
+                        struct rb_node *node = rb_next(&pos->rb);
+                        if (!node)
+                                pos = NULL;
+                        else
+                                pos = rb_to_kn(node);
+                } while (pos && pos->ns != ns);
+        return pos;
+}
+static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx)
+{
+        struct dentry *dentry = file->f_path.dentry;
+        struct kernfs_node *parent = dentry->d_fsdata;
+        struct kernfs_node *pos = file->private_data;
+        const void *ns = NULL;
+        if (!dir_emit_dots(file, ctx))
+                return 0;
+        mutex_lock(&kernfs_mutex);
+        if (kernfs_ns_enabled(parent))
+                ns = kernfs_info(dentry->d_sb)->ns;
+        for (pos = kernfs_dir_pos(ns, parent, ctx->pos, pos);
+             pos;
+             pos = kernfs_dir_next_pos(ns, parent, ctx->pos, pos)) {
+                const char *name = pos->name;
+                unsigned int type = dt_type(pos);
+                int len = strlen(name);
+                ino_t ino = pos->ino;
+                ctx->pos = pos->hash;
+                file->private_data = pos;
+                kernfs_get(pos);
+                mutex_unlock(&kernfs_mutex);
+                if (!dir_emit(ctx, name, len, ino, type))
+                        return 0;
+                mutex_lock(&kernfs_mutex);
+        }
+        mutex_unlock(&kernfs_mutex);
+        file->private_data = NULL;
+        ctx->pos = INT_MAX;
+        return 0;
+}
+static loff_t kernfs_dir_fop_llseek(struct file *file, loff_t offset,
+                                    int whence)
+{
+        struct inode *inode = file_inode(file);
+        loff_t ret;
+        mutex_lock(&inode->i_mutex);
+        ret = generic_file_llseek(file, offset, whence);
+        mutex_unlock(&inode->i_mutex);
+        return ret;
+}
+const struct file_operations kernfs_dir_fops = {
+        .read           = generic_read_dir,
+        .iterate        = kernfs_fop_readdir,
+        .release        = kernfs_dir_fop_release,
+        .llseek         = kernfs_dir_fop_llseek,
+};
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
new file mode 100644
index 000000000000..dbf397bfdff2
--- /dev/null
+++ b/fs/kernfs/file.c
@@ -0,0 +1,867 @@
+/*
+ * fs/kernfs/file.c - kernfs file implementation
+ *
+ * Copyright (c) 2001-3 Patrick Mochel
+ * Copyright (c) 2007 SUSE Linux Products GmbH
+ * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2.
+ */
+#include <linux/fs.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/poll.h>
+#include <linux/pagemap.h>
+#include <linux/sched.h>
+#include "kernfs-internal.h"
+/*
+ * There's one kernfs_open_file for each open file and one kernfs_open_node
+ * for each kernfs_node with one or more open files.
+ *
+ * kernfs_node->attr.open points to kernfs_open_node.  attr.open is
+ * protected by kernfs_open_node_lock.
+ *
+ * filp->private_data points to seq_file whose ->private points to
+ * kernfs_open_file.  kernfs_open_files are chained at
+ * kernfs_open_node->files, which is protected by kernfs_open_file_mutex.
+ */
+static DEFINE_SPINLOCK(kernfs_open_node_lock);
+static DEFINE_MUTEX(kernfs_open_file_mutex);
+struct kernfs_open_node {
+        atomic_t                refcnt;
+        atomic_t                event;
+        wait_queue_head_t       poll;
+        struct list_head        files; /* goes through kernfs_open_file.list */
+};
+static struct kernfs_open_file *kernfs_of(struct file *file)
+{
+        return ((struct seq_file *)file->private_data)->private;
+}
+/*
+ * Determine the kernfs_ops for the given kernfs_node.  This function must
+ * be called while holding an active reference.
+ */
+static const struct kernfs_ops *kernfs_ops(struct kernfs_node *kn)
+{
+        if (kn->flags & KERNFS_LOCKDEP)
+                lockdep_assert_held(kn);
+        return kn->attr.ops;
+}
+/*
+ * As kernfs_seq_stop() is also called after kernfs_seq_start() or
+ * kernfs_seq_next() failure, it needs to distinguish whether it's stopping
+ * a seq_file iteration which is fully initialized with an active reference
+ * or an aborted kernfs_seq_start() due to get_active failure.  The
+ * position pointer is the only context for each seq_file iteration and
+ * thus the stop condition should be encoded in it.  As the return value is
+ * directly visible to userland, ERR_PTR(-ENODEV) is the only acceptable
+ * choice to indicate get_active failure.
+ *
+ * Unfortunately, this is complicated due to the optional custom seq_file
+ * operations which may return ERR_PTR(-ENODEV) too.  kernfs_seq_stop()
+ * can't distinguish whether ERR_PTR(-ENODEV) is from get_active failure or
+ * custom seq_file operations and thus can't decide whether put_active
+ * should be performed or not only on ERR_PTR(-ENODEV).
+ *
+ * This is worked around by factoring out the custom seq_stop() and
+ * put_active part into kernfs_seq_stop_active(), skipping it from
+ * kernfs_seq_stop() if ERR_PTR(-ENODEV) while invoking it directly after
+ * custom seq_file operations fail with ERR_PTR(-ENODEV) - this ensures
+ * that kernfs_seq_stop_active() is skipped only after get_active failure.
+ */
+static void kernfs_seq_stop_active(struct seq_file *sf, void *v)
+{
+        struct kernfs_open_file *of = sf->private;
+        const struct kernfs_ops *ops = kernfs_ops(of->kn);
+        if (ops->seq_stop)
+                ops->seq_stop(sf, v);
+        kernfs_put_active(of->kn);
+}
+static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos)
+{
+        struct kernfs_open_file *of = sf->private;
+        const struct kernfs_ops *ops;
+        /*
+         * @of->mutex nests outside active ref and is just to ensure that
+         * the ops aren't called concurrently for the same open file.
+         */
+        mutex_lock(&of->mutex);
+        if (!kernfs_get_active(of->kn))
+                return ERR_PTR(-ENODEV);
+        ops = kernfs_ops(of->kn);
+        if (ops->seq_start) {
+                void *next = ops->seq_start(sf, ppos);
+                /* see the comment above kernfs_seq_stop_active() */
+                if (next == ERR_PTR(-ENODEV))
+                        kernfs_seq_stop_active(sf, next);
+                return next;
+        } else {
+                /*
+                 * The same behavior and code as single_open().  Returns
+                 * !NULL if pos is at the beginning; otherwise, NULL.
+                 */
+                return NULL + !*ppos;
+        }
+}
+static void *kernfs_seq_next(struct seq_file *sf, void *v, loff_t *ppos)
+{
+        struct kernfs_open_file *of = sf->private;
+        const struct kernfs_ops *ops = kernfs_ops(of->kn);
+        if (ops->seq_next) {
+                void *next = ops->seq_next(sf, v, ppos);
+                /* see the comment above kernfs_seq_stop_active() */
+                if (next == ERR_PTR(-ENODEV))
+                        kernfs_seq_stop_active(sf, next);
+                return next;
+        } else {
+                /*
+                 * The same behavior and code as single_open(), always
+                 * terminate after the initial read.
+                 */
+                ++*ppos;
+                return NULL;
+        }
+}
+static void kernfs_seq_stop(struct seq_file *sf, void *v)
+{
+        struct kernfs_open_file *of = sf->private;
+        if (v != ERR_PTR(-ENODEV))
+                kernfs_seq_stop_active(sf, v);
+        mutex_unlock(&of->mutex);
+}
+static int kernfs_seq_show(struct seq_file *sf, void *v)
+{
+        struct kernfs_open_file *of = sf->private;
+        of->event = atomic_read(&of->kn->attr.open->event);
+        return of->kn->attr.ops->seq_show(sf, v);
+}
+static const struct seq_operations kernfs_seq_ops = {
+        .start = kernfs_seq_start,
+        .next = kernfs_seq_next,
+        .stop = kernfs_seq_stop,
+        .show = kernfs_seq_show,
+};
+/*
+ * As reading a bin file can have side-effects, the exact offset and bytes
+ * specified in read(2) call should be passed to the read callback making
+ * it difficult to use seq_file.  Implement simplistic custom buffering for
+ * bin files.
+ */
+static ssize_t kernfs_file_direct_read(struct kernfs_open_file *of,
+                                       char __user *user_buf, size_t count,
+                                       loff_t *ppos)
+{
+        ssize_t len = min_t(size_t, count, PAGE_SIZE);
+        const struct kernfs_ops *ops;
+        char *buf;
+        buf = kmalloc(len, GFP_KERNEL);
+        if (!buf)
+                return -ENOMEM;
+        /*
+         * @of->mutex nests outside active ref and is just to ensure that
+         * the ops aren't called concurrently for the same open file.
+         */
+        mutex_lock(&of->mutex);
+        if (!kernfs_get_active(of->kn)) {
+                len = -ENODEV;
+                mutex_unlock(&of->mutex);
+                goto out_free;
+        }
+        ops = kernfs_ops(of->kn);
+        if (ops->read)
+                len = ops->read(of, buf, len, *ppos);
+        else
+                len = -EINVAL;
+        kernfs_put_active(of->kn);
+        mutex_unlock(&of->mutex);
+        if (len < 0)
+                goto out_free;
+        if (copy_to_user(user_buf, buf, len)) {
+                len = -EFAULT;
+                goto out_free;
+        }
+        *ppos += len;
+ out_free:
+        kfree(buf);
+        return len;
+}
+/**
+ * kernfs_fop_read - kernfs vfs read callback
+ * @file: file pointer
+ * @user_buf: data to write
+ * @count: number of bytes
+ * @ppos: starting offset
+ */
+static ssize_t kernfs_fop_read(struct file *file, char __user *user_buf,
+                               size_t count, loff_t *ppos)
+{
+        struct kernfs_open_file *of = kernfs_of(file);
+        if (of->kn->flags & KERNFS_HAS_SEQ_SHOW)
+                return seq_read(file, user_buf, count, ppos);
+        else
+                return kernfs_file_direct_read(of, user_buf, count, ppos);
+}
+/**
+ * kernfs_fop_write - kernfs vfs write callback
+ * @file: file pointer
+ * @user_buf: data to write
+ * @count: number of bytes
+ * @ppos: starting offset
+ *
+ * Copy data in from userland and pass it to the matching kernfs write
+ * operation.
+ *
+ * There is no easy way for us to know if userspace is only doing a partial
+ * write, so we don't support them. We expect the entire buffer to come on
+ * the first write.  Hint: if you're writing a value, first read the file,
+ * modify only the the value you're changing, then write entire buffer
+ * back.
+ */
+static ssize_t kernfs_fop_write(struct file *file, const char __user *user_buf,
+                                size_t count, loff_t *ppos)
+{
+        struct kernfs_open_file *of = kernfs_of(file);
+        ssize_t len = min_t(size_t, count, PAGE_SIZE);
+        const struct kernfs_ops *ops;
+        char *buf;
+        buf = kmalloc(len + 1, GFP_KERNEL);
+        if (!buf)
+                return -ENOMEM;
+        if (copy_from_user(buf, user_buf, len)) {
+                len = -EFAULT;
+                goto out_free;
+        }
+        buf[len] = '\0';        /* guarantee string termination */
+        /*
+         * @of->mutex nests outside active ref and is just to ensure that
+         * the ops aren't called concurrently for the same open file.
+         */
+        mutex_lock(&of->mutex);
+        if (!kernfs_get_active(of->kn)) {
+                mutex_unlock(&of->mutex);
+                len = -ENODEV;
+                goto out_free;
+        }
+        ops = kernfs_ops(of->kn);
+        if (ops->write)
+                len = ops->write(of, buf, len, *ppos);
+        else
+                len = -EINVAL;
+        kernfs_put_active(of->kn);
+        mutex_unlock(&of->mutex);
+        if (len > 0)
+                *ppos += len;
+out_free:
+        kfree(buf);
+        return len;
+}
+static void kernfs_vma_open(struct vm_area_struct *vma)
+{
+        struct file *file = vma->vm_file;
+        struct kernfs_open_file *of = kernfs_of(file);
+        if (!of->vm_ops)
+                return;
+        if (!kernfs_get_active(of->kn))
+                return;
+        if (of->vm_ops->open)
+                of->vm_ops->open(vma);
+        kernfs_put_active(of->kn);
+}
+static int kernfs_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+        struct file *file = vma->vm_file;
+        struct kernfs_open_file *of = kernfs_of(file);
+        int ret;
+        if (!of->vm_ops)
+                return VM_FAULT_SIGBUS;
+        if (!kernfs_get_active(of->kn))
+                return VM_FAULT_SIGBUS;
+        ret = VM_FAULT_SIGBUS;
+        if (of->vm_ops->fault)
+                ret = of->vm_ops->fault(vma, vmf);
+        kernfs_put_active(of->kn);
+        return ret;
+}
+static int kernfs_vma_page_mkwrite(struct vm_area_struct *vma,
+                                   struct vm_fault *vmf)
+{
+        struct file *file = vma->vm_file;
+        struct kernfs_open_file *of = kernfs_of(file);
+        int ret;
+        if (!of->vm_ops)
+                return VM_FAULT_SIGBUS;
+        if (!kernfs_get_active(of->kn))
+                return VM_FAULT_SIGBUS;
+        ret = 0;
+        if (of->vm_ops->page_mkwrite)
+                ret = of->vm_ops->page_mkwrite(vma, vmf);
+        else
+                file_update_time(file);
+        kernfs_put_active(of->kn);
+        return ret;
+}
+static int kernfs_vma_access(struct vm_area_struct *vma, unsigned long addr,
+                             void *buf, int len, int write)
+{
+        struct file *file = vma->vm_file;
+        struct kernfs_open_file *of = kernfs_of(file);
+        int ret;
+        if (!of->vm_ops)
+                return -EINVAL;
+        if (!kernfs_get_active(of->kn))
+                return -EINVAL;
+        ret = -EINVAL;
+        if (of->vm_ops->access)
+                ret = of->vm_ops->access(vma, addr, buf, len, write);
+        kernfs_put_active(of->kn);
+        return ret;
+}
+#ifdef CONFIG_NUMA
+static int kernfs_vma_set_policy(struct vm_area_struct *vma,
+                                 struct mempolicy *new)
+{
+        struct file *file = vma->vm_file;
+        struct kernfs_open_file *of = kernfs_of(file);
+        int ret;
+        if (!of->vm_ops)
+                return 0;
+        if (!kernfs_get_active(of->kn))
+                return -EINVAL;
+        ret = 0;
+        if (of->vm_ops->set_policy)
+                ret = of->vm_ops->set_policy(vma, new);
+        kernfs_put_active(of->kn);
+        return ret;
+}
+static struct mempolicy *kernfs_vma_get_policy(struct vm_area_struct *vma,
+                                               unsigned long addr)
+{
+        struct file *file = vma->vm_file;
+        struct kernfs_open_file *of = kernfs_of(file);
+        struct mempolicy *pol;
+        if (!of->vm_ops)
+                return vma->vm_policy;
+        if (!kernfs_get_active(of->kn))
+                return vma->vm_policy;
+        pol = vma->vm_policy;
+        if (of->vm_ops->get_policy)
+                pol = of->vm_ops->get_policy(vma, addr);
+        kernfs_put_active(of->kn);
+        return pol;
+}
+static int kernfs_vma_migrate(struct vm_area_struct *vma,
+                              const nodemask_t *from, const nodemask_t *to,
+                              unsigned long flags)
+{
+        struct file *file = vma->vm_file;
+        struct kernfs_open_file *of = kernfs_of(file);
+        int ret;
+        if (!of->vm_ops)
+                return 0;
+        if (!kernfs_get_active(of->kn))
+                return 0;
+        ret = 0;
+        if (of->vm_ops->migrate)
+                ret = of->vm_ops->migrate(vma, from, to, flags);
+        kernfs_put_active(of->kn);
+        return ret;
+}
+#endif
+static const struct vm_operations_struct kernfs_vm_ops = {
+        .open           = kernfs_vma_open,
+        .fault          = kernfs_vma_fault,
+        .page_mkwrite   = kernfs_vma_page_mkwrite,
+        .access         = kernfs_vma_access,
+#ifdef CONFIG_NUMA
+        .set_policy     = kernfs_vma_set_policy,
+        .get_policy     = kernfs_vma_get_policy,
+        .migrate        = kernfs_vma_migrate,
+#endif
+};
+static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma)
+{
+        struct kernfs_open_file *of = kernfs_of(file);
+        const struct kernfs_ops *ops;
+        int rc;
+        /*
+         * mmap path and of->mutex are prone to triggering spurious lockdep
+         * warnings and we don't want to add spurious locking dependency
+         * between the two.  Check whether mmap is actually implemented
+         * without grabbing @of->mutex by testing HAS_MMAP flag.  See the
+         * comment in kernfs_file_open() for more details.
+         */
+        if (!(of->kn->flags & KERNFS_HAS_MMAP))
+                return -ENODEV;
+        mutex_lock(&of->mutex);
+        rc = -ENODEV;
+        if (!kernfs_get_active(of->kn))
+                goto out_unlock;
+        ops = kernfs_ops(of->kn);
+        rc = ops->mmap(of, vma);
+        /*
+         * PowerPC's pci_mmap of legacy_mem uses shmem_zero_setup()
+         * to satisfy versions of X which crash if the mmap fails: that
+         * substitutes a new vm_file, and we don't then want bin_vm_ops.
+         */
+        if (vma->vm_file != file)
+                goto out_put;
+        rc = -EINVAL;
+        if (of->mmapped && of->vm_ops != vma->vm_ops)
+                goto out_put;
+        /*
+         * It is not possible to successfully wrap close.
+         * So error if someone is trying to use close.
+         */
+        rc = -EINVAL;
+        if (vma->vm_ops && vma->vm_ops->close)
+                goto out_put;
+        rc = 0;
+        of->mmapped = 1;
+        of->vm_ops = vma->vm_ops;
+        vma->vm_ops = &kernfs_vm_ops;
+out_put:
+        kernfs_put_active(of->kn);
+out_unlock:
+        mutex_unlock(&of->mutex);
+        return rc;
+}
+/**
+ *      kernfs_get_open_node - get or create kernfs_open_node
+ *      @kn: target kernfs_node
+ *      @of: kernfs_open_file for this instance of open
+ *
+ *      If @kn->attr.open exists, increment its reference count; otherwise,
+ *      create one.  @of is chained to the files list.
+ *
+ *      LOCKING:
+ *      Kernel thread context (may sleep).
+ *
+ *      RETURNS:
+ *      0 on success, -errno on failure.
+ */
+static int kernfs_get_open_node(struct kernfs_node *kn,
+                                struct kernfs_open_file *of)
+{
+        struct kernfs_open_node *on, *new_on = NULL;
+ retry:
+        mutex_lock(&kernfs_open_file_mutex);
+        spin_lock_irq(&kernfs_open_node_lock);
+        if (!kn->attr.open && new_on) {
+                kn->attr.open = new_on;
+                new_on = NULL;
+        }
+        on = kn->attr.open;
+        if (on) {
+                atomic_inc(&on->refcnt);
+                list_add_tail(&of->list, &on->files);
+        }
+        spin_unlock_irq(&kernfs_open_node_lock);
+        mutex_unlock(&kernfs_open_file_mutex);
+        if (on) {
+                kfree(new_on);
+                return 0;
+        }
+        /* not there, initialize a new one and retry */
+        new_on = kmalloc(sizeof(*new_on), GFP_KERNEL);
+        if (!new_on)
+                return -ENOMEM;
+        atomic_set(&new_on->refcnt, 0);
+        atomic_set(&new_on->event, 1);
+        init_waitqueue_head(&new_on->poll);
+        INIT_LIST_HEAD(&new_on->files);
+        goto retry;
+}
+/**
+ *      kernfs_put_open_node - put kernfs_open_node
+ *      @kn: target kernfs_nodet
+ *      @of: associated kernfs_open_file
+ *
+ *      Put @kn->attr.open and unlink @of from the files list.  If
+ *      reference count reaches zero, disassociate and free it.
+ *
+ *      LOCKING:
+ *      None.
+ */
+static void kernfs_put_open_node(struct kernfs_node *kn,
+                                 struct kernfs_open_file *of)
+{
+        struct kernfs_open_node *on = kn->attr.open;
+        unsigned long flags;
+        mutex_lock(&kernfs_open_file_mutex);
+        spin_lock_irqsave(&kernfs_open_node_lock, flags);
+        if (of)
+                list_del(&of->list);
+        if (atomic_dec_and_test(&on->refcnt))
+                kn->attr.open = NULL;
+        else
+                on = NULL;
+        spin_unlock_irqrestore(&kernfs_open_node_lock, flags);
+        mutex_unlock(&kernfs_open_file_mutex);
+        kfree(on);
+}
+static int kernfs_fop_open(struct inode *inode, struct file *file)
+{
+        struct kernfs_node *kn = file->f_path.dentry->d_fsdata;
+        const struct kernfs_ops *ops;
+        struct kernfs_open_file *of;
+        bool has_read, has_write, has_mmap;
+        int error = -EACCES;
+        if (!kernfs_get_active(kn))
+                return -ENODEV;
+        ops = kernfs_ops(kn);
+        has_read = ops->seq_show || ops->read || ops->mmap;
+        has_write = ops->write || ops->mmap;
+        has_mmap = ops->mmap;
+        /* check perms and supported operations */
+        if ((file->f_mode & FMODE_WRITE) &&
+            (!(inode->i_mode & S_IWUGO) || !has_write))
+                goto err_out;
+        if ((file->f_mode & FMODE_READ) &&
+            (!(inode->i_mode & S_IRUGO) || !has_read))
+                goto err_out;
+        /* allocate a kernfs_open_file for the file */
+        error = -ENOMEM;
+        of = kzalloc(sizeof(struct kernfs_open_file), GFP_KERNEL);
+        if (!of)
+                goto err_out;
+        /*
+         * The following is done to give a different lockdep key to
+         * @of->mutex for files which implement mmap.  This is a rather
+         * crude way to avoid false positive lockdep warning around
+         * mm->mmap_sem - mmap nests @of->mutex under mm->mmap_sem and
+         * reading /sys/block/sda/trace/act_mask grabs sr_mutex, under
+         * which mm->mmap_sem nests, while holding @of->mutex.  As each
+         * open file has a separate mutex, it's okay as long as those don't
+         * happen on the same file.  At this point, we can't easily give
+         * each file a separate locking class.  Let's differentiate on
+         * whether the file has mmap or not for now.
+         *
+         * Both paths of the branch look the same.  They're supposed to
+         * look that way and give @of->mutex different static lockdep keys.
+         */
+        if (has_mmap)
+                mutex_init(&of->mutex);
+        else
+                mutex_init(&of->mutex);
+        of->kn = kn;
+        of->file = file;
+        /*
+         * Always instantiate seq_file even if read access doesn't use
+         * seq_file or is not requested.  This unifies private data access
+         * and readable regular files are the vast majority anyway.
+         */
+        if (ops->seq_show)
+                error = seq_open(file, &kernfs_seq_ops);
+        else
+                error = seq_open(file, NULL);
+        if (error)
+                goto err_free;
+        ((struct seq_file *)file->private_data)->private = of;
+        /* seq_file clears PWRITE unconditionally, restore it if WRITE */
+        if (file->f_mode & FMODE_WRITE)
+                file->f_mode |= FMODE_PWRITE;
+        /* make sure we have open node struct */
+        error = kernfs_get_open_node(kn, of);
+        if (error)
+                goto err_close;
+        /* open succeeded, put active references */
+        kernfs_put_active(kn);
+        return 0;
+err_close:
+        seq_release(inode, file);
+err_free:
+        kfree(of);
+err_out:
+        kernfs_put_active(kn);
+        return error;
+}
+static int kernfs_fop_release(struct inode *inode, struct file *filp)
+{
+        struct kernfs_node *kn = filp->f_path.dentry->d_fsdata;
+        struct kernfs_open_file *of = kernfs_of(filp);
+        kernfs_put_open_node(kn, of);
+        seq_release(inode, filp);
+        kfree(of);
+        return 0;
+}
+void kernfs_unmap_bin_file(struct kernfs_node *kn)
+{
+        struct kernfs_open_node *on;
+        struct kernfs_open_file *of;
+        if (!(kn->flags & KERNFS_HAS_MMAP))
+                return;
+        spin_lock_irq(&kernfs_open_node_lock);
+        on = kn->attr.open;
+        if (on)
+                atomic_inc(&on->refcnt);
+        spin_unlock_irq(&kernfs_open_node_lock);
+        if (!on)
+                return;
+        mutex_lock(&kernfs_open_file_mutex);
+        list_for_each_entry(of, &on->files, list) {
+                struct inode *inode = file_inode(of->file);
+                unmap_mapping_range(inode->i_mapping, 0, 0, 1);
+        }
+        mutex_unlock(&kernfs_open_file_mutex);
+        kernfs_put_open_node(kn, NULL);
+}
+/*
+ * Kernfs attribute files are pollable.  The idea is that you read
+ * the content and then you use 'poll' or 'select' to wait for
+ * the content to change.  When the content changes (assuming the
+ * manager for the kobject supports notification), poll will
+ * return POLLERR|POLLPRI, and select will return the fd whether
+ * it is waiting for read, write, or exceptions.
+ * Once poll/select indicates that the value has changed, you
+ * need to close and re-open the file, or seek to 0 and read again.
+ * Reminder: this only works for attributes which actively support
+ * it, and it is not possible to test an attribute from userspace
+ * to see if it supports poll (Neither 'poll' nor 'select' return
+ * an appropriate error code).  When in doubt, set a suitable timeout value.
+ */
+static unsigned int kernfs_fop_poll(struct file *filp, poll_table *wait)
+{
+        struct kernfs_open_file *of = kernfs_of(filp);
+        struct kernfs_node *kn = filp->f_path.dentry->d_fsdata;
+        struct kernfs_open_node *on = kn->attr.open;
+        /* need parent for the kobj, grab both */
+        if (!kernfs_get_active(kn))
+                goto trigger;
+        poll_wait(filp, &on->poll, wait);
+        kernfs_put_active(kn);
+        if (of->event != atomic_read(&on->event))
+                goto trigger;
+        return DEFAULT_POLLMASK;
+ trigger:
+        return DEFAULT_POLLMASK|POLLERR|POLLPRI;
+}
+/**
+ * kernfs_notify - notify a kernfs file
+ * @kn: file to notify
+ *
+ * Notify @kn such that poll(2) on @kn wakes up.
+ */
+void kernfs_notify(struct kernfs_node *kn)
+{
+        struct kernfs_open_node *on;
+        unsigned long flags;
+        spin_lock_irqsave(&kernfs_open_node_lock, flags);
+        if (!WARN_ON(kernfs_type(kn) != KERNFS_FILE)) {
+                on = kn->attr.open;
+                if (on) {
+                        atomic_inc(&on->event);
+                        wake_up_interruptible(&on->poll);
+                }
+        }
+        spin_unlock_irqrestore(&kernfs_open_node_lock, flags);
+}
+EXPORT_SYMBOL_GPL(kernfs_notify);
+const struct file_operations kernfs_file_fops = {
+        .read           = kernfs_fop_read,
+        .write          = kernfs_fop_write,
+        .llseek         = generic_file_llseek,
+        .mmap           = kernfs_fop_mmap,
+        .open           = kernfs_fop_open,
+        .release        = kernfs_fop_release,
+        .poll           = kernfs_fop_poll,
+};
+/**
+ * __kernfs_create_file - kernfs internal function to create a file
+ * @parent: directory to create the file in
+ * @name: name of the file
+ * @mode: mode of the file
+ * @size: size of the file
+ * @ops: kernfs operations for the file
+ * @priv: private data for the file
+ * @ns: optional namespace tag of the file
+ * @static_name: don't copy file name
+ * @key: lockdep key for the file's active_ref, %NULL to disable lockdep
+ *
+ * Returns the created node on success, ERR_PTR() value on error.
+ */
+struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
+                                         const char *name,
+                                         umode_t mode, loff_t size,
+                                         const struct kernfs_ops *ops,
+                                         void *priv, const void *ns,
+                                         bool name_is_static,
+                                         struct lock_class_key *key)
+{
+        struct kernfs_addrm_cxt acxt;
+        struct kernfs_node *kn;
+        unsigned flags;
+        int rc;
+        flags = KERNFS_FILE;
+        if (name_is_static)
+                flags |= KERNFS_STATIC_NAME;
+        kn = kernfs_new_node(parent, name, (mode & S_IALLUGO) | S_IFREG, flags);
+        if (!kn)
+                return ERR_PTR(-ENOMEM);
+        kn->attr.ops = ops;
+        kn->attr.size = size;
+        kn->ns = ns;
+        kn->priv = priv;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+        if (key) {
+                lockdep_init_map(&kn->dep_map, "s_active", key, 0);
+                kn->flags |= KERNFS_LOCKDEP;
+        }
+#endif
+        /*
+         * kn->attr.ops is accesible only while holding active ref.  We
+         * need to know whether some ops are implemented outside active
+         * ref.  Cache their existence in flags.
+         */
+        if (ops->seq_show)
+                kn->flags |= KERNFS_HAS_SEQ_SHOW;
+        if (ops->mmap)
+                kn->flags |= KERNFS_HAS_MMAP;
+        kernfs_addrm_start(&acxt);
+        rc = kernfs_add_one(&acxt, kn);
+        kernfs_addrm_finish(&acxt);
+        if (rc) {
+                kernfs_put(kn);
+                return ERR_PTR(rc);
+        }
+        return kn;
+}
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
new file mode 100644
index 000000000000..e55126f85bd2
--- /dev/null
+++ b/fs/kernfs/inode.c
@@ -0,0 +1,377 @@
+/*
+ * fs/kernfs/inode.c - kernfs inode implementation
+ *
+ * Copyright (c) 2001-3 Patrick Mochel
+ * Copyright (c) 2007 SUSE Linux Products GmbH
+ * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2.
+ */
+#include <linux/pagemap.h>
+#include <linux/backing-dev.h>
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/xattr.h>
+#include <linux/security.h>
+#include "kernfs-internal.h"
+static const struct address_space_operations kernfs_aops = {
+        .readpage       = simple_readpage,
+        .write_begin    = simple_write_begin,
+        .write_end      = simple_write_end,
+};
+static struct backing_dev_info kernfs_bdi = {
+        .name           = "kernfs",
+        .ra_pages       = 0,    /* No readahead */
+        .capabilities   = BDI_CAP_NO_ACCT_AND_WRITEBACK,
+};
+static const struct inode_operations kernfs_iops = {
+        .permission     = kernfs_iop_permission,
+        .setattr        = kernfs_iop_setattr,
+        .getattr        = kernfs_iop_getattr,
+        .setxattr       = kernfs_iop_setxattr,
+        .removexattr    = kernfs_iop_removexattr,
+        .getxattr       = kernfs_iop_getxattr,
+        .listxattr      = kernfs_iop_listxattr,
+};
+void __init kernfs_inode_init(void)
+{
+        if (bdi_init(&kernfs_bdi))
+                panic("failed to init kernfs_bdi");
+}
+static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn)
+{
+        struct iattr *iattrs;
+        if (kn->iattr)
+                return kn->iattr;
+        kn->iattr = kzalloc(sizeof(struct kernfs_iattrs), GFP_KERNEL);
+        if (!kn->iattr)
+                return NULL;
+        iattrs = &kn->iattr->ia_iattr;
+        /* assign default attributes */
+        iattrs->ia_mode = kn->mode;
+        iattrs->ia_uid = GLOBAL_ROOT_UID;
+        iattrs->ia_gid = GLOBAL_ROOT_GID;
+        iattrs->ia_atime = iattrs->ia_mtime = iattrs->ia_ctime = CURRENT_TIME;
+        simple_xattrs_init(&kn->iattr->xattrs);
+        return kn->iattr;
+}
+static int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
+{
+        struct kernfs_iattrs *attrs;
+        struct iattr *iattrs;
+        unsigned int ia_valid = iattr->ia_valid;
+        attrs = kernfs_iattrs(kn);
+        if (!attrs)
+                return -ENOMEM;
+        iattrs = &attrs->ia_iattr;
+        if (ia_valid & ATTR_UID)
+                iattrs->ia_uid = iattr->ia_uid;
+        if (ia_valid & ATTR_GID)
+                iattrs->ia_gid = iattr->ia_gid;
+        if (ia_valid & ATTR_ATIME)
+                iattrs->ia_atime = iattr->ia_atime;
+        if (ia_valid & ATTR_MTIME)
+                iattrs->ia_mtime = iattr->ia_mtime;
+        if (ia_valid & ATTR_CTIME)
+                iattrs->ia_ctime = iattr->ia_ctime;
+        if (ia_valid & ATTR_MODE) {
+                umode_t mode = iattr->ia_mode;
+                iattrs->ia_mode = kn->mode = mode;
+        }
+        return 0;
+}
+/**
+ * kernfs_setattr - set iattr on a node
+ * @kn: target node
+ * @iattr: iattr to set
+ *
+ * Returns 0 on success, -errno on failure.
+ */
+int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
+{
+        int ret;
+        mutex_lock(&kernfs_mutex);
+        ret = __kernfs_setattr(kn, iattr);
+        mutex_unlock(&kernfs_mutex);
+        return ret;
+}
+int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+        struct inode *inode = dentry->d_inode;
+        struct kernfs_node *kn = dentry->d_fsdata;
+        int error;
+        if (!kn)
+                return -EINVAL;
+        mutex_lock(&kernfs_mutex);
+        error = inode_change_ok(inode, iattr);
+        if (error)
+                goto out;
+        error = __kernfs_setattr(kn, iattr);
+        if (error)
+                goto out;
+        /* this ignores size changes */
+        setattr_copy(inode, iattr);
+out:
+        mutex_unlock(&kernfs_mutex);
+        return error;
+}
+static int kernfs_node_setsecdata(struct kernfs_node *kn, void **secdata,
+                                  u32 *secdata_len)
+{
+        struct kernfs_iattrs *attrs;
+        void *old_secdata;
+        size_t old_secdata_len;
+        attrs = kernfs_iattrs(kn);
+        if (!attrs)
+                return -ENOMEM;
+        old_secdata = attrs->ia_secdata;
+        old_secdata_len = attrs->ia_secdata_len;
+        attrs->ia_secdata = *secdata;
+        attrs->ia_secdata_len = *secdata_len;
+        *secdata = old_secdata;
+        *secdata_len = old_secdata_len;
+        return 0;
+}
+int kernfs_iop_setxattr(struct dentry *dentry, const char *name,
+                        const void *value, size_t size, int flags)
+{
+        struct kernfs_node *kn = dentry->d_fsdata;
+        struct kernfs_iattrs *attrs;
+        void *secdata;
+        int error;
+        u32 secdata_len = 0;
+        attrs = kernfs_iattrs(kn);
+        if (!attrs)
+                return -ENOMEM;
+        if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) {
+                const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;
+                error = security_inode_setsecurity(dentry->d_inode, suffix,
+                                                value, size, flags);
+                if (error)
+                        return error;
+                error = security_inode_getsecctx(dentry->d_inode,
+                                                &secdata, &secdata_len);
+                if (error)
+                        return error;
+                mutex_lock(&kernfs_mutex);
+                error = kernfs_node_setsecdata(kn, &secdata, &secdata_len);
+                mutex_unlock(&kernfs_mutex);
+                if (secdata)
+                        security_release_secctx(secdata, secdata_len);
+                return error;
+        } else if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) {
+                return simple_xattr_set(&attrs->xattrs, name, value, size,
+                                        flags);
+        }
+        return -EINVAL;
+}
+int kernfs_iop_removexattr(struct dentry *dentry, const char *name)
+{
+        struct kernfs_node *kn = dentry->d_fsdata;
+        struct kernfs_iattrs *attrs;
+        attrs = kernfs_iattrs(kn);
+        if (!attrs)
+                return -ENOMEM;
+        return simple_xattr_remove(&attrs->xattrs, name);
+}
+ssize_t kernfs_iop_getxattr(struct dentry *dentry, const char *name, void *buf,
+                            size_t size)
+{
+        struct kernfs_node *kn = dentry->d_fsdata;
+        struct kernfs_iattrs *attrs;
+        attrs = kernfs_iattrs(kn);
+        if (!attrs)
+                return -ENOMEM;
+        return simple_xattr_get(&attrs->xattrs, name, buf, size);
+}
+ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size)
+{
+        struct kernfs_node *kn = dentry->d_fsdata;
+        struct kernfs_iattrs *attrs;
+        attrs = kernfs_iattrs(kn);
+        if (!attrs)
+                return -ENOMEM;
+        return simple_xattr_list(&attrs->xattrs, buf, size);
+}
+static inline void set_default_inode_attr(struct inode *inode, umode_t mode)
+{
+        inode->i_mode = mode;
+        inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+}
+static inline void set_inode_attr(struct inode *inode, struct iattr *iattr)
+{
+        inode->i_uid = iattr->ia_uid;
+        inode->i_gid = iattr->ia_gid;
+        inode->i_atime = iattr->ia_atime;
+        inode->i_mtime = iattr->ia_mtime;
+        inode->i_ctime = iattr->ia_ctime;
+}
+static void kernfs_refresh_inode(struct kernfs_node *kn, struct inode *inode)
+{
+        struct kernfs_iattrs *attrs = kn->iattr;
+        inode->i_mode = kn->mode;
+        if (attrs) {
+                /*
+                 * kernfs_node has non-default attributes get them from
+                 * persistent copy in kernfs_node.
+                 */
+                set_inode_attr(inode, &attrs->ia_iattr);
+                security_inode_notifysecctx(inode, attrs->ia_secdata,
+                                            attrs->ia_secdata_len);
+        }
+        if (kernfs_type(kn) == KERNFS_DIR)
+                set_nlink(inode, kn->dir.subdirs + 2);
+}
+int kernfs_iop_getattr(struct vfsmount *mnt, struct dentry *dentry,
+                   struct kstat *stat)
+{
+        struct kernfs_node *kn = dentry->d_fsdata;
+        struct inode *inode = dentry->d_inode;
+        mutex_lock(&kernfs_mutex);
+        kernfs_refresh_inode(kn, inode);
+        mutex_unlock(&kernfs_mutex);
+        generic_fillattr(inode, stat);
+        return 0;
+}
+static void kernfs_init_inode(struct kernfs_node *kn, struct inode *inode)
+{
+        kernfs_get(kn);
+        inode->i_private = kn;
+        inode->i_mapping->a_ops = &kernfs_aops;
+        inode->i_mapping->backing_dev_info = &kernfs_bdi;
+        inode->i_op = &kernfs_iops;
+        set_default_inode_attr(inode, kn->mode);
+        kernfs_refresh_inode(kn, inode);
+        /* initialize inode according to type */
+        switch (kernfs_type(kn)) {
+        case KERNFS_DIR:
+                inode->i_op = &kernfs_dir_iops;
+                inode->i_fop = &kernfs_dir_fops;
+                break;
+        case KERNFS_FILE:
+                inode->i_size = kn->attr.size;
+                inode->i_fop = &kernfs_file_fops;
+                break;
+        case KERNFS_LINK:
+                inode->i_op = &kernfs_symlink_iops;
+                break;
+        default:
+                BUG();
+        }
+        unlock_new_inode(inode);
+}
+/**
+ *      kernfs_get_inode - get inode for kernfs_node
+ *      @sb: super block
+ *      @kn: kernfs_node to allocate inode for
+ *
+ *      Get inode for @kn.  If such inode doesn't exist, a new inode is
+ *      allocated and basics are initialized.  New inode is returned
+ *      locked.
+ *
+ *      LOCKING:
+ *      Kernel thread context (may sleep).
+ *
+ *      RETURNS:
+ *      Pointer to allocated inode on success, NULL on failure.
+ */
+struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn)
+{
+        struct inode *inode;
+        inode = iget_locked(sb, kn->ino);
+        if (inode && (inode->i_state & I_NEW))
+                kernfs_init_inode(kn, inode);
+        return inode;
+}
+/*
+ * The kernfs_node serves as both an inode and a directory entry for
+ * kernfs.  To prevent the kernfs inode numbers from being freed
+ * prematurely we take a reference to kernfs_node from the kernfs inode.  A
+ * super_operations.evict_inode() implementation is needed to drop that
+ * reference upon inode destruction.
+ */
+void kernfs_evict_inode(struct inode *inode)
+{
+        struct kernfs_node *kn = inode->i_private;
+        truncate_inode_pages(&inode->i_data, 0);
+        clear_inode(inode);
+        kernfs_put(kn);
+}
+int kernfs_iop_permission(struct inode *inode, int mask)
+{
+        struct kernfs_node *kn;
+        if (mask & MAY_NOT_BLOCK)
+                return -ECHILD;
+        kn = inode->i_private;
+        mutex_lock(&kernfs_mutex);
+        kernfs_refresh_inode(kn, inode);
+        mutex_unlock(&kernfs_mutex);
+        return generic_permission(inode, mask);
+}
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
new file mode 100644
index 000000000000..eb536b76374a
--- /dev/null
+++ b/fs/kernfs/kernfs-internal.h
@@ -0,0 +1,122 @@
+/*
+ * fs/kernfs/kernfs-internal.h - kernfs internal header file
+ *
+ * Copyright (c) 2001-3 Patrick Mochel
+ * Copyright (c) 2007 SUSE Linux Products GmbH
+ * Copyright (c) 2007, 2013 Tejun Heo <teheo@suse.de>
+ *
+ * This file is released under the GPLv2.
+ */
+#ifndef __KERNFS_INTERNAL_H
+#define __KERNFS_INTERNAL_H
+#include <linux/lockdep.h>
+#include <linux/fs.h>
+#include <linux/mutex.h>
+#include <linux/xattr.h>
+#include <linux/kernfs.h>
+struct kernfs_iattrs {
+        struct iattr            ia_iattr;
+        void                    *ia_secdata;
+        u32                     ia_secdata_len;
+        struct simple_xattrs    xattrs;
+};
+#define KN_DEACTIVATED_BIAS             INT_MIN
+/* KERNFS_TYPE_MASK and types are defined in include/linux/kernfs.h */
+/**
+ * kernfs_root - find out the kernfs_root a kernfs_node belongs to
+ * @kn: kernfs_node of interest
+ *
+ * Return the kernfs_root @kn belongs to.
+ */
+static inline struct kernfs_root *kernfs_root(struct kernfs_node *kn)
+{
+        /* if parent exists, it's always a dir; otherwise, @sd is a dir */
+        if (kn->parent)
+                kn = kn->parent;
+        return kn->dir.root;
+}
+/*
+ * Context structure to be used while adding/removing nodes.
+ */
+struct kernfs_addrm_cxt {
+        struct kernfs_node      *removed;
+};
+/*
+ * mount.c
+ */
+struct kernfs_super_info {
+        /*
+         * The root associated with this super_block.  Each super_block is
+         * identified by the root and ns it's associated with.
+         */
+        struct kernfs_root      *root;
+        /*
+         * Each sb is associated with one namespace tag, currently the
+         * network namespace of the task which mounted this kernfs
+         * instance.  If multiple tags become necessary, make the following
+         * an array and compare kernfs_node tag against every entry.
+         */
+        const void              *ns;
+};
+#define kernfs_info(SB) ((struct kernfs_super_info *)(SB->s_fs_info))
+extern struct kmem_cache *kernfs_node_cache;
+/*
+ * inode.c
+ */
+struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn);
+void kernfs_evict_inode(struct inode *inode);
+int kernfs_iop_permission(struct inode *inode, int mask);
+int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr);
+int kernfs_iop_getattr(struct vfsmount *mnt, struct dentry *dentry,
+                       struct kstat *stat);
+int kernfs_iop_setxattr(struct dentry *dentry, const char *name, const void *value,
+                        size_t size, int flags);
+int kernfs_iop_removexattr(struct dentry *dentry, const char *name);
+ssize_t kernfs_iop_getxattr(struct dentry *dentry, const char *name, void *buf,
+                            size_t size);
+ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size);
+void kernfs_inode_init(void);
+/*
+ * dir.c
+ */
+extern struct mutex kernfs_mutex;
+extern const struct dentry_operations kernfs_dops;
+extern const struct file_operations kernfs_dir_fops;
+extern const struct inode_operations kernfs_dir_iops;
+struct kernfs_node *kernfs_get_active(struct kernfs_node *kn);
+void kernfs_put_active(struct kernfs_node *kn);
+void kernfs_addrm_start(struct kernfs_addrm_cxt *acxt);
+int kernfs_add_one(struct kernfs_addrm_cxt *acxt, struct kernfs_node *kn);
+void kernfs_addrm_finish(struct kernfs_addrm_cxt *acxt);
+struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
+                                    const char *name, umode_t mode,
+                                    unsigned flags);
+/*
+ * file.c
+ */
+extern const struct file_operations kernfs_file_fops;
+void kernfs_unmap_bin_file(struct kernfs_node *kn);
+/*
+ * symlink.c
+ */
+extern const struct inode_operations kernfs_symlink_iops;
+#endif  /* __KERNFS_INTERNAL_H */
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
new file mode 100644
index 000000000000..0d6ce895a9ee
--- /dev/null
+++ b/fs/kernfs/mount.c
@@ -0,0 +1,165 @@
+/*
+ * fs/kernfs/mount.c - kernfs mount implementation
+ *
+ * Copyright (c) 2001-3 Patrick Mochel
+ * Copyright (c) 2007 SUSE Linux Products GmbH
+ * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2.
+ */
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/init.h>
+#include <linux/magic.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include "kernfs-internal.h"
+struct kmem_cache *kernfs_node_cache;
+static const struct super_operations kernfs_sops = {
+        .statfs         = simple_statfs,
+        .drop_inode     = generic_delete_inode,
+        .evict_inode    = kernfs_evict_inode,
+};
+static int kernfs_fill_super(struct super_block *sb)
+{
+        struct kernfs_super_info *info = kernfs_info(sb);
+        struct inode *inode;
+        struct dentry *root;
+        sb->s_blocksize = PAGE_CACHE_SIZE;
+        sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+        sb->s_magic = SYSFS_MAGIC;
+        sb->s_op = &kernfs_sops;
+        sb->s_time_gran = 1;
+        /* get root inode, initialize and unlock it */
+        mutex_lock(&kernfs_mutex);
+        inode = kernfs_get_inode(sb, info->root->kn);
+        mutex_unlock(&kernfs_mutex);
+        if (!inode) {
+                pr_debug("kernfs: could not get root inode\n");
+                return -ENOMEM;
+        }
+        /* instantiate and link root dentry */
+        root = d_make_root(inode);
+        if (!root) {
+                pr_debug("%s: could not get root dentry!\n", __func__);
+                return -ENOMEM;
+        }
+        kernfs_get(info->root->kn);
+        root->d_fsdata = info->root->kn;
+        sb->s_root = root;
+        sb->s_d_op = &kernfs_dops;
+        return 0;
+}
+static int kernfs_test_super(struct super_block *sb, void *data)
+{
+        struct kernfs_super_info *sb_info = kernfs_info(sb);
+        struct kernfs_super_info *info = data;
+        return sb_info->root == info->root && sb_info->ns == info->ns;
+}
+static int kernfs_set_super(struct super_block *sb, void *data)
+{
+        int error;
+        error = set_anon_super(sb, data);
+        if (!error)
+                sb->s_fs_info = data;
+        return error;
+}
+/**
+ * kernfs_super_ns - determine the namespace tag of a kernfs super_block
+ * @sb: super_block of interest
+ *
+ * Return the namespace tag associated with kernfs super_block @sb.
+ */
+const void *kernfs_super_ns(struct super_block *sb)
+{
+        struct kernfs_super_info *info = kernfs_info(sb);
+        return info->ns;
+}
+/**
+ * kernfs_mount_ns - kernfs mount helper
+ * @fs_type: file_system_type of the fs being mounted
+ * @flags: mount flags specified for the mount
+ * @root: kernfs_root of the hierarchy being mounted
+ * @ns: optional namespace tag of the mount
+ *
+ * This is to be called from each kernfs user's file_system_type->mount()
+ * implementation, which should pass through the specified @fs_type and
+ * @flags, and specify the hierarchy and namespace tag to mount via @root
+ * and @ns, respectively.
+ *
+ * The return value can be passed to the vfs layer verbatim.
+ */
+struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags,
+                               struct kernfs_root *root, const void *ns)
+{
+        struct super_block *sb;
+        struct kernfs_super_info *info;
+        int error;
+        info = kzalloc(sizeof(*info), GFP_KERNEL);
+        if (!info)
+                return ERR_PTR(-ENOMEM);
+        info->root = root;
+        info->ns = ns;
+        sb = sget(fs_type, kernfs_test_super, kernfs_set_super, flags, info);
+        if (IS_ERR(sb) || sb->s_fs_info != info)
+                kfree(info);
+        if (IS_ERR(sb))
+                return ERR_CAST(sb);
+        if (!sb->s_root) {
+                error = kernfs_fill_super(sb);
+                if (error) {
+                        deactivate_locked_super(sb);
+                        return ERR_PTR(error);
+                }
+                sb->s_flags |= MS_ACTIVE;
+        }
+        return dget(sb->s_root);
+}
+/**
+ * kernfs_kill_sb - kill_sb for kernfs
+ * @sb: super_block being killed
+ *
+ * This can be used directly for file_system_type->kill_sb().  If a kernfs
+ * user needs extra cleanup, it can implement its own kill_sb() and call
+ * this function at the end.
+ */
+void kernfs_kill_sb(struct super_block *sb)
+{
+        struct kernfs_super_info *info = kernfs_info(sb);
+        struct kernfs_node *root_kn = sb->s_root->d_fsdata;
+        /*
+         * Remove the superblock from fs_supers/s_instances
+         * so we can't find it, before freeing kernfs_super_info.
+         */
+        kill_anon_super(sb);
+        kfree(info);
+        kernfs_put(root_kn);
+}
+void __init kernfs_init(void)
+{
+        kernfs_node_cache = kmem_cache_create("kernfs_node_cache",
+                                              sizeof(struct kernfs_node),
+                                              0, SLAB_PANIC, NULL);
+        kernfs_inode_init();
+}
diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c
new file mode 100644
index 000000000000..4d457055acb9
--- /dev/null
+++ b/fs/kernfs/symlink.c
@@ -0,0 +1,151 @@
+/*
+ * fs/kernfs/symlink.c - kernfs symlink implementation
+ *
+ * Copyright (c) 2001-3 Patrick Mochel
+ * Copyright (c) 2007 SUSE Linux Products GmbH
+ * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2.
+ */
+#include <linux/fs.h>
+#include <linux/gfp.h>
+#include <linux/namei.h>
+#include "kernfs-internal.h"
+/**
+ * kernfs_create_link - create a symlink
+ * @parent: directory to create the symlink in
+ * @name: name of the symlink
+ * @target: target node for the symlink to point to
+ *
+ * Returns the created node on success, ERR_PTR() value on error.
+ */
+struct kernfs_node *kernfs_create_link(struct kernfs_node *parent,
+                                       const char *name,
+                                       struct kernfs_node *target)
+{
+        struct kernfs_node *kn;
+        struct kernfs_addrm_cxt acxt;
+        int error;
+        kn = kernfs_new_node(parent, name, S_IFLNK|S_IRWXUGO, KERNFS_LINK);
+        if (!kn)
+                return ERR_PTR(-ENOMEM);
+        if (kernfs_ns_enabled(parent))
+                kn->ns = target->ns;
+        kn->symlink.target_kn = target;
+        kernfs_get(target);     /* ref owned by symlink */
+        kernfs_addrm_start(&acxt);
+        error = kernfs_add_one(&acxt, kn);
+        kernfs_addrm_finish(&acxt);
+        if (!error)
+                return kn;
+        kernfs_put(kn);
+        return ERR_PTR(error);
+}
+static int kernfs_get_target_path(struct kernfs_node *parent,
+                                  struct kernfs_node *target, char *path)
+{
+        struct kernfs_node *base, *kn;
+        char *s = path;
+        int len = 0;
+        /* go up to the root, stop at the base */
+        base = parent;
+        while (base->parent) {
+                kn = target->parent;
+                while (kn->parent && base != kn)
+                        kn = kn->parent;
+                if (base == kn)
+                        break;
+                strcpy(s, "../");
+                s += 3;
+                base = base->parent;
+        }
+        /* determine end of target string for reverse fillup */
+        kn = target;
+        while (kn->parent && kn != base) {
+                len += strlen(kn->name) + 1;
+                kn = kn->parent;
+        }
+        /* check limits */
+        if (len < 2)
+                return -EINVAL;
+        len--;
+        if ((s - path) + len > PATH_MAX)
+                return -ENAMETOOLONG;
+        /* reverse fillup of target string from target to base */
+        kn = target;
+        while (kn->parent && kn != base) {
+                int slen = strlen(kn->name);
+                len -= slen;
+                strncpy(s + len, kn->name, slen);
+                if (len)
+                        s[--len] = '/';
+                kn = kn->parent;
+        }
+        return 0;
+}
+static int kernfs_getlink(struct dentry *dentry, char *path)
+{
+        struct kernfs_node *kn = dentry->d_fsdata;
+        struct kernfs_node *parent = kn->parent;
+        struct kernfs_node *target = kn->symlink.target_kn;
+        int error;
+        mutex_lock(&kernfs_mutex);
+        error = kernfs_get_target_path(parent, target, path);
+        mutex_unlock(&kernfs_mutex);
+        return error;
+}
+static void *kernfs_iop_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+        int error = -ENOMEM;
+        unsigned long page = get_zeroed_page(GFP_KERNEL);
+        if (page) {
+                error = kernfs_getlink(dentry, (char *) page);
+                if (error < 0)
+                        free_page((unsigned long)page);
+        }
+        nd_set_link(nd, error ? ERR_PTR(error) : (char *)page);
+        return NULL;
+}
+static void kernfs_iop_put_link(struct dentry *dentry, struct nameidata *nd,
+                                void *cookie)
+{
+        char *page = nd_get_link(nd);
+        if (!IS_ERR(page))
+                free_page((unsigned long)page);
+}
+const struct inode_operations kernfs_symlink_iops = {
+        .setxattr       = kernfs_iop_setxattr,
+        .removexattr    = kernfs_iop_removexattr,
+        .getxattr       = kernfs_iop_getxattr,
+        .listxattr      = kernfs_iop_listxattr,
+        .readlink       = generic_readlink,
+        .follow_link    = kernfs_iop_follow_link,
+        .put_link       = kernfs_iop_put_link,
+        .setattr        = kernfs_iop_setattr,
+        .getattr        = kernfs_iop_getattr,
+        .permission     = kernfs_iop_permission,
+};
diff --git a/fs/namespace.c b/fs/namespace.c
index ac2ce8a766e1..22e536705c45 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2790,6 +2790,8 @@ void __init mnt_init(void)
        for (u = 0; u < HASH_SIZE; u++)
                INIT_LIST_HEAD(&mountpoint_hashtable[u]);
+        kernfs_init();
        err = sysfs_init();
        if (err)
                printk(KERN_WARNING "%s: sysfs_init error: %d\n",
@@ -2886,7 +2888,7 @@ bool fs_fully_visible(struct file_system_type *type)
                        struct inode *inode = child->mnt_mountpoint->d_inode;
                        if (!S_ISDIR(inode->i_mode))
                                goto next;
-                        if (inode->i_nlink != 2)
+                        if (inode->i_nlink > 2)
                                goto next;
                }
                visible = true;
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 9f6b486b6c01..a1a191634abc 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -1440,17 +1440,19 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci,
                nilfs_clear_logs(&sci->sc_segbufs);
-                err = nilfs_segctor_extend_segments(sci, nilfs, nadd);
-                if (unlikely(err))
-                        return err;
                if (sci->sc_stage.flags & NILFS_CF_SUFREED) {
                        err = nilfs_sufile_cancel_freev(nilfs->ns_sufile,
                                                        sci->sc_freesegs,
                                                        sci->sc_nfreesegs,
                                                        NULL);
                        WARN_ON(err); /* do not happen */
+                        sci->sc_stage.flags &= ~NILFS_CF_SUFREED;
                }
+                err = nilfs_segctor_extend_segments(sci, nilfs, nadd);
+                if (unlikely(err))
+                        return err;
                nadd = min_t(int, nadd << 1, SC_MAX_SEGDELTA);
                sci->sc_stage = prev_stage;
        }
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 1fedd5f7ccc4..0b9ff4395e6a 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -82,20 +82,23 @@ static void dnotify_recalc_inode_mask(struct fsnotify_mark *fsn_mark)
 * events.
 */
 static int dnotify_handle_event(struct fsnotify_group *group,
+                                struct inode *inode,
                                struct fsnotify_mark *inode_mark,
                                struct fsnotify_mark *vfsmount_mark,
-                                struct fsnotify_event *event)
+                                u32 mask, void *data, int data_type,
+                                const unsigned char *file_name)
 {
        struct dnotify_mark *dn_mark;
-        struct inode *to_tell;
        struct dnotify_struct *dn;
        struct dnotify_struct **prev;
        struct fown_struct *fown;
-        __u32 test_mask = event->mask & ~FS_EVENT_ON_CHILD;
+        __u32 test_mask = mask & ~FS_EVENT_ON_CHILD;
-        BUG_ON(vfsmount_mark);
+        /* not a dir, dnotify doesn't care */
+        if (!S_ISDIR(inode->i_mode))
+                return 0;
-        to_tell = event->to_tell;
+        BUG_ON(vfsmount_mark);
        dn_mark = container_of(inode_mark, struct dnotify_mark, fsn_mark);
@@ -122,23 +125,6 @@ static int dnotify_handle_event(struct fsnotify_group *group,
        return 0;
 }
-/*
- * Given an inode and mask determine if dnotify would be interested in sending
- * userspace notification for that pair.
- */
-static bool dnotify_should_send_event(struct fsnotify_group *group,
-                                      struct inode *inode,
-                                      struct fsnotify_mark *inode_mark,
-                                      struct fsnotify_mark *vfsmount_mark,
-                                      __u32 mask, void *data, int data_type)
-{
-        /* not a dir, dnotify doesn't care */
-        if (!S_ISDIR(inode->i_mode))
-                return false;
-        return true;
-}
 static void dnotify_free_mark(struct fsnotify_mark *fsn_mark)
 {
        struct dnotify_mark *dn_mark = container_of(fsn_mark,
@@ -152,10 +138,6 @@ static void dnotify_free_mark(struct fsnotify_mark *fsn_mark)
 static struct fsnotify_ops dnotify_fsnotify_ops = {
        .handle_event = dnotify_handle_event,
-        .should_send_event = dnotify_should_send_event,
-        .free_group_priv = NULL,
-        .freeing_mark = NULL,
-        .free_event_priv = NULL,
 };
 /*
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 0c2f9122b262..58772623f02a 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -9,31 +9,27 @@
 #include <linux/types.h>
 #include <linux/wait.h>
-static bool should_merge(struct fsnotify_event *old, struct fsnotify_event *new)
+#include "fanotify.h"
+static bool should_merge(struct fsnotify_event *old_fsn,
+                         struct fsnotify_event *new_fsn)
 {
-        pr_debug("%s: old=%p new=%p\n", __func__, old, new);
+        struct fanotify_event_info *old, *new;
-        if (old->to_tell == new->to_tell &&
-            old->data_type == new->data_type &&
-            old->tgid == new->tgid) {
-                switch (old->data_type) {
-                case (FSNOTIFY_EVENT_PATH):
 #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
-                        /* dont merge two permission events */
+        /* dont merge two permission events */
-                        if ((old->mask & FAN_ALL_PERM_EVENTS) &&
+        if ((old_fsn->mask & FAN_ALL_PERM_EVENTS) &&
-                            (new->mask & FAN_ALL_PERM_EVENTS))
+            (new_fsn->mask & FAN_ALL_PERM_EVENTS))
-                                return false;
+                return false;
 #endif
-                        if ((old->path.mnt == new->path.mnt) &&
+        pr_debug("%s: old=%p new=%p\n", __func__, old_fsn, new_fsn);
-                            (old->path.dentry == new->path.dentry))
+        old = FANOTIFY_E(old_fsn);
-                                return true;
+        new = FANOTIFY_E(new_fsn);
-                        break;
-                case (FSNOTIFY_EVENT_NONE):
+        if (old_fsn->inode == new_fsn->inode && old->tgid == new->tgid &&
-                        return true;
+            old->path.mnt == new->path.mnt &&
-                default:
+            old->path.dentry == new->path.dentry)
-                        BUG();
+                return true;
-                };
-        }
        return false;
 }
@@ -41,59 +37,28 @@ static bool should_merge(struct fsnotify_event *old, struct fsnotify_event *new)
 static struct fsnotify_event *fanotify_merge(struct list_head *list,
                                             struct fsnotify_event *event)
 {
-        struct fsnotify_event_holder *test_holder;
+        struct fsnotify_event *test_event;
-        struct fsnotify_event *test_event = NULL;
+        bool do_merge = false;
-        struct fsnotify_event *new_event;
        pr_debug("%s: list=%p event=%p\n", __func__, list, event);
+        list_for_each_entry_reverse(test_event, list, list) {
-        list_for_each_entry_reverse(test_holder, list, event_list) {
+                if (should_merge(test_event, event)) {
-                if (should_merge(test_holder->event, event)) {
+                        do_merge = true;
-                        test_event = test_holder->event;
                        break;
                }
        }
-        if (!test_event)
+        if (!do_merge)
                return NULL;
-        fsnotify_get_event(test_event);
+        test_event->mask |= event->mask;
+        return test_event;
-        /* if they are exactly the same we are done */
-        if (test_event->mask == event->mask)
-                return test_event;
-        /*
-         * if the refcnt == 2 this is the only queue
-         * for this event and so we can update the mask
-         * in place.
-         */
-        if (atomic_read(&test_event->refcnt) == 2) {
-                test_event->mask |= event->mask;
-                return test_event;
-        }
-        new_event = fsnotify_clone_event(test_event);
-        /* done with test_event */
-        fsnotify_put_event(test_event);
-        /* couldn't allocate memory, merge was not possible */
-        if (unlikely(!new_event))
-                return ERR_PTR(-ENOMEM);
-        /* build new event and replace it on the list */
-        new_event->mask = (test_event->mask | event->mask);
-        fsnotify_replace_event(test_holder, new_event);
-        /* we hold a reference on new_event from clone_event */
-        return new_event;
 }
 #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
 static int fanotify_get_response_from_access(struct fsnotify_group *group,
-                                             struct fsnotify_event *event)
+                                             struct fanotify_event_info *event)
 {
        int ret;
@@ -106,7 +71,6 @@ static int fanotify_get_response_from_access(struct fsnotify_group *group,
                return 0;
        /* userspace responded, convert to something usable */
-        spin_lock(&event->lock);
        switch (event->response) {
        case FAN_ALLOW:
                ret = 0;
@@ -116,7 +80,6 @@ static int fanotify_get_response_from_access(struct fsnotify_group *group,
                ret = -EPERM;
        }
        event->response = 0;
-        spin_unlock(&event->lock);
        pr_debug("%s: group=%p event=%p about to return ret=%d\n", __func__,
                 group, event, ret);
@@ -125,58 +88,17 @@ static int fanotify_get_response_from_access(struct fsnotify_group *group,
 }
 #endif
-static int fanotify_handle_event(struct fsnotify_group *group,
+static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark,
-                                 struct fsnotify_mark *inode_mark,
-                                 struct fsnotify_mark *fanotify_mark,
-                                 struct fsnotify_event *event)
-{
-        int ret = 0;
-        struct fsnotify_event *notify_event = NULL;
-        BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS);
-        BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY);
-        BUILD_BUG_ON(FAN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE);
-        BUILD_BUG_ON(FAN_CLOSE_WRITE != FS_CLOSE_WRITE);
-        BUILD_BUG_ON(FAN_OPEN != FS_OPEN);
-        BUILD_BUG_ON(FAN_EVENT_ON_CHILD != FS_EVENT_ON_CHILD);
-        BUILD_BUG_ON(FAN_Q_OVERFLOW != FS_Q_OVERFLOW);
-        BUILD_BUG_ON(FAN_OPEN_PERM != FS_OPEN_PERM);
-        BUILD_BUG_ON(FAN_ACCESS_PERM != FS_ACCESS_PERM);
-        BUILD_BUG_ON(FAN_ONDIR != FS_ISDIR);
-        pr_debug("%s: group=%p event=%p\n", __func__, group, event);
-        notify_event = fsnotify_add_notify_event(group, event, NULL, fanotify_merge);
-        if (IS_ERR(notify_event))
-                return PTR_ERR(notify_event);
-#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
-        if (event->mask & FAN_ALL_PERM_EVENTS) {
-                /* if we merged we need to wait on the new event */
-                if (notify_event)
-                        event = notify_event;
-                ret = fanotify_get_response_from_access(group, event);
-        }
-#endif
-        if (notify_event)
-                fsnotify_put_event(notify_event);
-        return ret;
-}
-static bool fanotify_should_send_event(struct fsnotify_group *group,
-                                       struct inode *to_tell,
-                                       struct fsnotify_mark *inode_mark,
                                       struct fsnotify_mark *vfsmnt_mark,
-                                       __u32 event_mask, void *data, int data_type)
+                                       u32 event_mask,
+                                       void *data, int data_type)
 {
        __u32 marks_mask, marks_ignored_mask;
        struct path *path = data;
-        pr_debug("%s: group=%p to_tell=%p inode_mark=%p vfsmnt_mark=%p "
+        pr_debug("%s: inode_mark=%p vfsmnt_mark=%p mask=%x data=%p"
-                 "mask=%x data=%p data_type=%d\n", __func__, group, to_tell,
+                 " data_type=%d\n", __func__, inode_mark, vfsmnt_mark,
-                 inode_mark, vfsmnt_mark, event_mask, data, data_type);
+                 event_mask, data, data_type);
        /* if we don't have enough info to send an event to userspace say no */
        if (data_type != FSNOTIFY_EVENT_PATH)
@@ -217,6 +139,74 @@ static bool fanotify_should_send_event(struct fsnotify_group *group,
        return false;
 }
+static int fanotify_handle_event(struct fsnotify_group *group,
+                                 struct inode *inode,
+                                 struct fsnotify_mark *inode_mark,
+                                 struct fsnotify_mark *fanotify_mark,
+                                 u32 mask, void *data, int data_type,
+                                 const unsigned char *file_name)
+{
+        int ret = 0;
+        struct fanotify_event_info *event;
+        struct fsnotify_event *fsn_event;
+        struct fsnotify_event *notify_fsn_event;
+        BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS);
+        BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY);
+        BUILD_BUG_ON(FAN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE);
+        BUILD_BUG_ON(FAN_CLOSE_WRITE != FS_CLOSE_WRITE);
+        BUILD_BUG_ON(FAN_OPEN != FS_OPEN);
+        BUILD_BUG_ON(FAN_EVENT_ON_CHILD != FS_EVENT_ON_CHILD);
+        BUILD_BUG_ON(FAN_Q_OVERFLOW != FS_Q_OVERFLOW);
+        BUILD_BUG_ON(FAN_OPEN_PERM != FS_OPEN_PERM);
+        BUILD_BUG_ON(FAN_ACCESS_PERM != FS_ACCESS_PERM);
+        BUILD_BUG_ON(FAN_ONDIR != FS_ISDIR);
+        if (!fanotify_should_send_event(inode_mark, fanotify_mark, mask, data,
+                                        data_type))
+                return 0;
+        pr_debug("%s: group=%p inode=%p mask=%x\n", __func__, group, inode,
+                 mask);
+        event = kmem_cache_alloc(fanotify_event_cachep, GFP_KERNEL);
+        if (unlikely(!event))
+                return -ENOMEM;
+        fsn_event = &event->fse;
+        fsnotify_init_event(fsn_event, inode, mask);
+        event->tgid = get_pid(task_tgid(current));
+        if (data_type == FSNOTIFY_EVENT_PATH) {
+                struct path *path = data;
+                event->path = *path;
+                path_get(&event->path);
+        } else {
+                event->path.mnt = NULL;
+                event->path.dentry = NULL;
+        }
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+        event->response = 0;
+#endif
+        notify_fsn_event = fsnotify_add_notify_event(group, fsn_event,
+                                                     fanotify_merge);
+        if (notify_fsn_event) {
+                /* Our event wasn't used in the end. Free it. */
+                fsnotify_destroy_event(group, fsn_event);
+                if (IS_ERR(notify_fsn_event))
+                        return PTR_ERR(notify_fsn_event);
+                /* We need to ask about a different events after a merge... */
+                event = FANOTIFY_E(notify_fsn_event);
+                fsn_event = notify_fsn_event;
+        }
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+        if (fsn_event->mask & FAN_ALL_PERM_EVENTS)
+                ret = fanotify_get_response_from_access(group, event);
+#endif
+        return ret;
+}
 static void fanotify_free_group_priv(struct fsnotify_group *group)
 {
        struct user_struct *user;
@@ -226,10 +216,18 @@ static void fanotify_free_group_priv(struct fsnotify_group *group)
        free_uid(user);
 }
+static void fanotify_free_event(struct fsnotify_event *fsn_event)
+{
+        struct fanotify_event_info *event;
+        event = FANOTIFY_E(fsn_event);
+        path_put(&event->path);
+        put_pid(event->tgid);
+        kmem_cache_free(fanotify_event_cachep, event);
+}
 const struct fsnotify_ops fanotify_fsnotify_ops = {
        .handle_event = fanotify_handle_event,
-        .should_send_event = fanotify_should_send_event,
        .free_group_priv = fanotify_free_group_priv,
-        .free_event_priv = NULL,
+        .free_event = fanotify_free_event,
-        .freeing_mark = NULL,
 };
diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
new file mode 100644
index 000000000000..0e90174a116a
--- /dev/null
+++ b/fs/notify/fanotify/fanotify.h
@@ -0,0 +1,23 @@
+#include <linux/fsnotify_backend.h>
+#include <linux/path.h>
+#include <linux/slab.h>
+extern struct kmem_cache *fanotify_event_cachep;
+struct fanotify_event_info {
+        struct fsnotify_event fse;
+        /*
+         * We hold ref to this path so it may be dereferenced at any point
+         * during this object's lifetime
+         */
+        struct path path;
+        struct pid *tgid;
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+        u32 response;   /* userspace answer to question */
+#endif
+};
+static inline struct fanotify_event_info *FANOTIFY_E(struct fsnotify_event *fse)
+{
+        return container_of(fse, struct fanotify_event_info, fse);
+}
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index e44cb6427df3..57d7c083cb4b 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -19,6 +19,7 @@
 #include "../../mount.h"
 #include "../fdinfo.h"
+#include "fanotify.h"
 #define FANOTIFY_DEFAULT_MAX_EVENTS     16384
 #define FANOTIFY_DEFAULT_MAX_MARKS      8192
@@ -28,11 +29,12 @@ extern const struct fsnotify_ops fanotify_fsnotify_ops;
 static struct kmem_cache *fanotify_mark_cache __read_mostly;
 static struct kmem_cache *fanotify_response_event_cache __read_mostly;
+struct kmem_cache *fanotify_event_cachep __read_mostly;
 struct fanotify_response_event {
        struct list_head list;
        __s32 fd;
-        struct fsnotify_event *event;
+        struct fanotify_event_info *event;
 };
 /*
@@ -61,8 +63,8 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
 }
 static int create_fd(struct fsnotify_group *group,
-                        struct fsnotify_event *event,
+                     struct fanotify_event_info *event,
-                        struct file **file)
+                     struct file **file)
 {
        int client_fd;
        struct file *new_file;
@@ -73,12 +75,6 @@ static int create_fd(struct fsnotify_group *group,
        if (client_fd < 0)
                return client_fd;
-        if (event->data_type != FSNOTIFY_EVENT_PATH) {
-                WARN_ON(1);
-                put_unused_fd(client_fd);
-                return -EINVAL;
-        }
        /*
         * we need a new file handle for the userspace program so it can read even if it was
         * originally opened O_WRONLY.
@@ -109,23 +105,25 @@ static int create_fd(struct fsnotify_group *group,
 }
 static int fill_event_metadata(struct fsnotify_group *group,
-                                   struct fanotify_event_metadata *metadata,
+                               struct fanotify_event_metadata *metadata,
-                                   struct fsnotify_event *event,
+                               struct fsnotify_event *fsn_event,
-                                   struct file **file)
+                               struct file **file)
 {
        int ret = 0;
+        struct fanotify_event_info *event;
        pr_debug("%s: group=%p metadata=%p event=%p\n", __func__,
-                 group, metadata, event);
+                 group, metadata, fsn_event);
        *file = NULL;
+        event = container_of(fsn_event, struct fanotify_event_info, fse);
        metadata->event_len = FAN_EVENT_METADATA_LEN;
        metadata->metadata_len = FAN_EVENT_METADATA_LEN;
        metadata->vers = FANOTIFY_METADATA_VERSION;
        metadata->reserved = 0;
-        metadata->mask = event->mask & FAN_ALL_OUTGOING_EVENTS;
+        metadata->mask = fsn_event->mask & FAN_ALL_OUTGOING_EVENTS;
        metadata->pid = pid_vnr(event->tgid);
-        if (unlikely(event->mask & FAN_Q_OVERFLOW))
+        if (unlikely(fsn_event->mask & FAN_Q_OVERFLOW))
                metadata->fd = FAN_NOFD;
        else {
                metadata->fd = create_fd(group, event, file);
@@ -209,7 +207,7 @@ static int prepare_for_access_response(struct fsnotify_group *group,
        if (!re)
                return -ENOMEM;
-        re->event = event;
+        re->event = FANOTIFY_E(event);
        re->fd = fd;
        mutex_lock(&group->fanotify_data.access_mutex);
@@ -217,7 +215,7 @@ static int prepare_for_access_response(struct fsnotify_group *group,
        if (atomic_read(&group->fanotify_data.bypass_perm)) {
                mutex_unlock(&group->fanotify_data.access_mutex);
                kmem_cache_free(fanotify_response_event_cache, re);
-                event->response = FAN_ALLOW;
+                FANOTIFY_E(event)->response = FAN_ALLOW;
                return 0;
        }
                
@@ -273,7 +271,7 @@ out_close_fd:
 out:
 #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
        if (event->mask & FAN_ALL_PERM_EVENTS) {
-                event->response = FAN_DENY;
+                FANOTIFY_E(event)->response = FAN_DENY;
                wake_up(&group->fanotify_data.access_waitq);
        }
 #endif
@@ -321,7 +319,7 @@ static ssize_t fanotify_read(struct file *file, char __user *buf,
                        if (IS_ERR(kevent))
                                break;
                        ret = copy_event_to_user(group, kevent, buf);
-                        fsnotify_put_event(kevent);
+                        fsnotify_destroy_event(group, kevent);
                        if (ret < 0)
                                break;
                        buf += ret;
@@ -409,7 +407,7 @@ static int fanotify_release(struct inode *ignored, struct file *file)
 static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
        struct fsnotify_group *group;
-        struct fsnotify_event_holder *holder;
+        struct fsnotify_event *fsn_event;
        void __user *p;
        int ret = -ENOTTY;
        size_t send_len = 0;
@@ -421,7 +419,7 @@ static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long ar
        switch (cmd) {
        case FIONREAD:
                mutex_lock(&group->notification_mutex);
-                list_for_each_entry(holder, &group->notification_list, event_list)
+                list_for_each_entry(fsn_event, &group->notification_list, list)
                        send_len += FAN_EVENT_METADATA_LEN;
                mutex_unlock(&group->notification_mutex);
                ret = put_user(send_len, (int __user *) p);
@@ -906,6 +904,7 @@ static int __init fanotify_user_setup(void)
        fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, SLAB_PANIC);
        fanotify_response_event_cache = KMEM_CACHE(fanotify_response_event,
                                                   SLAB_PANIC);
+        fanotify_event_cachep = KMEM_CACHE(fanotify_event_info, SLAB_PANIC);
        return 0;
 }
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 4bb21d67d9b1..1d4e1ea2f37c 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -128,8 +128,7 @@ static int send_to_group(struct inode *to_tell,
                         struct fsnotify_mark *vfsmount_mark,
                         __u32 mask, void *data,
                         int data_is, u32 cookie,
-                         const unsigned char *file_name,
+                         const unsigned char *file_name)
-                         struct fsnotify_event **event)
 {
        struct fsnotify_group *group = NULL;
        __u32 inode_test_mask = 0;
@@ -170,27 +169,17 @@ static int send_to_group(struct inode *to_tell,
        pr_debug("%s: group=%p to_tell=%p mask=%x inode_mark=%p"
                 " inode_test_mask=%x vfsmount_mark=%p vfsmount_test_mask=%x"
-                 " data=%p data_is=%d cookie=%d event=%p\n",
+                 " data=%p data_is=%d cookie=%d\n",
                 __func__, group, to_tell, mask, inode_mark,
                 inode_test_mask, vfsmount_mark, vfsmount_test_mask, data,
-                 data_is, cookie, *event);
+                 data_is, cookie);
        if (!inode_test_mask && !vfsmount_test_mask)
                return 0;
-        if (group->ops->should_send_event(group, to_tell, inode_mark,
+        return group->ops->handle_event(group, to_tell, inode_mark,
-                                          vfsmount_mark, mask, data,
+                                        vfsmount_mark, mask, data, data_is,
-                                          data_is) == false)
+                                        file_name);
-                return 0;
-        if (!*event) {
-                *event = fsnotify_create_event(to_tell, mask, data,
-                                                data_is, file_name,
-                                                cookie, GFP_KERNEL);
-                if (!*event)
-                        return -ENOMEM;
-        }
-        return group->ops->handle_event(group, inode_mark, vfsmount_mark, *event);
 }
 /*
@@ -205,7 +194,6 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
        struct hlist_node *inode_node = NULL, *vfsmount_node = NULL;
        struct fsnotify_mark *inode_mark = NULL, *vfsmount_mark = NULL;
        struct fsnotify_group *inode_group, *vfsmount_group;
-        struct fsnotify_event *event = NULL;
        struct mount *mnt;
        int idx, ret = 0;
        /* global tests shouldn't care about events on child only the specific event */
@@ -258,18 +246,18 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
                if (inode_group > vfsmount_group) {
                        /* handle inode */
-                        ret = send_to_group(to_tell, inode_mark, NULL, mask, data,
+                        ret = send_to_group(to_tell, inode_mark, NULL, mask,
-                                            data_is, cookie, file_name, &event);
+                                            data, data_is, cookie, file_name);
                        /* we didn't use the vfsmount_mark */
                        vfsmount_group = NULL;
                } else if (vfsmount_group > inode_group) {
-                        ret = send_to_group(to_tell, NULL, vfsmount_mark, mask, data,
+                        ret = send_to_group(to_tell, NULL, vfsmount_mark, mask,
-                                            data_is, cookie, file_name, &event);
+                                            data, data_is, cookie, file_name);
                        inode_group = NULL;
                } else {
                        ret = send_to_group(to_tell, inode_mark, vfsmount_mark,
-                                            mask, data, data_is, cookie, file_name,
+                                            mask, data, data_is, cookie,
-                                            &event);
+                                            file_name);
                }
                if (ret && (mask & ALL_FSNOTIFY_PERM_EVENTS))
@@ -285,12 +273,6 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
        ret = 0;
 out:
        srcu_read_unlock(&fsnotify_mark_srcu, idx);
-        /*
-         * fsnotify_create_event() took a reference so the event can't be cleaned
-         * up while we are still trying to add it to lists, drop that one.
-         */
-        if (event)
-                fsnotify_put_event(event);
        return ret;
 }
diff --git a/fs/notify/group.c b/fs/notify/group.c
index bd2625bd88b4..ee674fe2cec7 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -99,6 +99,7 @@ struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops)
        INIT_LIST_HEAD(&group->marks_list);
        group->ops = ops;
+        fsnotify_init_event(&group->overflow_event, NULL, FS_Q_OVERFLOW);
        return group;
 }
diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h
index b6642e4de4bf..485eef3f4407 100644
--- a/fs/notify/inotify/inotify.h
+++ b/fs/notify/inotify/inotify.h
@@ -2,11 +2,12 @@
 #include <linux/inotify.h>
 #include <linux/slab.h> /* struct kmem_cache */
-extern struct kmem_cache *event_priv_cachep;
+struct inotify_event_info {
+        struct fsnotify_event fse;
-struct inotify_event_private_data {
-        struct fsnotify_event_private_data fsnotify_event_priv_data;
        int wd;
+        u32 sync_cookie;
+        int name_len;
+        char name[];
 };
 struct inotify_inode_mark {
@@ -14,8 +15,18 @@ struct inotify_inode_mark {
        int wd;
 };
+static inline struct inotify_event_info *INOTIFY_E(struct fsnotify_event *fse)
+{
+        return container_of(fse, struct inotify_event_info, fse);
+}
 extern void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
                                           struct fsnotify_group *group);
-extern void inotify_free_event_priv(struct fsnotify_event_private_data *event_priv);
+extern int inotify_handle_event(struct fsnotify_group *group,
+                                struct inode *inode,
+                                struct fsnotify_mark *inode_mark,
+                                struct fsnotify_mark *vfsmount_mark,
+                                u32 mask, void *data, int data_type,
+                                const unsigned char *file_name);
 extern const struct fsnotify_ops inotify_fsnotify_ops;
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 4216308b81b4..aad1a35e9af1 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -34,100 +34,87 @@
 #include "inotify.h"
 /*
- * Check if 2 events contain the same information.  We do not compare private data
+ * Check if 2 events contain the same information.
- * but at this moment that isn't a problem for any know fsnotify listeners.
 */
-static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new)
+static bool event_compare(struct fsnotify_event *old_fsn,
+                          struct fsnotify_event *new_fsn)
 {
-        if ((old->mask == new->mask) &&
+        struct inotify_event_info *old, *new;
-            (old->to_tell == new->to_tell) &&
-            (old->data_type == new->data_type) &&
+        if (old_fsn->mask & FS_IN_IGNORED)
-            (old->name_len == new->name_len)) {
+                return false;
-                switch (old->data_type) {
+        old = INOTIFY_E(old_fsn);
-                case (FSNOTIFY_EVENT_INODE):
+        new = INOTIFY_E(new_fsn);
-                        /* remember, after old was put on the wait_q we aren't
+        if ((old_fsn->mask == new_fsn->mask) &&
-                         * allowed to look at the inode any more, only thing
+            (old_fsn->inode == new_fsn->inode) &&
-                         * left to check was if the file_name is the same */
+            (old->name_len == new->name_len) &&
-                        if (!old->name_len ||
+            (!old->name_len || !strcmp(old->name, new->name)))
-                            !strcmp(old->file_name, new->file_name))
+                return true;
-                                return true;
-                        break;
-                case (FSNOTIFY_EVENT_PATH):
-                        if ((old->path.mnt == new->path.mnt) &&
-                            (old->path.dentry == new->path.dentry))
-                                return true;
-                        break;
-                case (FSNOTIFY_EVENT_NONE):
-                        if (old->mask & FS_Q_OVERFLOW)
-                                return true;
-                        else if (old->mask & FS_IN_IGNORED)
-                                return false;
-                        return true;
-                };
-        }
        return false;
 }
 static struct fsnotify_event *inotify_merge(struct list_head *list,
                                            struct fsnotify_event *event)
 {
-        struct fsnotify_event_holder *last_holder;
        struct fsnotify_event *last_event;
-        /* and the list better be locked by something too */
+        last_event = list_entry(list->prev, struct fsnotify_event, list);
-        spin_lock(&event->lock);
+        if (!event_compare(last_event, event))
+                return NULL;
-        last_holder = list_entry(list->prev, struct fsnotify_event_holder, event_list);
-        last_event = last_holder->event;
-        if (event_compare(last_event, event))
-                fsnotify_get_event(last_event);
-        else
-                last_event = NULL;
-        spin_unlock(&event->lock);
        return last_event;
 }
-static int inotify_handle_event(struct fsnotify_group *group,
+int inotify_handle_event(struct fsnotify_group *group,
-                                struct fsnotify_mark *inode_mark,
+                         struct inode *inode,
-                                struct fsnotify_mark *vfsmount_mark,
+                         struct fsnotify_mark *inode_mark,
-                                struct fsnotify_event *event)
+                         struct fsnotify_mark *vfsmount_mark,
+                         u32 mask, void *data, int data_type,
+                         const unsigned char *file_name)
 {
        struct inotify_inode_mark *i_mark;
-        struct inode *to_tell;
+        struct inotify_event_info *event;
-        struct inotify_event_private_data *event_priv;
-        struct fsnotify_event_private_data *fsn_event_priv;
        struct fsnotify_event *added_event;
-        int wd, ret = 0;
+        struct fsnotify_event *fsn_event;
+        int ret = 0;
+        int len = 0;
+        int alloc_len = sizeof(struct inotify_event_info);
        BUG_ON(vfsmount_mark);
-        pr_debug("%s: group=%p event=%p to_tell=%p mask=%x\n", __func__, group,
+        if ((inode_mark->mask & FS_EXCL_UNLINK) &&
-                 event, event->to_tell, event->mask);
+            (data_type == FSNOTIFY_EVENT_PATH)) {
+                struct path *path = data;
-        to_tell = event->to_tell;
+                if (d_unlinked(path->dentry))
+                        return 0;
+        }
+        if (file_name) {
+                len = strlen(file_name);
+                alloc_len += len + 1;
+        }
+        pr_debug("%s: group=%p inode=%p mask=%x\n", __func__, group, inode,
+                 mask);
        i_mark = container_of(inode_mark, struct inotify_inode_mark,
                              fsn_mark);
-        wd = i_mark->wd;
-        event_priv = kmem_cache_alloc(event_priv_cachep, GFP_KERNEL);
+        event = kmalloc(alloc_len, GFP_KERNEL);
-        if (unlikely(!event_priv))
+        if (unlikely(!event))
                return -ENOMEM;
-        fsn_event_priv = &event_priv->fsnotify_event_priv_data;
+        fsn_event = &event->fse;
+        fsnotify_init_event(fsn_event, inode, mask);
-        fsnotify_get_group(group);
+        event->wd = i_mark->wd;
-        fsn_event_priv->group = group;
+        event->name_len = len;
-        event_priv->wd = wd;
+        if (len)
+                strcpy(event->name, file_name);
-        added_event = fsnotify_add_notify_event(group, event, fsn_event_priv, inotify_merge);
+        added_event = fsnotify_add_notify_event(group, fsn_event, inotify_merge);
        if (added_event) {
-                inotify_free_event_priv(fsn_event_priv);
+                /* Our event wasn't used in the end. Free it. */
-                if (!IS_ERR(added_event))
+                fsnotify_destroy_event(group, fsn_event);
-                        fsnotify_put_event(added_event);
+                if (IS_ERR(added_event))
-                else
                        ret = PTR_ERR(added_event);
        }
@@ -142,22 +129,6 @@ static void inotify_freeing_mark(struct fsnotify_mark *fsn_mark, struct fsnotify
        inotify_ignored_and_remove_idr(fsn_mark, group);
 }
-static bool inotify_should_send_event(struct fsnotify_group *group, struct inode *inode,
-                                      struct fsnotify_mark *inode_mark,
-                                      struct fsnotify_mark *vfsmount_mark,
-                                      __u32 mask, void *data, int data_type)
-{
-        if ((inode_mark->mask & FS_EXCL_UNLINK) &&
-            (data_type == FSNOTIFY_EVENT_PATH)) {
-                struct path *path = data;
-                if (d_unlinked(path->dentry))
-                        return false;
-        }
-        return true;
-}
 /*
 * This is NEVER supposed to be called.  Inotify marks should either have been
 * removed from the idr when the watch was removed or in the
@@ -202,22 +173,14 @@ static void inotify_free_group_priv(struct fsnotify_group *group)
        free_uid(group->inotify_data.user);
 }
-void inotify_free_event_priv(struct fsnotify_event_private_data *fsn_event_priv)
+static void inotify_free_event(struct fsnotify_event *fsn_event)
 {
-        struct inotify_event_private_data *event_priv;
+        kfree(INOTIFY_E(fsn_event));
-        event_priv = container_of(fsn_event_priv, struct inotify_event_private_data,
-                                  fsnotify_event_priv_data);
-        fsnotify_put_group(fsn_event_priv->group);
-        kmem_cache_free(event_priv_cachep, event_priv);
 }
 const struct fsnotify_ops inotify_fsnotify_ops = {
        .handle_event = inotify_handle_event,
-        .should_send_event = inotify_should_send_event,
        .free_group_priv = inotify_free_group_priv,
-        .free_event_priv = inotify_free_event_priv,
+        .free_event = inotify_free_event,
        .freeing_mark = inotify_freeing_mark,
 };
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 60f954a891ab..497395c8274b 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -50,7 +50,6 @@ static int inotify_max_queued_events __read_mostly;
 static int inotify_max_user_watches __read_mostly;
 static struct kmem_cache *inotify_inode_mark_cachep __read_mostly;
-struct kmem_cache *event_priv_cachep __read_mostly;
 #ifdef CONFIG_SYSCTL
@@ -124,6 +123,16 @@ static unsigned int inotify_poll(struct file *file, poll_table *wait)
        return ret;
 }
+static int round_event_name_len(struct fsnotify_event *fsn_event)
+{
+        struct inotify_event_info *event;
+        event = INOTIFY_E(fsn_event);
+        if (!event->name_len)
+                return 0;
+        return roundup(event->name_len + 1, sizeof(struct inotify_event));
+}
 /*
 * Get an inotify_kernel_event if one exists and is small
 * enough to fit in "count". Return an error pointer if
@@ -144,9 +153,7 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
        pr_debug("%s: group=%p event=%p\n", __func__, group, event);
-        if (event->name_len)
+        event_size += round_event_name_len(event);
-                event_size += roundup(event->name_len + 1, event_size);
        if (event_size > count)
                return ERR_PTR(-EINVAL);
@@ -164,40 +171,27 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
 * buffer we had in "get_one_event()" above.
 */
 static ssize_t copy_event_to_user(struct fsnotify_group *group,
-                                  struct fsnotify_event *event,
+                                  struct fsnotify_event *fsn_event,
                                  char __user *buf)
 {
        struct inotify_event inotify_event;
-        struct fsnotify_event_private_data *fsn_priv;
+        struct inotify_event_info *event;
-        struct inotify_event_private_data *priv;
        size_t event_size = sizeof(struct inotify_event);
-        size_t name_len = 0;
+        size_t name_len;
+        size_t pad_name_len;
-        pr_debug("%s: group=%p event=%p\n", __func__, group, event);
-        /* we get the inotify watch descriptor from the event private data */
+        pr_debug("%s: group=%p event=%p\n", __func__, group, fsn_event);
-        spin_lock(&event->lock);
-        fsn_priv = fsnotify_remove_priv_from_event(group, event);
-        spin_unlock(&event->lock);
-        if (!fsn_priv)
-                inotify_event.wd = -1;
-        else {
-                priv = container_of(fsn_priv, struct inotify_event_private_data,
-                                    fsnotify_event_priv_data);
-                inotify_event.wd = priv->wd;
-                inotify_free_event_priv(fsn_priv);
-        }
+        event = INOTIFY_E(fsn_event);
+        name_len = event->name_len;
        /*
-         * round up event->name_len so it is a multiple of event_size
+         * round up name length so it is a multiple of event_size
         * plus an extra byte for the terminating '\0'.
         */
-        if (event->name_len)
+        pad_name_len = round_event_name_len(fsn_event);
-                name_len = roundup(event->name_len + 1, event_size);
+        inotify_event.len = pad_name_len;
-        inotify_event.len = name_len;
+        inotify_event.mask = inotify_mask_to_arg(fsn_event->mask);
+        inotify_event.wd = event->wd;
-        inotify_event.mask = inotify_mask_to_arg(event->mask);
        inotify_event.cookie = event->sync_cookie;
        /* send the main event */
@@ -209,20 +203,18 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
        /*
         * fsnotify only stores the pathname, so here we have to send the pathname
         * and then pad that pathname out to a multiple of sizeof(inotify_event)
-         * with zeros.  I get my zeros from the nul_inotify_event.
+         * with zeros.
         */
-        if (name_len) {
+        if (pad_name_len) {
-                unsigned int len_to_zero = name_len - event->name_len;
                /* copy the path name */
-                if (copy_to_user(buf, event->file_name, event->name_len))
+                if (copy_to_user(buf, event->name, name_len))
                        return -EFAULT;
-                buf += event->name_len;
+                buf += name_len;
                /* fill userspace with 0's */
-                if (clear_user(buf, len_to_zero))
+                if (clear_user(buf, pad_name_len - name_len))
                        return -EFAULT;
-                buf += len_to_zero;
+                event_size += pad_name_len;
-                event_size += name_len;
        }
        return event_size;
@@ -254,7 +246,7 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
                        if (IS_ERR(kevent))
                                break;
                        ret = copy_event_to_user(group, kevent, buf);
-                        fsnotify_put_event(kevent);
+                        fsnotify_destroy_event(group, kevent);
                        if (ret < 0)
                                break;
                        buf += ret;
@@ -297,8 +289,7 @@ static long inotify_ioctl(struct file *file, unsigned int cmd,
                          unsigned long arg)
 {
        struct fsnotify_group *group;
-        struct fsnotify_event_holder *holder;
+        struct fsnotify_event *fsn_event;
-        struct fsnotify_event *event;
        void __user *p;
        int ret = -ENOTTY;
        size_t send_len = 0;
@@ -311,12 +302,10 @@ static long inotify_ioctl(struct file *file, unsigned int cmd,
        switch (cmd) {
        case FIONREAD:
                mutex_lock(&group->notification_mutex);
-                list_for_each_entry(holder, &group->notification_list, event_list) {
+                list_for_each_entry(fsn_event, &group->notification_list,
-                        event = holder->event;
+                                    list) {
                        send_len += sizeof(struct inotify_event);
-                        if (event->name_len)
+                        send_len += round_event_name_len(fsn_event);
-                                send_len += roundup(event->name_len + 1,
-                                                sizeof(struct inotify_event));
                }
                mutex_unlock(&group->notification_mutex);
                ret = put_user(send_len, (int __user *) p);
@@ -503,43 +492,12 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
                                    struct fsnotify_group *group)
 {
        struct inotify_inode_mark *i_mark;
-        struct fsnotify_event *ignored_event, *notify_event;
-        struct inotify_event_private_data *event_priv;
-        struct fsnotify_event_private_data *fsn_event_priv;
-        int ret;
-        i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
-        ignored_event = fsnotify_create_event(NULL, FS_IN_IGNORED, NULL,
-                                              FSNOTIFY_EVENT_NONE, NULL, 0,
-                                              GFP_NOFS);
-        if (!ignored_event)
-                goto skip_send_ignore;
-        event_priv = kmem_cache_alloc(event_priv_cachep, GFP_NOFS);
-        if (unlikely(!event_priv))
-                goto skip_send_ignore;
-        fsn_event_priv = &event_priv->fsnotify_event_priv_data;
-        fsnotify_get_group(group);
-        fsn_event_priv->group = group;
-        event_priv->wd = i_mark->wd;
-        notify_event = fsnotify_add_notify_event(group, ignored_event, fsn_event_priv, NULL);
-        if (notify_event) {
-                if (IS_ERR(notify_event))
-                        ret = PTR_ERR(notify_event);
-                else
-                        fsnotify_put_event(notify_event);
-                inotify_free_event_priv(fsn_event_priv);
-        }
-skip_send_ignore:
+        /* Queue ignore event for the watch */
-        /* matches the reference taken when the event was created */
+        inotify_handle_event(group, NULL, fsn_mark, NULL, FS_IN_IGNORED,
-        if (ignored_event)
+                             NULL, FSNOTIFY_EVENT_NONE, NULL);
-                fsnotify_put_event(ignored_event);
+        i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
        /* remove this mark from the idr */
        inotify_remove_from_idr(group, i_mark);
@@ -836,7 +794,6 @@ static int __init inotify_user_setup(void)
        BUG_ON(hweight32(ALL_INOTIFY_BITS) != 21);
        inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, SLAB_PANIC);
-        event_priv_cachep = KMEM_CACHE(inotify_event_private_data, SLAB_PANIC);
        inotify_max_queued_events = 16384;
        inotify_max_user_instances = 128;
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index 7b51b05f160c..952237b8e2d2 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -48,15 +48,6 @@
 #include <linux/fsnotify_backend.h>
 #include "fsnotify.h"
-static struct kmem_cache *fsnotify_event_cachep;
-static struct kmem_cache *fsnotify_event_holder_cachep;
-/*
- * This is a magic event we send when the q is too full.  Since it doesn't
- * hold real event information we just keep one system wide and use it any time
- * it is needed.  It's refcnt is set 1 at kernel init time and will never
- * get set to 0 so it will never get 'freed'
- */
-static struct fsnotify_event *q_overflow_event;
 static atomic_t fsnotify_sync_cookie = ATOMIC_INIT(0);
 /**
@@ -76,60 +67,14 @@ bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group)
        return list_empty(&group->notification_list) ? true : false;
 }
-void fsnotify_get_event(struct fsnotify_event *event)
+void fsnotify_destroy_event(struct fsnotify_group *group,
+                            struct fsnotify_event *event)
 {
-        atomic_inc(&event->refcnt);
+        /* Overflow events are per-group and we don't want to free them */
-}
+        if (!event || event->mask == FS_Q_OVERFLOW)
-void fsnotify_put_event(struct fsnotify_event *event)
-{
-        if (!event)
                return;
-        if (atomic_dec_and_test(&event->refcnt)) {
+        group->ops->free_event(event);
-                pr_debug("%s: event=%p\n", __func__, event);
-                if (event->data_type == FSNOTIFY_EVENT_PATH)
-                        path_put(&event->path);
-                BUG_ON(!list_empty(&event->private_data_list));
-                kfree(event->file_name);
-                put_pid(event->tgid);
-                kmem_cache_free(fsnotify_event_cachep, event);
-        }
-}
-struct fsnotify_event_holder *fsnotify_alloc_event_holder(void)
-{
-        return kmem_cache_alloc(fsnotify_event_holder_cachep, GFP_KERNEL);
-}
-void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder)
-{
-        if (holder)
-                kmem_cache_free(fsnotify_event_holder_cachep, holder);
-}
-/*
- * Find the private data that the group previously attached to this event when
- * the group added the event to the notification queue (fsnotify_add_notify_event)
- */
-struct fsnotify_event_private_data *fsnotify_remove_priv_from_event(struct fsnotify_group *group, struct fsnotify_event *event)
-{
-        struct fsnotify_event_private_data *lpriv;
-        struct fsnotify_event_private_data *priv = NULL;
-        assert_spin_locked(&event->lock);
-        list_for_each_entry(lpriv, &event->private_data_list, event_list) {
-                if (lpriv->group == group) {
-                        priv = lpriv;
-                        list_del(&priv->event_list);
-                        break;
-                }
-        }
-        return priv;
 }
 /*
@@ -137,91 +82,35 @@ struct fsnotify_event_private_data *fsnotify_remove_priv_from_event(struct fsnot
 * event off the queue to deal with.  If the event is successfully added to the
 * group's notification queue, a reference is taken on event.
 */
-struct fsnotify_event *fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_event *event,
+struct fsnotify_event *fsnotify_add_notify_event(struct fsnotify_group *group,
-                                                 struct fsnotify_event_private_data *priv,
+                                                 struct fsnotify_event *event,
                                                 struct fsnotify_event *(*merge)(struct list_head *,
                                                                                 struct fsnotify_event *))
 {
        struct fsnotify_event *return_event = NULL;
-        struct fsnotify_event_holder *holder = NULL;
        struct list_head *list = &group->notification_list;
-        pr_debug("%s: group=%p event=%p priv=%p\n", __func__, group, event, priv);
+        pr_debug("%s: group=%p event=%p\n", __func__, group, event);
-        /*
-         * There is one fsnotify_event_holder embedded inside each fsnotify_event.
-         * Check if we expect to be able to use that holder.  If not alloc a new
-         * holder.
-         * For the overflow event it's possible that something will use the in
-         * event holder before we get the lock so we may need to jump back and
-         * alloc a new holder, this can't happen for most events...
-         */
-        if (!list_empty(&event->holder.event_list)) {
-alloc_holder:
-                holder = fsnotify_alloc_event_holder();
-                if (!holder)
-                        return ERR_PTR(-ENOMEM);
-        }
        mutex_lock(&group->notification_mutex);
        if (group->q_len >= group->max_events) {
-                event = q_overflow_event;
+                /* Queue overflow event only if it isn't already queued */
+                if (list_empty(&group->overflow_event.list))
-                /*
+                        event = &group->overflow_event;
-                 * we need to return the overflow event
-                 * which means we need a ref
-                 */
-                fsnotify_get_event(event);
                return_event = event;
-                /* sorry, no private data on the overflow event */
-                priv = NULL;
        }
        if (!list_empty(list) && merge) {
-                struct fsnotify_event *tmp;
+                return_event = merge(list, event);
-                tmp = merge(list, event);
-                if (tmp) {
-                        mutex_unlock(&group->notification_mutex);
-                        if (return_event)
-                                fsnotify_put_event(return_event);
-                        if (holder != &event->holder)
-                                fsnotify_destroy_event_holder(holder);
-                        return tmp;
-                }
-        }
-        spin_lock(&event->lock);
-        if (list_empty(&event->holder.event_list)) {
-                if (unlikely(holder))
-                        fsnotify_destroy_event_holder(holder);
-                holder = &event->holder;
-        } else if (unlikely(!holder)) {
-                /* between the time we checked above and got the lock the in
-                 * event holder was used, go back and get a new one */
-                spin_unlock(&event->lock);
-                mutex_unlock(&group->notification_mutex);
                if (return_event) {
-                        fsnotify_put_event(return_event);
+                        mutex_unlock(&group->notification_mutex);
-                        return_event = NULL;
+                        return return_event;
                }
-                goto alloc_holder;
        }
        group->q_len++;
-        holder->event = event;
+        list_add_tail(&event->list, list);
-        fsnotify_get_event(event);
-        list_add_tail(&holder->event_list, list);
-        if (priv)
-                list_add_tail(&priv->event_list, &event->private_data_list);
-        spin_unlock(&event->lock);
        mutex_unlock(&group->notification_mutex);
        wake_up(&group->notification_waitq);
@@ -230,32 +119,20 @@ alloc_holder:
 }
 /*
- * Remove and return the first event from the notification list.  There is a
+ * Remove and return the first event from the notification list.  It is the
- * reference held on this event since it was on the list.  It is the responsibility
+ * responsibility of the caller to destroy the obtained event
- * of the caller to drop this reference.
 */
 struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group)
 {
        struct fsnotify_event *event;
-        struct fsnotify_event_holder *holder;
        BUG_ON(!mutex_is_locked(&group->notification_mutex));
        pr_debug("%s: group=%p\n", __func__, group);
-        holder = list_first_entry(&group->notification_list, struct fsnotify_event_holder, event_list);
+        event = list_first_entry(&group->notification_list,
+                                 struct fsnotify_event, list);
-        event = holder->event;
+        list_del(&event->list);
-        spin_lock(&event->lock);
-        holder->event = NULL;
-        list_del_init(&holder->event_list);
-        spin_unlock(&event->lock);
-        /* event == holder means we are referenced through the in event holder */
-        if (holder != &event->holder)
-                fsnotify_destroy_event_holder(holder);
        group->q_len--;
        return event;
@@ -266,15 +143,10 @@ struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group
 */
 struct fsnotify_event *fsnotify_peek_notify_event(struct fsnotify_group *group)
 {
-        struct fsnotify_event *event;
-        struct fsnotify_event_holder *holder;
        BUG_ON(!mutex_is_locked(&group->notification_mutex));
-        holder = list_first_entry(&group->notification_list, struct fsnotify_event_holder, event_list);
+        return list_first_entry(&group->notification_list,
-        event = holder->event;
+                                struct fsnotify_event, list);
-        return event;
 }
 /*
@@ -284,181 +156,31 @@ struct fsnotify_event *fsnotify_peek_notify_event(struct fsnotify_group *group)
 void fsnotify_flush_notify(struct fsnotify_group *group)
 {
        struct fsnotify_event *event;
-        struct fsnotify_event_private_data *priv;
        mutex_lock(&group->notification_mutex);
        while (!fsnotify_notify_queue_is_empty(group)) {
                event = fsnotify_remove_notify_event(group);
-                /* if they don't implement free_event_priv they better not have attached any */
+                fsnotify_destroy_event(group, event);
-                if (group->ops->free_event_priv) {
-                        spin_lock(&event->lock);
-                        priv = fsnotify_remove_priv_from_event(group, event);
-                        spin_unlock(&event->lock);
-                        if (priv)
-                                group->ops->free_event_priv(priv);
-                }
-                fsnotify_put_event(event); /* matches fsnotify_add_notify_event */
        }
        mutex_unlock(&group->notification_mutex);
 }
-static void initialize_event(struct fsnotify_event *event)
-{
-        INIT_LIST_HEAD(&event->holder.event_list);
-        atomic_set(&event->refcnt, 1);
-        spin_lock_init(&event->lock);
-        INIT_LIST_HEAD(&event->private_data_list);
-}
-/*
- * Caller damn well better be holding whatever mutex is protecting the
- * old_holder->event_list and the new_event must be a clean event which
- * cannot be found anywhere else in the kernel.
- */
-int fsnotify_replace_event(struct fsnotify_event_holder *old_holder,
-                           struct fsnotify_event *new_event)
-{
-        struct fsnotify_event *old_event = old_holder->event;
-        struct fsnotify_event_holder *new_holder = &new_event->holder;
-        enum event_spinlock_class {
-                SPINLOCK_OLD,
-                SPINLOCK_NEW,
-        };
-        pr_debug("%s: old_event=%p new_event=%p\n", __func__, old_event, new_event);
-        /*
-         * if the new_event's embedded holder is in use someone
-         * screwed up and didn't give us a clean new event.
-         */
-        BUG_ON(!list_empty(&new_holder->event_list));
-        spin_lock_nested(&old_event->lock, SPINLOCK_OLD);
-        spin_lock_nested(&new_event->lock, SPINLOCK_NEW);
-        new_holder->event = new_event;
-        list_replace_init(&old_holder->event_list, &new_holder->event_list);
-        spin_unlock(&new_event->lock);
-        spin_unlock(&old_event->lock);
-        /* event == holder means we are referenced through the in event holder */
-        if (old_holder != &old_event->holder)
-                fsnotify_destroy_event_holder(old_holder);
-        fsnotify_get_event(new_event); /* on the list take reference */
-        fsnotify_put_event(old_event); /* off the list, drop reference */
-        return 0;
-}
-struct fsnotify_event *fsnotify_clone_event(struct fsnotify_event *old_event)
-{
-        struct fsnotify_event *event;
-        event = kmem_cache_alloc(fsnotify_event_cachep, GFP_KERNEL);
-        if (!event)
-                return NULL;
-        pr_debug("%s: old_event=%p new_event=%p\n", __func__, old_event, event);
-        memcpy(event, old_event, sizeof(*event));
-        initialize_event(event);
-        if (event->name_len) {
-                event->file_name = kstrdup(old_event->file_name, GFP_KERNEL);
-                if (!event->file_name) {
-                        kmem_cache_free(fsnotify_event_cachep, event);
-                        return NULL;
-                }
-        }
-        event->tgid = get_pid(old_event->tgid);
-        if (event->data_type == FSNOTIFY_EVENT_PATH)
-                path_get(&event->path);
-        return event;
-}
 /*
 * fsnotify_create_event - Allocate a new event which will be sent to each
 * group's handle_event function if the group was interested in this
 * particular event.
 *
- * @to_tell the inode which is supposed to receive the event (sometimes a
+ * @inode the inode which is supposed to receive the event (sometimes a
 *      parent of the inode to which the event happened.
 * @mask what actually happened.
 * @data pointer to the object which was actually affected
 * @data_type flag indication if the data is a file, path, inode, nothing...
 * @name the filename, if available
 */
-struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, void *data,
+void fsnotify_init_event(struct fsnotify_event *event, struct inode *inode,
-                                             int data_type, const unsigned char *name,
+                         u32 mask)
-                                             u32 cookie, gfp_t gfp)
 {
-        struct fsnotify_event *event;
+        INIT_LIST_HEAD(&event->list);
+        event->inode = inode;
-        event = kmem_cache_zalloc(fsnotify_event_cachep, gfp);
-        if (!event)
-                return NULL;
-        pr_debug("%s: event=%p to_tell=%p mask=%x data=%p data_type=%d\n",
-                 __func__, event, to_tell, mask, data, data_type);
-        initialize_event(event);
-        if (name) {
-                event->file_name = kstrdup(name, gfp);
-                if (!event->file_name) {
-                        kmem_cache_free(fsnotify_event_cachep, event);
-                        return NULL;
-                }
-                event->name_len = strlen(event->file_name);
-        }
-        event->tgid = get_pid(task_tgid(current));
-        event->sync_cookie = cookie;
-        event->to_tell = to_tell;
-        event->data_type = data_type;
-        switch (data_type) {
-        case FSNOTIFY_EVENT_PATH: {
-                struct path *path = data;
-                event->path.dentry = path->dentry;
-                event->path.mnt = path->mnt;
-                path_get(&event->path);
-                break;
-        }
-        case FSNOTIFY_EVENT_INODE:
-                event->inode = data;
-                break;
-        case FSNOTIFY_EVENT_NONE:
-                event->inode = NULL;
-                event->path.dentry = NULL;
-                event->path.mnt = NULL;
-                break;
-        default:
-                BUG();
-        }
        event->mask = mask;
-        return event;
-}
-static __init int fsnotify_notification_init(void)
-{
-        fsnotify_event_cachep = KMEM_CACHE(fsnotify_event, SLAB_PANIC);
-        fsnotify_event_holder_cachep = KMEM_CACHE(fsnotify_event_holder, SLAB_PANIC);
-        q_overflow_event = fsnotify_create_event(NULL, FS_Q_OVERFLOW, NULL,
-                                                 FSNOTIFY_EVENT_NONE, NULL, 0,
-                                                 GFP_KERNEL);
-        if (!q_overflow_event)
-                panic("unable to allocate fsnotify q_overflow_event\n");
-        return 0;
 }
-subsys_initcall(fsnotify_notification_init);
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index f17e58b32989..ce210d4951a1 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -38,7 +38,6 @@ ocfs2-objs := \
        symlink.o               \
        sysfile.o               \
        uptodate.o              \
-        ver.o                   \
        quota_local.o           \
        quota_global.o          \
        xattr.o                 \
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index dc7411fe185d..8750ae1b8636 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -7260,14 +7260,8 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
        start = range->start >> osb->s_clustersize_bits;
        len = range->len >> osb->s_clustersize_bits;
        minlen = range->minlen >> osb->s_clustersize_bits;
-        trimmed = 0;
-        if (!len) {
-                range->len = 0;
-                return 0;
-        }
-        if (minlen >= osb->bitmap_cpg)
+        if (minlen >= osb->bitmap_cpg || range->len < sb->s_blocksize)
                return -EINVAL;
        main_bm_inode = ocfs2_get_system_file_inode(osb,
@@ -7293,6 +7287,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
                goto out_unlock;
        }
+        len = range->len >> osb->s_clustersize_bits;
        if (start + len > le32_to_cpu(main_bm->i_clusters))
                len = le32_to_cpu(main_bm->i_clusters) - start;
@@ -7307,6 +7302,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
        last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1);
        last_bit = osb->bitmap_cpg;
+        trimmed = 0;
        for (group = first_group; group <= last_group;) {
                if (first_bit + len >= osb->bitmap_cpg)
                        last_bit = osb->bitmap_cpg;
diff --git a/fs/ocfs2/cluster/Makefile b/fs/ocfs2/cluster/Makefile
index bc8c5e7d8608..1aefc0350ec3 100644
--- a/fs/ocfs2/cluster/Makefile
+++ b/fs/ocfs2/cluster/Makefile
@@ -1,4 +1,4 @@
 obj-$(CONFIG_OCFS2_FS) += ocfs2_nodemanager.o
 ocfs2_nodemanager-objs := heartbeat.o masklog.o sys.o nodemanager.o \
-        quorum.o tcp.o netdebug.o ver.o
+        quorum.o tcp.o netdebug.o
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index bb240647ca5f..441c84e169e6 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -29,7 +29,6 @@
 #include "heartbeat.h"
 #include "masklog.h"
 #include "sys.h"
-#include "ver.h"
 /* for now we operate under the assertion that there can be only one
 * cluster active at a time.  Changing this will require trickling
@@ -945,8 +944,6 @@ static int __init init_o2nm(void)
 {
        int ret = -1;
-        cluster_print_version();
        ret = o2hb_init();
        if (ret)
                goto out;
@@ -984,6 +981,7 @@ out:
 MODULE_AUTHOR("Oracle");
 MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("OCFS2 cluster management");
 module_init(init_o2nm)
 module_exit(exit_o2nm)
diff --git a/fs/ocfs2/cluster/ver.c b/fs/ocfs2/cluster/ver.c
deleted file mode 100644
index a56eee6abad3..000000000000
--- a/fs/ocfs2/cluster/ver.c
+++ /dev/null
@@ -1,42 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * ver.c
- *
- * version string
- *
- * Copyright (C) 2002, 2005 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include "ver.h"
-#define CLUSTER_BUILD_VERSION "1.5.0"
-#define VERSION_STR "OCFS2 Node Manager " CLUSTER_BUILD_VERSION
-void cluster_print_version(void)
-{
-        printk(KERN_INFO "%s\n", VERSION_STR);
-}
-MODULE_DESCRIPTION(VERSION_STR);
-MODULE_VERSION(CLUSTER_BUILD_VERSION);
diff --git a/fs/ocfs2/cluster/ver.h b/fs/ocfs2/cluster/ver.h
deleted file mode 100644
index 32554c3382c2..000000000000
--- a/fs/ocfs2/cluster/ver.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * ver.h
- *
- * Function prototypes
- *
- * Copyright (C) 2005 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-#ifndef O2CLUSTER_VER_H
-#define O2CLUSTER_VER_H
-void cluster_print_version(void);
-#endif /* O2CLUSTER_VER_H */
diff --git a/fs/ocfs2/dlm/Makefile b/fs/ocfs2/dlm/Makefile
index c8a044efbb15..bd1aab1f49a4 100644
--- a/fs/ocfs2/dlm/Makefile
+++ b/fs/ocfs2/dlm/Makefile
@@ -3,5 +3,5 @@ ccflags-y := -Ifs/ocfs2
 obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o
 ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \
-        dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o dlmver.o
+        dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 8b3382abf840..33660a4a52fa 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -43,8 +43,6 @@
 #include "dlmdomain.h"
 #include "dlmdebug.h"
-#include "dlmver.h"
 #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_DOMAIN)
 #include "cluster/masklog.h"
@@ -2328,8 +2326,6 @@ static int __init dlm_init(void)
 {
        int status;
-        dlm_print_version();
        status = dlm_init_mle_cache();
        if (status) {
                mlog(ML_ERROR, "Could not create o2dlm_mle slabcache\n");
@@ -2379,6 +2375,7 @@ static void __exit dlm_exit (void)
 MODULE_AUTHOR("Oracle");
 MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("OCFS2 Distributed Lock Management");
 module_init(dlm_init);
 module_exit(dlm_exit);
diff --git a/fs/ocfs2/dlm/dlmver.c b/fs/ocfs2/dlm/dlmver.c
deleted file mode 100644
index dfc0da4d158d..000000000000
--- a/fs/ocfs2/dlm/dlmver.c
+++ /dev/null
@@ -1,42 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * dlmver.c
- *
- * version string
- *
- * Copyright (C) 2002, 2005 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include "dlmver.h"
-#define DLM_BUILD_VERSION "1.5.0"
-#define VERSION_STR "OCFS2 DLM " DLM_BUILD_VERSION
-void dlm_print_version(void)
-{
-        printk(KERN_INFO "%s\n", VERSION_STR);
-}
-MODULE_DESCRIPTION(VERSION_STR);
-MODULE_VERSION(DLM_BUILD_VERSION);
diff --git a/fs/ocfs2/dlm/dlmver.h b/fs/ocfs2/dlm/dlmver.h
deleted file mode 100644
index f674aee77a16..000000000000
--- a/fs/ocfs2/dlm/dlmver.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * dlmfsver.h
- *
- * Function prototypes
- *
- * Copyright (C) 2005 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-#ifndef DLM_VER_H
-#define DLM_VER_H
-void dlm_print_version(void);
-#endif /* DLM_VER_H */
diff --git a/fs/ocfs2/dlmfs/Makefile b/fs/ocfs2/dlmfs/Makefile
index f14be89a6701..eed3db8c5b49 100644
--- a/fs/ocfs2/dlmfs/Makefile
+++ b/fs/ocfs2/dlmfs/Makefile
@@ -2,4 +2,4 @@ ccflags-y := -Ifs/ocfs2
 obj-$(CONFIG_OCFS2_FS) += ocfs2_dlmfs.o
-ocfs2_dlmfs-objs := userdlm.o dlmfs.o dlmfsver.o
+ocfs2_dlmfs-objs := userdlm.o dlmfs.o
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index efa2b3d339e3..09b7d9dac71d 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -49,7 +49,6 @@
 #include "stackglue.h"
 #include "userdlm.h"
-#include "dlmfsver.h"
 #define MLOG_MASK_PREFIX ML_DLMFS
 #include "cluster/masklog.h"
@@ -644,8 +643,6 @@ static int __init init_dlmfs_fs(void)
        int status;
        int cleanup_inode = 0, cleanup_worker = 0;
-        dlmfs_print_version();
        status = bdi_init(&dlmfs_backing_dev_info);
        if (status)
                return status;
@@ -701,6 +698,7 @@ static void __exit exit_dlmfs_fs(void)
 MODULE_AUTHOR("Oracle");
 MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("OCFS2 DLM-Filesystem");
 module_init(init_dlmfs_fs)
 module_exit(exit_dlmfs_fs)
diff --git a/fs/ocfs2/dlmfs/dlmfsver.c b/fs/ocfs2/dlmfs/dlmfsver.c
deleted file mode 100644
index a733b3321f83..000000000000
--- a/fs/ocfs2/dlmfs/dlmfsver.c
+++ /dev/null
@@ -1,42 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * dlmfsver.c
- *
- * version string
- *
- * Copyright (C) 2002, 2005 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include "dlmfsver.h"
-#define DLM_BUILD_VERSION "1.5.0"
-#define VERSION_STR "OCFS2 DLMFS " DLM_BUILD_VERSION
-void dlmfs_print_version(void)
-{
-        printk(KERN_INFO "%s\n", VERSION_STR);
-}
-MODULE_DESCRIPTION(VERSION_STR);
-MODULE_VERSION(DLM_BUILD_VERSION);
diff --git a/fs/ocfs2/dlmfs/dlmfsver.h b/fs/ocfs2/dlmfs/dlmfsver.h
deleted file mode 100644
index f35eadbed25c..000000000000
--- a/fs/ocfs2/dlmfs/dlmfsver.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * dlmver.h
- *
- * Function prototypes
- *
- * Copyright (C) 2005 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-#ifndef DLMFS_VER_H
-#define DLMFS_VER_H
-void dlmfs_print_version(void);
-#endif /* DLMFS_VER_H */
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 3407b2c62b21..19986959d149 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2996,6 +2996,8 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
        /* for now, uuid == domain */
        status = ocfs2_cluster_connect(osb->osb_cluster_stack,
+                                       osb->osb_cluster_name,
+                                       strlen(osb->osb_cluster_name),
                                       osb->uuid_str,
                                       strlen(osb->uuid_str),
                                       &lproto, ocfs2_do_node_down, osb,
@@ -3005,7 +3007,7 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
                goto bail;
        }
-        status = ocfs2_cluster_this_node(&osb->node_num);
+        status = ocfs2_cluster_this_node(conn, &osb->node_num);
        if (status < 0) {
                mlog_errno(status);
                mlog(ML_ERROR,
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 6fff128cad16..f42eecef6478 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1869,7 +1869,8 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
        }
        size = sr->l_start + sr->l_len;
-        if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) {
+        if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64 ||
+            cmd == OCFS2_IOC_UNRESVSP || cmd == OCFS2_IOC_UNRESVSP64) {
                if (sr->l_len <= 0) {
                        ret = -EINVAL;
                        goto out_inode_unlock;
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index fa32ce9b455d..8ca3c29accbf 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -7,6 +7,7 @@
 #include <linux/fs.h>
 #include <linux/mount.h>
+#include <linux/blkdev.h>
 #include <linux/compat.h>
 #include <cluster/masklog.h>
@@ -966,15 +967,21 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
        case FITRIM:
        {
                struct super_block *sb = inode->i_sb;
+                struct request_queue *q = bdev_get_queue(sb->s_bdev);
                struct fstrim_range range;
                int ret = 0;
                if (!capable(CAP_SYS_ADMIN))
                        return -EPERM;
+                if (!blk_queue_discard(q))
+                        return -EOPNOTSUPP;
                if (copy_from_user(&range, argp, sizeof(range)))
                        return -EFAULT;
+                range.minlen = max_t(u64, q->limits.discard_granularity,
+                                     range.minlen);
                ret = ocfs2_trim_fs(sb, &range);
                if (ret < 0)
                        return ret;
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index 631a98213474..64c304d668f0 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -561,83 +561,6 @@ static void ocfs2_probe_alloc_group(struct inode *inode, struct buffer_head *bh,
        mlog(0, "found phys_cpos: %u to fit the wanted moving.\n", *phys_cpos);
 }
-static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
-                                       handle_t *handle,
-                                       struct buffer_head *di_bh,
-                                       u32 num_bits,
-                                       u16 chain)
-{
-        int ret;
-        u32 tmp_used;
-        struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
-        struct ocfs2_chain_list *cl =
-                                (struct ocfs2_chain_list *) &di->id2.i_chain;
-        ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
-        if (ret < 0) {
-                mlog_errno(ret);
-                goto out;
-        }
-        tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
-        di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
-        le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
-        ocfs2_journal_dirty(handle, di_bh);
-out:
-        return ret;
-}
-static inline int ocfs2_block_group_set_bits(handle_t *handle,
-                                             struct inode *alloc_inode,
-                                             struct ocfs2_group_desc *bg,
-                                             struct buffer_head *group_bh,
-                                             unsigned int bit_off,
-                                             unsigned int num_bits)
-{
-        int status;
-        void *bitmap = bg->bg_bitmap;
-        int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
-        /* All callers get the descriptor via
-         * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
-        BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
-        BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
-        mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
-             num_bits);
-        if (ocfs2_is_cluster_bitmap(alloc_inode))
-                journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
-        status = ocfs2_journal_access_gd(handle,
-                                         INODE_CACHE(alloc_inode),
-                                         group_bh,
-                                         journal_type);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
-        le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
-        if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
-                ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
-                            " count %u but claims %u are freed. num_bits %d",
-                            (unsigned long long)le64_to_cpu(bg->bg_blkno),
-                            le16_to_cpu(bg->bg_bits),
-                            le16_to_cpu(bg->bg_free_bits_count), num_bits);
-                return -EROFS;
-        }
-        while (num_bits--)
-                ocfs2_set_bit(bit_off++, bitmap);
-        ocfs2_journal_dirty(handle, group_bh);
-bail:
-        return status;
-}
 static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
                             u32 cpos, u32 phys_cpos, u32 *new_phys_cpos,
                             u32 len, int ext_flags)
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 3a903470c794..553f53cc73ae 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -387,6 +387,7 @@ struct ocfs2_super
        u8 osb_stackflags;
        char osb_cluster_stack[OCFS2_STACK_LABEL_LEN + 1];
+        char osb_cluster_name[OCFS2_CLUSTER_NAME_LEN + 1];
        struct ocfs2_cluster_connection *cconn;
        struct ocfs2_lock_res osb_super_lockres;
        struct ocfs2_lock_res osb_rename_lockres;
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index bf1f8930456f..1724d43d3da1 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -398,7 +398,8 @@ static int o2cb_cluster_disconnect(struct ocfs2_cluster_connection *conn)
        return 0;
 }
-static int o2cb_cluster_this_node(unsigned int *node)
+static int o2cb_cluster_this_node(struct ocfs2_cluster_connection *conn,
+                                  unsigned int *node)
 {
        int node_num;
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 286edf1e231f..13a8537d8e8b 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -23,6 +23,7 @@
 #include <linux/mutex.h>
 #include <linux/slab.h>
 #include <linux/reboot.h>
+#include <linux/sched.h>
 #include <asm/uaccess.h>
 #include "stackglue.h"
@@ -102,6 +103,12 @@
 #define OCFS2_TEXT_UUID_LEN                     32
 #define OCFS2_CONTROL_MESSAGE_VERNUM_LEN        2
 #define OCFS2_CONTROL_MESSAGE_NODENUM_LEN       8
+#define VERSION_LOCK                            "version_lock"
+enum ocfs2_connection_type {
+        WITH_CONTROLD,
+        NO_CONTROLD
+};
 /*
 * ocfs2_live_connection is refcounted because the filesystem and
@@ -110,6 +117,13 @@
 struct ocfs2_live_connection {
        struct list_head                oc_list;
        struct ocfs2_cluster_connection *oc_conn;
+        enum ocfs2_connection_type      oc_type;
+        atomic_t                        oc_this_node;
+        int                             oc_our_slot;
+        struct dlm_lksb                 oc_version_lksb;
+        char                            oc_lvb[DLM_LVB_LEN];
+        struct completion               oc_sync_wait;
+        wait_queue_head_t               oc_wait;
 };
 struct ocfs2_control_private {
@@ -198,20 +212,15 @@ static struct ocfs2_live_connection *ocfs2_connection_find(const char *name)
 * mount path.  Since the VFS prevents multiple calls to
 * fill_super(), we can't get dupes here.
 */
-static int ocfs2_live_connection_new(struct ocfs2_cluster_connection *conn,
+static int ocfs2_live_connection_attach(struct ocfs2_cluster_connection *conn,
-                                     struct ocfs2_live_connection **c_ret)
+                                     struct ocfs2_live_connection *c)
 {
        int rc = 0;
-        struct ocfs2_live_connection *c;
-        c = kzalloc(sizeof(struct ocfs2_live_connection), GFP_KERNEL);
-        if (!c)
-                return -ENOMEM;
        mutex_lock(&ocfs2_control_lock);
        c->oc_conn = conn;
-        if (atomic_read(&ocfs2_control_opened))
+        if ((c->oc_type == NO_CONTROLD) || atomic_read(&ocfs2_control_opened))
                list_add(&c->oc_list, &ocfs2_live_connection_list);
        else {
                printk(KERN_ERR
@@ -220,12 +229,6 @@ static int ocfs2_live_connection_new(struct ocfs2_cluster_connection *conn,
        }
        mutex_unlock(&ocfs2_control_lock);
-        if (!rc)
-                *c_ret = c;
-        else
-                kfree(c);
        return rc;
 }
@@ -799,18 +802,251 @@ static int fs_protocol_compare(struct ocfs2_protocol_version *existing,
        return 0;
 }
+static void lvb_to_version(char *lvb, struct ocfs2_protocol_version *ver)
+{
+        struct ocfs2_protocol_version *pv =
+                (struct ocfs2_protocol_version *)lvb;
+        /*
+         * ocfs2_protocol_version has two u8 variables, so we don't
+         * need any endian conversion.
+         */
+        ver->pv_major = pv->pv_major;
+        ver->pv_minor = pv->pv_minor;
+}
+static void version_to_lvb(struct ocfs2_protocol_version *ver, char *lvb)
+{
+        struct ocfs2_protocol_version *pv =
+                (struct ocfs2_protocol_version *)lvb;
+        /*
+         * ocfs2_protocol_version has two u8 variables, so we don't
+         * need any endian conversion.
+         */
+        pv->pv_major = ver->pv_major;
+        pv->pv_minor = ver->pv_minor;
+}
+static void sync_wait_cb(void *arg)
+{
+        struct ocfs2_cluster_connection *conn = arg;
+        struct ocfs2_live_connection *lc = conn->cc_private;
+        complete(&lc->oc_sync_wait);
+}
+static int sync_unlock(struct ocfs2_cluster_connection *conn,
+                struct dlm_lksb *lksb, char *name)
+{
+        int error;
+        struct ocfs2_live_connection *lc = conn->cc_private;
+        error = dlm_unlock(conn->cc_lockspace, lksb->sb_lkid, 0, lksb, conn);
+        if (error) {
+                printk(KERN_ERR "%s lkid %x error %d\n",
+                                name, lksb->sb_lkid, error);
+                return error;
+        }
+        wait_for_completion(&lc->oc_sync_wait);
+        if (lksb->sb_status != -DLM_EUNLOCK) {
+                printk(KERN_ERR "%s lkid %x status %d\n",
+                                name, lksb->sb_lkid, lksb->sb_status);
+                return -1;
+        }
+        return 0;
+}
+static int sync_lock(struct ocfs2_cluster_connection *conn,
+                int mode, uint32_t flags,
+                struct dlm_lksb *lksb, char *name)
+{
+        int error, status;
+        struct ocfs2_live_connection *lc = conn->cc_private;
+        error = dlm_lock(conn->cc_lockspace, mode, lksb, flags,
+                        name, strlen(name),
+                        0, sync_wait_cb, conn, NULL);
+        if (error) {
+                printk(KERN_ERR "%s lkid %x flags %x mode %d error %d\n",
+                                name, lksb->sb_lkid, flags, mode, error);
+                return error;
+        }
+        wait_for_completion(&lc->oc_sync_wait);
+        status = lksb->sb_status;
+        if (status && status != -EAGAIN) {
+                printk(KERN_ERR "%s lkid %x flags %x mode %d status %d\n",
+                                name, lksb->sb_lkid, flags, mode, status);
+        }
+        return status;
+}
+static int version_lock(struct ocfs2_cluster_connection *conn, int mode,
+                int flags)
+{
+        struct ocfs2_live_connection *lc = conn->cc_private;
+        return sync_lock(conn, mode, flags,
+                        &lc->oc_version_lksb, VERSION_LOCK);
+}
+static int version_unlock(struct ocfs2_cluster_connection *conn)
+{
+        struct ocfs2_live_connection *lc = conn->cc_private;
+        return sync_unlock(conn, &lc->oc_version_lksb, VERSION_LOCK);
+}
+/* get_protocol_version()
+ *
+ * To exchange ocfs2 versioning, we use the LVB of the version dlm lock.
+ * The algorithm is:
+ * 1. Attempt to take the lock in EX mode (non-blocking).
+ * 2. If successful (which means it is the first mount), write the
+ *    version number and downconvert to PR lock.
+ * 3. If unsuccessful (returns -EAGAIN), read the version from the LVB after
+ *    taking the PR lock.
+ */
+static int get_protocol_version(struct ocfs2_cluster_connection *conn)
+{
+        int ret;
+        struct ocfs2_live_connection *lc = conn->cc_private;
+        struct ocfs2_protocol_version pv;
+        running_proto.pv_major =
+                ocfs2_user_plugin.sp_max_proto.pv_major;
+        running_proto.pv_minor =
+                ocfs2_user_plugin.sp_max_proto.pv_minor;
+        lc->oc_version_lksb.sb_lvbptr = lc->oc_lvb;
+        ret = version_lock(conn, DLM_LOCK_EX,
+                        DLM_LKF_VALBLK|DLM_LKF_NOQUEUE);
+        if (!ret) {
+                conn->cc_version.pv_major = running_proto.pv_major;
+                conn->cc_version.pv_minor = running_proto.pv_minor;
+                version_to_lvb(&running_proto, lc->oc_lvb);
+                version_lock(conn, DLM_LOCK_PR, DLM_LKF_CONVERT|DLM_LKF_VALBLK);
+        } else if (ret == -EAGAIN) {
+                ret = version_lock(conn, DLM_LOCK_PR, DLM_LKF_VALBLK);
+                if (ret)
+                        goto out;
+                lvb_to_version(lc->oc_lvb, &pv);
+                if ((pv.pv_major != running_proto.pv_major) ||
+                                (pv.pv_minor > running_proto.pv_minor)) {
+                        ret = -EINVAL;
+                        goto out;
+                }
+                conn->cc_version.pv_major = pv.pv_major;
+                conn->cc_version.pv_minor = pv.pv_minor;
+        }
+out:
+        return ret;
+}
+static void user_recover_prep(void *arg)
+{
+}
+static void user_recover_slot(void *arg, struct dlm_slot *slot)
+{
+        struct ocfs2_cluster_connection *conn = arg;
+        printk(KERN_INFO "ocfs2: Node %d/%d down. Initiating recovery.\n",
+                        slot->nodeid, slot->slot);
+        conn->cc_recovery_handler(slot->nodeid, conn->cc_recovery_data);
+}
+static void user_recover_done(void *arg, struct dlm_slot *slots,
+                int num_slots, int our_slot,
+                uint32_t generation)
+{
+        struct ocfs2_cluster_connection *conn = arg;
+        struct ocfs2_live_connection *lc = conn->cc_private;
+        int i;
+        for (i = 0; i < num_slots; i++)
+                if (slots[i].slot == our_slot) {
+                        atomic_set(&lc->oc_this_node, slots[i].nodeid);
+                        break;
+                }
+        lc->oc_our_slot = our_slot;
+        wake_up(&lc->oc_wait);
+}
+static const struct dlm_lockspace_ops ocfs2_ls_ops = {
+        .recover_prep = user_recover_prep,
+        .recover_slot = user_recover_slot,
+        .recover_done = user_recover_done,
+};
+static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn)
+{
+        version_unlock(conn);
+        dlm_release_lockspace(conn->cc_lockspace, 2);
+        conn->cc_lockspace = NULL;
+        ocfs2_live_connection_drop(conn->cc_private);
+        conn->cc_private = NULL;
+        return 0;
+}
 static int user_cluster_connect(struct ocfs2_cluster_connection *conn)
 {
        dlm_lockspace_t *fsdlm;
-        struct ocfs2_live_connection *uninitialized_var(control);
+        struct ocfs2_live_connection *lc;
-        int rc = 0;
+        int rc, ops_rv;
        BUG_ON(conn == NULL);
-        rc = ocfs2_live_connection_new(conn, &control);
+        lc = kzalloc(sizeof(struct ocfs2_live_connection), GFP_KERNEL);
+        if (!lc) {
+                rc = -ENOMEM;
+                goto out;
+        }
+        init_waitqueue_head(&lc->oc_wait);
+        init_completion(&lc->oc_sync_wait);
+        atomic_set(&lc->oc_this_node, 0);
+        conn->cc_private = lc;
+        lc->oc_type = NO_CONTROLD;
+        rc = dlm_new_lockspace(conn->cc_name, conn->cc_cluster_name,
+                               DLM_LSFL_FS, DLM_LVB_LEN,
+                               &ocfs2_ls_ops, conn, &ops_rv, &fsdlm);
+        if (rc)
+                goto out;
+        if (ops_rv == -EOPNOTSUPP) {
+                lc->oc_type = WITH_CONTROLD;
+                printk(KERN_NOTICE "ocfs2: You seem to be using an older "
+                                "version of dlm_controld and/or ocfs2-tools."
+                                " Please consider upgrading.\n");
+        } else if (ops_rv) {
+                rc = ops_rv;
+                goto out;
+        }
+        conn->cc_lockspace = fsdlm;
+        rc = ocfs2_live_connection_attach(conn, lc);
        if (rc)
                goto out;
+        if (lc->oc_type == NO_CONTROLD) {
+                rc = get_protocol_version(conn);
+                if (rc) {
+                        printk(KERN_ERR "ocfs2: Could not determine"
+                                        " locking version\n");
+                        user_cluster_disconnect(conn);
+                        goto out;
+                }
+                wait_event(lc->oc_wait, (atomic_read(&lc->oc_this_node) > 0));
+        }
        /*
         * running_proto must have been set before we allowed any mounts
         * to proceed.
@@ -818,42 +1054,34 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn)
        if (fs_protocol_compare(&running_proto, &conn->cc_version)) {
                printk(KERN_ERR
                       "Unable to mount with fs locking protocol version "
-                       "%u.%u because the userspace control daemon has "
+                       "%u.%u because negotiated protocol is %u.%u\n",
-                       "negotiated %u.%u\n",
                       conn->cc_version.pv_major, conn->cc_version.pv_minor,
                       running_proto.pv_major, running_proto.pv_minor);
                rc = -EPROTO;
-                ocfs2_live_connection_drop(control);
+                ocfs2_live_connection_drop(lc);
-                goto out;
+                lc = NULL;
-        }
-        rc = dlm_new_lockspace(conn->cc_name, NULL, DLM_LSFL_FS, DLM_LVB_LEN,
-                               NULL, NULL, NULL, &fsdlm);
-        if (rc) {
-                ocfs2_live_connection_drop(control);
-                goto out;
        }
-        conn->cc_private = control;
-        conn->cc_lockspace = fsdlm;
 out:
+        if (rc && lc)
+                kfree(lc);
        return rc;
 }
-static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn)
-{
-        dlm_release_lockspace(conn->cc_lockspace, 2);
-        conn->cc_lockspace = NULL;
-        ocfs2_live_connection_drop(conn->cc_private);
-        conn->cc_private = NULL;
-        return 0;
-}
-static int user_cluster_this_node(unsigned int *this_node)
+static int user_cluster_this_node(struct ocfs2_cluster_connection *conn,
+                                  unsigned int *this_node)
 {
        int rc;
+        struct ocfs2_live_connection *lc = conn->cc_private;
+        if (lc->oc_type == WITH_CONTROLD)
+                rc = ocfs2_control_get_this_node();
+        else if (lc->oc_type == NO_CONTROLD)
+                rc = atomic_read(&lc->oc_this_node);
+        else
+                rc = -EINVAL;
-        rc = ocfs2_control_get_this_node();
        if (rc < 0)
                return rc;
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index cb7ec0b63ddc..1324e6600e57 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -309,6 +309,8 @@ int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino,
 EXPORT_SYMBOL_GPL(ocfs2_plock);
 int ocfs2_cluster_connect(const char *stack_name,
+                          const char *cluster_name,
+                          int cluster_name_len,
                          const char *group,
                          int grouplen,
                          struct ocfs2_locking_protocol *lproto,
@@ -342,8 +344,10 @@ int ocfs2_cluster_connect(const char *stack_name,
                goto out;
        }
-        memcpy(new_conn->cc_name, group, grouplen);
+        strlcpy(new_conn->cc_name, group, GROUP_NAME_MAX + 1);
        new_conn->cc_namelen = grouplen;
+        strlcpy(new_conn->cc_cluster_name, cluster_name, CLUSTER_NAME_MAX + 1);
+        new_conn->cc_cluster_name_len = cluster_name_len;
        new_conn->cc_recovery_handler = recovery_handler;
        new_conn->cc_recovery_data = recovery_data;
@@ -386,8 +390,9 @@ int ocfs2_cluster_connect_agnostic(const char *group,
        if (cluster_stack_name[0])
                stack_name = cluster_stack_name;
-        return ocfs2_cluster_connect(stack_name, group, grouplen, lproto,
+        return ocfs2_cluster_connect(stack_name, NULL, 0, group, grouplen,
-                                     recovery_handler, recovery_data, conn);
+                                     lproto, recovery_handler, recovery_data,
+                                     conn);
 }
 EXPORT_SYMBOL_GPL(ocfs2_cluster_connect_agnostic);
@@ -460,9 +465,10 @@ void ocfs2_cluster_hangup(const char *group, int grouplen)
 }
 EXPORT_SYMBOL_GPL(ocfs2_cluster_hangup);
-int ocfs2_cluster_this_node(unsigned int *node)
+int ocfs2_cluster_this_node(struct ocfs2_cluster_connection *conn,
+                            unsigned int *node)
 {
-        return active_stack->sp_ops->this_node(node);
+        return active_stack->sp_ops->this_node(conn, node);
 }
 EXPORT_SYMBOL_GPL(ocfs2_cluster_this_node);
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index 1ec56fdb8d0d..66334a30cea8 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -45,6 +45,9 @@ struct file_lock;
 */
 #define GROUP_NAME_MAX          64
+/* This shadows  OCFS2_CLUSTER_NAME_LEN */
+#define CLUSTER_NAME_MAX        16
 /*
 * ocfs2_protocol_version changes when ocfs2 does something different in
@@ -97,8 +100,10 @@ struct ocfs2_locking_protocol {
 * locking compatibility.
 */
 struct ocfs2_cluster_connection {
-        char cc_name[GROUP_NAME_MAX];
+        char cc_name[GROUP_NAME_MAX + 1];
        int cc_namelen;
+        char cc_cluster_name[CLUSTER_NAME_MAX + 1];
+        int cc_cluster_name_len;
        struct ocfs2_protocol_version cc_version;
        struct ocfs2_locking_protocol *cc_proto;
        void (*cc_recovery_handler)(int node_num, void *recovery_data);
@@ -152,7 +157,8 @@ struct ocfs2_stack_operations {
         * ->this_node() returns the cluster's unique identifier for the
         * local node.
         */
-        int (*this_node)(unsigned int *node);
+        int (*this_node)(struct ocfs2_cluster_connection *conn,
+                         unsigned int *node);
        /*
         * Call the underlying dlm lock function.  The ->dlm_lock()
@@ -239,6 +245,8 @@ struct ocfs2_stack_plugin {
 /* Used by the filesystem */
 int ocfs2_cluster_connect(const char *stack_name,
+                          const char *cluster_name,
+                          int cluster_name_len,
                          const char *group,
                          int grouplen,
                          struct ocfs2_locking_protocol *lproto,
@@ -260,7 +268,8 @@ int ocfs2_cluster_connect_agnostic(const char *group,
 int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn,
                             int hangup_pending);
 void ocfs2_cluster_hangup(const char *group, int grouplen);
-int ocfs2_cluster_this_node(unsigned int *node);
+int ocfs2_cluster_this_node(struct ocfs2_cluster_connection *conn,
+                            unsigned int *node);
 struct ocfs2_lock_res;
 int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn,
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 2c91452c4047..47ae2663a6f5 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -113,12 +113,6 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
                                     struct ocfs2_suballoc_result *res);
 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
                                         int nr);
-static inline int ocfs2_block_group_set_bits(handle_t *handle,
-                                             struct inode *alloc_inode,
-                                             struct ocfs2_group_desc *bg,
-                                             struct buffer_head *group_bh,
-                                             unsigned int bit_off,
-                                             unsigned int num_bits);
 static int ocfs2_relink_block_group(handle_t *handle,
                                    struct inode *alloc_inode,
                                    struct buffer_head *fe_bh,
@@ -1343,7 +1337,7 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
        return status;
 }
-static inline int ocfs2_block_group_set_bits(handle_t *handle,
+int ocfs2_block_group_set_bits(handle_t *handle,
                                             struct inode *alloc_inode,
                                             struct ocfs2_group_desc *bg,
                                             struct buffer_head *group_bh,
@@ -1388,8 +1382,6 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
        ocfs2_journal_dirty(handle, group_bh);
 bail:
-        if (status)
-                mlog_errno(status);
        return status;
 }
@@ -1588,7 +1580,7 @@ static int ocfs2_block_group_search(struct inode *inode,
        return ret;
 }
-static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
+int ocfs2_alloc_dinode_update_counts(struct inode *inode,
                                       handle_t *handle,
                                       struct buffer_head *di_bh,
                                       u32 num_bits,
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index a36d0aa50911..218d8036b3e7 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -86,6 +86,18 @@ int ocfs2_reserve_clusters(struct ocfs2_super *osb,
                           u32 bits_wanted,
                           struct ocfs2_alloc_context **ac);
+int ocfs2_alloc_dinode_update_counts(struct inode *inode,
+                         handle_t *handle,
+                         struct buffer_head *di_bh,
+                         u32 num_bits,
+                         u16 chain);
+int ocfs2_block_group_set_bits(handle_t *handle,
+                         struct inode *alloc_inode,
+                         struct ocfs2_group_desc *bg,
+                         struct buffer_head *group_bh,
+                         unsigned int bit_off,
+                         unsigned int num_bits);
 int ocfs2_claim_metadata(handle_t *handle,
                         struct ocfs2_alloc_context *ac,
                         u32 bits_wanted,
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index c41492957aa5..49d84f80f36c 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -68,7 +68,6 @@
 #include "super.h"
 #include "sysfile.h"
 #include "uptodate.h"
-#include "ver.h"
 #include "xattr.h"
 #include "quota.h"
 #include "refcounttree.h"
@@ -90,6 +89,7 @@ static struct dentry *ocfs2_debugfs_root = NULL;
 MODULE_AUTHOR("Oracle");
 MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("OCFS2 cluster file system");
 struct mount_options
 {
@@ -1618,8 +1618,6 @@ static int __init ocfs2_init(void)
 {
        int status, i;
-        ocfs2_print_version();
        for (i = 0; i < OCFS2_IOEND_WQ_HASH_SZ; i++)
                init_waitqueue_head(&ocfs2__ioend_wq[i]);
@@ -1947,11 +1945,15 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
        ocfs2_shutdown_local_alloc(osb);
-        ocfs2_truncate_log_shutdown(osb);
        /* This will disable recovery and flush any recovery work. */
        ocfs2_recovery_exit(osb);
+        /*
+         * During dismount, when it recovers another node it will call
+         * ocfs2_recover_orphans and queue delayed work osb_truncate_log_wq.
+         */
+        ocfs2_truncate_log_shutdown(osb);
        ocfs2_journal_shutdown(osb);
        ocfs2_sync_blockdev(sb);
@@ -2225,10 +2227,9 @@ static int ocfs2_initialize_super(struct super_block *sb,
        if (ocfs2_clusterinfo_valid(osb)) {
                osb->osb_stackflags =
                        OCFS2_RAW_SB(di)->s_cluster_info.ci_stackflags;
-                memcpy(osb->osb_cluster_stack,
+                strlcpy(osb->osb_cluster_stack,
                       OCFS2_RAW_SB(di)->s_cluster_info.ci_stack,
-                       OCFS2_STACK_LABEL_LEN);
+                       OCFS2_STACK_LABEL_LEN + 1);
-                osb->osb_cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0';
                if (strlen(osb->osb_cluster_stack) != OCFS2_STACK_LABEL_LEN) {
                        mlog(ML_ERROR,
                             "couldn't mount because of an invalid "
@@ -2237,6 +2238,9 @@ static int ocfs2_initialize_super(struct super_block *sb,
                        status = -EINVAL;
                        goto bail;
                }
+                strlcpy(osb->osb_cluster_name,
+                        OCFS2_RAW_SB(di)->s_cluster_info.ci_cluster,
+                        OCFS2_CLUSTER_NAME_LEN + 1);
        } else {
                /* The empty string is identical with classic tools that
                 * don't know about s_cluster_info. */
diff --git a/fs/ocfs2/ver.c b/fs/ocfs2/ver.c
deleted file mode 100644
index e2488f4128a2..000000000000
--- a/fs/ocfs2/ver.c
+++ /dev/null
@@ -1,43 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * ver.c
- *
- * version string
- *
- * Copyright (C) 2002, 2005 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
-#include "ver.h"
-#define OCFS2_BUILD_VERSION "1.5.0"
-#define VERSION_STR "OCFS2 " OCFS2_BUILD_VERSION
-void ocfs2_print_version(void)
-{
-        printk(KERN_INFO "%s\n", VERSION_STR);
-}
-MODULE_DESCRIPTION(VERSION_STR);
-MODULE_VERSION(OCFS2_BUILD_VERSION);
diff --git a/fs/ocfs2/ver.h b/fs/ocfs2/ver.h
deleted file mode 100644
index d7395cb91d2f..000000000000
--- a/fs/ocfs2/ver.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * ver.h
- *
- * Function prototypes
- *
- * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-#ifndef OCFS2_VER_H
-#define OCFS2_VER_H
-void ocfs2_print_version(void);
-#endif /* OCFS2_VER_H */
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 8bd2135b7f82..021e7c069b86 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -22,11 +22,80 @@
 #include <linux/errno.h>
-EXPORT_SYMBOL(posix_acl_init);
+struct posix_acl **acl_by_type(struct inode *inode, int type)
-EXPORT_SYMBOL(posix_acl_alloc);
+{
-EXPORT_SYMBOL(posix_acl_valid);
+        switch (type) {
-EXPORT_SYMBOL(posix_acl_equiv_mode);
+        case ACL_TYPE_ACCESS:
-EXPORT_SYMBOL(posix_acl_from_mode);
+                return &inode->i_acl;
+        case ACL_TYPE_DEFAULT:
+                return &inode->i_default_acl;
+        default:
+                BUG();
+        }
+}
+EXPORT_SYMBOL(acl_by_type);
+struct posix_acl *get_cached_acl(struct inode *inode, int type)
+{
+        struct posix_acl **p = acl_by_type(inode, type);
+        struct posix_acl *acl = ACCESS_ONCE(*p);
+        if (acl) {
+                spin_lock(&inode->i_lock);
+                acl = *p;
+                if (acl != ACL_NOT_CACHED)
+                        acl = posix_acl_dup(acl);
+                spin_unlock(&inode->i_lock);
+        }
+        return acl;
+}
+EXPORT_SYMBOL(get_cached_acl);
+struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type)
+{
+        return rcu_dereference(*acl_by_type(inode, type));
+}
+EXPORT_SYMBOL(get_cached_acl_rcu);
+void set_cached_acl(struct inode *inode, int type, struct posix_acl *acl)
+{
+        struct posix_acl **p = acl_by_type(inode, type);
+        struct posix_acl *old;
+        spin_lock(&inode->i_lock);
+        old = *p;
+        rcu_assign_pointer(*p, posix_acl_dup(acl));
+        spin_unlock(&inode->i_lock);
+        if (old != ACL_NOT_CACHED)
+                posix_acl_release(old);
+}
+EXPORT_SYMBOL(set_cached_acl);
+void forget_cached_acl(struct inode *inode, int type)
+{
+        struct posix_acl **p = acl_by_type(inode, type);
+        struct posix_acl *old;
+        spin_lock(&inode->i_lock);
+        old = *p;
+        *p = ACL_NOT_CACHED;
+        spin_unlock(&inode->i_lock);
+        if (old != ACL_NOT_CACHED)
+                posix_acl_release(old);
+}
+EXPORT_SYMBOL(forget_cached_acl);
+void forget_all_cached_acls(struct inode *inode)
+{
+        struct posix_acl *old_access, *old_default;
+        spin_lock(&inode->i_lock);
+        old_access = inode->i_acl;
+        old_default = inode->i_default_acl;
+        inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED;
+        spin_unlock(&inode->i_lock);
+        if (old_access != ACL_NOT_CACHED)
+                posix_acl_release(old_access);
+        if (old_default != ACL_NOT_CACHED)
+                posix_acl_release(old_default);
+}
+EXPORT_SYMBOL(forget_all_cached_acls);
 /*
 * Init a fresh posix_acl
@@ -37,6 +106,7 @@ posix_acl_init(struct posix_acl *acl, int count)
        atomic_set(&acl->a_refcount, 1);
        acl->a_count = count;
 }
+EXPORT_SYMBOL(posix_acl_init);
 /*
 * Allocate a new ACL with the specified number of entries.
@@ -51,6 +121,7 @@ posix_acl_alloc(int count, gfp_t flags)
                posix_acl_init(acl, count);
        return acl;
 }
+EXPORT_SYMBOL(posix_acl_alloc);
 /*
 * Clone an ACL.
@@ -146,6 +217,7 @@ posix_acl_valid(const struct posix_acl *acl)
                return 0;
        return -EINVAL;
 }
+EXPORT_SYMBOL(posix_acl_valid);
 /*
 * Returns 0 if the acl can be exactly represented in the traditional
@@ -186,6 +258,7 @@ posix_acl_equiv_mode(const struct posix_acl *acl, umode_t *mode_p)
                *mode_p = (*mode_p & ~S_IRWXUGO) | mode;
        return not_equiv;
 }
+EXPORT_SYMBOL(posix_acl_equiv_mode);
 /*
 * Create an ACL representing the file mode permission bits of an inode.
@@ -207,6 +280,7 @@ posix_acl_from_mode(umode_t mode, gfp_t flags)
        acl->a_entries[2].e_perm = (mode & S_IRWXO);
        return acl;
 }
+EXPORT_SYMBOL(posix_acl_from_mode);
 /*
 * Return 0 if current is granted want access to the inode
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index a77d2b299199..24270eceddbf 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -26,7 +26,11 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
        unsigned long committed;
        struct vmalloc_info vmi;
        long cached;
+        long available;
+        unsigned long pagecache;
+        unsigned long wmark_low = 0;
        unsigned long pages[NR_LRU_LISTS];
+        struct zone *zone;
        int lru;
 /*
@@ -47,12 +51,44 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
        for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
                pages[lru] = global_page_state(NR_LRU_BASE + lru);
+        for_each_zone(zone)
+                wmark_low += zone->watermark[WMARK_LOW];
+        /*
+         * Estimate the amount of memory available for userspace allocations,
+         * without causing swapping.
+         *
+         * Free memory cannot be taken below the low watermark, before the
+         * system starts swapping.
+         */
+        available = i.freeram - wmark_low;
+        /*
+         * Not all the page cache can be freed, otherwise the system will
+         * start swapping. Assume at least half of the page cache, or the
+         * low watermark worth of cache, needs to stay.
+         */
+        pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE];
+        pagecache -= min(pagecache / 2, wmark_low);
+        available += pagecache;
+        /*
+         * Part of the reclaimable swap consists of items that are in use,
+         * and cannot be freed. Cap this estimate at the low watermark.
+         */
+        available += global_page_state(NR_SLAB_RECLAIMABLE) -
+                     min(global_page_state(NR_SLAB_RECLAIMABLE) / 2, wmark_low);
+        if (available < 0)
+                available = 0;
        /*
         * Tagged format, for easy grepping and expansion.
         */
        seq_printf(m,
                "MemTotal:       %8lu kB\n"
                "MemFree:        %8lu kB\n"
+                "MemAvailable:   %8lu kB\n"
                "Buffers:        %8lu kB\n"
                "Cached:         %8lu kB\n"
                "SwapCached:     %8lu kB\n"
@@ -105,6 +141,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
                ,
                K(i.totalram),
                K(i.freeram),
+                K(available),
                K(i.bufferram),
                K(cached),
                K(total_swapcache_pages()),
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index b8e93a40a5d3..78c3c2097787 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -443,8 +443,11 @@ int pstore_register(struct pstore_info *psi)
                pstore_get_records(0);
        kmsg_dump_register(&pstore_dumper);
-        pstore_register_console();
-        pstore_register_ftrace();
+        if ((psi->flags & PSTORE_FLAGS_FRAGILE) == 0) {
+                pstore_register_console();
+                pstore_register_ftrace();
+        }
        if (pstore_update_ms >= 0) {
                pstore_timer.expires = jiffies +
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 39d14659a8d3..6a3e2c420180 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -275,4 +275,4 @@ int __init init_ramfs_fs(void)
        return err;
 }
-module_init(init_ramfs_fs)
+fs_initcall(init_ramfs_fs);
diff --git a/fs/read_write.c b/fs/read_write.c
index 58e440df1bc6..1193ffd03565 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -901,10 +901,6 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
        io_fn_t fn;
        iov_fn_t fnv;
-        ret = -EFAULT;
-        if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector)))
-                goto out;
        ret = compat_rw_copy_check_uvector(type, uvector, nr_segs,
                                               UIO_FASTIOV, iovstack, &iov);
        if (ret <= 0)
diff --git a/fs/splice.c b/fs/splice.c
index 46a08f772d7d..12028fa41def 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -555,6 +555,24 @@ static const struct pipe_buf_operations default_pipe_buf_ops = {
        .get = generic_pipe_buf_get,
 };
+static int generic_pipe_buf_nosteal(struct pipe_inode_info *pipe,
+                                    struct pipe_buffer *buf)
+{
+        return 1;
+}
+/* Pipe buffer operations for a socket and similar. */
+const struct pipe_buf_operations nosteal_pipe_buf_ops = {
+        .can_merge = 0,
+        .map = generic_pipe_buf_map,
+        .unmap = generic_pipe_buf_unmap,
+        .confirm = generic_pipe_buf_confirm,
+        .release = generic_pipe_buf_release,
+        .steal = generic_pipe_buf_nosteal,
+        .get = generic_pipe_buf_get,
+};
+EXPORT_SYMBOL(nosteal_pipe_buf_ops);
 static ssize_t kernel_readv(struct file *file, const struct iovec *vec,
                            unsigned long vlen, loff_t offset)
 {
diff --git a/fs/super.c b/fs/super.c
index e5f6c2cfac38..cecd780e0f44 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -166,6 +166,8 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
        if (!s)
                return NULL;
+        INIT_LIST_HEAD(&s->s_mounts);
        if (security_sb_alloc(s))
                goto fail;
@@ -188,7 +190,6 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
        if (list_lru_init(&s->s_inode_lru))
                goto fail;
-        INIT_LIST_HEAD(&s->s_mounts);
        init_rwsem(&s->s_umount);
        lockdep_set_class(&s->s_umount, &type->s_umount_key);
        /*
diff --git a/fs/sysfs/Makefile b/fs/sysfs/Makefile
index 8876ac183373..6eff6e1205a5 100644
--- a/fs/sysfs/Makefile
+++ b/fs/sysfs/Makefile
@@ -2,4 +2,4 @@
 # Makefile for the sysfs virtual filesystem
 #
-obj-y           := inode.o file.o dir.o symlink.o mount.o group.o
+obj-y           := file.o dir.o symlink.o mount.o group.o
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 5e73d6626e50..ee0d761c3179 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -13,465 +13,31 @@
 #undef DEBUG
 #include <linux/fs.h>
-#include <linux/mount.h>
-#include <linux/module.h>
 #include <linux/kobject.h>
-#include <linux/namei.h>
-#include <linux/idr.h>
-#include <linux/completion.h>
-#include <linux/mutex.h>
 #include <linux/slab.h>
-#include <linux/security.h>
-#include <linux/hash.h>
 #include "sysfs.h"
-DEFINE_MUTEX(sysfs_mutex);
 DEFINE_SPINLOCK(sysfs_symlink_target_lock);
-#define to_sysfs_dirent(X) rb_entry((X), struct sysfs_dirent, s_rb)
-static DEFINE_SPINLOCK(sysfs_ino_lock);
-static DEFINE_IDA(sysfs_ino_ida);
-/**
- *      sysfs_name_hash
- *      @name: Null terminated string to hash
- *      @ns:   Namespace tag to hash
- *
- *      Returns 31 bit hash of ns + name (so it fits in an off_t )
- */
-static unsigned int sysfs_name_hash(const char *name, const void *ns)
-{
-        unsigned long hash = init_name_hash();
-        unsigned int len = strlen(name);
-        while (len--)
-                hash = partial_name_hash(*name++, hash);
-        hash = (end_name_hash(hash) ^ hash_ptr((void *)ns, 31));
-        hash &= 0x7fffffffU;
-        /* Reserve hash numbers 0, 1 and INT_MAX for magic directory entries */
-        if (hash < 1)
-                hash += 2;
-        if (hash >= INT_MAX)
-                hash = INT_MAX - 1;
-        return hash;
-}
-static int sysfs_name_compare(unsigned int hash, const char *name,
-                              const void *ns, const struct sysfs_dirent *sd)
-{
-        if (hash != sd->s_hash)
-                return hash - sd->s_hash;
-        if (ns != sd->s_ns)
-                return ns - sd->s_ns;
-        return strcmp(name, sd->s_name);
-}
-static int sysfs_sd_compare(const struct sysfs_dirent *left,
-                            const struct sysfs_dirent *right)
-{
-        return sysfs_name_compare(left->s_hash, left->s_name, left->s_ns,
-                                  right);
-}
-/**
- *      sysfs_link_sibling - link sysfs_dirent into sibling rbtree
- *      @sd: sysfs_dirent of interest
- *
- *      Link @sd into its sibling rbtree which starts from
- *      sd->s_parent->s_dir.children.
- *
- *      Locking:
- *      mutex_lock(sysfs_mutex)
- *
- *      RETURNS:
- *      0 on susccess -EEXIST on failure.
- */
-static int sysfs_link_sibling(struct sysfs_dirent *sd)
-{
-        struct rb_node **node = &sd->s_parent->s_dir.children.rb_node;
-        struct rb_node *parent = NULL;
-        if (sysfs_type(sd) == SYSFS_DIR)
-                sd->s_parent->s_dir.subdirs++;
-        while (*node) {
-                struct sysfs_dirent *pos;
-                int result;
-                pos = to_sysfs_dirent(*node);
-                parent = *node;
-                result = sysfs_sd_compare(sd, pos);
-                if (result < 0)
-                        node = &pos->s_rb.rb_left;
-                else if (result > 0)
-                        node = &pos->s_rb.rb_right;
-                else
-                        return -EEXIST;
-        }
-        /* add new node and rebalance the tree */
-        rb_link_node(&sd->s_rb, parent, node);
-        rb_insert_color(&sd->s_rb, &sd->s_parent->s_dir.children);
-        return 0;
-}
-/**
- *      sysfs_unlink_sibling - unlink sysfs_dirent from sibling rbtree
- *      @sd: sysfs_dirent of interest
- *
- *      Unlink @sd from its sibling rbtree which starts from
- *      sd->s_parent->s_dir.children.
- *
- *      Locking:
- *      mutex_lock(sysfs_mutex)
- */
-static void sysfs_unlink_sibling(struct sysfs_dirent *sd)
-{
-        if (sysfs_type(sd) == SYSFS_DIR)
-                sd->s_parent->s_dir.subdirs--;
-        rb_erase(&sd->s_rb, &sd->s_parent->s_dir.children);
-}
-/**
- *      sysfs_get_active - get an active reference to sysfs_dirent
- *      @sd: sysfs_dirent to get an active reference to
- *
- *      Get an active reference of @sd.  This function is noop if @sd
- *      is NULL.
- *
- *      RETURNS:
- *      Pointer to @sd on success, NULL on failure.
- */
-struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd)
-{
-        if (unlikely(!sd))
-                return NULL;
-        if (!atomic_inc_unless_negative(&sd->s_active))
-                return NULL;
-        if (likely(!sysfs_ignore_lockdep(sd)))
-                rwsem_acquire_read(&sd->dep_map, 0, 1, _RET_IP_);
-        return sd;
-}
-/**
- *      sysfs_put_active - put an active reference to sysfs_dirent
- *      @sd: sysfs_dirent to put an active reference to
- *
- *      Put an active reference to @sd.  This function is noop if @sd
- *      is NULL.
- */
-void sysfs_put_active(struct sysfs_dirent *sd)
-{
-        int v;
-        if (unlikely(!sd))
-                return;
-        if (likely(!sysfs_ignore_lockdep(sd)))
-                rwsem_release(&sd->dep_map, 1, _RET_IP_);
-        v = atomic_dec_return(&sd->s_active);
-        if (likely(v != SD_DEACTIVATED_BIAS))
-                return;
-        /* atomic_dec_return() is a mb(), we'll always see the updated
-         * sd->u.completion.
-         */
-        complete(sd->u.completion);
-}
-/**
- *      sysfs_deactivate - deactivate sysfs_dirent
- *      @sd: sysfs_dirent to deactivate
- *
- *      Deny new active references and drain existing ones.
- */
-static void sysfs_deactivate(struct sysfs_dirent *sd)
-{
-        DECLARE_COMPLETION_ONSTACK(wait);
-        int v;
-        BUG_ON(!(sd->s_flags & SYSFS_FLAG_REMOVED));
-        if (!(sysfs_type(sd) & SYSFS_ACTIVE_REF))
-                return;
-        sd->u.completion = (void *)&wait;
-        rwsem_acquire(&sd->dep_map, 0, 0, _RET_IP_);
-        /* atomic_add_return() is a mb(), put_active() will always see
-         * the updated sd->u.completion.
-         */
-        v = atomic_add_return(SD_DEACTIVATED_BIAS, &sd->s_active);
-        if (v != SD_DEACTIVATED_BIAS) {
-                lock_contended(&sd->dep_map, _RET_IP_);
-                wait_for_completion(&wait);
-        }
-        lock_acquired(&sd->dep_map, _RET_IP_);
-        rwsem_release(&sd->dep_map, 1, _RET_IP_);
-}
-static int sysfs_alloc_ino(unsigned int *pino)
-{
-        int ino, rc;
- retry:
-        spin_lock(&sysfs_ino_lock);
-        rc = ida_get_new_above(&sysfs_ino_ida, 2, &ino);
-        spin_unlock(&sysfs_ino_lock);
-        if (rc == -EAGAIN) {
-                if (ida_pre_get(&sysfs_ino_ida, GFP_KERNEL))
-                        goto retry;
-                rc = -ENOMEM;
-        }
-        *pino = ino;
-        return rc;
-}
-static void sysfs_free_ino(unsigned int ino)
-{
-        spin_lock(&sysfs_ino_lock);
-        ida_remove(&sysfs_ino_ida, ino);
-        spin_unlock(&sysfs_ino_lock);
-}
-void release_sysfs_dirent(struct sysfs_dirent *sd)
-{
-        struct sysfs_dirent *parent_sd;
- repeat:
-        /* Moving/renaming is always done while holding reference.
-         * sd->s_parent won't change beneath us.
-         */
-        parent_sd = sd->s_parent;
-        WARN(!(sd->s_flags & SYSFS_FLAG_REMOVED),
-                "sysfs: free using entry: %s/%s\n",
-                parent_sd ? parent_sd->s_name : "", sd->s_name);
-        if (sysfs_type(sd) == SYSFS_KOBJ_LINK)
-                sysfs_put(sd->s_symlink.target_sd);
-        if (sysfs_type(sd) & SYSFS_COPY_NAME)
-                kfree(sd->s_name);
-        if (sd->s_iattr && sd->s_iattr->ia_secdata)
-                security_release_secctx(sd->s_iattr->ia_secdata,
-                                        sd->s_iattr->ia_secdata_len);
-        kfree(sd->s_iattr);
-        sysfs_free_ino(sd->s_ino);
-        kmem_cache_free(sysfs_dir_cachep, sd);
-        sd = parent_sd;
-        if (sd && atomic_dec_and_test(&sd->s_count))
-                goto repeat;
-}
-static int sysfs_dentry_delete(const struct dentry *dentry)
-{
-        struct sysfs_dirent *sd = dentry->d_fsdata;
-        return !(sd && !(sd->s_flags & SYSFS_FLAG_REMOVED));
-}
-static int sysfs_dentry_revalidate(struct dentry *dentry, unsigned int flags)
-{
-        struct sysfs_dirent *sd;
-        int type;
-        if (flags & LOOKUP_RCU)
-                return -ECHILD;
-        sd = dentry->d_fsdata;
-        mutex_lock(&sysfs_mutex);
-        /* The sysfs dirent has been deleted */
-        if (sd->s_flags & SYSFS_FLAG_REMOVED)
-                goto out_bad;
-        /* The sysfs dirent has been moved? */
-        if (dentry->d_parent->d_fsdata != sd->s_parent)
-                goto out_bad;
-        /* The sysfs dirent has been renamed */
-        if (strcmp(dentry->d_name.name, sd->s_name) != 0)
-                goto out_bad;
-        /* The sysfs dirent has been moved to a different namespace */
-        type = KOBJ_NS_TYPE_NONE;
-        if (sd->s_parent) {
-                type = sysfs_ns_type(sd->s_parent);
-                if (type != KOBJ_NS_TYPE_NONE &&
-                                sysfs_info(dentry->d_sb)->ns[type] != sd->s_ns)
-                        goto out_bad;
-        }
-        mutex_unlock(&sysfs_mutex);
-out_valid:
-        return 1;
-out_bad:
-        /* Remove the dentry from the dcache hashes.
-         * If this is a deleted dentry we use d_drop instead of d_delete
-         * so sysfs doesn't need to cope with negative dentries.
-         *
-         * If this is a dentry that has simply been renamed we
-         * use d_drop to remove it from the dcache lookup on its
-         * old parent.  If this dentry persists later when a lookup
-         * is performed at its new name the dentry will be readded
-         * to the dcache hashes.
-         */
-        mutex_unlock(&sysfs_mutex);
-        /* If we have submounts we must allow the vfs caches
-         * to lie about the state of the filesystem to prevent
-         * leaks and other nasty things.
-         */
-        if (check_submounts_and_drop(dentry) != 0)
-                goto out_valid;
-        return 0;
-}
-static void sysfs_dentry_release(struct dentry *dentry)
-{
-        sysfs_put(dentry->d_fsdata);
-}
-const struct dentry_operations sysfs_dentry_ops = {
-        .d_revalidate   = sysfs_dentry_revalidate,
-        .d_delete       = sysfs_dentry_delete,
-        .d_release      = sysfs_dentry_release,
-};
-struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type)
-{
-        char *dup_name = NULL;
-        struct sysfs_dirent *sd;
-        if (type & SYSFS_COPY_NAME) {
-                name = dup_name = kstrdup(name, GFP_KERNEL);
-                if (!name)
-                        return NULL;
-        }
-        sd = kmem_cache_zalloc(sysfs_dir_cachep, GFP_KERNEL);
-        if (!sd)
-                goto err_out1;
-        if (sysfs_alloc_ino(&sd->s_ino))
-                goto err_out2;
-        atomic_set(&sd->s_count, 1);
-        atomic_set(&sd->s_active, 0);
-        sd->s_name = name;
-        sd->s_mode = mode;
-        sd->s_flags = type | SYSFS_FLAG_REMOVED;
-        return sd;
- err_out2:
-        kmem_cache_free(sysfs_dir_cachep, sd);
- err_out1:
-        kfree(dup_name);
-        return NULL;
-}
-/**
- *      sysfs_addrm_start - prepare for sysfs_dirent add/remove
- *      @acxt: pointer to sysfs_addrm_cxt to be used
- *
- *      This function is called when the caller is about to add or remove
- *      sysfs_dirent.  This function acquires sysfs_mutex.  @acxt is used
- *      to keep and pass context to other addrm functions.
- *
- *      LOCKING:
- *      Kernel thread context (may sleep).  sysfs_mutex is locked on
- *      return.
- */
-void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt)
-        __acquires(sysfs_mutex)
-{
-        memset(acxt, 0, sizeof(*acxt));
-        mutex_lock(&sysfs_mutex);
-}
-/**
- *      __sysfs_add_one - add sysfs_dirent to parent without warning
- *      @acxt: addrm context to use
- *      @sd: sysfs_dirent to be added
- *      @parent_sd: the parent sysfs_dirent to add @sd to
- *
- *      Get @parent_sd and set @sd->s_parent to it and increment nlink of
- *      the parent inode if @sd is a directory and link into the children
- *      list of the parent.
- *
- *      This function should be called between calls to
- *      sysfs_addrm_start() and sysfs_addrm_finish() and should be
- *      passed the same @acxt as passed to sysfs_addrm_start().
- *
- *      LOCKING:
- *      Determined by sysfs_addrm_start().
- *
- *      RETURNS:
- *      0 on success, -EEXIST if entry with the given name already
- *      exists.
- */
-int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd,
-                    struct sysfs_dirent *parent_sd)
-{
-        struct sysfs_inode_attrs *ps_iattr;
-        int ret;
-        if (!!sysfs_ns_type(parent_sd) != !!sd->s_ns) {
-                WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n",
-                        sysfs_ns_type(parent_sd) ? "required" : "invalid",
-                        parent_sd->s_name, sd->s_name);
-                return -EINVAL;
-        }
-        sd->s_hash = sysfs_name_hash(sd->s_name, sd->s_ns);
-        sd->s_parent = sysfs_get(parent_sd);
-        ret = sysfs_link_sibling(sd);
-        if (ret)
-                return ret;
-        /* Update timestamps on the parent */
-        ps_iattr = parent_sd->s_iattr;
-        if (ps_iattr) {
-                struct iattr *ps_iattrs = &ps_iattr->ia_iattr;
-                ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME;
-        }
-        /* Mark the entry added into directory tree */
-        sd->s_flags &= ~SYSFS_FLAG_REMOVED;
-        return 0;
-}
 /**
 *      sysfs_pathname - return full path to sysfs dirent
- *      @sd: sysfs_dirent whose path we want
+ *      @kn: kernfs_node whose path we want
 *      @path: caller allocated buffer of size PATH_MAX
 *
 *      Gives the name "/" to the sysfs_root entry; any path returned
 *      is relative to wherever sysfs is mounted.
 */
-static char *sysfs_pathname(struct sysfs_dirent *sd, char *path)
+static char *sysfs_pathname(struct kernfs_node *kn, char *path)
 {
-        if (sd->s_parent) {
+        if (kn->parent) {
-                sysfs_pathname(sd->s_parent, path);
+                sysfs_pathname(kn->parent, path);
                strlcat(path, "/", PATH_MAX);
        }
-        strlcat(path, sd->s_name, PATH_MAX);
+        strlcat(path, kn->name, PATH_MAX);
        return path;
 }
-void sysfs_warn_dup(struct sysfs_dirent *parent, const char *name)
+void sysfs_warn_dup(struct kernfs_node *parent, const char *name)
 {
        char *path;
@@ -489,445 +55,34 @@ void sysfs_warn_dup(struct sysfs_dirent *parent, const char *name)
 }
 /**
- *      sysfs_add_one - add sysfs_dirent to parent
- *      @acxt: addrm context to use
- *      @sd: sysfs_dirent to be added
- *      @parent_sd: the parent sysfs_dirent to add @sd to
- *
- *      Get @parent_sd and set @sd->s_parent to it and increment nlink of
- *      the parent inode if @sd is a directory and link into the children
- *      list of the parent.
- *
- *      This function should be called between calls to
- *      sysfs_addrm_start() and sysfs_addrm_finish() and should be
- *      passed the same @acxt as passed to sysfs_addrm_start().
- *
- *      LOCKING:
- *      Determined by sysfs_addrm_start().
- *
- *      RETURNS:
- *      0 on success, -EEXIST if entry with the given name already
- *      exists.
- */
-int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd,
-                  struct sysfs_dirent *parent_sd)
-{
-        int ret;
-        ret = __sysfs_add_one(acxt, sd, parent_sd);
-        if (ret == -EEXIST)
-                sysfs_warn_dup(parent_sd, sd->s_name);
-        return ret;
-}
-/**
- *      sysfs_remove_one - remove sysfs_dirent from parent
- *      @acxt: addrm context to use
- *      @sd: sysfs_dirent to be removed
- *
- *      Mark @sd removed and drop nlink of parent inode if @sd is a
- *      directory.  @sd is unlinked from the children list.
- *
- *      This function should be called between calls to
- *      sysfs_addrm_start() and sysfs_addrm_finish() and should be
- *      passed the same @acxt as passed to sysfs_addrm_start().
- *
- *      LOCKING:
- *      Determined by sysfs_addrm_start().
- */
-static void sysfs_remove_one(struct sysfs_addrm_cxt *acxt,
-                             struct sysfs_dirent *sd)
-{
-        struct sysfs_inode_attrs *ps_iattr;
-        /*
-         * Removal can be called multiple times on the same node.  Only the
-         * first invocation is effective and puts the base ref.
-         */
-        if (sd->s_flags & SYSFS_FLAG_REMOVED)
-                return;
-        sysfs_unlink_sibling(sd);
-        /* Update timestamps on the parent */
-        ps_iattr = sd->s_parent->s_iattr;
-        if (ps_iattr) {
-                struct iattr *ps_iattrs = &ps_iattr->ia_iattr;
-                ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME;
-        }
-        sd->s_flags |= SYSFS_FLAG_REMOVED;
-        sd->u.removed_list = acxt->removed;
-        acxt->removed = sd;
-}
-/**
- *      sysfs_addrm_finish - finish up sysfs_dirent add/remove
- *      @acxt: addrm context to finish up
- *
- *      Finish up sysfs_dirent add/remove.  Resources acquired by
- *      sysfs_addrm_start() are released and removed sysfs_dirents are
- *      cleaned up.
- *
- *      LOCKING:
- *      sysfs_mutex is released.
- */
-void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt)
-        __releases(sysfs_mutex)
-{
-        /* release resources acquired by sysfs_addrm_start() */
-        mutex_unlock(&sysfs_mutex);
-        /* kill removed sysfs_dirents */
-        while (acxt->removed) {
-                struct sysfs_dirent *sd = acxt->removed;
-                acxt->removed = sd->u.removed_list;
-                sysfs_deactivate(sd);
-                sysfs_unmap_bin_file(sd);
-                sysfs_put(sd);
-        }
-}
-/**
- *      sysfs_find_dirent - find sysfs_dirent with the given name
- *      @parent_sd: sysfs_dirent to search under
- *      @name: name to look for
- *      @ns: the namespace tag to use
- *
- *      Look for sysfs_dirent with name @name under @parent_sd.
- *
- *      LOCKING:
- *      mutex_lock(sysfs_mutex)
- *
- *      RETURNS:
- *      Pointer to sysfs_dirent if found, NULL if not.
- */
-struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
-                                       const unsigned char *name,
-                                       const void *ns)
-{
-        struct rb_node *node = parent_sd->s_dir.children.rb_node;
-        unsigned int hash;
-        if (!!sysfs_ns_type(parent_sd) != !!ns) {
-                WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n",
-                        sysfs_ns_type(parent_sd) ? "required" : "invalid",
-                        parent_sd->s_name, name);
-                return NULL;
-        }
-        hash = sysfs_name_hash(name, ns);
-        while (node) {
-                struct sysfs_dirent *sd;
-                int result;
-                sd = to_sysfs_dirent(node);
-                result = sysfs_name_compare(hash, name, ns, sd);
-                if (result < 0)
-                        node = node->rb_left;
-                else if (result > 0)
-                        node = node->rb_right;
-                else
-                        return sd;
-        }
-        return NULL;
-}
-/**
- *      sysfs_get_dirent_ns - find and get sysfs_dirent with the given name
- *      @parent_sd: sysfs_dirent to search under
- *      @name: name to look for
- *      @ns: the namespace tag to use
- *
- *      Look for sysfs_dirent with name @name under @parent_sd and get
- *      it if found.
- *
- *      LOCKING:
- *      Kernel thread context (may sleep).  Grabs sysfs_mutex.
- *
- *      RETURNS:
- *      Pointer to sysfs_dirent if found, NULL if not.
- */
-struct sysfs_dirent *sysfs_get_dirent_ns(struct sysfs_dirent *parent_sd,
-                                         const unsigned char *name,
-                                         const void *ns)
-{
-        struct sysfs_dirent *sd;
-        mutex_lock(&sysfs_mutex);
-        sd = sysfs_find_dirent(parent_sd, name, ns);
-        sysfs_get(sd);
-        mutex_unlock(&sysfs_mutex);
-        return sd;
-}
-EXPORT_SYMBOL_GPL(sysfs_get_dirent_ns);
-static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
-                      enum kobj_ns_type type,
-                      const char *name, const void *ns,
-                      struct sysfs_dirent **p_sd)
-{
-        umode_t mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
-        struct sysfs_addrm_cxt acxt;
-        struct sysfs_dirent *sd;
-        int rc;
-        /* allocate */
-        sd = sysfs_new_dirent(name, mode, SYSFS_DIR);
-        if (!sd)
-                return -ENOMEM;
-        sd->s_flags |= (type << SYSFS_NS_TYPE_SHIFT);
-        sd->s_ns = ns;
-        sd->s_dir.kobj = kobj;
-        /* link in */
-        sysfs_addrm_start(&acxt);
-        rc = sysfs_add_one(&acxt, sd, parent_sd);
-        sysfs_addrm_finish(&acxt);
-        if (rc == 0)
-                *p_sd = sd;
-        else
-                sysfs_put(sd);
-        return rc;
-}
-int sysfs_create_subdir(struct kobject *kobj, const char *name,
-                        struct sysfs_dirent **p_sd)
-{
-        return create_dir(kobj, kobj->sd,
-                          KOBJ_NS_TYPE_NONE, name, NULL, p_sd);
-}
-/**
- *      sysfs_read_ns_type: return associated ns_type
- *      @kobj: the kobject being queried
- *
- *      Each kobject can be tagged with exactly one namespace type
- *      (i.e. network or user).  Return the ns_type associated with
- *      this object if any
- */
-static enum kobj_ns_type sysfs_read_ns_type(struct kobject *kobj)
-{
-        const struct kobj_ns_type_operations *ops;
-        enum kobj_ns_type type;
-        ops = kobj_child_ns_ops(kobj);
-        if (!ops)
-                return KOBJ_NS_TYPE_NONE;
-        type = ops->type;
-        BUG_ON(type <= KOBJ_NS_TYPE_NONE);
-        BUG_ON(type >= KOBJ_NS_TYPES);
-        BUG_ON(!kobj_ns_type_registered(type));
-        return type;
-}
-/**
 * sysfs_create_dir_ns - create a directory for an object with a namespace tag
 * @kobj: object we're creating directory for
 * @ns: the namespace tag to use
 */
 int sysfs_create_dir_ns(struct kobject *kobj, const void *ns)
 {
-        enum kobj_ns_type type;
+        struct kernfs_node *parent, *kn;
-        struct sysfs_dirent *parent_sd, *sd;
-        int error = 0;
        BUG_ON(!kobj);
        if (kobj->parent)
-                parent_sd = kobj->parent->sd;
+                parent = kobj->parent->sd;
        else
-                parent_sd = &sysfs_root;
+                parent = sysfs_root_kn;
-        if (!parent_sd)
+        if (!parent)
                return -ENOENT;
-        type = sysfs_read_ns_type(kobj);
+        kn = kernfs_create_dir_ns(parent, kobject_name(kobj),
+                                  S_IRWXU | S_IRUGO | S_IXUGO, kobj, ns);
-        error = create_dir(kobj, parent_sd, type, kobject_name(kobj), ns, &sd);
+        if (IS_ERR(kn)) {
-        if (!error)
+                if (PTR_ERR(kn) == -EEXIST)
-                kobj->sd = sd;
+                        sysfs_warn_dup(parent, kobject_name(kobj));
-        return error;
+                return PTR_ERR(kn);
-}
-static struct dentry *sysfs_lookup(struct inode *dir, struct dentry *dentry,
-                                   unsigned int flags)
-{
-        struct dentry *ret = NULL;
-        struct dentry *parent = dentry->d_parent;
-        struct sysfs_dirent *parent_sd = parent->d_fsdata;
-        struct sysfs_dirent *sd;
-        struct inode *inode;
-        enum kobj_ns_type type;
-        const void *ns;
-        mutex_lock(&sysfs_mutex);
-        type = sysfs_ns_type(parent_sd);
-        ns = sysfs_info(dir->i_sb)->ns[type];
-        sd = sysfs_find_dirent(parent_sd, dentry->d_name.name, ns);
-        /* no such entry */
-        if (!sd) {
-                ret = ERR_PTR(-ENOENT);
-                goto out_unlock;
-        }
-        dentry->d_fsdata = sysfs_get(sd);
-        /* attach dentry and inode */
-        inode = sysfs_get_inode(dir->i_sb, sd);
-        if (!inode) {
-                ret = ERR_PTR(-ENOMEM);
-                goto out_unlock;
-        }
-        /* instantiate and hash dentry */
-        ret = d_materialise_unique(dentry, inode);
- out_unlock:
-        mutex_unlock(&sysfs_mutex);
-        return ret;
-}
-const struct inode_operations sysfs_dir_inode_operations = {
-        .lookup         = sysfs_lookup,
-        .permission     = sysfs_permission,
-        .setattr        = sysfs_setattr,
-        .getattr        = sysfs_getattr,
-        .setxattr       = sysfs_setxattr,
-};
-static struct sysfs_dirent *sysfs_leftmost_descendant(struct sysfs_dirent *pos)
-{
-        struct sysfs_dirent *last;
-        while (true) {
-                struct rb_node *rbn;
-                last = pos;
-                if (sysfs_type(pos) != SYSFS_DIR)
-                        break;
-                rbn = rb_first(&pos->s_dir.children);
-                if (!rbn)
-                        break;
-                pos = to_sysfs_dirent(rbn);
-        }
-        return last;
-}
-/**
- * sysfs_next_descendant_post - find the next descendant for post-order walk
- * @pos: the current position (%NULL to initiate traversal)
- * @root: sysfs_dirent whose descendants to walk
- *
- * Find the next descendant to visit for post-order traversal of @root's
- * descendants.  @root is included in the iteration and the last node to be
- * visited.
- */
-static struct sysfs_dirent *sysfs_next_descendant_post(struct sysfs_dirent *pos,
-                                                       struct sysfs_dirent *root)
-{
-        struct rb_node *rbn;
-        lockdep_assert_held(&sysfs_mutex);
-        /* if first iteration, visit leftmost descendant which may be root */
-        if (!pos)
-                return sysfs_leftmost_descendant(root);
-        /* if we visited @root, we're done */
-        if (pos == root)
-                return NULL;
-        /* if there's an unvisited sibling, visit its leftmost descendant */
-        rbn = rb_next(&pos->s_rb);
-        if (rbn)
-                return sysfs_leftmost_descendant(to_sysfs_dirent(rbn));
-        /* no sibling left, visit parent */
-        return pos->s_parent;
-}
-static void __sysfs_remove(struct sysfs_addrm_cxt *acxt,
-                           struct sysfs_dirent *sd)
-{
-        struct sysfs_dirent *pos, *next;
-        if (!sd)
-                return;
-        pr_debug("sysfs %s: removing\n", sd->s_name);
-        next = NULL;
-        do {
-                pos = next;
-                next = sysfs_next_descendant_post(pos, sd);
-                if (pos)
-                        sysfs_remove_one(acxt, pos);
-        } while (next);
-}
-/**
- * sysfs_remove - remove a sysfs_dirent recursively
- * @sd: the sysfs_dirent to remove
- *
- * Remove @sd along with all its subdirectories and files.
- */
-void sysfs_remove(struct sysfs_dirent *sd)
-{
-        struct sysfs_addrm_cxt acxt;
-        sysfs_addrm_start(&acxt);
-        __sysfs_remove(&acxt, sd);
-        sysfs_addrm_finish(&acxt);
-}
-/**
- * sysfs_hash_and_remove - find a sysfs_dirent by name and remove it
- * @dir_sd: parent of the target
- * @name: name of the sysfs_dirent to remove
- * @ns: namespace tag of the sysfs_dirent to remove
- *
- * Look for the sysfs_dirent with @name and @ns under @dir_sd and remove
- * it.  Returns 0 on success, -ENOENT if such entry doesn't exist.
- */
-int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name,
-                          const void *ns)
-{
-        struct sysfs_addrm_cxt acxt;
-        struct sysfs_dirent *sd;
-        if (!dir_sd) {
-                WARN(1, KERN_WARNING "sysfs: can not remove '%s', no directory\n",
-                        name);
-                return -ENOENT;
        }
-        sysfs_addrm_start(&acxt);
+        kobj->sd = kn;
+        return 0;
-        sd = sysfs_find_dirent(dir_sd, name, ns);
-        if (sd)
-                __sysfs_remove(&acxt, sd);
-        sysfs_addrm_finish(&acxt);
-        if (sd)
-                return 0;
-        else
-                return -ENOENT;
 }
 /**
@@ -940,207 +95,47 @@ int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name,
 */
 void sysfs_remove_dir(struct kobject *kobj)
 {
-        struct sysfs_dirent *sd = kobj->sd;
+        struct kernfs_node *kn = kobj->sd;
        /*
         * In general, kboject owner is responsible for ensuring removal
         * doesn't race with other operations and sysfs doesn't provide any
         * protection; however, when @kobj is used as a symlink target, the
         * symlinking entity usually doesn't own @kobj and thus has no
-         * control over removal.  @kobj->sd may be removed anytime and
+         * control over removal.  @kobj->sd may be removed anytime
-         * symlink code may end up dereferencing an already freed sd.
+         * and symlink code may end up dereferencing an already freed node.
         *
-         * sysfs_symlink_target_lock synchronizes @kobj->sd disassociation
+         * sysfs_symlink_target_lock synchronizes @kobj->sd
-         * against symlink operations so that symlink code can safely
+         * disassociation against symlink operations so that symlink code
-         * dereference @kobj->sd.
+         * can safely dereference @kobj->sd.
         */
        spin_lock(&sysfs_symlink_target_lock);
        kobj->sd = NULL;
        spin_unlock(&sysfs_symlink_target_lock);
-        if (sd) {
+        if (kn) {
-                WARN_ON_ONCE(sysfs_type(sd) != SYSFS_DIR);
+                WARN_ON_ONCE(kernfs_type(kn) != KERNFS_DIR);
-                sysfs_remove(sd);
+                kernfs_remove(kn);
        }
 }
-int sysfs_rename(struct sysfs_dirent *sd, struct sysfs_dirent *new_parent_sd,
-                 const char *new_name, const void *new_ns)
-{
-        int error;
-        mutex_lock(&sysfs_mutex);
-        error = 0;
-        if ((sd->s_parent == new_parent_sd) && (sd->s_ns == new_ns) &&
-            (strcmp(sd->s_name, new_name) == 0))
-                goto out;       /* nothing to rename */
-        error = -EEXIST;
-        if (sysfs_find_dirent(new_parent_sd, new_name, new_ns))
-                goto out;
-        /* rename sysfs_dirent */
-        if (strcmp(sd->s_name, new_name) != 0) {
-                error = -ENOMEM;
-                new_name = kstrdup(new_name, GFP_KERNEL);
-                if (!new_name)
-                        goto out;
-                kfree(sd->s_name);
-                sd->s_name = new_name;
-        }
-        /*
-         * Move to the appropriate place in the appropriate directories rbtree.
-         */
-        sysfs_unlink_sibling(sd);
-        sysfs_get(new_parent_sd);
-        sysfs_put(sd->s_parent);
-        sd->s_ns = new_ns;
-        sd->s_hash = sysfs_name_hash(sd->s_name, sd->s_ns);
-        sd->s_parent = new_parent_sd;
-        sysfs_link_sibling(sd);
-        error = 0;
- out:
-        mutex_unlock(&sysfs_mutex);
-        return error;
-}
 int sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name,
                        const void *new_ns)
 {
-        struct sysfs_dirent *parent_sd = kobj->sd->s_parent;
+        struct kernfs_node *parent = kobj->sd->parent;
-        return sysfs_rename(kobj->sd, parent_sd, new_name, new_ns);
+        return kernfs_rename_ns(kobj->sd, parent, new_name, new_ns);
 }
 int sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj,
                      const void *new_ns)
 {
-        struct sysfs_dirent *sd = kobj->sd;
+        struct kernfs_node *kn = kobj->sd;
-        struct sysfs_dirent *new_parent_sd;
+        struct kernfs_node *new_parent;
-        BUG_ON(!sd->s_parent);
+        BUG_ON(!kn->parent);
-        new_parent_sd = new_parent_kobj && new_parent_kobj->sd ?
+        new_parent = new_parent_kobj && new_parent_kobj->sd ?
-                new_parent_kobj->sd : &sysfs_root;
+                new_parent_kobj->sd : sysfs_root_kn;
-        return sysfs_rename(sd, new_parent_sd, sd->s_name, new_ns);
+        return kernfs_rename_ns(kn, new_parent, kn->name, new_ns);
 }
-/* Relationship between s_mode and the DT_xxx types */
-static inline unsigned char dt_type(struct sysfs_dirent *sd)
-{
-        return (sd->s_mode >> 12) & 15;
-}
-static int sysfs_dir_release(struct inode *inode, struct file *filp)
-{
-        sysfs_put(filp->private_data);
-        return 0;
-}
-static struct sysfs_dirent *sysfs_dir_pos(const void *ns,
-        struct sysfs_dirent *parent_sd, loff_t hash, struct sysfs_dirent *pos)
-{
-        if (pos) {
-                int valid = !(pos->s_flags & SYSFS_FLAG_REMOVED) &&
-                        pos->s_parent == parent_sd &&
-                        hash == pos->s_hash;
-                sysfs_put(pos);
-                if (!valid)
-                        pos = NULL;
-        }
-        if (!pos && (hash > 1) && (hash < INT_MAX)) {
-                struct rb_node *node = parent_sd->s_dir.children.rb_node;
-                while (node) {
-                        pos = to_sysfs_dirent(node);
-                        if (hash < pos->s_hash)
-                                node = node->rb_left;
-                        else if (hash > pos->s_hash)
-                                node = node->rb_right;
-                        else
-                                break;
-                }
-        }
-        /* Skip over entries in the wrong namespace */
-        while (pos && pos->s_ns != ns) {
-                struct rb_node *node = rb_next(&pos->s_rb);
-                if (!node)
-                        pos = NULL;
-                else
-                        pos = to_sysfs_dirent(node);
-        }
-        return pos;
-}
-static struct sysfs_dirent *sysfs_dir_next_pos(const void *ns,
-        struct sysfs_dirent *parent_sd, ino_t ino, struct sysfs_dirent *pos)
-{
-        pos = sysfs_dir_pos(ns, parent_sd, ino, pos);
-        if (pos)
-                do {
-                        struct rb_node *node = rb_next(&pos->s_rb);
-                        if (!node)
-                                pos = NULL;
-                        else
-                                pos = to_sysfs_dirent(node);
-                } while (pos && pos->s_ns != ns);
-        return pos;
-}
-static int sysfs_readdir(struct file *file, struct dir_context *ctx)
-{
-        struct dentry *dentry = file->f_path.dentry;
-        struct sysfs_dirent *parent_sd = dentry->d_fsdata;
-        struct sysfs_dirent *pos = file->private_data;
-        enum kobj_ns_type type;
-        const void *ns;
-        type = sysfs_ns_type(parent_sd);
-        ns = sysfs_info(dentry->d_sb)->ns[type];
-        if (!dir_emit_dots(file, ctx))
-                return 0;
-        mutex_lock(&sysfs_mutex);
-        for (pos = sysfs_dir_pos(ns, parent_sd, ctx->pos, pos);
-             pos;
-             pos = sysfs_dir_next_pos(ns, parent_sd, ctx->pos, pos)) {
-                const char *name = pos->s_name;
-                unsigned int type = dt_type(pos);
-                int len = strlen(name);
-                ino_t ino = pos->s_ino;
-                ctx->pos = pos->s_hash;
-                file->private_data = sysfs_get(pos);
-                mutex_unlock(&sysfs_mutex);
-                if (!dir_emit(ctx, name, len, ino, type))
-                        return 0;
-                mutex_lock(&sysfs_mutex);
-        }
-        mutex_unlock(&sysfs_mutex);
-        file->private_data = NULL;
-        ctx->pos = INT_MAX;
-        return 0;
-}
-static loff_t sysfs_dir_llseek(struct file *file, loff_t offset, int whence)
-{
-        struct inode *inode = file_inode(file);
-        loff_t ret;
-        mutex_lock(&inode->i_mutex);
-        ret = generic_file_llseek(file, offset, whence);
-        mutex_unlock(&inode->i_mutex);
-        return ret;
-}
-const struct file_operations sysfs_dir_operations = {
-        .read           = generic_read_dir,
-        .iterate        = sysfs_readdir,
-        .release        = sysfs_dir_release,
-        .llseek         = sysfs_dir_llseek,
-};
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index b94f93685093..810cf6e613e5 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -14,70 +14,23 @@
 #include <linux/kobject.h>
 #include <linux/kallsyms.h>
 #include <linux/slab.h>
-#include <linux/fsnotify.h>
-#include <linux/namei.h>
-#include <linux/poll.h>
 #include <linux/list.h>
 #include <linux/mutex.h>
-#include <linux/limits.h>
-#include <linux/uaccess.h>
 #include <linux/seq_file.h>
-#include <linux/mm.h>
 #include "sysfs.h"
+#include "../kernfs/kernfs-internal.h"
 /*
- * There's one sysfs_open_file for each open file and one sysfs_open_dirent
+ * Determine ktype->sysfs_ops for the given kernfs_node.  This function
- * for each sysfs_dirent with one or more open files.
- *
- * sysfs_dirent->s_attr.open points to sysfs_open_dirent.  s_attr.open is
- * protected by sysfs_open_dirent_lock.
- *
- * filp->private_data points to seq_file whose ->private points to
- * sysfs_open_file.  sysfs_open_files are chained at
- * sysfs_open_dirent->files, which is protected by sysfs_open_file_mutex.
- */
-static DEFINE_SPINLOCK(sysfs_open_dirent_lock);
-static DEFINE_MUTEX(sysfs_open_file_mutex);
-struct sysfs_open_dirent {
-        atomic_t                refcnt;
-        atomic_t                event;
-        wait_queue_head_t       poll;
-        struct list_head        files; /* goes through sysfs_open_file.list */
-};
-struct sysfs_open_file {
-        struct sysfs_dirent     *sd;
-        struct file             *file;
-        struct mutex            mutex;
-        int                     event;
-        struct list_head        list;
-        bool                    mmapped;
-        const struct vm_operations_struct *vm_ops;
-};
-static bool sysfs_is_bin(struct sysfs_dirent *sd)
-{
-        return sysfs_type(sd) == SYSFS_KOBJ_BIN_ATTR;
-}
-static struct sysfs_open_file *sysfs_of(struct file *file)
-{
-        return ((struct seq_file *)file->private_data)->private;
-}
-/*
- * Determine ktype->sysfs_ops for the given sysfs_dirent.  This function
 * must be called while holding an active reference.
 */
-static const struct sysfs_ops *sysfs_file_ops(struct sysfs_dirent *sd)
+static const struct sysfs_ops *sysfs_file_ops(struct kernfs_node *kn)
 {
-        struct kobject *kobj = sd->s_parent->s_dir.kobj;
+        struct kobject *kobj = kn->parent->priv;
-        if (!sysfs_ignore_lockdep(sd))
+        if (kn->flags & KERNFS_LOCKDEP)
-                lockdep_assert_held(sd);
+                lockdep_assert_held(kn);
        return kobj->ktype ? kobj->ktype->sysfs_ops : NULL;
 }
@@ -86,13 +39,13 @@ static const struct sysfs_ops *sysfs_file_ops(struct sysfs_dirent *sd)
 * details like buffering and seeking.  The following function pipes
 * sysfs_ops->show() result through seq_file.
 */
-static int sysfs_seq_show(struct seq_file *sf, void *v)
+static int sysfs_kf_seq_show(struct seq_file *sf, void *v)
 {
-        struct sysfs_open_file *of = sf->private;
+        struct kernfs_open_file *of = sf->private;
-        struct kobject *kobj = of->sd->s_parent->s_dir.kobj;
+        struct kobject *kobj = of->kn->parent->priv;
-        const struct sysfs_ops *ops;
+        const struct sysfs_ops *ops = sysfs_file_ops(of->kn);
-        char *buf;
        ssize_t count;
+        char *buf;
        /* acquire buffer and ensure that it's >= PAGE_SIZE */
        count = seq_get_buf(sf, &buf);
@@ -102,34 +55,15 @@ static int sysfs_seq_show(struct seq_file *sf, void *v)
        }
        /*
-         * Need @of->sd for attr and ops, its parent for kobj.  @of->mutex
+         * Invoke show().  Control may reach here via seq file lseek even
-         * nests outside active ref and is just to ensure that the ops
+         * if @ops->show() isn't implemented.
-         * aren't called concurrently for the same open file.
         */
-        mutex_lock(&of->mutex);
+        if (ops->show) {
-        if (!sysfs_get_active(of->sd)) {
+                count = ops->show(kobj, of->kn->priv, buf);
-                mutex_unlock(&of->mutex);
+                if (count < 0)
-                return -ENODEV;
+                        return count;
        }
-        of->event = atomic_read(&of->sd->s_attr.open->event);
-        /*
-         * Lookup @ops and invoke show().  Control may reach here via seq
-         * file lseek even if @ops->show() isn't implemented.
-         */
-        ops = sysfs_file_ops(of->sd);
-        if (ops->show)
-                count = ops->show(kobj, of->sd->s_attr.attr, buf);
-        else
-                count = 0;
-        sysfs_put_active(of->sd);
-        mutex_unlock(&of->mutex);
-        if (count < 0)
-                return count;
        /*
         * The code works fine with PAGE_SIZE return but it's likely to
         * indicate truncated result or overflow in normal use cases.
@@ -144,728 +78,194 @@ static int sysfs_seq_show(struct seq_file *sf, void *v)
        return 0;
 }
-/*
+static ssize_t sysfs_kf_bin_read(struct kernfs_open_file *of, char *buf,
- * Read method for bin files.  As reading a bin file can have side-effects,
+                                 size_t count, loff_t pos)
- * the exact offset and bytes specified in read(2) call should be passed to
- * the read callback making it difficult to use seq_file.  Implement
- * simplistic custom buffering for bin files.
- */
-static ssize_t sysfs_bin_read(struct file *file, char __user *userbuf,
-                              size_t bytes, loff_t *off)
 {
-        struct sysfs_open_file *of = sysfs_of(file);
+        struct bin_attribute *battr = of->kn->priv;
-        struct bin_attribute *battr = of->sd->s_attr.bin_attr;
+        struct kobject *kobj = of->kn->parent->priv;
-        struct kobject *kobj = of->sd->s_parent->s_dir.kobj;
+        loff_t size = file_inode(of->file)->i_size;
-        loff_t size = file_inode(file)->i_size;
-        int count = min_t(size_t, bytes, PAGE_SIZE);
-        loff_t offs = *off;
-        char *buf;
-        if (!bytes)
+        if (!count)
                return 0;
        if (size) {
-                if (offs > size)
+                if (pos > size)
                        return 0;
-                if (offs + count > size)
+                if (pos + count > size)
-                        count = size - offs;
+                        count = size - pos;
-        }
-        buf = kmalloc(count, GFP_KERNEL);
-        if (!buf)
-                return -ENOMEM;
-        /* need of->sd for battr, its parent for kobj */
-        mutex_lock(&of->mutex);
-        if (!sysfs_get_active(of->sd)) {
-                count = -ENODEV;
-                mutex_unlock(&of->mutex);
-                goto out_free;
-        }
-        if (battr->read)
-                count = battr->read(file, kobj, battr, buf, offs, count);
-        else
-                count = -EIO;
-        sysfs_put_active(of->sd);
-        mutex_unlock(&of->mutex);
-        if (count < 0)
-                goto out_free;
-        if (copy_to_user(userbuf, buf, count)) {
-                count = -EFAULT;
-                goto out_free;
        }
-        pr_debug("offs = %lld, *off = %lld, count = %d\n", offs, *off, count);
+        if (!battr->read)
+                return -EIO;
-        *off = offs + count;
- out_free:
+        return battr->read(of->file, kobj, battr, buf, pos, count);
-        kfree(buf);
-        return count;
 }
-/**
+/* kernfs write callback for regular sysfs files */
- * flush_write_buffer - push buffer to kobject
+static ssize_t sysfs_kf_write(struct kernfs_open_file *of, char *buf,
- * @of: open file
+                              size_t count, loff_t pos)
- * @buf: data buffer for file
- * @off: file offset to write to
- * @count: number of bytes
- *
- * Get the correct pointers for the kobject and the attribute we're dealing
- * with, then call the store() method for it with @buf.
- */
-static int flush_write_buffer(struct sysfs_open_file *of, char *buf, loff_t off,
-                              size_t count)
 {
-        struct kobject *kobj = of->sd->s_parent->s_dir.kobj;
+        const struct sysfs_ops *ops = sysfs_file_ops(of->kn);
-        int rc = 0;
+        struct kobject *kobj = of->kn->parent->priv;
-        /*
-         * Need @of->sd for attr and ops, its parent for kobj.  @of->mutex
-         * nests outside active ref and is just to ensure that the ops
-         * aren't called concurrently for the same open file.
-         */
-        mutex_lock(&of->mutex);
-        if (!sysfs_get_active(of->sd)) {
-                mutex_unlock(&of->mutex);
-                return -ENODEV;
-        }
-        if (sysfs_is_bin(of->sd)) {
+        if (!count)
-                struct bin_attribute *battr = of->sd->s_attr.bin_attr;
+                return 0;
-                rc = -EIO;
-                if (battr->write)
-                        rc = battr->write(of->file, kobj, battr, buf, off,
-                                          count);
-        } else {
-                const struct sysfs_ops *ops = sysfs_file_ops(of->sd);
-                rc = ops->store(kobj, of->sd->s_attr.attr, buf, count);
-        }
-        sysfs_put_active(of->sd);
-        mutex_unlock(&of->mutex);
-        return rc;
+        return ops->store(kobj, of->kn->priv, buf, count);
 }
-/**
+/* kernfs write callback for bin sysfs files */
- * sysfs_write_file - write an attribute
+static ssize_t sysfs_kf_bin_write(struct kernfs_open_file *of, char *buf,
- * @file: file pointer
+                                  size_t count, loff_t pos)
- * @user_buf: data to write
- * @count: number of bytes
- * @ppos: starting offset
- *
- * Copy data in from userland and pass it to the matching
- * sysfs_ops->store() by invoking flush_write_buffer().
- *
- * There is no easy way for us to know if userspace is only doing a partial
- * write, so we don't support them. We expect the entire buffer to come on
- * the first write.  Hint: if you're writing a value, first read the file,
- * modify only the the value you're changing, then write entire buffer
- * back.
- */
-static ssize_t sysfs_write_file(struct file *file, const char __user *user_buf,
-                                size_t count, loff_t *ppos)
 {
-        struct sysfs_open_file *of = sysfs_of(file);
+        struct bin_attribute *battr = of->kn->priv;
-        ssize_t len = min_t(size_t, count, PAGE_SIZE);
+        struct kobject *kobj = of->kn->parent->priv;
-        loff_t size = file_inode(file)->i_size;
+        loff_t size = file_inode(of->file)->i_size;
-        char *buf;
-        if (sysfs_is_bin(of->sd) && size) {
+        if (size) {
-                if (size <= *ppos)
+                if (size <= pos)
                        return 0;
-                len = min_t(ssize_t, len, size - *ppos);
+                count = min_t(ssize_t, count, size - pos);
        }
+        if (!count)
-        if (!len)
                return 0;
-        buf = kmalloc(len + 1, GFP_KERNEL);
+        if (!battr->write)
-        if (!buf)
+                return -EIO;
-                return -ENOMEM;
-        if (copy_from_user(buf, user_buf, len)) {
+        return battr->write(of->file, kobj, battr, buf, pos, count);
-                len = -EFAULT;
-                goto out_free;
-        }
-        buf[len] = '\0';        /* guarantee string termination */
-        len = flush_write_buffer(of, buf, *ppos, len);
-        if (len > 0)
-                *ppos += len;
-out_free:
-        kfree(buf);
-        return len;
-}
-static void sysfs_bin_vma_open(struct vm_area_struct *vma)
-{
-        struct file *file = vma->vm_file;
-        struct sysfs_open_file *of = sysfs_of(file);
-        if (!of->vm_ops)
-                return;
-        if (!sysfs_get_active(of->sd))
-                return;
-        if (of->vm_ops->open)
-                of->vm_ops->open(vma);
-        sysfs_put_active(of->sd);
 }
-static int sysfs_bin_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int sysfs_kf_bin_mmap(struct kernfs_open_file *of,
+                             struct vm_area_struct *vma)
 {
-        struct file *file = vma->vm_file;
+        struct bin_attribute *battr = of->kn->priv;
-        struct sysfs_open_file *of = sysfs_of(file);
+        struct kobject *kobj = of->kn->parent->priv;
-        int ret;
-        if (!of->vm_ops)
+        return battr->mmap(of->file, kobj, battr, vma);
-                return VM_FAULT_SIGBUS;
-        if (!sysfs_get_active(of->sd))
-                return VM_FAULT_SIGBUS;
-        ret = VM_FAULT_SIGBUS;
-        if (of->vm_ops->fault)
-                ret = of->vm_ops->fault(vma, vmf);
-        sysfs_put_active(of->sd);
-        return ret;
 }
-static int sysfs_bin_page_mkwrite(struct vm_area_struct *vma,
+void sysfs_notify(struct kobject *kobj, const char *dir, const char *attr)
-                                  struct vm_fault *vmf)
 {
-        struct file *file = vma->vm_file;
+        struct kernfs_node *kn = kobj->sd, *tmp;
-        struct sysfs_open_file *of = sysfs_of(file);
-        int ret;
-        if (!of->vm_ops)
-                return VM_FAULT_SIGBUS;
-        if (!sysfs_get_active(of->sd))
+        if (kn && dir)
-                return VM_FAULT_SIGBUS;
+                kn = kernfs_find_and_get(kn, dir);
-        ret = 0;
-        if (of->vm_ops->page_mkwrite)
-                ret = of->vm_ops->page_mkwrite(vma, vmf);
        else
-                file_update_time(file);
+                kernfs_get(kn);
-        sysfs_put_active(of->sd);
-        return ret;
-}
-static int sysfs_bin_access(struct vm_area_struct *vma, unsigned long addr,
-                            void *buf, int len, int write)
-{
-        struct file *file = vma->vm_file;
-        struct sysfs_open_file *of = sysfs_of(file);
-        int ret;
-        if (!of->vm_ops)
-                return -EINVAL;
-        if (!sysfs_get_active(of->sd))
-                return -EINVAL;
-        ret = -EINVAL;
-        if (of->vm_ops->access)
-                ret = of->vm_ops->access(vma, addr, buf, len, write);
-        sysfs_put_active(of->sd);
-        return ret;
-}
-#ifdef CONFIG_NUMA
-static int sysfs_bin_set_policy(struct vm_area_struct *vma,
-                                struct mempolicy *new)
-{
-        struct file *file = vma->vm_file;
-        struct sysfs_open_file *of = sysfs_of(file);
-        int ret;
-        if (!of->vm_ops)
-                return 0;
-        if (!sysfs_get_active(of->sd))
-                return -EINVAL;
-        ret = 0;
-        if (of->vm_ops->set_policy)
-                ret = of->vm_ops->set_policy(vma, new);
-        sysfs_put_active(of->sd);
-        return ret;
-}
-static struct mempolicy *sysfs_bin_get_policy(struct vm_area_struct *vma,
-                                              unsigned long addr)
-{
-        struct file *file = vma->vm_file;
-        struct sysfs_open_file *of = sysfs_of(file);
-        struct mempolicy *pol;
-        if (!of->vm_ops)
-                return vma->vm_policy;
-        if (!sysfs_get_active(of->sd))
-                return vma->vm_policy;
-        pol = vma->vm_policy;
-        if (of->vm_ops->get_policy)
-                pol = of->vm_ops->get_policy(vma, addr);
-        sysfs_put_active(of->sd);
-        return pol;
-}
-static int sysfs_bin_migrate(struct vm_area_struct *vma, const nodemask_t *from,
-                             const nodemask_t *to, unsigned long flags)
-{
-        struct file *file = vma->vm_file;
-        struct sysfs_open_file *of = sysfs_of(file);
-        int ret;
-        if (!of->vm_ops)
-                return 0;
-        if (!sysfs_get_active(of->sd))
-                return 0;
-        ret = 0;
-        if (of->vm_ops->migrate)
-                ret = of->vm_ops->migrate(vma, from, to, flags);
-        sysfs_put_active(of->sd);
-        return ret;
-}
-#endif
-static const struct vm_operations_struct sysfs_bin_vm_ops = {
-        .open           = sysfs_bin_vma_open,
-        .fault          = sysfs_bin_fault,
-        .page_mkwrite   = sysfs_bin_page_mkwrite,
-        .access         = sysfs_bin_access,
-#ifdef CONFIG_NUMA
-        .set_policy     = sysfs_bin_set_policy,
-        .get_policy     = sysfs_bin_get_policy,
-        .migrate        = sysfs_bin_migrate,
-#endif
-};
-static int sysfs_bin_mmap(struct file *file, struct vm_area_struct *vma)
-{
-        struct sysfs_open_file *of = sysfs_of(file);
-        struct bin_attribute *battr = of->sd->s_attr.bin_attr;
-        struct kobject *kobj = of->sd->s_parent->s_dir.kobj;
-        int rc;
-        mutex_lock(&of->mutex);
-        /* need of->sd for battr, its parent for kobj */
-        rc = -ENODEV;
-        if (!sysfs_get_active(of->sd))
-                goto out_unlock;
-        if (!battr->mmap)
-                goto out_put;
-        rc = battr->mmap(file, kobj, battr, vma);
-        if (rc)
-                goto out_put;
-        /*
-         * PowerPC's pci_mmap of legacy_mem uses shmem_zero_setup()
-         * to satisfy versions of X which crash if the mmap fails: that
-         * substitutes a new vm_file, and we don't then want bin_vm_ops.
-         */
-        if (vma->vm_file != file)
-                goto out_put;
-        rc = -EINVAL;
-        if (of->mmapped && of->vm_ops != vma->vm_ops)
-                goto out_put;
-        /*
+        if (kn && attr) {
-         * It is not possible to successfully wrap close.
+                tmp = kernfs_find_and_get(kn, attr);
-         * So error if someone is trying to use close.
+                kernfs_put(kn);
-         */
+                kn = tmp;
-        rc = -EINVAL;
-        if (vma->vm_ops && vma->vm_ops->close)
-                goto out_put;
-        rc = 0;
-        of->mmapped = 1;
-        of->vm_ops = vma->vm_ops;
-        vma->vm_ops = &sysfs_bin_vm_ops;
-out_put:
-        sysfs_put_active(of->sd);
-out_unlock:
-        mutex_unlock(&of->mutex);
-        return rc;
-}
-/**
- *      sysfs_get_open_dirent - get or create sysfs_open_dirent
- *      @sd: target sysfs_dirent
- *      @of: sysfs_open_file for this instance of open
- *
- *      If @sd->s_attr.open exists, increment its reference count;
- *      otherwise, create one.  @of is chained to the files list.
- *
- *      LOCKING:
- *      Kernel thread context (may sleep).
- *
- *      RETURNS:
- *      0 on success, -errno on failure.
- */
-static int sysfs_get_open_dirent(struct sysfs_dirent *sd,
-                                 struct sysfs_open_file *of)
-{
-        struct sysfs_open_dirent *od, *new_od = NULL;
- retry:
-        mutex_lock(&sysfs_open_file_mutex);
-        spin_lock_irq(&sysfs_open_dirent_lock);
-        if (!sd->s_attr.open && new_od) {
-                sd->s_attr.open = new_od;
-                new_od = NULL;
        }
-        od = sd->s_attr.open;
+        if (kn) {
-        if (od) {
+                kernfs_notify(kn);
-                atomic_inc(&od->refcnt);
+                kernfs_put(kn);
-                list_add_tail(&of->list, &od->files);
-        }
-        spin_unlock_irq(&sysfs_open_dirent_lock);
-        mutex_unlock(&sysfs_open_file_mutex);
-        if (od) {
-                kfree(new_od);
-                return 0;
        }
+}
+EXPORT_SYMBOL_GPL(sysfs_notify);
-        /* not there, initialize a new one and retry */
+static const struct kernfs_ops sysfs_file_kfops_empty = {
-        new_od = kmalloc(sizeof(*new_od), GFP_KERNEL);
+};
-        if (!new_od)
-                return -ENOMEM;
-        atomic_set(&new_od->refcnt, 0);
+static const struct kernfs_ops sysfs_file_kfops_ro = {
-        atomic_set(&new_od->event, 1);
+        .seq_show       = sysfs_kf_seq_show,
-        init_waitqueue_head(&new_od->poll);
+};
-        INIT_LIST_HEAD(&new_od->files);
-        goto retry;
-}
-/**
+static const struct kernfs_ops sysfs_file_kfops_wo = {
- *      sysfs_put_open_dirent - put sysfs_open_dirent
+        .write          = sysfs_kf_write,
- *      @sd: target sysfs_dirent
+};
- *      @of: associated sysfs_open_file
- *
- *      Put @sd->s_attr.open and unlink @of from the files list.  If
- *      reference count reaches zero, disassociate and free it.
- *
- *      LOCKING:
- *      None.
- */
-static void sysfs_put_open_dirent(struct sysfs_dirent *sd,
-                                  struct sysfs_open_file *of)
-{
-        struct sysfs_open_dirent *od = sd->s_attr.open;
-        unsigned long flags;
-        mutex_lock(&sysfs_open_file_mutex);
+static const struct kernfs_ops sysfs_file_kfops_rw = {
-        spin_lock_irqsave(&sysfs_open_dirent_lock, flags);
+        .seq_show       = sysfs_kf_seq_show,
+        .write          = sysfs_kf_write,
+};
-        if (of)
+static const struct kernfs_ops sysfs_bin_kfops_ro = {
-                list_del(&of->list);
+        .read           = sysfs_kf_bin_read,
+};
-        if (atomic_dec_and_test(&od->refcnt))
+static const struct kernfs_ops sysfs_bin_kfops_wo = {
-                sd->s_attr.open = NULL;
+        .write          = sysfs_kf_bin_write,
-        else
+};
-                od = NULL;
-        spin_unlock_irqrestore(&sysfs_open_dirent_lock, flags);
+static const struct kernfs_ops sysfs_bin_kfops_rw = {
-        mutex_unlock(&sysfs_open_file_mutex);
+        .read           = sysfs_kf_bin_read,
+        .write          = sysfs_kf_bin_write,
+};
-        kfree(od);
+static const struct kernfs_ops sysfs_bin_kfops_mmap = {
-}
+        .read           = sysfs_kf_bin_read,
+        .write          = sysfs_kf_bin_write,
+        .mmap           = sysfs_kf_bin_mmap,
+};
-static int sysfs_open_file(struct inode *inode, struct file *file)
+int sysfs_add_file_mode_ns(struct kernfs_node *parent,
+                           const struct attribute *attr, bool is_bin,
+                           umode_t mode, const void *ns)
 {
-        struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
+        struct lock_class_key *key = NULL;
-        struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
+        const struct kernfs_ops *ops;
-        struct sysfs_open_file *of;
+        struct kernfs_node *kn;
-        bool has_read, has_write, has_mmap;
+        loff_t size;
-        int error = -EACCES;
-        /* need attr_sd for attr and ops, its parent for kobj */
-        if (!sysfs_get_active(attr_sd))
-                return -ENODEV;
-        if (sysfs_is_bin(attr_sd)) {
+        if (!is_bin) {
-                struct bin_attribute *battr = attr_sd->s_attr.bin_attr;
+                struct kobject *kobj = parent->priv;
+                const struct sysfs_ops *sysfs_ops = kobj->ktype->sysfs_ops;
-                has_read = battr->read || battr->mmap;
-                has_write = battr->write || battr->mmap;
-                has_mmap = battr->mmap;
-        } else {
-                const struct sysfs_ops *ops = sysfs_file_ops(attr_sd);
                /* every kobject with an attribute needs a ktype assigned */
-                if (WARN(!ops, KERN_ERR
+                if (WARN(!sysfs_ops, KERN_ERR
                         "missing sysfs attribute operations for kobject: %s\n",
                         kobject_name(kobj)))
-                        goto err_out;
+                        return -EINVAL;
-                has_read = ops->show;
+                if (sysfs_ops->show && sysfs_ops->store)
-                has_write = ops->store;
+                        ops = &sysfs_file_kfops_rw;
-                has_mmap = false;
+                else if (sysfs_ops->show)
-        }
+                        ops = &sysfs_file_kfops_ro;
+                else if (sysfs_ops->store)
-        /* check perms and supported operations */
+                        ops = &sysfs_file_kfops_wo;
-        if ((file->f_mode & FMODE_WRITE) &&
+                else
-            (!(inode->i_mode & S_IWUGO) || !has_write))
+                        ops = &sysfs_file_kfops_empty;
-                goto err_out;
+                size = PAGE_SIZE;
-        if ((file->f_mode & FMODE_READ) &&
+        } else {
-            (!(inode->i_mode & S_IRUGO) || !has_read))
+                struct bin_attribute *battr = (void *)attr;
-                goto err_out;
+                if (battr->mmap)
-        /* allocate a sysfs_open_file for the file */
+                        ops = &sysfs_bin_kfops_mmap;
-        error = -ENOMEM;
+                else if (battr->read && battr->write)
-        of = kzalloc(sizeof(struct sysfs_open_file), GFP_KERNEL);
+                        ops = &sysfs_bin_kfops_rw;
-        if (!of)
+                else if (battr->read)
-                goto err_out;
+                        ops = &sysfs_bin_kfops_ro;
+                else if (battr->write)
-        /*
+                        ops = &sysfs_bin_kfops_wo;
-         * The following is done to give a different lockdep key to
+                else
-         * @of->mutex for files which implement mmap.  This is a rather
+                        ops = &sysfs_file_kfops_empty;
-         * crude way to avoid false positive lockdep warning around
-         * mm->mmap_sem - mmap nests @of->mutex under mm->mmap_sem and
+                size = battr->size;
-         * reading /sys/block/sda/trace/act_mask grabs sr_mutex, under
-         * which mm->mmap_sem nests, while holding @of->mutex.  As each
-         * open file has a separate mutex, it's okay as long as those don't
-         * happen on the same file.  At this point, we can't easily give
-         * each file a separate locking class.  Let's differentiate on
-         * whether the file has mmap or not for now.
-         */
-        if (has_mmap)
-                mutex_init(&of->mutex);
-        else
-                mutex_init(&of->mutex);
-        of->sd = attr_sd;
-        of->file = file;
-        /*
-         * Always instantiate seq_file even if read access doesn't use
-         * seq_file or is not requested.  This unifies private data access
-         * and readable regular files are the vast majority anyway.
-         */
-        if (sysfs_is_bin(attr_sd))
-                error = single_open(file, NULL, of);
-        else
-                error = single_open(file, sysfs_seq_show, of);
-        if (error)
-                goto err_free;
-        /* seq_file clears PWRITE unconditionally, restore it if WRITE */
-        if (file->f_mode & FMODE_WRITE)
-                file->f_mode |= FMODE_PWRITE;
-        /* make sure we have open dirent struct */
-        error = sysfs_get_open_dirent(attr_sd, of);
-        if (error)
-                goto err_close;
-        /* open succeeded, put active references */
-        sysfs_put_active(attr_sd);
-        return 0;
-err_close:
-        single_release(inode, file);
-err_free:
-        kfree(of);
-err_out:
-        sysfs_put_active(attr_sd);
-        return error;
-}
-static int sysfs_release(struct inode *inode, struct file *filp)
-{
-        struct sysfs_dirent *sd = filp->f_path.dentry->d_fsdata;
-        struct sysfs_open_file *of = sysfs_of(filp);
-        sysfs_put_open_dirent(sd, of);
-        single_release(inode, filp);
-        kfree(of);
-        return 0;
-}
-void sysfs_unmap_bin_file(struct sysfs_dirent *sd)
-{
-        struct sysfs_open_dirent *od;
-        struct sysfs_open_file *of;
-        if (!sysfs_is_bin(sd))
-                return;
-        spin_lock_irq(&sysfs_open_dirent_lock);
-        od = sd->s_attr.open;
-        if (od)
-                atomic_inc(&od->refcnt);
-        spin_unlock_irq(&sysfs_open_dirent_lock);
-        if (!od)
-                return;
-        mutex_lock(&sysfs_open_file_mutex);
-        list_for_each_entry(of, &od->files, list) {
-                struct inode *inode = file_inode(of->file);
-                unmap_mapping_range(inode->i_mapping, 0, 0, 1);
        }
-        mutex_unlock(&sysfs_open_file_mutex);
-        sysfs_put_open_dirent(sd, NULL);
-}
-/* Sysfs attribute files are pollable.  The idea is that you read
- * the content and then you use 'poll' or 'select' to wait for
- * the content to change.  When the content changes (assuming the
- * manager for the kobject supports notification), poll will
- * return POLLERR|POLLPRI, and select will return the fd whether
- * it is waiting for read, write, or exceptions.
- * Once poll/select indicates that the value has changed, you
- * need to close and re-open the file, or seek to 0 and read again.
- * Reminder: this only works for attributes which actively support
- * it, and it is not possible to test an attribute from userspace
- * to see if it supports poll (Neither 'poll' nor 'select' return
- * an appropriate error code).  When in doubt, set a suitable timeout value.
- */
-static unsigned int sysfs_poll(struct file *filp, poll_table *wait)
-{
-        struct sysfs_open_file *of = sysfs_of(filp);
-        struct sysfs_dirent *attr_sd = filp->f_path.dentry->d_fsdata;
-        struct sysfs_open_dirent *od = attr_sd->s_attr.open;
-        /* need parent for the kobj, grab both */
-        if (!sysfs_get_active(attr_sd))
-                goto trigger;
-        poll_wait(filp, &od->poll, wait);
-        sysfs_put_active(attr_sd);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+        if (!attr->ignore_lockdep)
-        if (of->event != atomic_read(&od->event))
+                key = attr->key ?: (struct lock_class_key *)&attr->skey;
-                goto trigger;
+#endif
+        kn = __kernfs_create_file(parent, attr->name, mode, size, ops,
-        return DEFAULT_POLLMASK;
+                                  (void *)attr, ns, true, key);
+        if (IS_ERR(kn)) {
- trigger:
+                if (PTR_ERR(kn) == -EEXIST)
-        return DEFAULT_POLLMASK|POLLERR|POLLPRI;
+                        sysfs_warn_dup(parent, attr->name);
-}
+                return PTR_ERR(kn);
-void sysfs_notify_dirent(struct sysfs_dirent *sd)
-{
-        struct sysfs_open_dirent *od;
-        unsigned long flags;
-        spin_lock_irqsave(&sysfs_open_dirent_lock, flags);
-        if (!WARN_ON(sysfs_type(sd) != SYSFS_KOBJ_ATTR)) {
-                od = sd->s_attr.open;
-                if (od) {
-                        atomic_inc(&od->event);
-                        wake_up_interruptible(&od->poll);
-                }
        }
+        return 0;
-        spin_unlock_irqrestore(&sysfs_open_dirent_lock, flags);
-}
-EXPORT_SYMBOL_GPL(sysfs_notify_dirent);
-void sysfs_notify(struct kobject *k, const char *dir, const char *attr)
-{
-        struct sysfs_dirent *sd = k->sd;
-        mutex_lock(&sysfs_mutex);
-        if (sd && dir)
-                sd = sysfs_find_dirent(sd, dir, NULL);
-        if (sd && attr)
-                sd = sysfs_find_dirent(sd, attr, NULL);
-        if (sd)
-                sysfs_notify_dirent(sd);
-        mutex_unlock(&sysfs_mutex);
-}
-EXPORT_SYMBOL_GPL(sysfs_notify);
-const struct file_operations sysfs_file_operations = {
-        .read           = seq_read,
-        .write          = sysfs_write_file,
-        .llseek         = generic_file_llseek,
-        .open           = sysfs_open_file,
-        .release        = sysfs_release,
-        .poll           = sysfs_poll,
-};
-const struct file_operations sysfs_bin_operations = {
-        .read           = sysfs_bin_read,
-        .write          = sysfs_write_file,
-        .llseek         = generic_file_llseek,
-        .mmap           = sysfs_bin_mmap,
-        .open           = sysfs_open_file,
-        .release        = sysfs_release,
-        .poll           = sysfs_poll,
-};
-int sysfs_add_file_mode_ns(struct sysfs_dirent *dir_sd,
-                           const struct attribute *attr, int type,
-                           umode_t amode, const void *ns)
-{
-        umode_t mode = (amode & S_IALLUGO) | S_IFREG;
-        struct sysfs_addrm_cxt acxt;
-        struct sysfs_dirent *sd;
-        int rc;
-        sd = sysfs_new_dirent(attr->name, mode, type);
-        if (!sd)
-                return -ENOMEM;
-        sd->s_ns = ns;
-        sd->s_attr.attr = (void *)attr;
-        sysfs_dirent_init_lockdep(sd);
-        sysfs_addrm_start(&acxt);
-        rc = sysfs_add_one(&acxt, sd, dir_sd);
-        sysfs_addrm_finish(&acxt);
-        if (rc)
-                sysfs_put(sd);
-        return rc;
 }
+int sysfs_add_file(struct kernfs_node *parent, const struct attribute *attr,
-int sysfs_add_file(struct sysfs_dirent *dir_sd, const struct attribute *attr,
+                   bool is_bin)
-                   int type)
 {
-        return sysfs_add_file_mode_ns(dir_sd, attr, type, attr->mode, NULL);
+        return sysfs_add_file_mode_ns(parent, attr, is_bin, attr->mode, NULL);
 }
 /**
@@ -879,8 +279,7 @@ int sysfs_create_file_ns(struct kobject *kobj, const struct attribute *attr,
 {
        BUG_ON(!kobj || !kobj->sd || !attr);
-        return sysfs_add_file_mode_ns(kobj->sd, attr, SYSFS_KOBJ_ATTR,
+        return sysfs_add_file_mode_ns(kobj->sd, attr, false, attr->mode, ns);
-                                      attr->mode, ns);
 }
 EXPORT_SYMBOL_GPL(sysfs_create_file_ns);
@@ -908,19 +307,21 @@ EXPORT_SYMBOL_GPL(sysfs_create_files);
 int sysfs_add_file_to_group(struct kobject *kobj,
                const struct attribute *attr, const char *group)
 {
-        struct sysfs_dirent *dir_sd;
+        struct kernfs_node *parent;
        int error;
-        if (group)
+        if (group) {
-                dir_sd = sysfs_get_dirent(kobj->sd, group);
+                parent = kernfs_find_and_get(kobj->sd, group);
-        else
+        } else {
-                dir_sd = sysfs_get(kobj->sd);
+                parent = kobj->sd;
+                kernfs_get(parent);
+        }
-        if (!dir_sd)
+        if (!parent)
                return -ENOENT;
-        error = sysfs_add_file(dir_sd, attr, SYSFS_KOBJ_ATTR);
+        error = sysfs_add_file(parent, attr, false);
-        sysfs_put(dir_sd);
+        kernfs_put(parent);
        return error;
 }
@@ -936,23 +337,20 @@ EXPORT_SYMBOL_GPL(sysfs_add_file_to_group);
 int sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr,
                     umode_t mode)
 {
-        struct sysfs_dirent *sd;
+        struct kernfs_node *kn;
        struct iattr newattrs;
        int rc;
-        mutex_lock(&sysfs_mutex);
+        kn = kernfs_find_and_get(kobj->sd, attr->name);
+        if (!kn)
-        rc = -ENOENT;
+                return -ENOENT;
-        sd = sysfs_find_dirent(kobj->sd, attr->name, NULL);
-        if (!sd)
-                goto out;
-        newattrs.ia_mode = (mode & S_IALLUGO) | (sd->s_mode & ~S_IALLUGO);
+        newattrs.ia_mode = (mode & S_IALLUGO) | (kn->mode & ~S_IALLUGO);
        newattrs.ia_valid = ATTR_MODE;
-        rc = sysfs_sd_setattr(sd, &newattrs);
- out:
+        rc = kernfs_setattr(kn, &newattrs);
-        mutex_unlock(&sysfs_mutex);
+        kernfs_put(kn);
        return rc;
 }
 EXPORT_SYMBOL_GPL(sysfs_chmod_file);
@@ -968,9 +366,9 @@ EXPORT_SYMBOL_GPL(sysfs_chmod_file);
 void sysfs_remove_file_ns(struct kobject *kobj, const struct attribute *attr,
                          const void *ns)
 {
-        struct sysfs_dirent *dir_sd = kobj->sd;
+        struct kernfs_node *parent = kobj->sd;
-        sysfs_hash_and_remove(dir_sd, attr->name, ns);
+        kernfs_remove_by_name_ns(parent, attr->name, ns);
 }
 EXPORT_SYMBOL_GPL(sysfs_remove_file_ns);
@@ -991,15 +389,18 @@ EXPORT_SYMBOL_GPL(sysfs_remove_files);
 void sysfs_remove_file_from_group(struct kobject *kobj,
                const struct attribute *attr, const char *group)
 {
-        struct sysfs_dirent *dir_sd;
+        struct kernfs_node *parent;
-        if (group)
+        if (group) {
-                dir_sd = sysfs_get_dirent(kobj->sd, group);
+                parent = kernfs_find_and_get(kobj->sd, group);
-        else
+        } else {
-                dir_sd = sysfs_get(kobj->sd);
+                parent = kobj->sd;
-        if (dir_sd) {
+                kernfs_get(parent);
-                sysfs_hash_and_remove(dir_sd, attr->name, NULL);
+        }
-                sysfs_put(dir_sd);
+        if (parent) {
+                kernfs_remove_by_name(parent, attr->name);
+                kernfs_put(parent);
        }
 }
 EXPORT_SYMBOL_GPL(sysfs_remove_file_from_group);
@@ -1014,7 +415,7 @@ int sysfs_create_bin_file(struct kobject *kobj,
 {
        BUG_ON(!kobj || !kobj->sd || !attr);
-        return sysfs_add_file(kobj->sd, &attr->attr, SYSFS_KOBJ_BIN_ATTR);
+        return sysfs_add_file(kobj->sd, &attr->attr, true);
 }
 EXPORT_SYMBOL_GPL(sysfs_create_bin_file);
@@ -1026,7 +427,7 @@ EXPORT_SYMBOL_GPL(sysfs_create_bin_file);
 void sysfs_remove_bin_file(struct kobject *kobj,
                           const struct bin_attribute *attr)
 {
-        sysfs_hash_and_remove(kobj->sd, attr->attr.name, NULL);
+        kernfs_remove_by_name(kobj->sd, attr->attr.name);
 }
 EXPORT_SYMBOL_GPL(sysfs_remove_bin_file);
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 1898a10e38ce..6b579387c67a 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -18,7 +18,7 @@
 #include "sysfs.h"
-static void remove_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
+static void remove_files(struct kernfs_node *parent, struct kobject *kobj,
                         const struct attribute_group *grp)
 {
        struct attribute *const *attr;
@@ -26,13 +26,13 @@ static void remove_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
        if (grp->attrs)
                for (attr = grp->attrs; *attr; attr++)
-                        sysfs_hash_and_remove(dir_sd, (*attr)->name, NULL);
+                        kernfs_remove_by_name(parent, (*attr)->name);
        if (grp->bin_attrs)
                for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++)
                        sysfs_remove_bin_file(kobj, *bin_attr);
 }
-static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
+static int create_files(struct kernfs_node *parent, struct kobject *kobj,
                        const struct attribute_group *grp, int update)
 {
        struct attribute *const *attr;
@@ -49,22 +49,20 @@ static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
                         * re-adding (if required) the file.
                         */
                        if (update)
-                                sysfs_hash_and_remove(dir_sd, (*attr)->name,
+                                kernfs_remove_by_name(parent, (*attr)->name);
-                                                      NULL);
                        if (grp->is_visible) {
                                mode = grp->is_visible(kobj, *attr, i);
                                if (!mode)
                                        continue;
                        }
-                        error = sysfs_add_file_mode_ns(dir_sd, *attr,
+                        error = sysfs_add_file_mode_ns(parent, *attr, false,
-                                                       SYSFS_KOBJ_ATTR,
                                                       (*attr)->mode | mode,
                                                       NULL);
                        if (unlikely(error))
                                break;
                }
                if (error) {
-                        remove_files(dir_sd, kobj, grp);
+                        remove_files(parent, kobj, grp);
                        goto exit;
                }
        }
@@ -78,7 +76,7 @@ static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
                                break;
                }
                if (error)
-                        remove_files(dir_sd, kobj, grp);
+                        remove_files(parent, kobj, grp);
        }
 exit:
        return error;
@@ -88,7 +86,7 @@ exit:
 static int internal_create_group(struct kobject *kobj, int update,
                                 const struct attribute_group *grp)
 {
-        struct sysfs_dirent *sd;
+        struct kernfs_node *kn;
        int error;
        BUG_ON(!kobj || (!update && !kobj->sd));
@@ -102,18 +100,22 @@ static int internal_create_group(struct kobject *kobj, int update,
                return -EINVAL;
        }
        if (grp->name) {
-                error = sysfs_create_subdir(kobj, grp->name, &sd);
+                kn = kernfs_create_dir(kobj->sd, grp->name,
-                if (error)
+                                       S_IRWXU | S_IRUGO | S_IXUGO, kobj);
-                        return error;
+                if (IS_ERR(kn)) {
+                        if (PTR_ERR(kn) == -EEXIST)
+                                sysfs_warn_dup(kobj->sd, grp->name);
+                        return PTR_ERR(kn);
+                }
        } else
-                sd = kobj->sd;
+                kn = kobj->sd;
-        sysfs_get(sd);
+        kernfs_get(kn);
-        error = create_files(sd, kobj, grp, update);
+        error = create_files(kn, kobj, grp, update);
        if (error) {
                if (grp->name)
-                        sysfs_remove(sd);
+                        kernfs_remove(kn);
        }
-        sysfs_put(sd);
+        kernfs_put(kn);
        return error;
 }
@@ -203,25 +205,27 @@ EXPORT_SYMBOL_GPL(sysfs_update_group);
 void sysfs_remove_group(struct kobject *kobj,
                        const struct attribute_group *grp)
 {
-        struct sysfs_dirent *dir_sd = kobj->sd;
+        struct kernfs_node *parent = kobj->sd;
-        struct sysfs_dirent *sd;
+        struct kernfs_node *kn;
        if (grp->name) {
-                sd = sysfs_get_dirent(dir_sd, grp->name);
+                kn = kernfs_find_and_get(parent, grp->name);
-                if (!sd) {
+                if (!kn) {
-                        WARN(!sd, KERN_WARNING
+                        WARN(!kn, KERN_WARNING
                             "sysfs group %p not found for kobject '%s'\n",
                             grp, kobject_name(kobj));
                        return;
                }
-        } else
+        } else {
-                sd = sysfs_get(dir_sd);
+                kn = parent;
+                kernfs_get(kn);
+        }
-        remove_files(sd, kobj, grp);
+        remove_files(kn, kobj, grp);
        if (grp->name)
-                sysfs_remove(sd);
+                kernfs_remove(kn);
-        sysfs_put(sd);
+        kernfs_put(kn);
 }
 EXPORT_SYMBOL_GPL(sysfs_remove_group);
@@ -257,22 +261,22 @@ EXPORT_SYMBOL_GPL(sysfs_remove_groups);
 int sysfs_merge_group(struct kobject *kobj,
                       const struct attribute_group *grp)
 {
-        struct sysfs_dirent *dir_sd;
+        struct kernfs_node *parent;
        int error = 0;
        struct attribute *const *attr;
        int i;
-        dir_sd = sysfs_get_dirent(kobj->sd, grp->name);
+        parent = kernfs_find_and_get(kobj->sd, grp->name);
-        if (!dir_sd)
+        if (!parent)
                return -ENOENT;
        for ((i = 0, attr = grp->attrs); *attr && !error; (++i, ++attr))
-                error = sysfs_add_file(dir_sd, *attr, SYSFS_KOBJ_ATTR);
+                error = sysfs_add_file(parent, *attr, false);
        if (error) {
                while (--i >= 0)
-                        sysfs_hash_and_remove(dir_sd, (*--attr)->name, NULL);
+                        kernfs_remove_by_name(parent, (*--attr)->name);
        }
-        sysfs_put(dir_sd);
+        kernfs_put(parent);
        return error;
 }
@@ -286,14 +290,14 @@ EXPORT_SYMBOL_GPL(sysfs_merge_group);
 void sysfs_unmerge_group(struct kobject *kobj,
                       const struct attribute_group *grp)
 {
-        struct sysfs_dirent *dir_sd;
+        struct kernfs_node *parent;
        struct attribute *const *attr;
-        dir_sd = sysfs_get_dirent(kobj->sd, grp->name);
+        parent = kernfs_find_and_get(kobj->sd, grp->name);
-        if (dir_sd) {
+        if (parent) {
                for (attr = grp->attrs; *attr; ++attr)
-                        sysfs_hash_and_remove(dir_sd, (*attr)->name, NULL);
+                        kernfs_remove_by_name(parent, (*attr)->name);
-                sysfs_put(dir_sd);
+                kernfs_put(parent);
        }
 }
 EXPORT_SYMBOL_GPL(sysfs_unmerge_group);
@@ -308,15 +312,15 @@ EXPORT_SYMBOL_GPL(sysfs_unmerge_group);
 int sysfs_add_link_to_group(struct kobject *kobj, const char *group_name,
                            struct kobject *target, const char *link_name)
 {
-        struct sysfs_dirent *dir_sd;
+        struct kernfs_node *parent;
        int error = 0;
-        dir_sd = sysfs_get_dirent(kobj->sd, group_name);
+        parent = kernfs_find_and_get(kobj->sd, group_name);
-        if (!dir_sd)
+        if (!parent)
                return -ENOENT;
-        error = sysfs_create_link_sd(dir_sd, target, link_name);
+        error = sysfs_create_link_sd(parent, target, link_name);
-        sysfs_put(dir_sd);
+        kernfs_put(parent);
        return error;
 }
@@ -331,12 +335,12 @@ EXPORT_SYMBOL_GPL(sysfs_add_link_to_group);
 void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name,
                                  const char *link_name)
 {
-        struct sysfs_dirent *dir_sd;
+        struct kernfs_node *parent;
-        dir_sd = sysfs_get_dirent(kobj->sd, group_name);
+        parent = kernfs_find_and_get(kobj->sd, group_name);
-        if (dir_sd) {
+        if (parent) {
-                sysfs_hash_and_remove(dir_sd, link_name, NULL);
+                kernfs_remove_by_name(parent, link_name);
-                sysfs_put(dir_sd);
+                kernfs_put(parent);
        }
 }
 EXPORT_SYMBOL_GPL(sysfs_remove_link_from_group);
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
deleted file mode 100644
index 1750f790af3b..000000000000
--- a/fs/sysfs/inode.c
+++ /dev/null
@@ -1,331 +0,0 @@
-/*
- * fs/sysfs/inode.c - basic sysfs inode and dentry operations
- *
- * Copyright (c) 2001-3 Patrick Mochel
- * Copyright (c) 2007 SUSE Linux Products GmbH
- * Copyright (c) 2007 Tejun Heo <teheo@suse.de>
- *
- * This file is released under the GPLv2.
- *
- * Please see Documentation/filesystems/sysfs.txt for more information.
- */
-#undef DEBUG
-#include <linux/pagemap.h>
-#include <linux/namei.h>
-#include <linux/backing-dev.h>
-#include <linux/capability.h>
-#include <linux/errno.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/sysfs.h>
-#include <linux/xattr.h>
-#include <linux/security.h>
-#include "sysfs.h"
-static const struct address_space_operations sysfs_aops = {
-        .readpage       = simple_readpage,
-        .write_begin    = simple_write_begin,
-        .write_end      = simple_write_end,
-};
-static struct backing_dev_info sysfs_backing_dev_info = {
-        .name           = "sysfs",
-        .ra_pages       = 0,    /* No readahead */
-        .capabilities   = BDI_CAP_NO_ACCT_AND_WRITEBACK,
-};
-static const struct inode_operations sysfs_inode_operations = {
-        .permission     = sysfs_permission,
-        .setattr        = sysfs_setattr,
-        .getattr        = sysfs_getattr,
-        .setxattr       = sysfs_setxattr,
-};
-int __init sysfs_inode_init(void)
-{
-        return bdi_init(&sysfs_backing_dev_info);
-}
-static struct sysfs_inode_attrs *sysfs_init_inode_attrs(struct sysfs_dirent *sd)
-{
-        struct sysfs_inode_attrs *attrs;
-        struct iattr *iattrs;
-        attrs = kzalloc(sizeof(struct sysfs_inode_attrs), GFP_KERNEL);
-        if (!attrs)
-                return NULL;
-        iattrs = &attrs->ia_iattr;
-        /* assign default attributes */
-        iattrs->ia_mode = sd->s_mode;
-        iattrs->ia_uid = GLOBAL_ROOT_UID;
-        iattrs->ia_gid = GLOBAL_ROOT_GID;
-        iattrs->ia_atime = iattrs->ia_mtime = iattrs->ia_ctime = CURRENT_TIME;
-        return attrs;
-}
-int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr *iattr)
-{
-        struct sysfs_inode_attrs *sd_attrs;
-        struct iattr *iattrs;
-        unsigned int ia_valid = iattr->ia_valid;
-        sd_attrs = sd->s_iattr;
-        if (!sd_attrs) {
-                /* setting attributes for the first time, allocate now */
-                sd_attrs = sysfs_init_inode_attrs(sd);
-                if (!sd_attrs)
-                        return -ENOMEM;
-                sd->s_iattr = sd_attrs;
-        }
-        /* attributes were changed at least once in past */
-        iattrs = &sd_attrs->ia_iattr;
-        if (ia_valid & ATTR_UID)
-                iattrs->ia_uid = iattr->ia_uid;
-        if (ia_valid & ATTR_GID)
-                iattrs->ia_gid = iattr->ia_gid;
-        if (ia_valid & ATTR_ATIME)
-                iattrs->ia_atime = iattr->ia_atime;
-        if (ia_valid & ATTR_MTIME)
-                iattrs->ia_mtime = iattr->ia_mtime;
-        if (ia_valid & ATTR_CTIME)
-                iattrs->ia_ctime = iattr->ia_ctime;
-        if (ia_valid & ATTR_MODE) {
-                umode_t mode = iattr->ia_mode;
-                iattrs->ia_mode = sd->s_mode = mode;
-        }
-        return 0;
-}
-int sysfs_setattr(struct dentry *dentry, struct iattr *iattr)
-{
-        struct inode *inode = dentry->d_inode;
-        struct sysfs_dirent *sd = dentry->d_fsdata;
-        int error;
-        if (!sd)
-                return -EINVAL;
-        mutex_lock(&sysfs_mutex);
-        error = inode_change_ok(inode, iattr);
-        if (error)
-                goto out;
-        error = sysfs_sd_setattr(sd, iattr);
-        if (error)
-                goto out;
-        /* this ignores size changes */
-        setattr_copy(inode, iattr);
-out:
-        mutex_unlock(&sysfs_mutex);
-        return error;
-}
-static int sysfs_sd_setsecdata(struct sysfs_dirent *sd, void **secdata,
-                               u32 *secdata_len)
-{
-        struct sysfs_inode_attrs *iattrs;
-        void *old_secdata;
-        size_t old_secdata_len;
-        if (!sd->s_iattr) {
-                sd->s_iattr = sysfs_init_inode_attrs(sd);
-                if (!sd->s_iattr)
-                        return -ENOMEM;
-        }
-        iattrs = sd->s_iattr;
-        old_secdata = iattrs->ia_secdata;
-        old_secdata_len = iattrs->ia_secdata_len;
-        iattrs->ia_secdata = *secdata;
-        iattrs->ia_secdata_len = *secdata_len;
-        *secdata = old_secdata;
-        *secdata_len = old_secdata_len;
-        return 0;
-}
-int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
-                size_t size, int flags)
-{
-        struct sysfs_dirent *sd = dentry->d_fsdata;
-        void *secdata;
-        int error;
-        u32 secdata_len = 0;
-        if (!sd)
-                return -EINVAL;
-        if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) {
-                const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;
-                error = security_inode_setsecurity(dentry->d_inode, suffix,
-                                                value, size, flags);
-                if (error)
-                        goto out;
-                error = security_inode_getsecctx(dentry->d_inode,
-                                                &secdata, &secdata_len);
-                if (error)
-                        goto out;
-                mutex_lock(&sysfs_mutex);
-                error = sysfs_sd_setsecdata(sd, &secdata, &secdata_len);
-                mutex_unlock(&sysfs_mutex);
-                if (secdata)
-                        security_release_secctx(secdata, secdata_len);
-        } else
-                return -EINVAL;
-out:
-        return error;
-}
-static inline void set_default_inode_attr(struct inode *inode, umode_t mode)
-{
-        inode->i_mode = mode;
-        inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-}
-static inline void set_inode_attr(struct inode *inode, struct iattr *iattr)
-{
-        inode->i_uid = iattr->ia_uid;
-        inode->i_gid = iattr->ia_gid;
-        inode->i_atime = iattr->ia_atime;
-        inode->i_mtime = iattr->ia_mtime;
-        inode->i_ctime = iattr->ia_ctime;
-}
-static void sysfs_refresh_inode(struct sysfs_dirent *sd, struct inode *inode)
-{
-        struct sysfs_inode_attrs *iattrs = sd->s_iattr;
-        inode->i_mode = sd->s_mode;
-        if (iattrs) {
-                /* sysfs_dirent has non-default attributes
-                 * get them from persistent copy in sysfs_dirent
-                 */
-                set_inode_attr(inode, &iattrs->ia_iattr);
-                security_inode_notifysecctx(inode,
-                                            iattrs->ia_secdata,
-                                            iattrs->ia_secdata_len);
-        }
-        if (sysfs_type(sd) == SYSFS_DIR)
-                set_nlink(inode, sd->s_dir.subdirs + 2);
-}
-int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
-                  struct kstat *stat)
-{
-        struct sysfs_dirent *sd = dentry->d_fsdata;
-        struct inode *inode = dentry->d_inode;
-        mutex_lock(&sysfs_mutex);
-        sysfs_refresh_inode(sd, inode);
-        mutex_unlock(&sysfs_mutex);
-        generic_fillattr(inode, stat);
-        return 0;
-}
-static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
-{
-        struct bin_attribute *bin_attr;
-        inode->i_private = sysfs_get(sd);
-        inode->i_mapping->a_ops = &sysfs_aops;
-        inode->i_mapping->backing_dev_info = &sysfs_backing_dev_info;
-        inode->i_op = &sysfs_inode_operations;
-        set_default_inode_attr(inode, sd->s_mode);
-        sysfs_refresh_inode(sd, inode);
-        /* initialize inode according to type */
-        switch (sysfs_type(sd)) {
-        case SYSFS_DIR:
-                inode->i_op = &sysfs_dir_inode_operations;
-                inode->i_fop = &sysfs_dir_operations;
-                break;
-        case SYSFS_KOBJ_ATTR:
-                inode->i_size = PAGE_SIZE;
-                inode->i_fop = &sysfs_file_operations;
-                break;
-        case SYSFS_KOBJ_BIN_ATTR:
-                bin_attr = sd->s_attr.bin_attr;
-                inode->i_size = bin_attr->size;
-                inode->i_fop = &sysfs_bin_operations;
-                break;
-        case SYSFS_KOBJ_LINK:
-                inode->i_op = &sysfs_symlink_inode_operations;
-                break;
-        default:
-                BUG();
-        }
-        unlock_new_inode(inode);
-}
-/**
- *      sysfs_get_inode - get inode for sysfs_dirent
- *      @sb: super block
- *      @sd: sysfs_dirent to allocate inode for
- *
- *      Get inode for @sd.  If such inode doesn't exist, a new inode
- *      is allocated and basics are initialized.  New inode is
- *      returned locked.
- *
- *      LOCKING:
- *      Kernel thread context (may sleep).
- *
- *      RETURNS:
- *      Pointer to allocated inode on success, NULL on failure.
- */
-struct inode *sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd)
-{
-        struct inode *inode;
-        inode = iget_locked(sb, sd->s_ino);
-        if (inode && (inode->i_state & I_NEW))
-                sysfs_init_inode(sd, inode);
-        return inode;
-}
-/*
- * The sysfs_dirent serves as both an inode and a directory entry for sysfs.
- * To prevent the sysfs inode numbers from being freed prematurely we take a
- * reference to sysfs_dirent from the sysfs inode.  A
- * super_operations.evict_inode() implementation is needed to drop that
- * reference upon inode destruction.
- */
-void sysfs_evict_inode(struct inode *inode)
-{
-        struct sysfs_dirent *sd  = inode->i_private;
-        truncate_inode_pages(&inode->i_data, 0);
-        clear_inode(inode);
-        sysfs_put(sd);
-}
-int sysfs_permission(struct inode *inode, int mask)
-{
-        struct sysfs_dirent *sd;
-        if (mask & MAY_NOT_BLOCK)
-                return -ECHILD;
-        sd = inode->i_private;
-        mutex_lock(&sysfs_mutex);
-        sysfs_refresh_inode(sd, inode);
-        mutex_unlock(&sysfs_mutex);
-        return generic_permission(inode, mask);
-}
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 834ec2cdb7a3..6211230814fd 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -14,146 +14,41 @@
 #include <linux/fs.h>
 #include <linux/mount.h>
-#include <linux/pagemap.h>
 #include <linux/init.h>
-#include <linux/module.h>
-#include <linux/magic.h>
-#include <linux/slab.h>
 #include <linux/user_namespace.h>
 #include "sysfs.h"
+static struct kernfs_root *sysfs_root;
-static struct vfsmount *sysfs_mnt;
+struct kernfs_node *sysfs_root_kn;
-struct kmem_cache *sysfs_dir_cachep;
-static const struct super_operations sysfs_ops = {
-        .statfs         = simple_statfs,
-        .drop_inode     = generic_delete_inode,
-        .evict_inode    = sysfs_evict_inode,
-};
-struct sysfs_dirent sysfs_root = {
-        .s_name         = "",
-        .s_count        = ATOMIC_INIT(1),
-        .s_flags        = SYSFS_DIR | (KOBJ_NS_TYPE_NONE << SYSFS_NS_TYPE_SHIFT),
-        .s_mode         = S_IFDIR | S_IRUGO | S_IXUGO,
-        .s_ino          = 1,
-};
-static int sysfs_fill_super(struct super_block *sb, void *data, int silent)
-{
-        struct inode *inode;
-        struct dentry *root;
-        sb->s_blocksize = PAGE_CACHE_SIZE;
-        sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
-        sb->s_magic = SYSFS_MAGIC;
-        sb->s_op = &sysfs_ops;
-        sb->s_time_gran = 1;
-        /* get root inode, initialize and unlock it */
-        mutex_lock(&sysfs_mutex);
-        inode = sysfs_get_inode(sb, &sysfs_root);
-        mutex_unlock(&sysfs_mutex);
-        if (!inode) {
-                pr_debug("sysfs: could not get root inode\n");
-                return -ENOMEM;
-        }
-        /* instantiate and link root dentry */
-        root = d_make_root(inode);
-        if (!root) {
-                pr_debug("%s: could not get root dentry!\n", __func__);
-                return -ENOMEM;
-        }
-        root->d_fsdata = &sysfs_root;
-        sb->s_root = root;
-        sb->s_d_op = &sysfs_dentry_ops;
-        return 0;
-}
-static int sysfs_test_super(struct super_block *sb, void *data)
-{
-        struct sysfs_super_info *sb_info = sysfs_info(sb);
-        struct sysfs_super_info *info = data;
-        enum kobj_ns_type type;
-        int found = 1;
-        for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) {
-                if (sb_info->ns[type] != info->ns[type])
-                        found = 0;
-        }
-        return found;
-}
-static int sysfs_set_super(struct super_block *sb, void *data)
-{
-        int error;
-        error = set_anon_super(sb, data);
-        if (!error)
-                sb->s_fs_info = data;
-        return error;
-}
-static void free_sysfs_super_info(struct sysfs_super_info *info)
-{
-        int type;
-        for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++)
-                kobj_ns_drop(type, info->ns[type]);
-        kfree(info);
-}
 static struct dentry *sysfs_mount(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *data)
 {
-        struct sysfs_super_info *info;
+        struct dentry *root;
-        enum kobj_ns_type type;
+        void *ns;
-        struct super_block *sb;
-        int error;
        if (!(flags & MS_KERNMOUNT)) {
                if (!capable(CAP_SYS_ADMIN) && !fs_fully_visible(fs_type))
                        return ERR_PTR(-EPERM);
-                for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) {
+                if (!kobj_ns_current_may_mount(KOBJ_NS_TYPE_NET))
-                        if (!kobj_ns_current_may_mount(type))
+                        return ERR_PTR(-EPERM);
-                                return ERR_PTR(-EPERM);
-                }
-        }
-        info = kzalloc(sizeof(*info), GFP_KERNEL);
-        if (!info)
-                return ERR_PTR(-ENOMEM);
-        for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++)
-                info->ns[type] = kobj_ns_grab_current(type);
-        sb = sget(fs_type, sysfs_test_super, sysfs_set_super, flags, info);
-        if (IS_ERR(sb) || sb->s_fs_info != info)
-                free_sysfs_super_info(info);
-        if (IS_ERR(sb))
-                return ERR_CAST(sb);
-        if (!sb->s_root) {
-                error = sysfs_fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
-                if (error) {
-                        deactivate_locked_super(sb);
-                        return ERR_PTR(error);
-                }
-                sb->s_flags |= MS_ACTIVE;
        }
-        return dget(sb->s_root);
+        ns = kobj_ns_grab_current(KOBJ_NS_TYPE_NET);
+        root = kernfs_mount_ns(fs_type, flags, sysfs_root, ns);
+        if (IS_ERR(root))
+                kobj_ns_drop(KOBJ_NS_TYPE_NET, ns);
+        return root;
 }
 static void sysfs_kill_sb(struct super_block *sb)
 {
-        struct sysfs_super_info *info = sysfs_info(sb);
+        void *ns = (void *)kernfs_super_ns(sb);
-        /* Remove the superblock from fs_supers/s_instances
-         * so we can't find it, before freeing sysfs_super_info.
+        kernfs_kill_sb(sb);
-         */
+        kobj_ns_drop(KOBJ_NS_TYPE_NET, ns);
-        kill_anon_super(sb);
-        free_sysfs_super_info(info);
 }
 static struct file_system_type sysfs_fs_type = {
@@ -165,48 +60,19 @@ static struct file_system_type sysfs_fs_type = {
 int __init sysfs_init(void)
 {
-        int err = -ENOMEM;
+        int err;
-        sysfs_dir_cachep = kmem_cache_create("sysfs_dir_cache",
+        sysfs_root = kernfs_create_root(NULL, NULL);
-                                              sizeof(struct sysfs_dirent),
+        if (IS_ERR(sysfs_root))
-                                              0, 0, NULL);
+                return PTR_ERR(sysfs_root);
-        if (!sysfs_dir_cachep)
-                goto out;
-        err = sysfs_inode_init();
+        sysfs_root_kn = sysfs_root->kn;
-        if (err)
-                goto out_err;
        err = register_filesystem(&sysfs_fs_type);
-        if (!err) {
+        if (err) {
-                sysfs_mnt = kern_mount(&sysfs_fs_type);
+                kernfs_destroy_root(sysfs_root);
-                if (IS_ERR(sysfs_mnt)) {
+                return err;
-                        printk(KERN_ERR "sysfs: could not mount!\n");
+        }
-                        err = PTR_ERR(sysfs_mnt);
-                        sysfs_mnt = NULL;
-                        unregister_filesystem(&sysfs_fs_type);
-                        goto out_err;
-                }
-        } else
-                goto out_err;
-out:
-        return err;
-out_err:
-        kmem_cache_destroy(sysfs_dir_cachep);
-        sysfs_dir_cachep = NULL;
-        goto out;
-}
-#undef sysfs_get
-struct sysfs_dirent *sysfs_get(struct sysfs_dirent *sd)
-{
-        return __sysfs_get(sd);
-}
-EXPORT_SYMBOL_GPL(sysfs_get);
-#undef sysfs_put
+        return 0;
-void sysfs_put(struct sysfs_dirent *sd)
-{
-        __sysfs_put(sd);
 }
-EXPORT_SYMBOL_GPL(sysfs_put);
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 3ae3f1bf1a09..aecb15f84557 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -11,109 +11,73 @@
 */
 #include <linux/fs.h>
-#include <linux/gfp.h>
-#include <linux/mount.h>
 #include <linux/module.h>
 #include <linux/kobject.h>
-#include <linux/namei.h>
 #include <linux/mutex.h>
 #include <linux/security.h>
 #include "sysfs.h"
-static int sysfs_do_create_link_sd(struct sysfs_dirent *parent_sd,
+static int sysfs_do_create_link_sd(struct kernfs_node *parent,
-                                   struct kobject *target,
+                                   struct kobject *target_kobj,
                                   const char *name, int warn)
 {
-        struct sysfs_dirent *target_sd = NULL;
+        struct kernfs_node *kn, *target = NULL;
-        struct sysfs_dirent *sd = NULL;
-        struct sysfs_addrm_cxt acxt;
-        enum kobj_ns_type ns_type;
-        int error;
-        BUG_ON(!name || !parent_sd);
+        BUG_ON(!name || !parent);
        /*
-         * We don't own @target and it may be removed at any time.
+         * We don't own @target_kobj and it may be removed at any time.
         * Synchronize using sysfs_symlink_target_lock.  See
         * sysfs_remove_dir() for details.
         */
        spin_lock(&sysfs_symlink_target_lock);
-        if (target->sd)
+        if (target_kobj->sd) {
-                target_sd = sysfs_get(target->sd);
+                target = target_kobj->sd;
+                kernfs_get(target);
+        }
        spin_unlock(&sysfs_symlink_target_lock);
-        error = -ENOENT;
+        if (!target)
-        if (!target_sd)
+                return -ENOENT;
-                goto out_put;
-        error = -ENOMEM;
-        sd = sysfs_new_dirent(name, S_IFLNK|S_IRWXUGO, SYSFS_KOBJ_LINK);
-        if (!sd)
-                goto out_put;
-        ns_type = sysfs_ns_type(parent_sd);
+        kn = kernfs_create_link(parent, name, target);
-        if (ns_type)
+        kernfs_put(target);
-                sd->s_ns = target_sd->s_ns;
-        sd->s_symlink.target_sd = target_sd;
-        target_sd = NULL;       /* reference is now owned by the symlink */
-        sysfs_addrm_start(&acxt);
-        /* Symlinks must be between directories with the same ns_type */
-        if (!ns_type ||
-            (ns_type == sysfs_ns_type(sd->s_symlink.target_sd->s_parent))) {
-                if (warn)
-                        error = sysfs_add_one(&acxt, sd, parent_sd);
-                else
-                        error = __sysfs_add_one(&acxt, sd, parent_sd);
-        } else {
-                error = -EINVAL;
-                WARN(1, KERN_WARNING
-                        "sysfs: symlink across ns_types %s/%s -> %s/%s\n",
-                        parent_sd->s_name,
-                        sd->s_name,
-                        sd->s_symlink.target_sd->s_parent->s_name,
-                        sd->s_symlink.target_sd->s_name);
-        }
-        sysfs_addrm_finish(&acxt);
-        if (error)
+        if (!IS_ERR(kn))
-                goto out_put;
+                return 0;
-        return 0;
+        if (warn && PTR_ERR(kn) == -EEXIST)
+                sysfs_warn_dup(parent, name);
- out_put:
+        return PTR_ERR(kn);
-        sysfs_put(target_sd);
-        sysfs_put(sd);
-        return error;
 }
 /**
 *      sysfs_create_link_sd - create symlink to a given object.
- *      @sd:            directory we're creating the link in.
+ *      @kn:            directory we're creating the link in.
 *      @target:        object we're pointing to.
 *      @name:          name of the symlink.
 */
-int sysfs_create_link_sd(struct sysfs_dirent *sd, struct kobject *target,
+int sysfs_create_link_sd(struct kernfs_node *kn, struct kobject *target,
                         const char *name)
 {
-        return sysfs_do_create_link_sd(sd, target, name, 1);
+        return sysfs_do_create_link_sd(kn, target, name, 1);
 }
 static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target,
                                const char *name, int warn)
 {
-        struct sysfs_dirent *parent_sd = NULL;
+        struct kernfs_node *parent = NULL;
        if (!kobj)
-                parent_sd = &sysfs_root;
+                parent = sysfs_root_kn;
        else
-                parent_sd = kobj->sd;
+                parent = kobj->sd;
-        if (!parent_sd)
+        if (!parent)
                return -EFAULT;
-        return sysfs_do_create_link_sd(parent_sd, target, name, warn);
+        return sysfs_do_create_link_sd(parent, target, name, warn);
 }
 /**
@@ -164,10 +128,10 @@ void sysfs_delete_link(struct kobject *kobj, struct kobject *targ,
         * sysfs_remove_dir() for details.
         */
        spin_lock(&sysfs_symlink_target_lock);
-        if (targ->sd && sysfs_ns_type(kobj->sd))
+        if (targ->sd && kernfs_ns_enabled(kobj->sd))
-                ns = targ->sd->s_ns;
+                ns = targ->sd->ns;
        spin_unlock(&sysfs_symlink_target_lock);
-        sysfs_hash_and_remove(kobj->sd, name, ns);
+        kernfs_remove_by_name_ns(kobj->sd, name, ns);
 }
 /**
@@ -177,14 +141,14 @@ void sysfs_delete_link(struct kobject *kobj, struct kobject *targ,
 */
 void sysfs_remove_link(struct kobject *kobj, const char *name)
 {
-        struct sysfs_dirent *parent_sd = NULL;
+        struct kernfs_node *parent = NULL;
        if (!kobj)
-                parent_sd = &sysfs_root;
+                parent = sysfs_root_kn;
        else
-                parent_sd = kobj->sd;
+                parent = kobj->sd;
-        sysfs_hash_and_remove(parent_sd, name, NULL);
+        kernfs_remove_by_name(parent, name);
 }
 EXPORT_SYMBOL_GPL(sysfs_remove_link);
@@ -201,130 +165,33 @@ EXPORT_SYMBOL_GPL(sysfs_remove_link);
 int sysfs_rename_link_ns(struct kobject *kobj, struct kobject *targ,
                         const char *old, const char *new, const void *new_ns)
 {
-        struct sysfs_dirent *parent_sd, *sd = NULL;
+        struct kernfs_node *parent, *kn = NULL;
        const void *old_ns = NULL;
        int result;
        if (!kobj)
-                parent_sd = &sysfs_root;
+                parent = sysfs_root_kn;
        else
-                parent_sd = kobj->sd;
+                parent = kobj->sd;
        if (targ->sd)
-                old_ns = targ->sd->s_ns;
+                old_ns = targ->sd->ns;
        result = -ENOENT;
-        sd = sysfs_get_dirent_ns(parent_sd, old, old_ns);
+        kn = kernfs_find_and_get_ns(parent, old, old_ns);
-        if (!sd)
+        if (!kn)
                goto out;
        result = -EINVAL;
-        if (sysfs_type(sd) != SYSFS_KOBJ_LINK)
+        if (kernfs_type(kn) != KERNFS_LINK)
                goto out;
-        if (sd->s_symlink.target_sd->s_dir.kobj != targ)
+        if (kn->symlink.target_kn->priv != targ)
                goto out;
-        result = sysfs_rename(sd, parent_sd, new, new_ns);
+        result = kernfs_rename_ns(kn, parent, new, new_ns);
 out:
-        sysfs_put(sd);
+        kernfs_put(kn);
        return result;
 }
 EXPORT_SYMBOL_GPL(sysfs_rename_link_ns);
-static int sysfs_get_target_path(struct sysfs_dirent *parent_sd,
-                                 struct sysfs_dirent *target_sd, char *path)
-{
-        struct sysfs_dirent *base, *sd;
-        char *s = path;
-        int len = 0;
-        /* go up to the root, stop at the base */
-        base = parent_sd;
-        while (base->s_parent) {
-                sd = target_sd->s_parent;
-                while (sd->s_parent && base != sd)
-                        sd = sd->s_parent;
-                if (base == sd)
-                        break;
-                strcpy(s, "../");
-                s += 3;
-                base = base->s_parent;
-        }
-        /* determine end of target string for reverse fillup */
-        sd = target_sd;
-        while (sd->s_parent && sd != base) {
-                len += strlen(sd->s_name) + 1;
-                sd = sd->s_parent;
-        }
-        /* check limits */
-        if (len < 2)
-                return -EINVAL;
-        len--;
-        if ((s - path) + len > PATH_MAX)
-                return -ENAMETOOLONG;
-        /* reverse fillup of target string from target to base */
-        sd = target_sd;
-        while (sd->s_parent && sd != base) {
-                int slen = strlen(sd->s_name);
-                len -= slen;
-                strncpy(s + len, sd->s_name, slen);
-                if (len)
-                        s[--len] = '/';
-                sd = sd->s_parent;
-        }
-        return 0;
-}
-static int sysfs_getlink(struct dentry *dentry, char *path)
-{
-        struct sysfs_dirent *sd = dentry->d_fsdata;
-        struct sysfs_dirent *parent_sd = sd->s_parent;
-        struct sysfs_dirent *target_sd = sd->s_symlink.target_sd;
-        int error;
-        mutex_lock(&sysfs_mutex);
-        error = sysfs_get_target_path(parent_sd, target_sd, path);
-        mutex_unlock(&sysfs_mutex);
-        return error;
-}
-static void *sysfs_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
-        int error = -ENOMEM;
-        unsigned long page = get_zeroed_page(GFP_KERNEL);
-        if (page) {
-                error = sysfs_getlink(dentry, (char *) page);
-                if (error < 0)
-                        free_page((unsigned long)page);
-        }
-        nd_set_link(nd, error ? ERR_PTR(error) : (char *)page);
-        return NULL;
-}
-static void sysfs_put_link(struct dentry *dentry, struct nameidata *nd,
-                           void *cookie)
-{
-        char *page = nd_get_link(nd);
-        if (!IS_ERR(page))
-                free_page((unsigned long)page);
-}
-const struct inode_operations sysfs_symlink_inode_operations = {
-        .setxattr       = sysfs_setxattr,
-        .readlink       = generic_readlink,
-        .follow_link    = sysfs_follow_link,
-        .put_link       = sysfs_put_link,
-        .setattr        = sysfs_setattr,
-        .getattr        = sysfs_getattr,
-        .permission     = sysfs_permission,
-};
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 0af09fbfb3f6..0e2f1cccb812 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -8,248 +8,36 @@
 * This file is released under the GPLv2.
 */
-#include <linux/lockdep.h>
+#ifndef __SYSFS_INTERNAL_H
-#include <linux/kobject_ns.h>
+#define __SYSFS_INTERNAL_H
-#include <linux/fs.h>
-#include <linux/rbtree.h>
-struct sysfs_open_dirent;
+#include <linux/sysfs.h>
-/* type-specific structures for sysfs_dirent->s_* union members */
-struct sysfs_elem_dir {
-        struct kobject          *kobj;
-        unsigned long           subdirs;
-        /* children rbtree starts here and goes through sd->s_rb */
-        struct rb_root          children;
-};
-struct sysfs_elem_symlink {
-        struct sysfs_dirent     *target_sd;
-};
-struct sysfs_elem_attr {
-        union {
-                struct attribute        *attr;
-                struct bin_attribute    *bin_attr;
-        };
-        struct sysfs_open_dirent *open;
-};
-struct sysfs_inode_attrs {
-        struct iattr    ia_iattr;
-        void            *ia_secdata;
-        u32             ia_secdata_len;
-};
-/*
- * sysfs_dirent - the building block of sysfs hierarchy.  Each and
- * every sysfs node is represented by single sysfs_dirent.
- *
- * As long as s_count reference is held, the sysfs_dirent itself is
- * accessible.  Dereferencing s_elem or any other outer entity
- * requires s_active reference.
- */
-struct sysfs_dirent {
-        atomic_t                s_count;
-        atomic_t                s_active;
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-        struct lockdep_map      dep_map;
-#endif
-        struct sysfs_dirent     *s_parent;
-        const char              *s_name;
-        struct rb_node          s_rb;
-        union {
-                struct completion       *completion;
-                struct sysfs_dirent     *removed_list;
-        } u;
-        const void              *s_ns; /* namespace tag */
-        unsigned int            s_hash; /* ns + name hash */
-        union {
-                struct sysfs_elem_dir           s_dir;
-                struct sysfs_elem_symlink       s_symlink;
-                struct sysfs_elem_attr          s_attr;
-        };
-        unsigned short          s_flags;
-        umode_t                 s_mode;
-        unsigned int            s_ino;
-        struct sysfs_inode_attrs *s_iattr;
-};
-#define SD_DEACTIVATED_BIAS             INT_MIN
-#define SYSFS_TYPE_MASK                 0x00ff
-#define SYSFS_DIR                       0x0001
-#define SYSFS_KOBJ_ATTR                 0x0002
-#define SYSFS_KOBJ_BIN_ATTR             0x0004
-#define SYSFS_KOBJ_LINK                 0x0008
-#define SYSFS_COPY_NAME                 (SYSFS_DIR | SYSFS_KOBJ_LINK)
-#define SYSFS_ACTIVE_REF                (SYSFS_KOBJ_ATTR | SYSFS_KOBJ_BIN_ATTR)
-/* identify any namespace tag on sysfs_dirents */
-#define SYSFS_NS_TYPE_MASK              0xf00
-#define SYSFS_NS_TYPE_SHIFT             8
-#define SYSFS_FLAG_MASK                 ~(SYSFS_NS_TYPE_MASK|SYSFS_TYPE_MASK)
-#define SYSFS_FLAG_REMOVED              0x02000
-static inline unsigned int sysfs_type(struct sysfs_dirent *sd)
-{
-        return sd->s_flags & SYSFS_TYPE_MASK;
-}
-/*
- * Return any namespace tags on this dirent.
- * enum kobj_ns_type is defined in linux/kobject.h
- */
-static inline enum kobj_ns_type sysfs_ns_type(struct sysfs_dirent *sd)
-{
-        return (sd->s_flags & SYSFS_NS_TYPE_MASK) >> SYSFS_NS_TYPE_SHIFT;
-}
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-#define sysfs_dirent_init_lockdep(sd)                           \
-do {                                                            \
-        struct attribute *attr = sd->s_attr.attr;               \
-        struct lock_class_key *key = attr->key;                 \
-        if (!key)                                               \
-                key = &attr->skey;                              \
-                                                                \
-        lockdep_init_map(&sd->dep_map, "s_active", key, 0);     \
-} while (0)
-/* Test for attributes that want to ignore lockdep for read-locking */
-static inline bool sysfs_ignore_lockdep(struct sysfs_dirent *sd)
-{
-        int type = sysfs_type(sd);
-        return (type == SYSFS_KOBJ_ATTR || type == SYSFS_KOBJ_BIN_ATTR) &&
-                sd->s_attr.attr->ignore_lockdep;
-}
-#else
-#define sysfs_dirent_init_lockdep(sd) do {} while (0)
-static inline bool sysfs_ignore_lockdep(struct sysfs_dirent *sd)
-{
-        return true;
-}
-#endif
-/*
- * Context structure to be used while adding/removing nodes.
- */
-struct sysfs_addrm_cxt {
-        struct sysfs_dirent     *removed;
-};
 /*
 * mount.c
 */
+extern struct kernfs_node *sysfs_root_kn;
-/*
- * Each sb is associated with a set of namespace tags (i.e.
- * the network namespace of the task which mounted this sysfs
- * instance).
- */
-struct sysfs_super_info {
-        void *ns[KOBJ_NS_TYPES];
-};
-#define sysfs_info(SB) ((struct sysfs_super_info *)(SB->s_fs_info))
-extern struct sysfs_dirent sysfs_root;
-extern struct kmem_cache *sysfs_dir_cachep;
 /*
 * dir.c
 */
-extern struct mutex sysfs_mutex;
 extern spinlock_t sysfs_symlink_target_lock;
-extern const struct dentry_operations sysfs_dentry_ops;
-extern const struct file_operations sysfs_dir_operations;
-extern const struct inode_operations sysfs_dir_inode_operations;
-struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd);
+void sysfs_warn_dup(struct kernfs_node *parent, const char *name);
-void sysfs_put_active(struct sysfs_dirent *sd);
-void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt);
-void sysfs_warn_dup(struct sysfs_dirent *parent, const char *name);
-int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd,
-                    struct sysfs_dirent *parent_sd);
-int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd,
-                  struct sysfs_dirent *parent_sd);
-void sysfs_remove(struct sysfs_dirent *sd);
-int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name,
-                          const void *ns);
-void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt);
-struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
-                                       const unsigned char *name,
-                                       const void *ns);
-struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type);
-void release_sysfs_dirent(struct sysfs_dirent *sd);
-int sysfs_create_subdir(struct kobject *kobj, const char *name,
-                        struct sysfs_dirent **p_sd);
-int sysfs_rename(struct sysfs_dirent *sd, struct sysfs_dirent *new_parent_sd,
-                 const char *new_name, const void *new_ns);
-static inline struct sysfs_dirent *__sysfs_get(struct sysfs_dirent *sd)
-{
-        if (sd) {
-                WARN_ON(!atomic_read(&sd->s_count));
-                atomic_inc(&sd->s_count);
-        }
-        return sd;
-}
-#define sysfs_get(sd) __sysfs_get(sd)
-static inline void __sysfs_put(struct sysfs_dirent *sd)
-{
-        if (sd && atomic_dec_and_test(&sd->s_count))
-                release_sysfs_dirent(sd);
-}
-#define sysfs_put(sd) __sysfs_put(sd)
-/*
- * inode.c
- */
-struct inode *sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd);
-void sysfs_evict_inode(struct inode *inode);
-int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr *iattr);
-int sysfs_permission(struct inode *inode, int mask);
-int sysfs_setattr(struct dentry *dentry, struct iattr *iattr);
-int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
-                  struct kstat *stat);
-int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
-                   size_t size, int flags);
-int sysfs_inode_init(void);
 /*
 * file.c
 */
-extern const struct file_operations sysfs_file_operations;
+int sysfs_add_file(struct kernfs_node *parent,
-extern const struct file_operations sysfs_bin_operations;
+                   const struct attribute *attr, bool is_bin);
+int sysfs_add_file_mode_ns(struct kernfs_node *parent,
-int sysfs_add_file(struct sysfs_dirent *dir_sd,
+                           const struct attribute *attr, bool is_bin,
-                   const struct attribute *attr, int type);
-int sysfs_add_file_mode_ns(struct sysfs_dirent *dir_sd,
-                           const struct attribute *attr, int type,
                           umode_t amode, const void *ns);
-void sysfs_unmap_bin_file(struct sysfs_dirent *sd);
 /*
 * symlink.c
 */
-extern const struct inode_operations sysfs_symlink_inode_operations;
+int sysfs_create_link_sd(struct kernfs_node *kn, struct kobject *target,
-int sysfs_create_link_sd(struct sysfs_dirent *sd, struct kobject *target,
                         const char *name);
+#endif  /* __SYSFS_INTERNAL_H */
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 5f6fc17d6bc5..9737cba1357d 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -1010,6 +1010,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
        else
                udf_truncate_tail_extent(inode);
        mark_inode_dirty(inode);
+        up_write(&iinfo->i_data_sem);
        fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
        if (!fi)
@@ -1023,7 +1024,6 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
        udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL);
        if (UDF_I(dir)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
                mark_inode_dirty(dir);
-        up_write(&iinfo->i_data_sem);
        if (fibh.sbh != fibh.ebh)
                brelse(fibh.ebh);
        brelse(fibh.sbh);
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 71c8c9d2b882..a26739451b53 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1217,7 +1217,7 @@ __xfs_get_blocks(
                lockmode = XFS_ILOCK_EXCL;
                xfs_ilock(ip, lockmode);
        } else {
-                lockmode = xfs_ilock_map_shared(ip);
+                lockmode = xfs_ilock_data_map_shared(ip);
        }
        ASSERT(offset <= mp->m_super->s_maxbytes);
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index b86127072ac3..01b6a0102fbd 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -164,6 +164,7 @@ xfs_attr_get(
 {
        int             error;
        struct xfs_name xname;
+        uint            lock_mode;
        XFS_STATS_INC(xs_attr_get);
@@ -174,9 +175,9 @@ xfs_attr_get(
        if (error)
                return error;
-        xfs_ilock(ip, XFS_ILOCK_SHARED);
+        lock_mode = xfs_ilock_attr_map_shared(ip);
        error = xfs_attr_get_int(ip, &xname, value, valuelenp, flags);
-        xfs_iunlock(ip, XFS_ILOCK_SHARED);
+        xfs_iunlock(ip, lock_mode);
        return(error);
 }
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index 2d174b128153..01db96f60cf0 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -507,17 +507,17 @@ xfs_attr_list_int(
 {
        int error;
        xfs_inode_t *dp = context->dp;
+        uint            lock_mode;
        XFS_STATS_INC(xs_attr_list);
        if (XFS_FORCED_SHUTDOWN(dp->i_mount))
                return EIO;
-        xfs_ilock(dp, XFS_ILOCK_SHARED);
        /*
         * Decide on what work routines to call based on the inode size.
         */
+        lock_mode = xfs_ilock_attr_map_shared(dp);
        if (!xfs_inode_hasattr(dp)) {
                error = 0;
        } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
@@ -527,9 +527,7 @@ xfs_attr_list_int(
        } else {
                error = xfs_attr_node_list(context);
        }
+        xfs_iunlock(dp, lock_mode);
-        xfs_iunlock(dp, XFS_ILOCK_SHARED);
        return error;
 }
diff --git a/fs/xfs/xfs_attr_remote.c b/fs/xfs/xfs_attr_remote.c
index 739e0a52deda..5549d69ddb45 100644
--- a/fs/xfs/xfs_attr_remote.c
+++ b/fs/xfs/xfs_attr_remote.c
@@ -110,7 +110,7 @@ xfs_attr3_rmt_verify(
        if (be32_to_cpu(rmt->rm_bytes) > fsbsize - sizeof(*rmt))
                return false;
        if (be32_to_cpu(rmt->rm_offset) +
-                                be32_to_cpu(rmt->rm_bytes) >= XATTR_SIZE_MAX)
+                                be32_to_cpu(rmt->rm_bytes) > XATTR_SIZE_MAX)
                return false;
        if (rmt->rm_owner == 0)
                return false;
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 3ef11b22e750..152543c4ca70 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -1635,7 +1635,7 @@ xfs_bmap_last_extent(
 * blocks at the end of the file which do not start at the previous data block,
 * we will try to align the new blocks at stripe unit boundaries.
 *
- * Returns 0 in bma->aeof if the file (fork) is empty as any new write will be
+ * Returns 1 in bma->aeof if the file (fork) is empty as any new write will be
 * at, or past the EOF.
 */
 STATIC int
@@ -1650,9 +1650,14 @@ xfs_bmap_isaeof(
        bma->aeof = 0;
        error = xfs_bmap_last_extent(NULL, bma->ip, whichfork, &rec,
                                     &is_empty);
-        if (error || is_empty)
+        if (error)
                return error;
+        if (is_empty) {
+                bma->aeof = 1;
+                return 0;
+        }
        /*
         * Check if we are allocation or past the last extent, or at least into
         * the last delayed allocated extent.
@@ -3643,10 +3648,19 @@ xfs_bmap_btalloc(
        int             isaligned;
        int             tryagain;
        int             error;
+        int             stripe_align;
        ASSERT(ap->length);
        mp = ap->ip->i_mount;
+        /* stripe alignment for allocation is determined by mount parameters */
+        stripe_align = 0;
+        if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC))
+                stripe_align = mp->m_swidth;
+        else if (mp->m_dalign)
+                stripe_align = mp->m_dalign;
        align = ap->userdata ? xfs_get_extsz_hint(ap->ip) : 0;
        if (unlikely(align)) {
                error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
@@ -3655,6 +3669,8 @@ xfs_bmap_btalloc(
                ASSERT(!error);
                ASSERT(ap->length);
        }
        nullfb = *ap->firstblock == NULLFSBLOCK;
        fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock);
        if (nullfb) {
@@ -3730,7 +3746,7 @@ xfs_bmap_btalloc(
         */
        if (!ap->flist->xbf_low && ap->aeof) {
                if (!ap->offset) {
-                        args.alignment = mp->m_dalign;
+                        args.alignment = stripe_align;
                        atype = args.type;
                        isaligned = 1;
                        /*
@@ -3755,13 +3771,13 @@ xfs_bmap_btalloc(
                         * of minlen+alignment+slop doesn't go up
                         * between the calls.
                         */
-                        if (blen > mp->m_dalign && blen <= args.maxlen)
+                        if (blen > stripe_align && blen <= args.maxlen)
-                                nextminlen = blen - mp->m_dalign;
+                                nextminlen = blen - stripe_align;
                        else
                                nextminlen = args.minlen;
-                        if (nextminlen + mp->m_dalign > args.minlen + 1)
+                        if (nextminlen + stripe_align > args.minlen + 1)
                                args.minalignslop =
-                                        nextminlen + mp->m_dalign -
+                                        nextminlen + stripe_align -
                                        args.minlen - 1;
                        else
                                args.minalignslop = 0;
@@ -3783,7 +3799,7 @@ xfs_bmap_btalloc(
                 */
                args.type = atype;
                args.fsbno = ap->blkno;
-                args.alignment = mp->m_dalign;
+                args.alignment = stripe_align;
                args.minlen = nextminlen;
                args.minalignslop = 0;
                isaligned = 1;
@@ -3997,6 +4013,7 @@ xfs_bmapi_read(
        ASSERT(*nmap >= 1);
        ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK|XFS_BMAPI_ENTIRE|
                           XFS_BMAPI_IGSTATE)));
+        ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED|XFS_ILOCK_EXCL));
        if (unlikely(XFS_TEST_ERROR(
            (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
@@ -4191,6 +4208,7 @@ xfs_bmapi_delay(
        ASSERT(*nmap >= 1);
        ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
        ASSERT(!(flags & ~XFS_BMAPI_ENTIRE));
+        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
        if (unlikely(XFS_TEST_ERROR(
            (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS &&
@@ -4484,6 +4502,7 @@ xfs_bmapi_write(
        ASSERT(tp != NULL);
        ASSERT(len > 0);
        ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL);
+        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
        if (unlikely(XFS_TEST_ERROR(
            (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
@@ -5035,6 +5054,7 @@ xfs_bunmapi(
        if (XFS_FORCED_SHUTDOWN(mp))
                return XFS_ERROR(EIO);
+        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
        ASSERT(len > 0);
        ASSERT(nexts >= 0);
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 5887e41c0323..f264616080ca 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -287,6 +287,7 @@ xfs_bmapi_allocate(
        INIT_WORK_ONSTACK(&args->work, xfs_bmapi_allocate_worker);
        queue_work(xfs_alloc_wq, &args->work);
        wait_for_completion(&done);
+        destroy_work_on_stack(&args->work);
        return args->result;
 }
@@ -617,22 +618,27 @@ xfs_getbmap(
                return XFS_ERROR(ENOMEM);
        xfs_ilock(ip, XFS_IOLOCK_SHARED);
-        if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) {
+        if (whichfork == XFS_DATA_FORK) {
-                if (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size) {
+                if (!(iflags & BMV_IF_DELALLOC) &&
+                    (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size)) {
                        error = -filemap_write_and_wait(VFS_I(ip)->i_mapping);
                        if (error)
                                goto out_unlock_iolock;
+                        /*
+                         * Even after flushing the inode, there can still be
+                         * delalloc blocks on the inode beyond EOF due to
+                         * speculative preallocation.  These are not removed
+                         * until the release function is called or the inode
+                         * is inactivated.  Hence we cannot assert here that
+                         * ip->i_delayed_blks == 0.
+                         */
                }
-                /*
-                 * even after flushing the inode, there can still be delalloc
-                 * blocks on the inode beyond EOF due to speculative
-                 * preallocation. These are not removed until the release
-                 * function is called or the inode is inactivated. Hence we
-                 * cannot assert here that ip->i_delayed_blks == 0.
-                 */
-        }
-        lock = xfs_ilock_map_shared(ip);
+                lock = xfs_ilock_data_map_shared(ip);
+        } else {
+                lock = xfs_ilock_attr_map_shared(ip);
+        }
        /*
         * Don't let nex be bigger than the number of extents
@@ -737,7 +743,7 @@ xfs_getbmap(
 out_free_map:
        kmem_free(map);
 out_unlock_ilock:
-        xfs_iunlock_map_shared(ip, lock);
+        xfs_iunlock(ip, lock);
 out_unlock_iolock:
        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
@@ -1168,9 +1174,15 @@ xfs_zero_remaining_bytes(
        xfs_buf_unlock(bp);
        for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
+                uint lock_mode;
                offset_fsb = XFS_B_TO_FSBT(mp, offset);
                nimap = 1;
+                lock_mode = xfs_ilock_data_map_shared(ip);
                error = xfs_bmapi_read(ip, offset_fsb, 1, &imap, &nimap, 0);
+                xfs_iunlock(ip, lock_mode);
                if (error || nimap < 1)
                        break;
                ASSERT(imap.br_blockcount >= 1);
@@ -1187,7 +1199,12 @@ xfs_zero_remaining_bytes(
                XFS_BUF_UNWRITE(bp);
                XFS_BUF_READ(bp);
                XFS_BUF_SET_ADDR(bp, xfs_fsb_to_db(ip, imap.br_startblock));
-                xfsbdstrat(mp, bp);
+                if (XFS_FORCED_SHUTDOWN(mp)) {
+                        error = XFS_ERROR(EIO);
+                        break;
+                }
+                xfs_buf_iorequest(bp);
                error = xfs_buf_iowait(bp);
                if (error) {
                        xfs_buf_ioerror_alert(bp,
@@ -1200,7 +1217,12 @@ xfs_zero_remaining_bytes(
                XFS_BUF_UNDONE(bp);
                XFS_BUF_UNREAD(bp);
                XFS_BUF_WRITE(bp);
-                xfsbdstrat(mp, bp);
+                if (XFS_FORCED_SHUTDOWN(mp)) {
+                        error = XFS_ERROR(EIO);
+                        break;
+                }
+                xfs_buf_iorequest(bp);
                error = xfs_buf_iowait(bp);
                if (error) {
                        xfs_buf_ioerror_alert(bp,
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index c7f0b77dcb00..9fccfb594291 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -698,7 +698,11 @@ xfs_buf_read_uncached(
        bp->b_flags |= XBF_READ;
        bp->b_ops = ops;
-        xfsbdstrat(target->bt_mount, bp);
+        if (XFS_FORCED_SHUTDOWN(target->bt_mount)) {
+                xfs_buf_relse(bp);
+                return NULL;
+        }
+        xfs_buf_iorequest(bp);
        xfs_buf_iowait(bp);
        return bp;
 }
@@ -1089,7 +1093,7 @@ xfs_bioerror(
 * This is meant for userdata errors; metadata bufs come with
 * iodone functions attached, so that we can track down errors.
 */
-STATIC int
+int
 xfs_bioerror_relse(
        struct xfs_buf  *bp)
 {
@@ -1152,7 +1156,7 @@ xfs_bwrite(
        ASSERT(xfs_buf_islocked(bp));
        bp->b_flags |= XBF_WRITE;
-        bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q);
+        bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q | XBF_WRITE_FAIL);
        xfs_bdstrat_cb(bp);
@@ -1164,25 +1168,6 @@ xfs_bwrite(
        return error;
 }
-/*
- * Wrapper around bdstrat so that we can stop data from going to disk in case
- * we are shutting down the filesystem.  Typically user data goes thru this
- * path; one of the exceptions is the superblock.
- */
-void
-xfsbdstrat(
-        struct xfs_mount        *mp,
-        struct xfs_buf          *bp)
-{
-        if (XFS_FORCED_SHUTDOWN(mp)) {
-                trace_xfs_bdstrat_shut(bp, _RET_IP_);
-                xfs_bioerror_relse(bp);
-                return;
-        }
-        xfs_buf_iorequest(bp);
-}
 STATIC void
 _xfs_buf_ioend(
        xfs_buf_t               *bp,
@@ -1516,6 +1501,12 @@ xfs_wait_buftarg(
                        struct xfs_buf *bp;
                        bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
                        list_del_init(&bp->b_lru);
+                        if (bp->b_flags & XBF_WRITE_FAIL) {
+                                xfs_alert(btp->bt_mount,
+"Corruption Alert: Buffer at block 0x%llx had permanent write failures!\n"
+"Please run xfs_repair to determine the extent of the problem.",
+                                        (long long)bp->b_bn);
+                        }
                        xfs_buf_rele(bp);
                }
                if (loop++ != 0)
@@ -1602,12 +1593,11 @@ xfs_free_buftarg(
        kmem_free(btp);
 }
-STATIC int
+int
-xfs_setsize_buftarg_flags(
+xfs_setsize_buftarg(
        xfs_buftarg_t           *btp,
        unsigned int            blocksize,
-        unsigned int            sectorsize,
+        unsigned int            sectorsize)
-        int                     verbose)
 {
        btp->bt_bsize = blocksize;
        btp->bt_sshift = ffs(sectorsize) - 1;
@@ -1628,26 +1618,17 @@ xfs_setsize_buftarg_flags(
 }
 /*
- *      When allocating the initial buffer target we have not yet
+ * When allocating the initial buffer target we have not yet
- *      read in the superblock, so don't know what sized sectors
+ * read in the superblock, so don't know what sized sectors
- *      are being used at this early stage.  Play safe.
+ * are being used at this early stage.  Play safe.
 */
 STATIC int
 xfs_setsize_buftarg_early(
        xfs_buftarg_t           *btp,
        struct block_device     *bdev)
 {
-        return xfs_setsize_buftarg_flags(btp,
+        return xfs_setsize_buftarg(btp, PAGE_SIZE,
-                        PAGE_SIZE, bdev_logical_block_size(bdev), 0);
+                                   bdev_logical_block_size(bdev));
-}
-int
-xfs_setsize_buftarg(
-        xfs_buftarg_t           *btp,
-        unsigned int            blocksize,
-        unsigned int            sectorsize)
-{
-        return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1);
 }
 xfs_buftarg_t *
@@ -1799,7 +1780,7 @@ __xfs_buf_delwri_submit(
        blk_start_plug(&plug);
        list_for_each_entry_safe(bp, n, io_list, b_list) {
-                bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC);
+                bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC | XBF_WRITE_FAIL);
                bp->b_flags |= XBF_WRITE;
                if (!wait) {
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index e65683361017..1cf21a4a9f22 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -45,6 +45,7 @@ typedef enum {
 #define XBF_ASYNC        (1 << 4) /* initiator will not wait for completion */
 #define XBF_DONE         (1 << 5) /* all pages in the buffer uptodate */
 #define XBF_STALE        (1 << 6) /* buffer has been staled, do not find it */
+#define XBF_WRITE_FAIL   (1 << 24)/* async writes have failed on this buffer */
 /* I/O hints for the BIO layer */
 #define XBF_SYNCIO       (1 << 10)/* treat this buffer as synchronous I/O */
@@ -70,6 +71,7 @@ typedef unsigned int xfs_buf_flags_t;
        { XBF_ASYNC,            "ASYNC" }, \
        { XBF_DONE,             "DONE" }, \
        { XBF_STALE,            "STALE" }, \
+        { XBF_WRITE_FAIL,       "WRITE_FAIL" }, \
        { XBF_SYNCIO,           "SYNCIO" }, \
        { XBF_FUA,              "FUA" }, \
        { XBF_FLUSH,            "FLUSH" }, \
@@ -80,6 +82,7 @@ typedef unsigned int xfs_buf_flags_t;
        { _XBF_DELWRI_Q,        "DELWRI_Q" }, \
        { _XBF_COMPOUND,        "COMPOUND" }
 /*
 * Internal state flags.
 */
@@ -269,9 +272,6 @@ extern void xfs_buf_unlock(xfs_buf_t *);
 /* Buffer Read and Write Routines */
 extern int xfs_bwrite(struct xfs_buf *bp);
-extern void xfsbdstrat(struct xfs_mount *, struct xfs_buf *);
 extern void xfs_buf_ioend(xfs_buf_t *,  int);
 extern void xfs_buf_ioerror(xfs_buf_t *, int);
 extern void xfs_buf_ioerror_alert(struct xfs_buf *, const char *func);
@@ -282,6 +282,8 @@ extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *,
 #define xfs_buf_zero(bp, off, len) \
            xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO)
+extern int xfs_bioerror_relse(struct xfs_buf *);
 static inline int xfs_buf_geterror(xfs_buf_t *bp)
 {
        return bp ? bp->b_error : ENOMEM;
@@ -301,7 +303,8 @@ extern void xfs_buf_terminate(void);
 #define XFS_BUF_ZEROFLAGS(bp) \
        ((bp)->b_flags &= ~(XBF_READ|XBF_WRITE|XBF_ASYNC| \
-                            XBF_SYNCIO|XBF_FUA|XBF_FLUSH))
+                            XBF_SYNCIO|XBF_FUA|XBF_FLUSH| \
+                            XBF_WRITE_FAIL))
 void xfs_buf_stale(struct xfs_buf *bp);
 #define XFS_BUF_UNSTALE(bp)     ((bp)->b_flags &= ~XBF_STALE)
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index a64f67ba25d3..33149113e333 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -182,21 +182,47 @@ xfs_buf_item_size(
        trace_xfs_buf_item_size(bip);
 }
-static struct xfs_log_iovec *
+static inline void
+xfs_buf_item_copy_iovec(
+        struct xfs_log_vec      *lv,
+        struct xfs_log_iovec    **vecp,
+        struct xfs_buf          *bp,
+        uint                    offset,
+        int                     first_bit,
+        uint                    nbits)
+{
+        offset += first_bit * XFS_BLF_CHUNK;
+        xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_BCHUNK,
+                        xfs_buf_offset(bp, offset),
+                        nbits * XFS_BLF_CHUNK);
+}
+static inline bool
+xfs_buf_item_straddle(
+        struct xfs_buf          *bp,
+        uint                    offset,
+        int                     next_bit,
+        int                     last_bit)
+{
+        return xfs_buf_offset(bp, offset + (next_bit << XFS_BLF_SHIFT)) !=
+                (xfs_buf_offset(bp, offset + (last_bit << XFS_BLF_SHIFT)) +
+                 XFS_BLF_CHUNK);
+}
+static void
 xfs_buf_item_format_segment(
        struct xfs_buf_log_item *bip,
-        struct xfs_log_iovec    *vecp,
+        struct xfs_log_vec      *lv,
+        struct xfs_log_iovec    **vecp,
        uint                    offset,
        struct xfs_buf_log_format *blfp)
 {
        struct xfs_buf  *bp = bip->bli_buf;
        uint            base_size;
-        uint            nvecs;
        int             first_bit;
        int             last_bit;
        int             next_bit;
        uint            nbits;
-        uint            buffer_offset;
        /* copy the flags across from the base format item */
        blfp->blf_flags = bip->__bli_format.blf_flags;
@@ -208,21 +234,17 @@ xfs_buf_item_format_segment(
         */
        base_size = xfs_buf_log_format_size(blfp);
-        nvecs = 0;
        first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0);
        if (!(bip->bli_flags & XFS_BLI_STALE) && first_bit == -1) {
                /*
                 * If the map is not be dirty in the transaction, mark
                 * the size as zero and do not advance the vector pointer.
                 */
-                goto out;
+                return;
        }
-        vecp->i_addr = blfp;
+        blfp = xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_BFORMAT, blfp, base_size);
-        vecp->i_len = base_size;
+        blfp->blf_size = 1;
-        vecp->i_type = XLOG_REG_TYPE_BFORMAT;
-        vecp++;
-        nvecs = 1;
        if (bip->bli_flags & XFS_BLI_STALE) {
                /*
@@ -232,14 +254,13 @@ xfs_buf_item_format_segment(
                 */
                trace_xfs_buf_item_format_stale(bip);
                ASSERT(blfp->blf_flags & XFS_BLF_CANCEL);
-                goto out;
+                return;
        }
        /*
         * Fill in an iovec for each set of contiguous chunks.
         */
        last_bit = first_bit;
        nbits = 1;
        for (;;) {
@@ -252,42 +273,22 @@ xfs_buf_item_format_segment(
                next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size,
                                        (uint)last_bit + 1);
                /*
-                 * If we run out of bits fill in the last iovec and get
+                 * If we run out of bits fill in the last iovec and get out of
-                 * out of the loop.
+                 * the loop.  Else if we start a new set of bits then fill in
-                 * Else if we start a new set of bits then fill in the
+                 * the iovec for the series we were looking at and start
-                 * iovec for the series we were looking at and start
+                 * counting the bits in the new one.  Else we're still in the
-                 * counting the bits in the new one.
+                 * same set of bits so just keep counting and scanning.
-                 * Else we're still in the same set of bits so just
-                 * keep counting and scanning.
                 */
                if (next_bit == -1) {
-                        buffer_offset = offset + first_bit * XFS_BLF_CHUNK;
+                        xfs_buf_item_copy_iovec(lv, vecp, bp, offset,
-                        vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
+                                                first_bit, nbits);
-                        vecp->i_len = nbits * XFS_BLF_CHUNK;
+                        blfp->blf_size++;
-                        vecp->i_type = XLOG_REG_TYPE_BCHUNK;
-                        nvecs++;
                        break;
-                } else if (next_bit != last_bit + 1) {
+                } else if (next_bit != last_bit + 1 ||
-                        buffer_offset = offset + first_bit * XFS_BLF_CHUNK;
+                           xfs_buf_item_straddle(bp, offset, next_bit, last_bit)) {
-                        vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
+                        xfs_buf_item_copy_iovec(lv, vecp, bp, offset,
-                        vecp->i_len = nbits * XFS_BLF_CHUNK;
+                                                first_bit, nbits);
-                        vecp->i_type = XLOG_REG_TYPE_BCHUNK;
+                        blfp->blf_size++;
-                        nvecs++;
-                        vecp++;
-                        first_bit = next_bit;
-                        last_bit = next_bit;
-                        nbits = 1;
-                } else if (xfs_buf_offset(bp, offset +
-                                              (next_bit << XFS_BLF_SHIFT)) !=
-                           (xfs_buf_offset(bp, offset +
-                                               (last_bit << XFS_BLF_SHIFT)) +
-                            XFS_BLF_CHUNK)) {
-                        buffer_offset = offset + first_bit * XFS_BLF_CHUNK;
-                        vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
-                        vecp->i_len = nbits * XFS_BLF_CHUNK;
-                        vecp->i_type = XLOG_REG_TYPE_BCHUNK;
-                        nvecs++;
-                        vecp++;
                        first_bit = next_bit;
                        last_bit = next_bit;
                        nbits = 1;
@@ -296,9 +297,6 @@ xfs_buf_item_format_segment(
                        nbits++;
                }
        }
-out:
-        blfp->blf_size = nvecs;
-        return vecp;
 }
 /*
@@ -310,10 +308,11 @@ out:
 STATIC void
 xfs_buf_item_format(
        struct xfs_log_item     *lip,
-        struct xfs_log_iovec    *vecp)
+        struct xfs_log_vec      *lv)
 {
        struct xfs_buf_log_item *bip = BUF_ITEM(lip);
        struct xfs_buf          *bp = bip->bli_buf;
+        struct xfs_log_iovec    *vecp = NULL;
        uint                    offset = 0;
        int                     i;
@@ -354,8 +353,8 @@ xfs_buf_item_format(
        }
        for (i = 0; i < bip->bli_format_count; i++) {
-                vecp = xfs_buf_item_format_segment(bip, vecp, offset,
+                xfs_buf_item_format_segment(bip, lv, &vecp, offset,
-                                                &bip->bli_formats[i]);
+                                            &bip->bli_formats[i]);
                offset += bp->b_maps[i].bm_len;
        }
@@ -496,6 +495,14 @@ xfs_buf_item_unpin(
        }
 }
+/*
+ * Buffer IO error rate limiting. Limit it to no more than 10 messages per 30
+ * seconds so as to not spam logs too much on repeated detection of the same
+ * buffer being bad..
+ */
+DEFINE_RATELIMIT_STATE(xfs_buf_write_fail_rl_state, 30 * HZ, 10);
 STATIC uint
 xfs_buf_item_push(
        struct xfs_log_item     *lip,
@@ -524,6 +531,14 @@ xfs_buf_item_push(
        trace_xfs_buf_item_push(bip);
+        /* has a previous flush failed due to IO errors? */
+        if ((bp->b_flags & XBF_WRITE_FAIL) &&
+            ___ratelimit(&xfs_buf_write_fail_rl_state, "XFS:")) {
+                xfs_warn(bp->b_target->bt_mount,
+"Detected failing async write on buffer block 0x%llx. Retrying async write.\n",
+                         (long long)bp->b_bn);
+        }
        if (!xfs_buf_delwri_queue(bp, buffer_list))
                rval = XFS_ITEM_FLUSHING;
        xfs_buf_unlock(bp);
@@ -1096,8 +1111,9 @@ xfs_buf_iodone_callbacks(
                xfs_buf_ioerror(bp, 0); /* errno of 0 unsets the flag */
-                if (!XFS_BUF_ISSTALE(bp)) {
+                if (!(bp->b_flags & (XBF_STALE|XBF_WRITE_FAIL))) {
-                        bp->b_flags |= XBF_WRITE | XBF_ASYNC | XBF_DONE;
+                        bp->b_flags |= XBF_WRITE | XBF_ASYNC |
+                                       XBF_DONE | XBF_WRITE_FAIL;
                        xfs_buf_iorequest(bp);
                } else {
                        xfs_buf_relse(bp);
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index 56369d4509d5..48c7d18f68c3 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -2067,12 +2067,12 @@ xfs_dir2_node_lookup(
 */
 int                                             /* error */
 xfs_dir2_node_removename(
-        xfs_da_args_t           *args)          /* operation arguments */
+        struct xfs_da_args      *args)          /* operation arguments */
 {
-        xfs_da_state_blk_t      *blk;           /* leaf block */
+        struct xfs_da_state_blk *blk;           /* leaf block */
        int                     error;          /* error return value */
        int                     rval;           /* operation return value */
-        xfs_da_state_t          *state;         /* btree cursor */
+        struct xfs_da_state     *state;         /* btree cursor */
        trace_xfs_dir2_node_removename(args);
@@ -2084,19 +2084,18 @@ xfs_dir2_node_removename(
        state->mp = args->dp->i_mount;
        state->blocksize = state->mp->m_dirblksize;
        state->node_ents = state->mp->m_dir_node_ents;
-        /*
-         * Look up the entry we're deleting, set up the cursor.
+        /* Look up the entry we're deleting, set up the cursor. */
-         */
        error = xfs_da3_node_lookup_int(state, &rval);
        if (error)
-                rval = error;
+                goto out_free;
-        /*
-         * Didn't find it, upper layer screwed up.
+        /* Didn't find it, upper layer screwed up. */
-         */
        if (rval != EEXIST) {
-                xfs_da_state_free(state);
+                error = rval;
-                return rval;
+                goto out_free;
        }
        blk = &state->path.blk[state->path.active - 1];
        ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC);
        ASSERT(state->extravalid);
@@ -2107,7 +2106,7 @@ xfs_dir2_node_removename(
        error = xfs_dir2_leafn_remove(args, blk->bp, blk->index,
                &state->extrablk, &rval);
        if (error)
-                return error;
+                goto out_free;
        /*
         * Fix the hash values up the btree.
         */
@@ -2122,6 +2121,7 @@ xfs_dir2_node_removename(
         */
        if (!error)
                error = xfs_dir2_node_to_leaf(state);
+out_free:
        xfs_da_state_free(state);
        return error;
 }
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index c4e50c6ed584..aead369e1c30 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -674,6 +674,7 @@ xfs_readdir(
 {
        int             rval;           /* return value */
        int             v;              /* type-checking value */
+        uint            lock_mode;
        trace_xfs_readdir(dp);
@@ -683,6 +684,7 @@ xfs_readdir(
        ASSERT(S_ISDIR(dp->i_d.di_mode));
        XFS_STATS_INC(xs_dir_getdents);
+        lock_mode = xfs_ilock_data_map_shared(dp);
        if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
                rval = xfs_dir2_sf_getdents(dp, ctx);
        else if ((rval = xfs_dir2_isblock(NULL, dp, &v)))
@@ -691,5 +693,7 @@ xfs_readdir(
                rval = xfs_dir2_block_getdents(dp, ctx);
        else
                rval = xfs_dir2_leaf_getdents(dp, ctx, bufsize);
+        xfs_iunlock(dp, lock_mode);
        return rval;
 }
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
index aafc6e46cb58..3725fb1b902b 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -170,6 +170,7 @@ xfs_dir2_block_to_sf(
        char                    *ptr;           /* current data pointer */
        xfs_dir2_sf_entry_t     *sfep;          /* shortform entry */
        xfs_dir2_sf_hdr_t       *sfp;           /* shortform directory header */
+        xfs_dir2_sf_hdr_t       *dst;           /* temporary data buffer */
        trace_xfs_dir2_block_to_sf(args);
@@ -177,35 +178,20 @@ xfs_dir2_block_to_sf(
        mp = dp->i_mount;
        /*
-         * Make a copy of the block data, so we can shrink the inode
+         * allocate a temporary destination buffer the size of the inode
-         * and add local data.
+         * to format the data into. Once we have formatted the data, we
+         * can free the block and copy the formatted data into the inode literal
+         * area.
         */
-        hdr = kmem_alloc(mp->m_dirblksize, KM_SLEEP);
+        dst = kmem_alloc(mp->m_sb.sb_inodesize, KM_SLEEP);
-        memcpy(hdr, bp->b_addr, mp->m_dirblksize);
+        hdr = bp->b_addr;
-        logflags = XFS_ILOG_CORE;
-        if ((error = xfs_dir2_shrink_inode(args, mp->m_dirdatablk, bp))) {
-                ASSERT(error != ENOSPC);
-                goto out;
-        }
        /*
-         * The buffer is now unconditionally gone, whether
-         * xfs_dir2_shrink_inode worked or not.
-         *
-         * Convert the inode to local format.
-         */
-        dp->i_df.if_flags &= ~XFS_IFEXTENTS;
-        dp->i_df.if_flags |= XFS_IFINLINE;
-        dp->i_d.di_format = XFS_DINODE_FMT_LOCAL;
-        ASSERT(dp->i_df.if_bytes == 0);
-        xfs_idata_realloc(dp, size, XFS_DATA_FORK);
-        logflags |= XFS_ILOG_DDATA;
-        /*
         * Copy the header into the newly allocate local space.
         */
-        sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+        sfp = (xfs_dir2_sf_hdr_t *)dst;
        memcpy(sfp, sfhp, xfs_dir2_sf_hdr_size(sfhp->i8count));
-        dp->i_d.di_size = size;
        /*
         * Set up to loop over the block's entries.
         */
@@ -258,10 +244,34 @@ xfs_dir2_block_to_sf(
                ptr += dp->d_ops->data_entsize(dep->namelen);
        }
        ASSERT((char *)sfep - (char *)sfp == size);
+        /* now we are done with the block, we can shrink the inode */
+        logflags = XFS_ILOG_CORE;
+        error = xfs_dir2_shrink_inode(args, mp->m_dirdatablk, bp);
+        if (error) {
+                ASSERT(error != ENOSPC);
+                goto out;
+        }
+        /*
+         * The buffer is now unconditionally gone, whether
+         * xfs_dir2_shrink_inode worked or not.
+         *
+         * Convert the inode to local format and copy the data in.
+         */
+        dp->i_df.if_flags &= ~XFS_IFEXTENTS;
+        dp->i_df.if_flags |= XFS_IFINLINE;
+        dp->i_d.di_format = XFS_DINODE_FMT_LOCAL;
+        ASSERT(dp->i_df.if_bytes == 0);
+        xfs_idata_realloc(dp, size, XFS_DATA_FORK);
+        logflags |= XFS_ILOG_DDATA;
+        memcpy(dp->i_df.if_u1.if_data, dst, size);
+        dp->i_d.di_size = size;
        xfs_dir2_sf_check(args);
 out:
        xfs_trans_log_inode(args->trans, dp, logflags);
-        kmem_free(hdr);
+        kmem_free(dst);
        return error;
 }
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 6b1e695caf0e..7aeb4c895b32 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -469,16 +469,17 @@ xfs_qm_dqtobp(
        struct xfs_mount        *mp = dqp->q_mount;
        xfs_dqid_t              id = be32_to_cpu(dqp->q_core.d_id);
        struct xfs_trans        *tp = (tpp ? *tpp : NULL);
+        uint                    lock_mode;
        dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk;
-        xfs_ilock(quotip, XFS_ILOCK_SHARED);
+        lock_mode = xfs_ilock_data_map_shared(quotip);
        if (!xfs_this_quota_on(dqp->q_mount, dqp->dq_flags)) {
                /*
                 * Return if this type of quotas is turned off while we
                 * didn't have the quota inode lock.
                 */
-                xfs_iunlock(quotip, XFS_ILOCK_SHARED);
+                xfs_iunlock(quotip, lock_mode);
                return ESRCH;
        }
@@ -488,7 +489,7 @@ xfs_qm_dqtobp(
        error = xfs_bmapi_read(quotip, dqp->q_fileoffset,
                               XFS_DQUOT_CLUSTER_SIZE_FSB, &map, &nmaps, 0);
-        xfs_iunlock(quotip, XFS_ILOCK_SHARED);
+        xfs_iunlock(quotip, lock_mode);
        if (error)
                return error;
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index 92e5f62eefc6..f33fbaaa4d8a 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -57,20 +57,24 @@ xfs_qm_dquot_logitem_size(
 STATIC void
 xfs_qm_dquot_logitem_format(
        struct xfs_log_item     *lip,
-        struct xfs_log_iovec    *logvec)
+        struct xfs_log_vec      *lv)
 {
        struct xfs_dq_logitem   *qlip = DQUOT_ITEM(lip);
+        struct xfs_log_iovec    *vecp = NULL;
-        logvec->i_addr = &qlip->qli_format;
+        struct xfs_dq_logformat *qlf;
-        logvec->i_len  = sizeof(xfs_dq_logformat_t);
-        logvec->i_type = XLOG_REG_TYPE_QFORMAT;
+        qlf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_QFORMAT);
-        logvec++;
+        qlf->qlf_type = XFS_LI_DQUOT;
-        logvec->i_addr = &qlip->qli_dquot->q_core;
+        qlf->qlf_size = 2;
-        logvec->i_len  = sizeof(xfs_disk_dquot_t);
+        qlf->qlf_id = be32_to_cpu(qlip->qli_dquot->q_core.d_id);
-        logvec->i_type = XLOG_REG_TYPE_DQUOT;
+        qlf->qlf_blkno = qlip->qli_dquot->q_blkno;
+        qlf->qlf_len = 1;
-        qlip->qli_format.qlf_size = 2;
+        qlf->qlf_boffset = qlip->qli_dquot->q_bufoffset;
+        xlog_finish_iovec(lv, vecp, sizeof(struct xfs_dq_logformat));
+        xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_DQUOT,
+                        &qlip->qli_dquot->q_core,
+                        sizeof(struct xfs_disk_dquot));
 }
 /*
@@ -257,18 +261,6 @@ xfs_qm_dquot_logitem_init(
        xfs_log_item_init(dqp->q_mount, &lp->qli_item, XFS_LI_DQUOT,
                                        &xfs_dquot_item_ops);
        lp->qli_dquot = dqp;
-        lp->qli_format.qlf_type = XFS_LI_DQUOT;
-        lp->qli_format.qlf_id = be32_to_cpu(dqp->q_core.d_id);
-        lp->qli_format.qlf_blkno = dqp->q_blkno;
-        lp->qli_format.qlf_len = 1;
-        /*
-         * This is just the offset of this dquot within its buffer
-         * (which is currently 1 FSB and probably won't change).
-         * Hence 32 bits for this offset should be just fine.
-         * Alternatively, we can store (bufoffset / sizeof(xfs_dqblk_t))
-         * here, and recompute it at recovery time.
-         */
-        lp->qli_format.qlf_boffset = (__uint32_t)dqp->q_bufoffset;
 }
 /*------------------  QUOTAOFF LOG ITEMS  -------------------*/
@@ -294,26 +286,20 @@ xfs_qm_qoff_logitem_size(
        *nbytes += sizeof(struct xfs_qoff_logitem);
 }
-/*
- * This is called to fill in the vector of log iovecs for the
- * given quotaoff log item. We use only 1 iovec, and we point that
- * at the quotaoff_log_format structure embedded in the quotaoff item.
- * It is at this point that we assert that all of the extent
- * slots in the quotaoff item have been filled.
- */
 STATIC void
 xfs_qm_qoff_logitem_format(
        struct xfs_log_item     *lip,
-        struct xfs_log_iovec    *log_vector)
+        struct xfs_log_vec      *lv)
 {
        struct xfs_qoff_logitem *qflip = QOFF_ITEM(lip);
+        struct xfs_log_iovec    *vecp = NULL;
-        ASSERT(qflip->qql_format.qf_type == XFS_LI_QUOTAOFF);
+        struct xfs_qoff_logformat *qlf;
-        log_vector->i_addr = &qflip->qql_format;
+        qlf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_QUOTAOFF);
-        log_vector->i_len = sizeof(xfs_qoff_logitem_t);
+        qlf->qf_type = XFS_LI_QUOTAOFF;
-        log_vector->i_type = XLOG_REG_TYPE_QUOTAOFF;
+        qlf->qf_size = 1;
-        qflip->qql_format.qf_size = 1;
+        qlf->qf_flags = qflip->qql_flags;
+        xlog_finish_iovec(lv, vecp, sizeof(struct xfs_qoff_logitem));
 }
 /*
@@ -453,8 +439,7 @@ xfs_qm_qoff_logitem_init(
        xfs_log_item_init(mp, &qf->qql_item, XFS_LI_QUOTAOFF, start ?
                        &xfs_qm_qoffend_logitem_ops : &xfs_qm_qoff_logitem_ops);
        qf->qql_item.li_mountp = mp;
-        qf->qql_format.qf_type = XFS_LI_QUOTAOFF;
-        qf->qql_format.qf_flags = flags;
        qf->qql_start_lip = start;
+        qf->qql_flags = flags;
        return qf;
 }
diff --git a/fs/xfs/xfs_dquot_item.h b/fs/xfs/xfs_dquot_item.h
index 5acae2ada70b..502e9464634a 100644
--- a/fs/xfs/xfs_dquot_item.h
+++ b/fs/xfs/xfs_dquot_item.h
@@ -27,13 +27,12 @@ typedef struct xfs_dq_logitem {
        xfs_log_item_t           qli_item;         /* common portion */
        struct xfs_dquot        *qli_dquot;        /* dquot ptr */
        xfs_lsn_t                qli_flush_lsn;    /* lsn at last flush */
-        xfs_dq_logformat_t       qli_format;       /* logged structure */
 } xfs_dq_logitem_t;
 typedef struct xfs_qoff_logitem {
        xfs_log_item_t           qql_item;      /* common portion */
        struct xfs_qoff_logitem *qql_start_lip; /* qoff-start logitem, if any */
-        xfs_qoff_logformat_t     qql_format;    /* logged structure */
+        unsigned int            qql_flags;
 } xfs_qoff_logitem_t;
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 3680d04f973f..fb7a4c1ce1c5 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -26,6 +26,7 @@
 #include "xfs_trans_priv.h"
 #include "xfs_buf_item.h"
 #include "xfs_extfree_item.h"
+#include "xfs_log.h"
 kmem_zone_t     *xfs_efi_zone;
@@ -101,9 +102,10 @@ xfs_efi_item_size(
 STATIC void
 xfs_efi_item_format(
        struct xfs_log_item     *lip,
-        struct xfs_log_iovec    *log_vector)
+        struct xfs_log_vec      *lv)
 {
        struct xfs_efi_log_item *efip = EFI_ITEM(lip);
+        struct xfs_log_iovec    *vecp = NULL;
        ASSERT(atomic_read(&efip->efi_next_extent) ==
                                efip->efi_format.efi_nextents);
@@ -111,10 +113,9 @@ xfs_efi_item_format(
        efip->efi_format.efi_type = XFS_LI_EFI;
        efip->efi_format.efi_size = 1;
-        log_vector->i_addr = &efip->efi_format;
+        xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_EFI_FORMAT,
-        log_vector->i_len = xfs_efi_item_sizeof(efip);
+                        &efip->efi_format,
-        log_vector->i_type = XLOG_REG_TYPE_EFI_FORMAT;
+                        xfs_efi_item_sizeof(efip));
-        ASSERT(log_vector->i_len >= sizeof(xfs_efi_log_format_t));
 }
@@ -368,19 +369,19 @@ xfs_efd_item_size(
 STATIC void
 xfs_efd_item_format(
        struct xfs_log_item     *lip,
-        struct xfs_log_iovec    *log_vector)
+        struct xfs_log_vec      *lv)
 {
        struct xfs_efd_log_item *efdp = EFD_ITEM(lip);
+        struct xfs_log_iovec    *vecp = NULL;
        ASSERT(efdp->efd_next_extent == efdp->efd_format.efd_nextents);
        efdp->efd_format.efd_type = XFS_LI_EFD;
        efdp->efd_format.efd_size = 1;
-        log_vector->i_addr = &efdp->efd_format;
+        xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_EFD_FORMAT,
-        log_vector->i_len = xfs_efd_item_sizeof(efdp);
+                        &efdp->efd_format,
-        log_vector->i_type = XLOG_REG_TYPE_EFD_FORMAT;
+                        xfs_efd_item_sizeof(efdp));
-        ASSERT(log_vector->i_len >= sizeof(xfs_efd_log_format_t));
 }
 /*
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 52c91e143725..e00121592632 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -912,7 +912,7 @@ xfs_dir_open(
         * If there are any blocks, read-ahead block 0 as we're almost
         * certain to have the next operation be a read there.
         */
-        mode = xfs_ilock_map_shared(ip);
+        mode = xfs_ilock_data_map_shared(ip);
        if (ip->i_d.di_nextents > 0)
                xfs_dir3_data_readahead(NULL, ip, 0, -1);
        xfs_iunlock(ip, mode);
@@ -1215,7 +1215,7 @@ xfs_seek_data(
        uint                    lock;
        int                     error;
-        lock = xfs_ilock_map_shared(ip);
+        lock = xfs_ilock_data_map_shared(ip);
        isize = i_size_read(inode);
        if (start >= isize) {
@@ -1294,7 +1294,7 @@ out:
        offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
 out_unlock:
-        xfs_iunlock_map_shared(ip, lock);
+        xfs_iunlock(ip, lock);
        if (error)
                return -error;
@@ -1319,7 +1319,7 @@ xfs_seek_hole(
        if (XFS_FORCED_SHUTDOWN(mp))
                return -XFS_ERROR(EIO);
-        lock = xfs_ilock_map_shared(ip);
+        lock = xfs_ilock_data_map_shared(ip);
        isize = i_size_read(inode);
        if (start >= isize) {
@@ -1402,7 +1402,7 @@ out:
        offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
 out_unlock:
-        xfs_iunlock_map_shared(ip, lock);
+        xfs_iunlock(ip, lock);
        if (error)
                return -error;
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index e87719c5bebe..5d7f105a1c82 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -52,7 +52,7 @@ xfs_ialloc_cluster_alignment(
 {
        if (xfs_sb_version_hasalign(&args->mp->m_sb) &&
            args->mp->m_sb.sb_inoalignmt >=
-             XFS_B_TO_FSBT(args->mp, XFS_INODE_CLUSTER_SIZE(args->mp)))
+             XFS_B_TO_FSBT(args->mp, args->mp->m_inode_cluster_size))
                return args->mp->m_sb.sb_inoalignmt;
        return 1;
 }
@@ -170,27 +170,20 @@ xfs_ialloc_inode_init(
 {
        struct xfs_buf          *fbuf;
        struct xfs_dinode       *free;
-        int                     blks_per_cluster, nbufs, ninodes;
+        int                     nbufs, blks_per_cluster, inodes_per_cluster;
        int                     version;
        int                     i, j;
        xfs_daddr_t             d;
        xfs_ino_t               ino = 0;
        /*
-         * Loop over the new block(s), filling in the inodes.
+         * Loop over the new block(s), filling in the inodes.  For small block
-         * For small block sizes, manipulate the inodes in buffers
+         * sizes, manipulate the inodes in buffers  which are multiples of the
-         * which are multiples of the blocks size.
+         * blocks size.
         */
-        if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) {
+        blks_per_cluster = xfs_icluster_size_fsb(mp);
-                blks_per_cluster = 1;
+        inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog;
-                nbufs = length;
+        nbufs = length / blks_per_cluster;
-                ninodes = mp->m_sb.sb_inopblock;
-        } else {
-                blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) /
-                                   mp->m_sb.sb_blocksize;
-                nbufs = length / blks_per_cluster;
-                ninodes = blks_per_cluster * mp->m_sb.sb_inopblock;
-        }
        /*
         * Figure out what version number to use in the inodes we create.  If
@@ -225,7 +218,7 @@ xfs_ialloc_inode_init(
                 * they track in the AIL as if they were physically logged.
                 */
                if (tp)
-                        xfs_icreate_log(tp, agno, agbno, XFS_IALLOC_INODES(mp),
+                        xfs_icreate_log(tp, agno, agbno, mp->m_ialloc_inos,
                                        mp->m_sb.sb_inodesize, length, gen);
        } else if (xfs_sb_version_hasnlink(&mp->m_sb))
                version = 2;
@@ -246,7 +239,7 @@ xfs_ialloc_inode_init(
                /* Initialize the inode buffers and log them appropriately. */
                fbuf->b_ops = &xfs_inode_buf_ops;
                xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length));
-                for (i = 0; i < ninodes; i++) {
+                for (i = 0; i < inodes_per_cluster; i++) {
                        int     ioffset = i << mp->m_sb.sb_inodelog;
                        uint    isize = xfs_dinode_size(version);
@@ -329,11 +322,11 @@ xfs_ialloc_ag_alloc(
         * Locking will ensure that we don't have two callers in here
         * at one time.
         */
-        newlen = XFS_IALLOC_INODES(args.mp);
+        newlen = args.mp->m_ialloc_inos;
        if (args.mp->m_maxicount &&
            args.mp->m_sb.sb_icount + newlen > args.mp->m_maxicount)
                return XFS_ERROR(ENOSPC);
-        args.minlen = args.maxlen = XFS_IALLOC_BLOCKS(args.mp);
+        args.minlen = args.maxlen = args.mp->m_ialloc_blks;
        /*
         * First try to allocate inodes contiguous with the last-allocated
         * chunk of inodes.  If the filesystem is striped, this will fill
@@ -343,7 +336,7 @@ xfs_ialloc_ag_alloc(
        newino = be32_to_cpu(agi->agi_newino);
        agno = be32_to_cpu(agi->agi_seqno);
        args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) +
-                        XFS_IALLOC_BLOCKS(args.mp);
+                     args.mp->m_ialloc_blks;
        if (likely(newino != NULLAGINO &&
                  (args.agbno < be32_to_cpu(agi->agi_length)))) {
                args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
@@ -585,7 +578,7 @@ xfs_ialloc_ag_select(
                 * Is there enough free space for the file plus a block of
                 * inodes? (if we need to allocate some)?
                 */
-                ineed = XFS_IALLOC_BLOCKS(mp);
+                ineed = mp->m_ialloc_blks;
                longest = pag->pagf_longest;
                if (!longest)
                        longest = pag->pagf_flcount > 0;
@@ -999,7 +992,7 @@ xfs_dialloc(
         * inode.
         */
        if (mp->m_maxicount &&
-            mp->m_sb.sb_icount + XFS_IALLOC_INODES(mp) > mp->m_maxicount) {
+            mp->m_sb.sb_icount + mp->m_ialloc_inos > mp->m_maxicount) {
                noroom = 1;
                okalloc = 0;
        }
@@ -1202,7 +1195,7 @@ xfs_difree(
         * When an inode cluster is free, it becomes eligible for removal
         */
        if (!(mp->m_flags & XFS_MOUNT_IKEEP) &&
-            (rec.ir_freecount == XFS_IALLOC_INODES(mp))) {
+            (rec.ir_freecount == mp->m_ialloc_inos)) {
                *delete = 1;
                *first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino);
@@ -1212,7 +1205,7 @@ xfs_difree(
                 * AGI and Superblock inode counts, and mark the disk space
                 * to be freed when the transaction is committed.
                 */
-                ilen = XFS_IALLOC_INODES(mp);
+                ilen = mp->m_ialloc_inos;
                be32_add_cpu(&agi->agi_count, -ilen);
                be32_add_cpu(&agi->agi_freecount, -(ilen - 1));
                xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT);
@@ -1228,9 +1221,9 @@ xfs_difree(
                        goto error0;
                }
-                xfs_bmap_add_free(XFS_AGB_TO_FSB(mp,
+                xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno,
-                                agno, XFS_INO_TO_AGBNO(mp,rec.ir_startino)),
+                                  XFS_AGINO_TO_AGBNO(mp, rec.ir_startino)),
-                                XFS_IALLOC_BLOCKS(mp), flist, mp);
+                                  mp->m_ialloc_blks, flist, mp);
        } else {
                *delete = 0;
@@ -1311,7 +1304,7 @@ xfs_imap_lookup(
        /* check that the returned record contains the required inode */
        if (rec.ir_startino > agino ||
-            rec.ir_startino + XFS_IALLOC_INODES(mp) <= agino)
+            rec.ir_startino + mp->m_ialloc_inos <= agino)
                return EINVAL;
        /* for untrusted inodes check it is allocated first */
@@ -1384,7 +1377,7 @@ xfs_imap(
                return XFS_ERROR(EINVAL);
        }
-        blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_blocklog;
+        blks_per_cluster = xfs_icluster_size_fsb(mp);
        /*
         * For bulkstat and handle lookups, we have an untrusted inode number
@@ -1405,7 +1398,7 @@ xfs_imap(
         * If the inode cluster size is the same as the blocksize or
         * smaller we get to the buffer by simple arithmetics.
         */
-        if (XFS_INODE_CLUSTER_SIZE(mp) <= mp->m_sb.sb_blocksize) {
+        if (blks_per_cluster == 1) {
                offset = XFS_INO_TO_OFFSET(mp, ino);
                ASSERT(offset < mp->m_sb.sb_inopblock);
diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h
index a8f76a5ff418..812365d17e67 100644
--- a/fs/xfs/xfs_ialloc.h
+++ b/fs/xfs/xfs_ialloc.h
@@ -25,17 +25,18 @@ struct xfs_mount;
 struct xfs_trans;
 struct xfs_btree_cur;
-/*
+/* Move inodes in clusters of this size */
- * Allocation parameters for inode allocation.
- */
-#define XFS_IALLOC_INODES(mp)   (mp)->m_ialloc_inos
-#define XFS_IALLOC_BLOCKS(mp)   (mp)->m_ialloc_blks
-/*
- * Move inodes in clusters of this size.
- */
 #define XFS_INODE_BIG_CLUSTER_SIZE      8192
-#define XFS_INODE_CLUSTER_SIZE(mp)      (mp)->m_inode_cluster_size
+/* Calculate and return the number of filesystem blocks per inode cluster */
+static inline int
+xfs_icluster_size_fsb(
+        struct xfs_mount        *mp)
+{
+        if (mp->m_sb.sb_blocksize >= mp->m_inode_cluster_size)
+                return 1;
+        return mp->m_inode_cluster_size >> mp->m_sb.sb_blocklog;
+}
 /*
 * Make an inode pointer out of the buffer/offset.
diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c
index d2eaccfa73f4..7e4549233251 100644
--- a/fs/xfs/xfs_icreate_item.c
+++ b/fs/xfs/xfs_icreate_item.c
@@ -28,6 +28,7 @@
 #include "xfs_trans_priv.h"
 #include "xfs_error.h"
 #include "xfs_icreate_item.h"
+#include "xfs_log.h"
 kmem_zone_t     *xfs_icreate_zone;              /* inode create item zone */
@@ -58,13 +59,14 @@ xfs_icreate_item_size(
 STATIC void
 xfs_icreate_item_format(
        struct xfs_log_item     *lip,
-        struct xfs_log_iovec    *log_vector)
+        struct xfs_log_vec      *lv)
 {
        struct xfs_icreate_item *icp = ICR_ITEM(lip);
+        struct xfs_log_iovec    *vecp = NULL;
-        log_vector->i_addr = (xfs_caddr_t)&icp->ic_format;
+        xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ICREATE,
-        log_vector->i_len  = sizeof(struct xfs_icreate_log);
+                        &icp->ic_format,
-        log_vector->i_type = XLOG_REG_TYPE_ICREATE;
+                        sizeof(struct xfs_icreate_log));
 }
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 001aa893ed59..3a137e9f9a7d 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -77,48 +77,44 @@ xfs_get_extsz_hint(
 }
 /*
- * This is a wrapper routine around the xfs_ilock() routine used to centralize
+ * These two are wrapper routines around the xfs_ilock() routine used to
- * some grungy code.  It is used in places that wish to lock the inode solely
+ * centralize some grungy code.  They are used in places that wish to lock the
- * for reading the extents.  The reason these places can't just call
+ * inode solely for reading the extents.  The reason these places can't just
- * xfs_ilock(SHARED) is that the inode lock also guards to bringing in of the
+ * call xfs_ilock(ip, XFS_ILOCK_SHARED) is that the inode lock also guards to
- * extents from disk for a file in b-tree format.  If the inode is in b-tree
+ * bringing in of the extents from disk for a file in b-tree format.  If the
- * format, then we need to lock the inode exclusively until the extents are read
+ * inode is in b-tree format, then we need to lock the inode exclusively until
- * in.  Locking it exclusively all the time would limit our parallelism
+ * the extents are read in.  Locking it exclusively all the time would limit
- * unnecessarily, though.  What we do instead is check to see if the extents
+ * our parallelism unnecessarily, though.  What we do instead is check to see
- * have been read in yet, and only lock the inode exclusively if they have not.
+ * if the extents have been read in yet, and only lock the inode exclusively
+ * if they have not.
 *
- * The function returns a value which should be given to the corresponding
+ * The functions return a value which should be given to the corresponding
- * xfs_iunlock_map_shared().  This value is the mode in which the lock was
+ * xfs_iunlock() call.
- * actually taken.
 */
 uint
-xfs_ilock_map_shared(
+xfs_ilock_data_map_shared(
-        xfs_inode_t     *ip)
+        struct xfs_inode        *ip)
 {
-        uint    lock_mode;
+        uint                    lock_mode = XFS_ILOCK_SHARED;
-        if ((ip->i_d.di_format == XFS_DINODE_FMT_BTREE) &&
+        if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
-            ((ip->i_df.if_flags & XFS_IFEXTENTS) == 0)) {
+            (ip->i_df.if_flags & XFS_IFEXTENTS) == 0)
                lock_mode = XFS_ILOCK_EXCL;
-        } else {
-                lock_mode = XFS_ILOCK_SHARED;
-        }
        xfs_ilock(ip, lock_mode);
        return lock_mode;
 }
-/*
+uint
- * This is simply the unlock routine to go with xfs_ilock_map_shared().
+xfs_ilock_attr_map_shared(
- * All it does is call xfs_iunlock() with the given lock_mode.
+        struct xfs_inode        *ip)
- */
-void
-xfs_iunlock_map_shared(
-        xfs_inode_t     *ip,
-        unsigned int    lock_mode)
 {
-        xfs_iunlock(ip, lock_mode);
+        uint                    lock_mode = XFS_ILOCK_SHARED;
+        if (ip->i_d.di_aformat == XFS_DINODE_FMT_BTREE &&
+            (ip->i_afp->if_flags & XFS_IFEXTENTS) == 0)
+                lock_mode = XFS_ILOCK_EXCL;
+        xfs_ilock(ip, lock_mode);
+        return lock_mode;
 }
 /*
@@ -588,9 +584,9 @@ xfs_lookup(
        if (XFS_FORCED_SHUTDOWN(dp->i_mount))
                return XFS_ERROR(EIO);
-        lock_mode = xfs_ilock_map_shared(dp);
+        lock_mode = xfs_ilock_data_map_shared(dp);
        error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name);
-        xfs_iunlock_map_shared(dp, lock_mode);
+        xfs_iunlock(dp, lock_mode);
        if (error)
                goto out;
@@ -2141,8 +2137,8 @@ xfs_ifree_cluster(
 {
        xfs_mount_t             *mp = free_ip->i_mount;
        int                     blks_per_cluster;
+        int                     inodes_per_cluster;
        int                     nbufs;
-        int                     ninodes;
        int                     i, j;
        xfs_daddr_t             blkno;
        xfs_buf_t               *bp;
@@ -2152,18 +2148,11 @@ xfs_ifree_cluster(
        struct xfs_perag        *pag;
        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum));
-        if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) {
+        blks_per_cluster = xfs_icluster_size_fsb(mp);
-                blks_per_cluster = 1;
+        inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog;
-                ninodes = mp->m_sb.sb_inopblock;
+        nbufs = mp->m_ialloc_blks / blks_per_cluster;
-                nbufs = XFS_IALLOC_BLOCKS(mp);
-        } else {
-                blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) /
-                                        mp->m_sb.sb_blocksize;
-                ninodes = blks_per_cluster * mp->m_sb.sb_inopblock;
-                nbufs = XFS_IALLOC_BLOCKS(mp) / blks_per_cluster;
-        }
-        for (j = 0; j < nbufs; j++, inum += ninodes) {
+        for (j = 0; j < nbufs; j++, inum += inodes_per_cluster) {
                blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),
                                         XFS_INO_TO_AGBNO(mp, inum));
@@ -2225,7 +2214,7 @@ xfs_ifree_cluster(
                 * transaction stale above, which means there is no point in
                 * even trying to lock them.
                 */
-                for (i = 0; i < ninodes; i++) {
+                for (i = 0; i < inodes_per_cluster; i++) {
 retry:
                        rcu_read_lock();
                        ip = radix_tree_lookup(&pag->pag_ici_root,
@@ -2906,13 +2895,13 @@ xfs_iflush_cluster(
        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
-        inodes_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog;
+        inodes_per_cluster = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
        ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *);
        ilist = kmem_alloc(ilist_size, KM_MAYFAIL|KM_NOFS);
        if (!ilist)
                goto out_put;
-        mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
+        mask = ~(((mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog)) - 1);
        first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
        rcu_read_lock();
        /* really need a gang lookup range call here */
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 9e6efccbae04..65e2350f449c 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -337,8 +337,8 @@ int		xfs_ilock_nowait(xfs_inode_t *, uint);
 void            xfs_iunlock(xfs_inode_t *, uint);
 void            xfs_ilock_demote(xfs_inode_t *, uint);
 int             xfs_isilocked(xfs_inode_t *, uint);
-uint            xfs_ilock_map_shared(xfs_inode_t *);
+uint            xfs_ilock_data_map_shared(struct xfs_inode *);
-void            xfs_iunlock_map_shared(xfs_inode_t *, uint);
+uint            xfs_ilock_attr_map_shared(struct xfs_inode *);
 int             xfs_ialloc(struct xfs_trans *, xfs_inode_t *, umode_t,
                           xfs_nlink_t, xfs_dev_t, prid_t, int,
                           struct xfs_buf **, xfs_inode_t **);
diff --git a/fs/xfs/xfs_inode_fork.c b/fs/xfs/xfs_inode_fork.c
index cfee14a83cfe..73514c0486b7 100644
--- a/fs/xfs/xfs_inode_fork.c
+++ b/fs/xfs/xfs_inode_fork.c
@@ -431,6 +431,8 @@ xfs_iread_extents(
        xfs_ifork_t     *ifp;
        xfs_extnum_t    nextents;
+        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
        if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) {
                XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW,
                                 ip->i_mount);
@@ -721,15 +723,16 @@ xfs_idestroy_fork(
 }
 /*
- * xfs_iextents_copy()
+ * Convert in-core extents to on-disk form
 *
- * This is called to copy the REAL extents (as opposed to the delayed
+ * For either the data or attr fork in extent format, we need to endian convert
- * allocation extents) from the inode into the given buffer.  It
+ * the in-core extent as we place them into the on-disk inode.
- * returns the number of bytes copied into the buffer.
 *
- * If there are no delayed allocation extents, then we can just
+ * In the case of the data fork, the in-core and on-disk fork sizes can be
- * memcpy() the extents into the buffer.  Otherwise, we need to
+ * different due to delayed allocation extents. We only copy on-disk extents
- * examine each extent in turn and skip those which are delayed.
+ * here, so callers must always use the physical fork size to determine the
+ * size of the buffer passed to this routine.  We will return the size actually
+ * used.
 */
 int
 xfs_iextents_copy(
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 7c0d391f9a6e..686889b4a1e5 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -30,6 +30,7 @@
 #include "xfs_trace.h"
 #include "xfs_trans_priv.h"
 #include "xfs_dinode.h"
+#include "xfs_log.h"
 kmem_zone_t     *xfs_ili_zone;          /* inode log item zone */
@@ -39,27 +40,14 @@ static inline struct xfs_inode_log_item *INODE_ITEM(struct xfs_log_item *lip)
        return container_of(lip, struct xfs_inode_log_item, ili_item);
 }
-/*
- * This returns the number of iovecs needed to log the given inode item.
- *
- * We need one iovec for the inode log format structure, one for the
- * inode core, and possibly one for the inode data/extents/b-tree root
- * and one for the inode attribute data/extents/b-tree root.
- */
 STATIC void
-xfs_inode_item_size(
+xfs_inode_item_data_fork_size(
-        struct xfs_log_item     *lip,
+        struct xfs_inode_log_item *iip,
        int                     *nvecs,
        int                     *nbytes)
 {
-        struct xfs_inode_log_item *iip = INODE_ITEM(lip);
        struct xfs_inode        *ip = iip->ili_inode;
-        *nvecs += 2;
-        *nbytes += sizeof(struct xfs_inode_log_format) +
-                   xfs_icdinode_size(ip->i_d.di_version);
        switch (ip->i_d.di_format) {
        case XFS_DINODE_FMT_EXTENTS:
                if ((iip->ili_fields & XFS_ILOG_DEXT) &&
@@ -70,7 +58,6 @@ xfs_inode_item_size(
                        *nvecs += 1;
                }
                break;
        case XFS_DINODE_FMT_BTREE:
                if ((iip->ili_fields & XFS_ILOG_DBROOT) &&
                    ip->i_df.if_broot_bytes > 0) {
@@ -78,7 +65,6 @@ xfs_inode_item_size(
                        *nvecs += 1;
                }
                break;
        case XFS_DINODE_FMT_LOCAL:
                if ((iip->ili_fields & XFS_ILOG_DDATA) &&
                    ip->i_df.if_bytes > 0) {
@@ -90,19 +76,20 @@ xfs_inode_item_size(
        case XFS_DINODE_FMT_DEV:
        case XFS_DINODE_FMT_UUID:
                break;
        default:
                ASSERT(0);
                break;
        }
+}
-        if (!XFS_IFORK_Q(ip))
+STATIC void
-                return;
+xfs_inode_item_attr_fork_size(
+        struct xfs_inode_log_item *iip,
+        int                     *nvecs,
+        int                     *nbytes)
+{
+        struct xfs_inode        *ip = iip->ili_inode;
-        /*
-         * Log any necessary attribute data.
-         */
        switch (ip->i_d.di_aformat) {
        case XFS_DINODE_FMT_EXTENTS:
                if ((iip->ili_fields & XFS_ILOG_AEXT) &&
@@ -113,7 +100,6 @@ xfs_inode_item_size(
                        *nvecs += 1;
                }
                break;
        case XFS_DINODE_FMT_BTREE:
                if ((iip->ili_fields & XFS_ILOG_ABROOT) &&
                    ip->i_afp->if_broot_bytes > 0) {
@@ -121,7 +107,6 @@ xfs_inode_item_size(
                        *nvecs += 1;
                }
                break;
        case XFS_DINODE_FMT_LOCAL:
                if ((iip->ili_fields & XFS_ILOG_ADATA) &&
                    ip->i_afp->if_bytes > 0) {
@@ -129,7 +114,6 @@ xfs_inode_item_size(
                        *nvecs += 1;
                }
                break;
        default:
                ASSERT(0);
                break;
@@ -137,98 +121,67 @@ xfs_inode_item_size(
 }
 /*
- * xfs_inode_item_format_extents - convert in-core extents to on-disk form
+ * This returns the number of iovecs needed to log the given inode item.
- *
- * For either the data or attr fork in extent format, we need to endian convert
- * the in-core extent as we place them into the on-disk inode. In this case, we
- * need to do this conversion before we write the extents into the log. Because
- * we don't have the disk inode to write into here, we allocate a buffer and
- * format the extents into it via xfs_iextents_copy(). We free the buffer in
- * the unlock routine after the copy for the log has been made.
 *
- * In the case of the data fork, the in-core and on-disk fork sizes can be
+ * We need one iovec for the inode log format structure, one for the
- * different due to delayed allocation extents. We only log on-disk extents
+ * inode core, and possibly one for the inode data/extents/b-tree root
- * here, so always use the physical fork size to determine the size of the
+ * and one for the inode attribute data/extents/b-tree root.
- * buffer we need to allocate.
 */
 STATIC void
-xfs_inode_item_format_extents(
+xfs_inode_item_size(
-        struct xfs_inode        *ip,
+        struct xfs_log_item     *lip,
-        struct xfs_log_iovec    *vecp,
+        int                     *nvecs,
-        int                     whichfork,
+        int                     *nbytes)
-        int                     type)
 {
-        xfs_bmbt_rec_t          *ext_buffer;
+        struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+        struct xfs_inode        *ip = iip->ili_inode;
-        ext_buffer = kmem_alloc(XFS_IFORK_SIZE(ip, whichfork), KM_SLEEP);
+        *nvecs += 2;
-        if (whichfork == XFS_DATA_FORK)
+        *nbytes += sizeof(struct xfs_inode_log_format) +
-                ip->i_itemp->ili_extents_buf = ext_buffer;
+                   xfs_icdinode_size(ip->i_d.di_version);
-        else
-                ip->i_itemp->ili_aextents_buf = ext_buffer;
-        vecp->i_addr = ext_buffer;
+        xfs_inode_item_data_fork_size(iip, nvecs, nbytes);
-        vecp->i_len = xfs_iextents_copy(ip, ext_buffer, whichfork);
+        if (XFS_IFORK_Q(ip))
-        vecp->i_type = type;
+                xfs_inode_item_attr_fork_size(iip, nvecs, nbytes);
 }
 /*
- * This is called to fill in the vector of log iovecs for the
+ * If this is a v1 format inode, then we need to log it as such.  This means
- * given inode log item.  It fills the first item with an inode
+ * that we have to copy the link count from the new field to the old.  We
- * log format structure, the second with the on-disk inode structure,
+ * don't have to worry about the new fields, because nothing trusts them as
- * and a possible third and/or fourth with the inode data/extents/b-tree
+ * long as the old inode version number is there.
- * root and inode attributes data/extents/b-tree root.
 */
 STATIC void
-xfs_inode_item_format(
+xfs_inode_item_format_v1_inode(
-        struct xfs_log_item     *lip,
+        struct xfs_inode        *ip)
-        struct xfs_log_iovec    *vecp)
+{
+        if (!xfs_sb_version_hasnlink(&ip->i_mount->m_sb)) {
+                /*
+                 * Convert it back.
+                 */
+                ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1);
+                ip->i_d.di_onlink = ip->i_d.di_nlink;
+        } else {
+                /*
+                 * The superblock version has already been bumped,
+                 * so just make the conversion to the new inode
+                 * format permanent.
+                 */
+                ip->i_d.di_version = 2;
+                ip->i_d.di_onlink = 0;
+                memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
+        }
+}
+STATIC void
+xfs_inode_item_format_data_fork(
+        struct xfs_inode_log_item *iip,
+        struct xfs_inode_log_format *ilf,
+        struct xfs_log_vec      *lv,
+        struct xfs_log_iovec    **vecp)
 {
-        struct xfs_inode_log_item *iip = INODE_ITEM(lip);
        struct xfs_inode        *ip = iip->ili_inode;
-        uint                    nvecs;
        size_t                  data_bytes;
-        xfs_mount_t             *mp;
-        vecp->i_addr = &iip->ili_format;
-        vecp->i_len  = sizeof(xfs_inode_log_format_t);
-        vecp->i_type = XLOG_REG_TYPE_IFORMAT;
-        vecp++;
-        nvecs        = 1;
-        vecp->i_addr = &ip->i_d;
-        vecp->i_len  = xfs_icdinode_size(ip->i_d.di_version);
-        vecp->i_type = XLOG_REG_TYPE_ICORE;
-        vecp++;
-        nvecs++;
-        /*
-         * If this is really an old format inode, then we need to
-         * log it as such.  This means that we have to copy the link
-         * count from the new field to the old.  We don't have to worry
-         * about the new fields, because nothing trusts them as long as
-         * the old inode version number is there.  If the superblock already
-         * has a new version number, then we don't bother converting back.
-         */
-        mp = ip->i_mount;
-        ASSERT(ip->i_d.di_version == 1 || xfs_sb_version_hasnlink(&mp->m_sb));
-        if (ip->i_d.di_version == 1) {
-                if (!xfs_sb_version_hasnlink(&mp->m_sb)) {
-                        /*
-                         * Convert it back.
-                         */
-                        ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1);
-                        ip->i_d.di_onlink = ip->i_d.di_nlink;
-                } else {
-                        /*
-                         * The superblock version has already been bumped,
-                         * so just make the conversion to the new inode
-                         * format permanent.
-                         */
-                        ip->i_d.di_version = 2;
-                        ip->i_d.di_onlink = 0;
-                        memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
-                }
-        }
        switch (ip->i_d.di_format) {
        case XFS_DINODE_FMT_EXTENTS:
@@ -239,36 +192,23 @@ xfs_inode_item_format(
                if ((iip->ili_fields & XFS_ILOG_DEXT) &&
                    ip->i_d.di_nextents > 0 &&
                    ip->i_df.if_bytes > 0) {
+                        struct xfs_bmbt_rec *p;
                        ASSERT(ip->i_df.if_u1.if_extents != NULL);
                        ASSERT(ip->i_df.if_bytes / sizeof(xfs_bmbt_rec_t) > 0);
-                        ASSERT(iip->ili_extents_buf == NULL);
+                        p = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_IEXT);
-#ifdef XFS_NATIVE_HOST
+                        data_bytes = xfs_iextents_copy(ip, p, XFS_DATA_FORK);
-                       if (ip->i_d.di_nextents == ip->i_df.if_bytes /
+                        xlog_finish_iovec(lv, *vecp, data_bytes);
-                                               (uint)sizeof(xfs_bmbt_rec_t)) {
-                                /*
+                        ASSERT(data_bytes <= ip->i_df.if_bytes);
-                                 * There are no delayed allocation
-                                 * extents, so just point to the
+                        ilf->ilf_dsize = data_bytes;
-                                 * real extents array.
+                        ilf->ilf_size++;
-                                 */
-                                vecp->i_addr = ip->i_df.if_u1.if_extents;
-                                vecp->i_len = ip->i_df.if_bytes;
-                                vecp->i_type = XLOG_REG_TYPE_IEXT;
-                        } else
-#endif
-                        {
-                                xfs_inode_item_format_extents(ip, vecp,
-                                        XFS_DATA_FORK, XLOG_REG_TYPE_IEXT);
-                        }
-                        ASSERT(vecp->i_len <= ip->i_df.if_bytes);
-                        iip->ili_format.ilf_dsize = vecp->i_len;
-                        vecp++;
-                        nvecs++;
                } else {
                        iip->ili_fields &= ~XFS_ILOG_DEXT;
                }
                break;
        case XFS_DINODE_FMT_BTREE:
                iip->ili_fields &=
                        ~(XFS_ILOG_DDATA | XFS_ILOG_DEXT |
@@ -277,80 +217,70 @@ xfs_inode_item_format(
                if ((iip->ili_fields & XFS_ILOG_DBROOT) &&
                    ip->i_df.if_broot_bytes > 0) {
                        ASSERT(ip->i_df.if_broot != NULL);
-                        vecp->i_addr = ip->i_df.if_broot;
+                        xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IBROOT,
-                        vecp->i_len = ip->i_df.if_broot_bytes;
+                                        ip->i_df.if_broot,
-                        vecp->i_type = XLOG_REG_TYPE_IBROOT;
+                                        ip->i_df.if_broot_bytes);
-                        vecp++;
+                        ilf->ilf_dsize = ip->i_df.if_broot_bytes;
-                        nvecs++;
+                        ilf->ilf_size++;
-                        iip->ili_format.ilf_dsize = ip->i_df.if_broot_bytes;
                } else {
                        ASSERT(!(iip->ili_fields &
                                 XFS_ILOG_DBROOT));
                        iip->ili_fields &= ~XFS_ILOG_DBROOT;
                }
                break;
        case XFS_DINODE_FMT_LOCAL:
                iip->ili_fields &=
                        ~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT |
                          XFS_ILOG_DEV | XFS_ILOG_UUID);
                if ((iip->ili_fields & XFS_ILOG_DDATA) &&
                    ip->i_df.if_bytes > 0) {
-                        ASSERT(ip->i_df.if_u1.if_data != NULL);
-                        ASSERT(ip->i_d.di_size > 0);
-                        vecp->i_addr = ip->i_df.if_u1.if_data;
                        /*
                         * Round i_bytes up to a word boundary.
                         * The underlying memory is guaranteed to
                         * to be there by xfs_idata_realloc().
                         */
                        data_bytes = roundup(ip->i_df.if_bytes, 4);
-                        ASSERT((ip->i_df.if_real_bytes == 0) ||
+                        ASSERT(ip->i_df.if_real_bytes == 0 ||
-                               (ip->i_df.if_real_bytes == data_bytes));
+                               ip->i_df.if_real_bytes == data_bytes);
-                        vecp->i_len = (int)data_bytes;
+                        ASSERT(ip->i_df.if_u1.if_data != NULL);
-                        vecp->i_type = XLOG_REG_TYPE_ILOCAL;
+                        ASSERT(ip->i_d.di_size > 0);
-                        vecp++;
+                        xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_ILOCAL,
-                        nvecs++;
+                                        ip->i_df.if_u1.if_data, data_bytes);
-                        iip->ili_format.ilf_dsize = (unsigned)data_bytes;
+                        ilf->ilf_dsize = (unsigned)data_bytes;
+                        ilf->ilf_size++;
                } else {
                        iip->ili_fields &= ~XFS_ILOG_DDATA;
                }
                break;
        case XFS_DINODE_FMT_DEV:
                iip->ili_fields &=
                        ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
                          XFS_ILOG_DEXT | XFS_ILOG_UUID);
-                if (iip->ili_fields & XFS_ILOG_DEV) {
+                if (iip->ili_fields & XFS_ILOG_DEV)
-                        iip->ili_format.ilf_u.ilfu_rdev =
+                        ilf->ilf_u.ilfu_rdev = ip->i_df.if_u2.if_rdev;
-                                ip->i_df.if_u2.if_rdev;
-                }
                break;
        case XFS_DINODE_FMT_UUID:
                iip->ili_fields &=
                        ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
                          XFS_ILOG_DEXT | XFS_ILOG_DEV);
-                if (iip->ili_fields & XFS_ILOG_UUID) {
+                if (iip->ili_fields & XFS_ILOG_UUID)
-                        iip->ili_format.ilf_u.ilfu_uuid =
+                        ilf->ilf_u.ilfu_uuid = ip->i_df.if_u2.if_uuid;
-                                ip->i_df.if_u2.if_uuid;
-                }
                break;
        default:
                ASSERT(0);
                break;
        }
+}
-        /*
+STATIC void
-         * If there are no attributes associated with the file, then we're done.
+xfs_inode_item_format_attr_fork(
-         */
+        struct xfs_inode_log_item *iip,
-        if (!XFS_IFORK_Q(ip)) {
+        struct xfs_inode_log_format *ilf,
-                iip->ili_fields &=
+        struct xfs_log_vec      *lv,
-                        ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT);
+        struct xfs_log_iovec    **vecp)
-                goto out;
+{
-        }
+        struct xfs_inode        *ip = iip->ili_inode;
+        size_t                  data_bytes;
        switch (ip->i_d.di_aformat) {
        case XFS_DINODE_FMT_EXTENTS:
@@ -360,30 +290,22 @@ xfs_inode_item_format(
                if ((iip->ili_fields & XFS_ILOG_AEXT) &&
                    ip->i_d.di_anextents > 0 &&
                    ip->i_afp->if_bytes > 0) {
+                        struct xfs_bmbt_rec *p;
                        ASSERT(ip->i_afp->if_bytes / sizeof(xfs_bmbt_rec_t) ==
                                ip->i_d.di_anextents);
                        ASSERT(ip->i_afp->if_u1.if_extents != NULL);
-#ifdef XFS_NATIVE_HOST
-                        /*
+                        p = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_EXT);
-                         * There are not delayed allocation extents
+                        data_bytes = xfs_iextents_copy(ip, p, XFS_ATTR_FORK);
-                         * for attributes, so just point at the array.
+                        xlog_finish_iovec(lv, *vecp, data_bytes);
-                         */
-                        vecp->i_addr = ip->i_afp->if_u1.if_extents;
+                        ilf->ilf_asize = data_bytes;
-                        vecp->i_len = ip->i_afp->if_bytes;
+                        ilf->ilf_size++;
-                        vecp->i_type = XLOG_REG_TYPE_IATTR_EXT;
-#else
-                        ASSERT(iip->ili_aextents_buf == NULL);
-                        xfs_inode_item_format_extents(ip, vecp,
-                                        XFS_ATTR_FORK, XLOG_REG_TYPE_IATTR_EXT);
-#endif
-                        iip->ili_format.ilf_asize = vecp->i_len;
-                        vecp++;
-                        nvecs++;
                } else {
                        iip->ili_fields &= ~XFS_ILOG_AEXT;
                }
                break;
        case XFS_DINODE_FMT_BTREE:
                iip->ili_fields &=
                        ~(XFS_ILOG_ADATA | XFS_ILOG_AEXT);
@@ -392,61 +314,89 @@ xfs_inode_item_format(
                    ip->i_afp->if_broot_bytes > 0) {
                        ASSERT(ip->i_afp->if_broot != NULL);
-                        vecp->i_addr = ip->i_afp->if_broot;
+                        xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_BROOT,
-                        vecp->i_len = ip->i_afp->if_broot_bytes;
+                                        ip->i_afp->if_broot,
-                        vecp->i_type = XLOG_REG_TYPE_IATTR_BROOT;
+                                        ip->i_afp->if_broot_bytes);
-                        vecp++;
+                        ilf->ilf_asize = ip->i_afp->if_broot_bytes;
-                        nvecs++;
+                        ilf->ilf_size++;
-                        iip->ili_format.ilf_asize = ip->i_afp->if_broot_bytes;
                } else {
                        iip->ili_fields &= ~XFS_ILOG_ABROOT;
                }
                break;
        case XFS_DINODE_FMT_LOCAL:
                iip->ili_fields &=
                        ~(XFS_ILOG_AEXT | XFS_ILOG_ABROOT);
                if ((iip->ili_fields & XFS_ILOG_ADATA) &&
                    ip->i_afp->if_bytes > 0) {
-                        ASSERT(ip->i_afp->if_u1.if_data != NULL);
-                        vecp->i_addr = ip->i_afp->if_u1.if_data;
                        /*
                         * Round i_bytes up to a word boundary.
                         * The underlying memory is guaranteed to
                         * to be there by xfs_idata_realloc().
                         */
                        data_bytes = roundup(ip->i_afp->if_bytes, 4);
-                        ASSERT((ip->i_afp->if_real_bytes == 0) ||
+                        ASSERT(ip->i_afp->if_real_bytes == 0 ||
-                               (ip->i_afp->if_real_bytes == data_bytes));
+                               ip->i_afp->if_real_bytes == data_bytes);
-                        vecp->i_len = (int)data_bytes;
+                        ASSERT(ip->i_afp->if_u1.if_data != NULL);
-                        vecp->i_type = XLOG_REG_TYPE_IATTR_LOCAL;
+                        xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_LOCAL,
-                        vecp++;
+                                        ip->i_afp->if_u1.if_data,
-                        nvecs++;
+                                        data_bytes);
-                        iip->ili_format.ilf_asize = (unsigned)data_bytes;
+                        ilf->ilf_asize = (unsigned)data_bytes;
+                        ilf->ilf_size++;
                } else {
                        iip->ili_fields &= ~XFS_ILOG_ADATA;
                }
                break;
        default:
                ASSERT(0);
                break;
        }
-out:
-        /*
-         * Now update the log format that goes out to disk from the in-core
-         * values.  We always write the inode core to make the arithmetic
-         * games in recovery easier, which isn't a big deal as just about any
-         * transaction would dirty it anyway.
-         */
-        iip->ili_format.ilf_fields = XFS_ILOG_CORE |
-                (iip->ili_fields & ~XFS_ILOG_TIMESTAMP);
-        iip->ili_format.ilf_size = nvecs;
 }
+/*
+ * This is called to fill in the vector of log iovecs for the given inode
+ * log item.  It fills the first item with an inode log format structure,
+ * the second with the on-disk inode structure, and a possible third and/or
+ * fourth with the inode data/extents/b-tree root and inode attributes
+ * data/extents/b-tree root.
+ */
+STATIC void
+xfs_inode_item_format(
+        struct xfs_log_item     *lip,
+        struct xfs_log_vec      *lv)
+{
+        struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+        struct xfs_inode        *ip = iip->ili_inode;
+        struct xfs_inode_log_format *ilf;
+        struct xfs_log_iovec    *vecp = NULL;
+        ilf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_IFORMAT);
+        ilf->ilf_type = XFS_LI_INODE;
+        ilf->ilf_ino = ip->i_ino;
+        ilf->ilf_blkno = ip->i_imap.im_blkno;
+        ilf->ilf_len = ip->i_imap.im_len;
+        ilf->ilf_boffset = ip->i_imap.im_boffset;
+        ilf->ilf_fields = XFS_ILOG_CORE;
+        ilf->ilf_size = 2; /* format + core */
+        xlog_finish_iovec(lv, vecp, sizeof(struct xfs_inode_log_format));
+        if (ip->i_d.di_version == 1)
+                xfs_inode_item_format_v1_inode(ip);
+        xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ICORE,
+                        &ip->i_d,
+                        xfs_icdinode_size(ip->i_d.di_version));
+        xfs_inode_item_format_data_fork(iip, ilf, lv, &vecp);
+        if (XFS_IFORK_Q(ip)) {
+                xfs_inode_item_format_attr_fork(iip, ilf, lv, &vecp);
+        } else {
+                iip->ili_fields &=
+                        ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT);
+        }
+        /* update the format with the exact fields we actually logged */
+        ilf->ilf_fields |= (iip->ili_fields & ~XFS_ILOG_TIMESTAMP);
+}
 /*
 * This is called to pin the inode associated with the inode log
@@ -563,27 +513,6 @@ xfs_inode_item_unlock(
        ASSERT(ip->i_itemp != NULL);
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-        /*
-         * If the inode needed a separate buffer with which to log
-         * its extents, then free it now.
-         */
-        if (iip->ili_extents_buf != NULL) {
-                ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS);
-                ASSERT(ip->i_d.di_nextents > 0);
-                ASSERT(iip->ili_fields & XFS_ILOG_DEXT);
-                ASSERT(ip->i_df.if_bytes > 0);
-                kmem_free(iip->ili_extents_buf);
-                iip->ili_extents_buf = NULL;
-        }
-        if (iip->ili_aextents_buf != NULL) {
-                ASSERT(ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS);
-                ASSERT(ip->i_d.di_anextents > 0);
-                ASSERT(iip->ili_fields & XFS_ILOG_AEXT);
-                ASSERT(ip->i_afp->if_bytes > 0);
-                kmem_free(iip->ili_aextents_buf);
-                iip->ili_aextents_buf = NULL;
-        }
        lock_flags = iip->ili_lock_flags;
        iip->ili_lock_flags = 0;
        if (lock_flags)
@@ -670,11 +599,6 @@ xfs_inode_item_init(
        iip->ili_inode = ip;
        xfs_log_item_init(mp, &iip->ili_item, XFS_LI_INODE,
                                                &xfs_inode_item_ops);
-        iip->ili_format.ilf_type = XFS_LI_INODE;
-        iip->ili_format.ilf_ino = ip->i_ino;
-        iip->ili_format.ilf_blkno = ip->i_imap.im_blkno;
-        iip->ili_format.ilf_len = ip->i_imap.im_len;
-        iip->ili_format.ilf_boffset = ip->i_imap.im_boffset;
 }
 /*
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index dce4d656768c..488d81254e28 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -34,11 +34,6 @@ typedef struct xfs_inode_log_item {
        unsigned short          ili_logged;        /* flushed logged data */
        unsigned int            ili_last_fields;   /* fields when flushed */
        unsigned int            ili_fields;        /* fields to be logged */
-        struct xfs_bmbt_rec     *ili_extents_buf;  /* array of logged
-                                                      data exts */
-        struct xfs_bmbt_rec     *ili_aextents_buf; /* array of logged
-                                                      attr exts */
-        xfs_inode_log_format_t  ili_format;        /* logged structure */
 } xfs_inode_log_item_t;
 static inline int xfs_inode_clean(xfs_inode_t *ip)
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 33ad9a77791f..518aa56b8f2e 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -112,15 +112,11 @@ xfs_find_handle(
                memset(&handle.ha_fid, 0, sizeof(handle.ha_fid));
                hsize = sizeof(xfs_fsid_t);
        } else {
-                int             lock_mode;
-                lock_mode = xfs_ilock_map_shared(ip);
                handle.ha_fid.fid_len = sizeof(xfs_fid_t) -
                                        sizeof(handle.ha_fid.fid_len);
                handle.ha_fid.fid_pad = 0;
                handle.ha_fid.fid_gen = ip->i_d.di_gen;
                handle.ha_fid.fid_ino = ip->i_ino;
-                xfs_iunlock_map_shared(ip, lock_mode);
                hsize = XFS_HSIZE(handle);
        }
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 27e0e544e963..0ce1d759156e 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -459,14 +459,12 @@ xfs_vn_getattr(
 static void
 xfs_setattr_mode(
-        struct xfs_trans        *tp,
        struct xfs_inode        *ip,
        struct iattr            *iattr)
 {
-        struct inode    *inode = VFS_I(ip);
+        struct inode            *inode = VFS_I(ip);
-        umode_t         mode = iattr->ia_mode;
+        umode_t                 mode = iattr->ia_mode;
-        ASSERT(tp);
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
        ip->i_d.di_mode &= S_IFMT;
@@ -476,6 +474,32 @@ xfs_setattr_mode(
        inode->i_mode |= mode & ~S_IFMT;
 }
+static void
+xfs_setattr_time(
+        struct xfs_inode        *ip,
+        struct iattr            *iattr)
+{
+        struct inode            *inode = VFS_I(ip);
+        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+        if (iattr->ia_valid & ATTR_ATIME) {
+                inode->i_atime = iattr->ia_atime;
+                ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec;
+                ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec;
+        }
+        if (iattr->ia_valid & ATTR_CTIME) {
+                inode->i_ctime = iattr->ia_ctime;
+                ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec;
+                ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec;
+        }
+        if (iattr->ia_valid & ATTR_MTIME) {
+                inode->i_mtime = iattr->ia_mtime;
+                ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec;
+                ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec;
+        }
+}
 int
 xfs_setattr_nonsize(
        struct xfs_inode        *ip,
@@ -618,7 +642,8 @@ xfs_setattr_nonsize(
                }
                if (!gid_eq(igid, gid)) {
                        if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_GQUOTA_ON(mp)) {
-                                ASSERT(!XFS_IS_PQUOTA_ON(mp));
+                                ASSERT(xfs_sb_version_has_pquotino(&mp->m_sb) ||
+                                       !XFS_IS_PQUOTA_ON(mp));
                                ASSERT(mask & ATTR_GID);
                                ASSERT(gdqp);
                                olddquot2 = xfs_qm_vop_chown(tp, ip,
@@ -629,30 +654,10 @@ xfs_setattr_nonsize(
                }
        }
-        /*
-         * Change file access modes.
-         */
        if (mask & ATTR_MODE)
-                xfs_setattr_mode(tp, ip, iattr);
+                xfs_setattr_mode(ip, iattr);
+        if (mask & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME))
-        /*
+                xfs_setattr_time(ip, iattr);
-         * Change file access or modified times.
-         */
-        if (mask & ATTR_ATIME) {
-                inode->i_atime = iattr->ia_atime;
-                ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec;
-                ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec;
-        }
-        if (mask & ATTR_CTIME) {
-                inode->i_ctime = iattr->ia_ctime;
-                ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec;
-                ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec;
-        }
-        if (mask & ATTR_MTIME) {
-                inode->i_mtime = iattr->ia_mtime;
-                ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec;
-                ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec;
-        }
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
@@ -867,22 +872,10 @@ xfs_setattr_size(
                xfs_inode_clear_eofblocks_tag(ip);
        }
-        /*
-         * Change file access modes.
-         */
        if (mask & ATTR_MODE)
-                xfs_setattr_mode(tp, ip, iattr);
+                xfs_setattr_mode(ip, iattr);
+        if (mask & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME))
-        if (mask & ATTR_CTIME) {
+                xfs_setattr_time(ip, iattr);
-                inode->i_ctime = iattr->ia_ctime;
-                ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec;
-                ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec;
-        }
-        if (mask & ATTR_MTIME) {
-                inode->i_mtime = iattr->ia_mtime;
-                ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec;
-                ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec;
-        }
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index c237ad15d500..f46338285152 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -209,9 +209,8 @@ xfs_bulkstat(
        xfs_inobt_rec_incore_t  *irbuf; /* start of irec buffer */
        xfs_inobt_rec_incore_t  *irbufend; /* end of good irec buffer entries */
        xfs_ino_t               lastino; /* last inode number returned */
-        int                     nbcluster; /* # of blocks in a cluster */
+        int                     blks_per_cluster; /* # of blocks per cluster */
-        int                     nicluster; /* # of inodes in a cluster */
+        int                     inodes_per_cluster;/* # of inodes per cluster */
-        int                     nimask; /* mask for inode clusters */
        int                     nirbuf; /* size of irbuf */
        int                     rval;   /* return value error code */
        int                     tmp;    /* result value from btree calls */
@@ -243,11 +242,8 @@ xfs_bulkstat(
        *done = 0;
        fmterror = 0;
        ubufp = ubuffer;
-        nicluster = mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp) ?
+        blks_per_cluster = xfs_icluster_size_fsb(mp);
-                mp->m_sb.sb_inopblock :
+        inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog;
-                (XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog);
-        nimask = ~(nicluster - 1);
-        nbcluster = nicluster >> mp->m_sb.sb_inopblog;
        irbuf = kmem_zalloc_greedy(&irbsize, PAGE_SIZE, PAGE_SIZE * 4);
        if (!irbuf)
                return ENOMEM;
@@ -390,12 +386,12 @@ xfs_bulkstat(
                                agbno = XFS_AGINO_TO_AGBNO(mp, r.ir_startino);
                                for (chunkidx = 0;
                                     chunkidx < XFS_INODES_PER_CHUNK;
-                                     chunkidx += nicluster,
+                                     chunkidx += inodes_per_cluster,
-                                     agbno += nbcluster) {
+                                     agbno += blks_per_cluster) {
-                                        if (xfs_inobt_maskn(chunkidx, nicluster)
+                                        if (xfs_inobt_maskn(chunkidx,
-                                                        & ~r.ir_free)
+                                            inodes_per_cluster) & ~r.ir_free)
                                                xfs_btree_reada_bufs(mp, agno,
-                                                        agbno, nbcluster,
+                                                        agbno, blks_per_cluster,
                                                        &xfs_inode_buf_ops);
                                }
                                blk_finish_plug(&plug);
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index e148719e0a5d..b0f4ef77fa70 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -30,6 +30,52 @@ struct xfs_log_vec {
 #define XFS_LOG_VEC_ORDERED     (-1)
+static inline void *
+xlog_prepare_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp,
+                uint type)
+{
+        struct xfs_log_iovec *vec = *vecp;
+        if (vec) {
+                ASSERT(vec - lv->lv_iovecp < lv->lv_niovecs);
+                vec++;
+        } else {
+                vec = &lv->lv_iovecp[0];
+        }
+        vec->i_type = type;
+        vec->i_addr = lv->lv_buf + lv->lv_buf_len;
+        ASSERT(IS_ALIGNED((unsigned long)vec->i_addr, sizeof(uint64_t)));
+        *vecp = vec;
+        return vec->i_addr;
+}
+static inline void
+xlog_finish_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec *vec, int len)
+{
+        /*
+         * We need to make sure the next buffer is naturally aligned for the
+         * biggest basic data type we put into it.  We already accounted for
+         * this when sizing the buffer.
+         */
+        lv->lv_buf_len += round_up(len, sizeof(uint64_t));
+        vec->i_len = len;
+}
+static inline void *
+xlog_copy_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp,
+                uint type, void *data, int len)
+{
+        void *buf;
+        buf = xlog_prepare_iovec(lv, vecp, type);
+        memcpy(buf, data, len);
+        xlog_finish_iovec(lv, *vecp, len);
+        return buf;
+}
 /*
 * Structure used to pass callback function and the function's argument
 * to the log manager.
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 5eb51fc5eb84..cdebd832c3db 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -82,36 +82,6 @@ xlog_cil_init_post_recovery(
                                                                log->l_curr_block);
 }
-STATIC int
-xlog_cil_lv_item_format(
-        struct xfs_log_item     *lip,
-        struct xfs_log_vec      *lv)
-{
-        int     index;
-        char    *ptr;
-        /* format new vectors into array */
-        lip->li_ops->iop_format(lip, lv->lv_iovecp);
-        /* copy data into existing array */
-        ptr = lv->lv_buf;
-        for (index = 0; index < lv->lv_niovecs; index++) {
-                struct xfs_log_iovec *vec = &lv->lv_iovecp[index];
-                memcpy(ptr, vec->i_addr, vec->i_len);
-                vec->i_addr = ptr;
-                ptr += vec->i_len;
-        }
-        /*
-         * some size calculations for log vectors over-estimate, so the caller
-         * doesn't know the amount of space actually used by the item. Return
-         * the byte count to the caller so they can check and store it
-         * appropriately.
-         */
-        return ptr - lv->lv_buf;
-}
 /*
 * Prepare the log item for insertion into the CIL. Calculate the difference in
 * log space and vectors it will consume, and if it is a new item pin it as
@@ -232,6 +202,13 @@ xlog_cil_insert_format_items(
                        nbytes = 0;
                }
+                /*
+                 * We 64-bit align the length of each iovec so that the start
+                 * of the next one is naturally aligned.  We'll need to
+                 * account for that slack space here.
+                 */
+                nbytes += niovecs * sizeof(uint64_t);
                /* grab the old item if it exists for reservation accounting */
                old_lv = lip->li_lv;
@@ -254,34 +231,27 @@ xlog_cil_insert_format_items(
                         */
                        *diff_iovecs -= lv->lv_niovecs;
                        *diff_len -= lv->lv_buf_len;
+                } else {
-                        /* Ensure the lv is set up according to ->iop_size */
+                        /* allocate new data chunk */
-                        lv->lv_niovecs = niovecs;
+                        lv = kmem_zalloc(buf_size, KM_SLEEP|KM_NOFS);
-                        lv->lv_buf = (char *)lv + buf_size - nbytes;
+                        lv->lv_item = lip;
+                        lv->lv_size = buf_size;
-                        lv->lv_buf_len = xlog_cil_lv_item_format(lip, lv);
+                        if (ordered) {
-                        goto insert;
+                                /* track as an ordered logvec */
+                                ASSERT(lip->li_lv == NULL);
+                                lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
+                                goto insert;
+                        }
+                        lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1];
                }
-                /* allocate new data chunk */
+                /* Ensure the lv is set up according to ->iop_size */
-                lv = kmem_zalloc(buf_size, KM_SLEEP|KM_NOFS);
-                lv->lv_item = lip;
-                lv->lv_size = buf_size;
                lv->lv_niovecs = niovecs;
-                if (ordered) {
-                        /* track as an ordered logvec */
-                        ASSERT(lip->li_lv == NULL);
-                        lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
-                        goto insert;
-                }
-                /* The allocated iovec region lies beyond the log vector. */
-                lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1];
                /* The allocated data region lies beyond the iovec region */
+                lv->lv_buf_len = 0;
                lv->lv_buf = (char *)lv + buf_size - nbytes;
+                lip->li_ops->iop_format(lip, lv);
-                lv->lv_buf_len = xlog_cil_lv_item_format(lip, lv);
 insert:
                ASSERT(lv->lv_buf_len <= nbytes);
                xfs_cil_prepare_item(log, lv, old_lv, diff_len, diff_iovecs);
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index b6b669df40f3..bce53ac81096 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -193,7 +193,10 @@ xlog_bread_noalign(
        bp->b_io_length = nbblks;
        bp->b_error = 0;
-        xfsbdstrat(log->l_mp, bp);
+        if (XFS_FORCED_SHUTDOWN(log->l_mp))
+                return XFS_ERROR(EIO);
+        xfs_buf_iorequest(bp);
        error = xfs_buf_iowait(bp);
        if (error)
                xfs_buf_ioerror_alert(bp, __func__);
@@ -1651,6 +1654,7 @@ xlog_recover_reorder_trans(
        int                     pass)
 {
        xlog_recover_item_t     *item, *n;
+        int                     error = 0;
        LIST_HEAD(sort_list);
        LIST_HEAD(cancel_list);
        LIST_HEAD(buffer_list);
@@ -1692,9 +1696,17 @@ xlog_recover_reorder_trans(
                                "%s: unrecognized type of log operation",
                                __func__);
                        ASSERT(0);
-                        return XFS_ERROR(EIO);
+                        /*
+                         * return the remaining items back to the transaction
+                         * item list so they can be freed in caller.
+                         */
+                        if (!list_empty(&sort_list))
+                                list_splice_init(&sort_list, &trans->r_itemq);
+                        error = XFS_ERROR(EIO);
+                        goto out;
                }
        }
+out:
        ASSERT(list_empty(&sort_list));
        if (!list_empty(&buffer_list))
                list_splice(&buffer_list, &trans->r_itemq);
@@ -1704,7 +1716,7 @@ xlog_recover_reorder_trans(
                list_splice_tail(&inode_buffer_list, &trans->r_itemq);
        if (!list_empty(&cancel_list))
                list_splice_tail(&cancel_list, &trans->r_itemq);
-        return 0;
+        return error;
 }
 /*
@@ -2514,19 +2526,19 @@ xlog_recover_buffer_pass2(
         *
         * Also make sure that only inode buffers with good sizes stay in
         * the buffer cache.  The kernel moves inodes in buffers of 1 block
-         * or XFS_INODE_CLUSTER_SIZE bytes, whichever is bigger.  The inode
+         * or mp->m_inode_cluster_size bytes, whichever is bigger.  The inode
         * buffers in the log can be a different size if the log was generated
         * by an older kernel using unclustered inode buffers or a newer kernel
         * running with a different inode cluster size.  Regardless, if the
-         * the inode buffer size isn't MAX(blocksize, XFS_INODE_CLUSTER_SIZE)
+         * the inode buffer size isn't MAX(blocksize, mp->m_inode_cluster_size)
-         * for *our* value of XFS_INODE_CLUSTER_SIZE, then we need to keep
+         * for *our* value of mp->m_inode_cluster_size, then we need to keep
         * the buffer out of the buffer cache so that the buffer won't
         * overlap with future reads of those inodes.
         */
        if (XFS_DINODE_MAGIC ==
            be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) &&
            (BBTOB(bp->b_io_length) != MAX(log->l_mp->m_sb.sb_blocksize,
-                        (__uint32_t)XFS_INODE_CLUSTER_SIZE(log->l_mp)))) {
+                        (__uint32_t)log->l_mp->m_inode_cluster_size))) {
                xfs_buf_stale(bp);
                error = xfs_bwrite(bp);
        } else {
@@ -3199,10 +3211,10 @@ xlog_recover_do_icreate_pass2(
        }
        /* existing allocation is fixed value */
-        ASSERT(count == XFS_IALLOC_INODES(mp));
+        ASSERT(count == mp->m_ialloc_inos);
-        ASSERT(length == XFS_IALLOC_BLOCKS(mp));
+        ASSERT(length == mp->m_ialloc_blks);
-        if (count != XFS_IALLOC_INODES(mp) ||
+        if (count != mp->m_ialloc_inos ||
-             length != XFS_IALLOC_BLOCKS(mp)) {
+             length != mp->m_ialloc_blks) {
                xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count 2");
                return EINVAL;
        }
@@ -3608,8 +3620,10 @@ xlog_recover_process_data(
                                error = XFS_ERROR(EIO);
                                break;
                        }
-                        if (error)
+                        if (error) {
+                                xlog_recover_free_trans(trans);
                                return error;
+                        }
                }
                dp += be32_to_cpu(ohead->oh_len);
                num_logops--;
@@ -4397,7 +4411,13 @@ xlog_do_recover(
        XFS_BUF_READ(bp);
        XFS_BUF_UNASYNC(bp);
        bp->b_ops = &xfs_sb_buf_ops;
-        xfsbdstrat(log->l_mp, bp);
+        if (XFS_FORCED_SHUTDOWN(log->l_mp)) {
+                xfs_buf_relse(bp);
+                return XFS_ERROR(EIO);
+        }
+        xfs_buf_iorequest(bp);
        error = xfs_buf_iowait(bp);
        if (error) {
                xfs_buf_ioerror_alert(bp, __func__);
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 14a4996cfec6..348e4d2ed6e6 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -134,8 +134,6 @@ xfs_qm_dqpurge(
 {
        struct xfs_mount        *mp = dqp->q_mount;
        struct xfs_quotainfo    *qi = mp->m_quotainfo;
-        struct xfs_dquot        *gdqp = NULL;
-        struct xfs_dquot        *pdqp = NULL;
        xfs_dqlock(dqp);
        if ((dqp->dq_flags & XFS_DQ_FREEING) || dqp->q_nrefs != 0) {
@@ -143,21 +141,6 @@ xfs_qm_dqpurge(
                return EAGAIN;
        }
-        /*
-         * If this quota has a hint attached, prepare for releasing it now.
-         */
-        gdqp = dqp->q_gdquot;
-        if (gdqp) {
-                xfs_dqlock(gdqp);
-                dqp->q_gdquot = NULL;
-        }
-        pdqp = dqp->q_pdquot;
-        if (pdqp) {
-                xfs_dqlock(pdqp);
-                dqp->q_pdquot = NULL;
-        }
        dqp->dq_flags |= XFS_DQ_FREEING;
        xfs_dqflock(dqp);
@@ -206,11 +189,47 @@ xfs_qm_dqpurge(
        XFS_STATS_DEC(xs_qm_dquot_unused);
        xfs_qm_dqdestroy(dqp);
+        return 0;
+}
+/*
+ * Release the group or project dquot pointers the user dquots maybe carrying
+ * around as a hint, and proceed to purge the user dquot cache if requested.
+*/
+STATIC int
+xfs_qm_dqpurge_hints(
+        struct xfs_dquot        *dqp,
+        void                    *data)
+{
+        struct xfs_dquot        *gdqp = NULL;
+        struct xfs_dquot        *pdqp = NULL;
+        uint                    flags = *((uint *)data);
+        xfs_dqlock(dqp);
+        if (dqp->dq_flags & XFS_DQ_FREEING) {
+                xfs_dqunlock(dqp);
+                return EAGAIN;
+        }
+        /* If this quota has a hint attached, prepare for releasing it now */
+        gdqp = dqp->q_gdquot;
        if (gdqp)
-                xfs_qm_dqput(gdqp);
+                dqp->q_gdquot = NULL;
+        pdqp = dqp->q_pdquot;
        if (pdqp)
-                xfs_qm_dqput(pdqp);
+                dqp->q_pdquot = NULL;
+        xfs_dqunlock(dqp);
+        if (gdqp)
+                xfs_qm_dqrele(gdqp);
+        if (pdqp)
+                xfs_qm_dqrele(pdqp);
+        if (flags & XFS_QMOPT_UQUOTA)
+                return xfs_qm_dqpurge(dqp, NULL);
        return 0;
 }
@@ -222,8 +241,18 @@ xfs_qm_dqpurge_all(
        struct xfs_mount        *mp,
        uint                    flags)
 {
-        if (flags & XFS_QMOPT_UQUOTA)
+        /*
-                xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_dqpurge, NULL);
+         * We have to release group/project dquot hint(s) from the user dquot
+         * at first if they are there, otherwise we would run into an infinite
+         * loop while walking through radix tree to purge other type of dquots
+         * since their refcount is not zero if the user dquot refers to them
+         * as hint.
+         *
+         * Call the special xfs_qm_dqpurge_hints() will end up go through the
+         * general xfs_qm_dqpurge() against user dquot cache if requested.
+         */
+        xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_dqpurge_hints, &flags);
        if (flags & XFS_QMOPT_GQUOTA)
                xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_dqpurge, NULL);
        if (flags & XFS_QMOPT_PQUOTA)
@@ -1193,16 +1222,18 @@ xfs_qm_dqiterate(
        lblkno = 0;
        maxlblkcnt = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
        do {
+                uint            lock_mode;
                nmaps = XFS_DQITER_MAP_SIZE;
                /*
                 * We aren't changing the inode itself. Just changing
                 * some of its data. No new blocks are added here, and
                 * the inode is never added to the transaction.
                 */
-                xfs_ilock(qip, XFS_ILOCK_SHARED);
+                lock_mode = xfs_ilock_data_map_shared(qip);
                error = xfs_bmapi_read(qip, lblkno, maxlblkcnt - lblkno,
                                       map, &nmaps, 0);
-                xfs_iunlock(qip, XFS_ILOCK_SHARED);
+                xfs_iunlock(qip, lock_mode);
                if (error)
                        break;
@@ -2082,24 +2113,21 @@ xfs_qm_vop_create_dqattach(
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
        ASSERT(XFS_IS_QUOTA_RUNNING(mp));
-        if (udqp) {
+        if (udqp && XFS_IS_UQUOTA_ON(mp)) {
                ASSERT(ip->i_udquot == NULL);
-                ASSERT(XFS_IS_UQUOTA_ON(mp));
                ASSERT(ip->i_d.di_uid == be32_to_cpu(udqp->q_core.d_id));
                ip->i_udquot = xfs_qm_dqhold(udqp);
                xfs_trans_mod_dquot(tp, udqp, XFS_TRANS_DQ_ICOUNT, 1);
        }
-        if (gdqp) {
+        if (gdqp && XFS_IS_GQUOTA_ON(mp)) {
                ASSERT(ip->i_gdquot == NULL);
-                ASSERT(XFS_IS_GQUOTA_ON(mp));
                ASSERT(ip->i_d.di_gid == be32_to_cpu(gdqp->q_core.d_id));
                ip->i_gdquot = xfs_qm_dqhold(gdqp);
                xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1);
        }
-        if (pdqp) {
+        if (pdqp && XFS_IS_PQUOTA_ON(mp)) {
                ASSERT(ip->i_pdquot == NULL);
-                ASSERT(XFS_IS_PQUOTA_ON(mp));
                ASSERT(xfs_get_projid(ip) == be32_to_cpu(pdqp->q_core.d_id));
                ip->i_pdquot = xfs_qm_dqhold(pdqp);
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index a788b66a5cb1..797fd4636273 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -20,13 +20,29 @@
 #include "xfs_dquot_item.h"
 #include "xfs_dquot.h"
-#include "xfs_quota_priv.h"
 struct xfs_inode;
 extern struct kmem_zone *xfs_qm_dqtrxzone;
 /*
+ * Number of bmaps that we ask from bmapi when doing a quotacheck.
+ * We make this restriction to keep the memory usage to a minimum.
+ */
+#define XFS_DQITER_MAP_SIZE     10
+#define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \
+        !dqp->q_core.d_blk_hardlimit && \
+        !dqp->q_core.d_blk_softlimit && \
+        !dqp->q_core.d_rtb_hardlimit && \
+        !dqp->q_core.d_rtb_softlimit && \
+        !dqp->q_core.d_ino_hardlimit && \
+        !dqp->q_core.d_ino_softlimit && \
+        !dqp->q_core.d_bcount && \
+        !dqp->q_core.d_rtbcount && \
+        !dqp->q_core.d_icount)
+/*
 * This defines the unit of allocation of dquots.
 * Currently, it is just one file system block, and a 4K blk contains 30
 * (136 * 30 = 4080) dquots. It's probably not worth trying to make
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 437c9198031a..3daf5ea1eb8d 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -278,7 +278,7 @@ xfs_qm_scall_trunc_qfiles(
        xfs_mount_t     *mp,
        uint            flags)
 {
-        int             error = 0, error2 = 0;
+        int             error;
        if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0) {
                xfs_debug(mp, "%s: flags=%x m_qflags=%x",
@@ -286,14 +286,20 @@ xfs_qm_scall_trunc_qfiles(
                return XFS_ERROR(EINVAL);
        }
-        if (flags & XFS_DQ_USER)
+        if (flags & XFS_DQ_USER) {
                error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_uquotino);
-        if (flags & XFS_DQ_GROUP)
+                if (error)
-                error2 = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_gquotino);
+                        return error;
+        }
+        if (flags & XFS_DQ_GROUP) {
+                error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_gquotino);
+                if (error)
+                        return error;
+        }
        if (flags & XFS_DQ_PROJ)
-                error2 = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_pquotino);
+                error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_pquotino);
-        return error ? error : error2;
+        return error;
 }
 /*
diff --git a/fs/xfs/xfs_quota_priv.h b/fs/xfs/xfs_quota_priv.h
deleted file mode 100644
index 6d86219d93da..000000000000
--- a/fs/xfs/xfs_quota_priv.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright (c) 2000-2003 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_QUOTA_PRIV_H__
-#define __XFS_QUOTA_PRIV_H__
-/*
- * Number of bmaps that we ask from bmapi when doing a quotacheck.
- * We make this restriction to keep the memory usage to a minimum.
- */
-#define XFS_DQITER_MAP_SIZE     10
-#define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \
-        !dqp->q_core.d_blk_hardlimit && \
-        !dqp->q_core.d_blk_softlimit && \
-        !dqp->q_core.d_rtb_hardlimit && \
-        !dqp->q_core.d_rtb_softlimit && \
-        !dqp->q_core.d_ino_hardlimit && \
-        !dqp->q_core.d_ino_softlimit && \
-        !dqp->q_core.d_bcount && \
-        !dqp->q_core.d_rtbcount && \
-        !dqp->q_core.d_icount)
-#define DQFLAGTO_TYPESTR(d)     (((d)->dq_flags & XFS_DQ_USER) ? "USR" : \
-                                 (((d)->dq_flags & XFS_DQ_GROUP) ? "GRP" : \
-                                 (((d)->dq_flags & XFS_DQ_PROJ) ? "PRJ":"???")))
-#endif  /* __XFS_QUOTA_PRIV_H__ */
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 9b96d35e483d..b5bc1ab3c4da 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -64,7 +64,7 @@ typedef struct xfs_log_item {
 struct xfs_item_ops {
        void (*iop_size)(xfs_log_item_t *, int *, int *);
-        void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *);
+        void (*iop_format)(xfs_log_item_t *, struct xfs_log_vec *);
        void (*iop_pin)(xfs_log_item_t *);
        void (*iop_unpin)(xfs_log_item_t *, int remove);
        uint (*iop_push)(struct xfs_log_item *, struct list_head *);
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index c035d11b7734..647b6f1d8923 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -314,7 +314,18 @@ xfs_trans_read_buf_map(
                        ASSERT(bp->b_iodone == NULL);
                        XFS_BUF_READ(bp);
                        bp->b_ops = ops;
-                        xfsbdstrat(tp->t_mountp, bp);
+                        /*
+                         * XXX(hch): clean up the error handling here to be less
+                         * of a mess..
+                         */
+                        if (XFS_FORCED_SHUTDOWN(mp)) {
+                                trace_xfs_bdstrat_shut(bp, _RET_IP_);
+                                xfs_bioerror_relse(bp);
+                        } else {
+                                xfs_buf_iorequest(bp);
+                        }
                        error = xfs_buf_iowait(bp);
                        if (error) {
                                xfs_buf_ioerror_alert(bp, __func__);
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index cd2a10e15d3a..41172861e857 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -295,8 +295,8 @@ xfs_trans_mod_dquot(
 /*
 * Given an array of dqtrx structures, lock all the dquots associated and join
 * them to the transaction, provided they have been modified.  We know that the
- * highest number of dquots of one type - usr, grp OR prj - involved in a
+ * highest number of dquots of one type - usr, grp and prj - involved in a
- * transaction is 2 so we don't need to make this very generic.
+ * transaction is 3 so we don't need to make this very generic.
 */
 STATIC void
 xfs_trans_dqlockedjoin(
diff --git a/fs/xfs/xfs_trans_resv.c b/fs/xfs/xfs_trans_resv.c
index 2fd59c0dae66..2ffd3e331b49 100644
--- a/fs/xfs/xfs_trans_resv.c
+++ b/fs/xfs/xfs_trans_resv.c
@@ -174,7 +174,7 @@ xfs_calc_itruncate_reservation(
                    xfs_calc_buf_res(5, 0) +
                    xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
                                     XFS_FSB_TO_B(mp, 1)) +
-                    xfs_calc_buf_res(2 + XFS_IALLOC_BLOCKS(mp) +
+                    xfs_calc_buf_res(2 + mp->m_ialloc_blks +
                                     mp->m_in_maxlevels, 0)));
 }
@@ -282,7 +282,7 @@ xfs_calc_create_resv_modify(
 * For create we can allocate some inodes giving:
 *    the agi and agf of the ag getting the new inodes: 2 * sectorsize
 *    the superblock for the nlink flag: sector size
- *    the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
+ *    the inode blocks allocated: mp->m_ialloc_blks * blocksize
 *    the inode btree: max depth * blocksize
 *    the allocation btrees: 2 trees * (max depth - 1) * block size
 */
@@ -292,7 +292,7 @@ xfs_calc_create_resv_alloc(
 {
        return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
                mp->m_sb.sb_sectsize +
-                xfs_calc_buf_res(XFS_IALLOC_BLOCKS(mp), XFS_FSB_TO_B(mp, 1)) +
+                xfs_calc_buf_res(mp->m_ialloc_blks, XFS_FSB_TO_B(mp, 1)) +
                xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) +
                xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
                                 XFS_FSB_TO_B(mp, 1));
@@ -385,9 +385,9 @@ xfs_calc_ifree_reservation(
                xfs_calc_inode_res(mp, 1) +
                xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
                xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) +
-                max_t(uint, XFS_FSB_TO_B(mp, 1), XFS_INODE_CLUSTER_SIZE(mp)) +
+                max_t(uint, XFS_FSB_TO_B(mp, 1), mp->m_inode_cluster_size) +
                xfs_calc_buf_res(1, 0) +
-                xfs_calc_buf_res(2 + XFS_IALLOC_BLOCKS(mp) +
+                xfs_calc_buf_res(2 + mp->m_ialloc_blks +
                                 mp->m_in_maxlevels, 0) +
                xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
                                 XFS_FSB_TO_B(mp, 1));
diff --git a/fs/xfs/xfs_trans_space.h b/fs/xfs/xfs_trans_space.h
index 7d2c920dfb9c..af5dbe06cb65 100644
--- a/fs/xfs/xfs_trans_space.h
+++ b/fs/xfs/xfs_trans_space.h
@@ -47,7 +47,7 @@
 #define XFS_DIRREMOVE_SPACE_RES(mp)     \
        XFS_DAREMOVE_SPACE_RES(mp, XFS_DATA_FORK)
 #define XFS_IALLOC_SPACE_RES(mp)        \
-        (XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels - 1)
+        ((mp)->m_ialloc_blks + (mp)->m_in_maxlevels - 1)
 /*
 * Space reservation values for various transactions.
diff --git a/fs/xfs/xfs_vnode.h b/fs/xfs/xfs_vnode.h
index 3e8e797c6d11..e8a77383c0d5 100644
--- a/fs/xfs/xfs_vnode.h
+++ b/fs/xfs/xfs_vnode.h
@@ -35,15 +35,6 @@ struct attrlist_cursor_kern;
        { IO_INVIS,     "INVIS"}
 /*
- * Flush/Invalidate options for vop_toss/flush/flushinval_pages.
- */
-#define FI_NONE                 0       /* none */
-#define FI_REMAPF               1       /* Do a remapf prior to the operation */
-#define FI_REMAPF_LOCKED        2       /* Do a remapf prior to the operation.
-                                           Prevent VM access to the pages until
-                                           the operation completes. */
-/*
 * Some useful predicates.
 */
 #define VN_MAPPED(vp)   mapping_mapped(vp->i_mapping)