194 files changed, 6118 insertions, 6000 deletions
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 055562c580b4..9ff073f4090a 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -148,13 +148,14 @@ static int v9fs_release_page(struct page *page, gfp_t gfp)
 * @offset: offset in the page
 */
-static void v9fs_invalidate_page(struct page *page, unsigned long offset)
+static void v9fs_invalidate_page(struct page *page, unsigned int offset,
+                                 unsigned int length)
 {
        /*
         * If called with zero offset, we should release
         * the private state assocated with the page
         */
-        if (offset == 0)
+        if (offset == 0 && length == PAGE_CACHE_SIZE)
                v9fs_fscache_invalidate_page(page);
 }
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index be1e34adc3c6..4d0c2e0be7e5 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -101,16 +101,15 @@ static struct p9_rdir *v9fs_alloc_rdir_buf(struct file *filp, int buflen)
 }
 /**
- * v9fs_dir_readdir - read a directory
+ * v9fs_dir_readdir - iterate through a directory
- * @filp: opened file structure
+ * @file: opened file structure
- * @dirent: directory structure ???
+ * @ctx: actor we feed the entries to
- * @filldir: function to populate directory structure ???
 *
 */
-static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx)
 {
-        int over;
+        bool over;
        struct p9_wstat st;
        int err = 0;
        struct p9_fid *fid;
@@ -118,19 +117,19 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
        int reclen = 0;
        struct p9_rdir *rdir;
-        p9_debug(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name);
+        p9_debug(P9_DEBUG_VFS, "name %s\n", file->f_path.dentry->d_name.name);
-        fid = filp->private_data;
+        fid = file->private_data;
        buflen = fid->clnt->msize - P9_IOHDRSZ;
-        rdir = v9fs_alloc_rdir_buf(filp, buflen);
+        rdir = v9fs_alloc_rdir_buf(file, buflen);
        if (!rdir)
                return -ENOMEM;
        while (1) {
                if (rdir->tail == rdir->head) {
-                        err = v9fs_file_readn(filp, rdir->buf, NULL,
+                        err = v9fs_file_readn(file, rdir->buf, NULL,
-                                                        buflen, filp->f_pos);
+                                                        buflen, ctx->pos);
                        if (err <= 0)
                                return err;
@@ -148,51 +147,45 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
                        }
                        reclen = st.size+2;
-                        over = filldir(dirent, st.name, strlen(st.name),
+                        over = !dir_emit(ctx, st.name, strlen(st.name),
-                            filp->f_pos, v9fs_qid2ino(&st.qid), dt_type(&st));
+                                         v9fs_qid2ino(&st.qid), dt_type(&st));
                        p9stat_free(&st);
                        if (over)
                                return 0;
                        rdir->head += reclen;
-                        filp->f_pos += reclen;
+                        ctx->pos += reclen;
                }
        }
 }
 /**
- * v9fs_dir_readdir_dotl - read a directory
+ * v9fs_dir_readdir_dotl - iterate through a directory
- * @filp: opened file structure
+ * @file: opened file structure
- * @dirent: buffer to fill dirent structures
+ * @ctx: actor we feed the entries to
- * @filldir: function to populate dirent structures
 *
 */
-static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent,
+static int v9fs_dir_readdir_dotl(struct file *file, struct dir_context *ctx)
-                                                filldir_t filldir)
 {
-        int over;
        int err = 0;
        struct p9_fid *fid;
        int buflen;
        struct p9_rdir *rdir;
        struct p9_dirent curdirent;
-        u64 oldoffset = 0;
-        p9_debug(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name);
+        p9_debug(P9_DEBUG_VFS, "name %s\n", file->f_path.dentry->d_name.name);
-        fid = filp->private_data;
+        fid = file->private_data;
        buflen = fid->clnt->msize - P9_READDIRHDRSZ;
-        rdir = v9fs_alloc_rdir_buf(filp, buflen);
+        rdir = v9fs_alloc_rdir_buf(file, buflen);
        if (!rdir)
                return -ENOMEM;
        while (1) {
                if (rdir->tail == rdir->head) {
                        err = p9_client_readdir(fid, rdir->buf, buflen,
-                                                filp->f_pos);
+                                                ctx->pos);
                        if (err <= 0)
                                return err;
@@ -210,22 +203,13 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent,
                                return -EIO;
                        }
-                        /* d_off in dirent structure tracks the offset into
+                        if (!dir_emit(ctx, curdirent.d_name,
-                         * the next dirent in the dir. However, filldir()
+                                      strlen(curdirent.d_name),
-                         * expects offset into the current dirent. Hence
+                                      v9fs_qid2ino(&curdirent.qid),
-                         * while calling filldir send the offset from the
+                                      curdirent.d_type))
-                         * previous dirent structure.
-                         */
-                        over = filldir(dirent, curdirent.d_name,
-                                        strlen(curdirent.d_name),
-                                        oldoffset, v9fs_qid2ino(&curdirent.qid),
-                                        curdirent.d_type);
-                        oldoffset = curdirent.d_off;
-                        if (over)
                                return 0;
-                        filp->f_pos = curdirent.d_off;
+                        ctx->pos = curdirent.d_off;
                        rdir->head += err;
                }
        }
@@ -254,7 +238,7 @@ int v9fs_dir_release(struct inode *inode, struct file *filp)
 const struct file_operations v9fs_dir_operations = {
        .read = generic_read_dir,
        .llseek = generic_file_llseek,
-        .readdir = v9fs_dir_readdir,
+        .iterate = v9fs_dir_readdir,
        .open = v9fs_file_open,
        .release = v9fs_dir_release,
 };
@@ -262,7 +246,7 @@ const struct file_operations v9fs_dir_operations = {
 const struct file_operations v9fs_dir_operations_dotl = {
        .read = generic_read_dir,
        .llseek = generic_file_llseek,
-        .readdir = v9fs_dir_readdir_dotl,
+        .iterate = v9fs_dir_readdir_dotl,
        .open = v9fs_file_open,
        .release = v9fs_dir_release,
        .fsync = v9fs_file_fsync_dotl,
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index 9cf874ce8336..ade28bb058e3 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -17,47 +17,43 @@
 static DEFINE_RWLOCK(adfs_dir_lock);
 static int
-adfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+adfs_readdir(struct file *file, struct dir_context *ctx)
 {
-        struct inode *inode = file_inode(filp);
+        struct inode *inode = file_inode(file);
        struct super_block *sb = inode->i_sb;
        struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir;
        struct object_info obj;
        struct adfs_dir dir;
        int ret = 0;
-        if (filp->f_pos >> 32)
+        if (ctx->pos >> 32)
-                goto out;
+                return 0;
        ret = ops->read(sb, inode->i_ino, inode->i_size, &dir);
        if (ret)
-                goto out;
+                return ret;
-        switch ((unsigned long)filp->f_pos) {
+        if (ctx->pos == 0) {
-        case 0:
+                if (!dir_emit_dot(file, ctx))
-                if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0)
                        goto free_out;
-                filp->f_pos += 1;
+                ctx->pos = 1;
+        }
-        case 1:
+        if (ctx->pos == 1) {
-                if (filldir(dirent, "..", 2, 1, dir.parent_id, DT_DIR) < 0)
+                if (!dir_emit(ctx, "..", 2, dir.parent_id, DT_DIR))
                        goto free_out;
-                filp->f_pos += 1;
+                ctx->pos = 2;
-        default:
-                break;
        }
        read_lock(&adfs_dir_lock);
-        ret = ops->setpos(&dir, filp->f_pos - 2);
+        ret = ops->setpos(&dir, ctx->pos - 2);
        if (ret)
                goto unlock_out;
        while (ops->getnext(&dir, &obj) == 0) {
-                if (filldir(dirent, obj.name, obj.name_len,
+                if (!dir_emit(ctx, obj.name, obj.name_len,
-                            filp->f_pos, obj.file_id, DT_UNKNOWN) < 0)
+                            obj.file_id, DT_UNKNOWN))
-                        goto unlock_out;
+                        break;
-                filp->f_pos += 1;
+                ctx->pos++;
        }
 unlock_out:
@@ -65,8 +61,6 @@ unlock_out:
 free_out:
        ops->free(&dir);
-out:
        return ret;
 }
@@ -192,7 +186,7 @@ out:
 const struct file_operations adfs_dir_operations = {
        .read           = generic_read_dir,
        .llseek         = generic_file_llseek,
-        .readdir        = adfs_readdir,
+        .iterate        = adfs_readdir,
        .fsync          = generic_file_fsync,
 };
diff --git a/fs/affs/dir.c b/fs/affs/dir.c
index fd11a6d608ee..f1eba8c3644e 100644
--- a/fs/affs/dir.c
+++ b/fs/affs/dir.c
@@ -15,12 +15,12 @@
 #include "affs.h"
-static int affs_readdir(struct file *, void *, filldir_t);
+static int affs_readdir(struct file *, struct dir_context *);
 const struct file_operations affs_dir_operations = {
        .read           = generic_read_dir,
        .llseek         = generic_file_llseek,
-        .readdir        = affs_readdir,
+        .iterate        = affs_readdir,
        .fsync          = affs_file_fsync,
 };
@@ -40,52 +40,35 @@ const struct inode_operations affs_dir_inode_operations = {
 };
 static int
-affs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+affs_readdir(struct file *file, struct dir_context *ctx)
 {
-        struct inode            *inode = file_inode(filp);
+        struct inode            *inode = file_inode(file);
        struct super_block      *sb = inode->i_sb;
-        struct buffer_head      *dir_bh;
+        struct buffer_head      *dir_bh = NULL;
-        struct buffer_head      *fh_bh;
+        struct buffer_head      *fh_bh = NULL;
        unsigned char           *name;
        int                      namelen;
        u32                      i;
        int                      hash_pos;
        int                      chain_pos;
-        u32                      f_pos;
        u32                      ino;
-        int                      stored;
-        int                      res;
-        pr_debug("AFFS: readdir(ino=%lu,f_pos=%lx)\n",inode->i_ino,(unsigned long)filp->f_pos);
+        pr_debug("AFFS: readdir(ino=%lu,f_pos=%lx)\n",inode->i_ino,(unsigned long)ctx->pos);
-        stored = 0;
+        if (ctx->pos < 2) {
-        res    = -EIO;
+                file->private_data = (void *)0;
-        dir_bh = NULL;
+                if (!dir_emit_dots(file, ctx))
-        fh_bh  = NULL;
-        f_pos  = filp->f_pos;
-        if (f_pos == 0) {
-                filp->private_data = (void *)0;
-                if (filldir(dirent, ".", 1, f_pos, inode->i_ino, DT_DIR) < 0)
                        return 0;
-                filp->f_pos = f_pos = 1;
-                stored++;
-        }
-        if (f_pos == 1) {
-                if (filldir(dirent, "..", 2, f_pos, parent_ino(filp->f_path.dentry), DT_DIR) < 0)
-                        return stored;
-                filp->f_pos = f_pos = 2;
-                stored++;
        }
        affs_lock_dir(inode);
-        chain_pos = (f_pos - 2) & 0xffff;
+        chain_pos = (ctx->pos - 2) & 0xffff;
-        hash_pos  = (f_pos - 2) >> 16;
+        hash_pos  = (ctx->pos - 2) >> 16;
        if (chain_pos == 0xffff) {
                affs_warning(sb, "readdir", "More than 65535 entries in chain");
                chain_pos = 0;
                hash_pos++;
-                filp->f_pos = ((hash_pos << 16) | chain_pos) + 2;
+                ctx->pos = ((hash_pos << 16) | chain_pos) + 2;
        }
        dir_bh = affs_bread(sb, inode->i_ino);
        if (!dir_bh)
@@ -94,8 +77,8 @@ affs_readdir(struct file *filp, void *dirent, filldir_t filldir)
        /* If the directory hasn't changed since the last call to readdir(),
         * we can jump directly to where we left off.
         */
-        ino = (u32)(long)filp->private_data;
+        ino = (u32)(long)file->private_data;
-        if (ino && filp->f_version == inode->i_version) {
+        if (ino && file->f_version == inode->i_version) {
                pr_debug("AFFS: readdir() left off=%d\n", ino);
                goto inside;
        }
@@ -105,7 +88,7 @@ affs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                fh_bh = affs_bread(sb, ino);
                if (!fh_bh) {
                        affs_error(sb, "readdir","Cannot read block %d", i);
-                        goto readdir_out;
+                        return -EIO;
                }
                ino = be32_to_cpu(AFFS_TAIL(sb, fh_bh)->hash_chain);
                affs_brelse(fh_bh);
@@ -119,38 +102,34 @@ affs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                ino = be32_to_cpu(AFFS_HEAD(dir_bh)->table[hash_pos]);
                if (!ino)
                        continue;
-                f_pos = (hash_pos << 16) + 2;
+                ctx->pos = (hash_pos << 16) + 2;
 inside:
                do {
                        fh_bh = affs_bread(sb, ino);
                        if (!fh_bh) {
                                affs_error(sb, "readdir","Cannot read block %d", ino);
-                                goto readdir_done;
+                                break;
                        }
                        namelen = min(AFFS_TAIL(sb, fh_bh)->name[0], (u8)30);
                        name = AFFS_TAIL(sb, fh_bh)->name + 1;
                        pr_debug("AFFS: readdir(): filldir(\"%.*s\", ino=%u), hash=%d, f_pos=%x\n",
-                                 namelen, name, ino, hash_pos, f_pos);
+                                 namelen, name, ino, hash_pos, (u32)ctx->pos);
-                        if (filldir(dirent, name, namelen, f_pos, ino, DT_UNKNOWN) < 0)
+                        if (!dir_emit(ctx, name, namelen, ino, DT_UNKNOWN))
                                goto readdir_done;
-                        stored++;
+                        ctx->pos++;
-                        f_pos++;
                        ino = be32_to_cpu(AFFS_TAIL(sb, fh_bh)->hash_chain);
                        affs_brelse(fh_bh);
                        fh_bh = NULL;
                } while (ino);
        }
 readdir_done:
-        filp->f_pos = f_pos;
+        file->f_version = inode->i_version;
-        filp->f_version = inode->i_version;
+        file->private_data = (void *)(long)ino;
-        filp->private_data = (void *)(long)ino;
-        res = stored;
 readdir_out:
        affs_brelse(dir_bh);
        affs_brelse(fh_bh);
        affs_unlock_dir(inode);
-        pr_debug("AFFS: readdir()=%d\n", stored);
+        return 0;
-        return res;
 }
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 7a465ed04444..34494fbead0a 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -22,7 +22,7 @@
 static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
                                 unsigned int flags);
 static int afs_dir_open(struct inode *inode, struct file *file);
-static int afs_readdir(struct file *file, void *dirent, filldir_t filldir);
+static int afs_readdir(struct file *file, struct dir_context *ctx);
 static int afs_d_revalidate(struct dentry *dentry, unsigned int flags);
 static int afs_d_delete(const struct dentry *dentry);
 static void afs_d_release(struct dentry *dentry);
@@ -43,7 +43,7 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
 const struct file_operations afs_dir_file_operations = {
        .open           = afs_dir_open,
        .release        = afs_release,
-        .readdir        = afs_readdir,
+        .iterate        = afs_readdir,
        .lock           = afs_lock,
        .llseek         = generic_file_llseek,
 };
@@ -119,9 +119,9 @@ struct afs_dir_page {
 };
 struct afs_lookup_cookie {
+        struct dir_context ctx;
        struct afs_fid  fid;
-        const char      *name;
+        struct qstr name;
-        size_t          nlen;
        int             found;
 };
@@ -228,20 +228,18 @@ static int afs_dir_open(struct inode *inode, struct file *file)
 /*
 * deal with one block in an AFS directory
 */
-static int afs_dir_iterate_block(unsigned *fpos,
+static int afs_dir_iterate_block(struct dir_context *ctx,
                                 union afs_dir_block *block,
-                                 unsigned blkoff,
+                                 unsigned blkoff)
-                                 void *cookie,
-                                 filldir_t filldir)
 {
        union afs_dirent *dire;
        unsigned offset, next, curr;
        size_t nlen;
-        int tmp, ret;
+        int tmp;
-        _enter("%u,%x,%p,,",*fpos,blkoff,block);
+        _enter("%u,%x,%p,,",(unsigned)ctx->pos,blkoff,block);
-        curr = (*fpos - blkoff) / sizeof(union afs_dirent);
+        curr = (ctx->pos - blkoff) / sizeof(union afs_dirent);
        /* walk through the block, an entry at a time */
        for (offset = AFS_DIRENT_PER_BLOCK - block->pagehdr.nentries;
@@ -256,7 +254,7 @@ static int afs_dir_iterate_block(unsigned *fpos,
                        _debug("ENT[%Zu.%u]: unused",
                               blkoff / sizeof(union afs_dir_block), offset);
                        if (offset >= curr)
-                                *fpos = blkoff +
+                                ctx->pos = blkoff +
                                        next * sizeof(union afs_dirent);
                        continue;
                }
@@ -302,19 +300,15 @@ static int afs_dir_iterate_block(unsigned *fpos,
                        continue;
                /* found the next entry */
-                ret = filldir(cookie,
+                if (!dir_emit(ctx, dire->u.name, nlen,
-                              dire->u.name,
-                              nlen,
-                              blkoff + offset * sizeof(union afs_dirent),
                              ntohl(dire->u.vnode),
-                              filldir == afs_lookup_filldir ?
+                              ctx->actor == afs_lookup_filldir ?
-                              ntohl(dire->u.unique) : DT_UNKNOWN);
+                              ntohl(dire->u.unique) : DT_UNKNOWN)) {
-                if (ret < 0) {
                        _leave(" = 0 [full]");
                        return 0;
                }
-                *fpos = blkoff + next * sizeof(union afs_dirent);
+                ctx->pos = blkoff + next * sizeof(union afs_dirent);
        }
        _leave(" = 1 [more]");
@@ -324,8 +318,8 @@ static int afs_dir_iterate_block(unsigned *fpos,
 /*
 * iterate through the data blob that lists the contents of an AFS directory
 */
-static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie,
+static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx,
-                           filldir_t filldir, struct key *key)
+                           struct key *key)
 {
        union afs_dir_block *dblock;
        struct afs_dir_page *dbuf;
@@ -333,7 +327,7 @@ static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie,
        unsigned blkoff, limit;
        int ret;
-        _enter("{%lu},%u,,", dir->i_ino, *fpos);
+        _enter("{%lu},%u,,", dir->i_ino, (unsigned)ctx->pos);
        if (test_bit(AFS_VNODE_DELETED, &AFS_FS_I(dir)->flags)) {
                _leave(" = -ESTALE");
@@ -341,13 +335,13 @@ static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie,
        }
        /* round the file position up to the next entry boundary */
-        *fpos += sizeof(union afs_dirent) - 1;
+        ctx->pos += sizeof(union afs_dirent) - 1;
-        *fpos &= ~(sizeof(union afs_dirent) - 1);
+        ctx->pos &= ~(sizeof(union afs_dirent) - 1);
        /* walk through the blocks in sequence */
        ret = 0;
-        while (*fpos < dir->i_size) {
+        while (ctx->pos < dir->i_size) {
-                blkoff = *fpos & ~(sizeof(union afs_dir_block) - 1);
+                blkoff = ctx->pos & ~(sizeof(union afs_dir_block) - 1);
                /* fetch the appropriate page from the directory */
                page = afs_dir_get_page(dir, blkoff / PAGE_SIZE, key);
@@ -364,8 +358,7 @@ static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie,
                do {
                        dblock = &dbuf->blocks[(blkoff % PAGE_SIZE) /
                                               sizeof(union afs_dir_block)];
-                        ret = afs_dir_iterate_block(fpos, dblock, blkoff,
+                        ret = afs_dir_iterate_block(ctx, dblock, blkoff);
-                                                    cookie, filldir);
                        if (ret != 1) {
                                afs_dir_put_page(page);
                                goto out;
@@ -373,7 +366,7 @@ static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie,
                        blkoff += sizeof(union afs_dir_block);
-                } while (*fpos < dir->i_size && blkoff < limit);
+                } while (ctx->pos < dir->i_size && blkoff < limit);
                afs_dir_put_page(page);
                ret = 0;
@@ -387,23 +380,10 @@ out:
 /*
 * read an AFS directory
 */
-static int afs_readdir(struct file *file, void *cookie, filldir_t filldir)
+static int afs_readdir(struct file *file, struct dir_context *ctx)
 {
-        unsigned fpos;
+        return afs_dir_iterate(file_inode(file), 
-        int ret;
+                              ctx, file->private_data);
-        _enter("{%Ld,{%lu}}",
-               file->f_pos, file_inode(file)->i_ino);
-        ASSERT(file->private_data != NULL);
-        fpos = file->f_pos;
-        ret = afs_dir_iterate(file_inode(file), &fpos,
-                              cookie, filldir, file->private_data);
-        file->f_pos = fpos;
-        _leave(" = %d", ret);
-        return ret;
 }
 /*
@@ -416,15 +396,16 @@ static int afs_lookup_filldir(void *_cookie, const char *name, int nlen,
 {
        struct afs_lookup_cookie *cookie = _cookie;
-        _enter("{%s,%Zu},%s,%u,,%llu,%u",
+        _enter("{%s,%u},%s,%u,,%llu,%u",
-               cookie->name, cookie->nlen, name, nlen,
+               cookie->name.name, cookie->name.len, name, nlen,
               (unsigned long long) ino, dtype);
        /* insanity checks first */
        BUILD_BUG_ON(sizeof(union afs_dir_block) != 2048);
        BUILD_BUG_ON(sizeof(union afs_dirent) != 32);
-        if (cookie->nlen != nlen || memcmp(cookie->name, name, nlen) != 0) {
+        if (cookie->name.len != nlen ||
+            memcmp(cookie->name.name, name, nlen) != 0) {
                _leave(" = 0 [no]");
                return 0;
        }
@@ -444,24 +425,18 @@ static int afs_lookup_filldir(void *_cookie, const char *name, int nlen,
 static int afs_do_lookup(struct inode *dir, struct dentry *dentry,
                         struct afs_fid *fid, struct key *key)
 {
-        struct afs_lookup_cookie cookie;
+        struct afs_super_info *as = dir->i_sb->s_fs_info;
-        struct afs_super_info *as;
+        struct afs_lookup_cookie cookie = {
-        unsigned fpos;
+                .ctx.actor = afs_lookup_filldir,
+                .name = dentry->d_name,
+                .fid.vid = as->volume->vid
+        };
        int ret;
        _enter("{%lu},%p{%s},", dir->i_ino, dentry, dentry->d_name.name);
-        as = dir->i_sb->s_fs_info;
        /* search the directory */
-        cookie.name     = dentry->d_name.name;
+        ret = afs_dir_iterate(dir, &cookie.ctx, key);
-        cookie.nlen     = dentry->d_name.len;
-        cookie.fid.vid  = as->volume->vid;
-        cookie.found    = 0;
-        fpos = 0;
-        ret = afs_dir_iterate(dir, &fpos, &cookie, afs_lookup_filldir,
-                              key);
        if (ret < 0) {
                _leave(" = %d [iter]", ret);
                return ret;
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 8f6e9234d565..66d50fe2ee45 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -19,7 +19,8 @@
 #include "internal.h"
 static int afs_readpage(struct file *file, struct page *page);
-static void afs_invalidatepage(struct page *page, unsigned long offset);
+static void afs_invalidatepage(struct page *page, unsigned int offset,
+                               unsigned int length);
 static int afs_releasepage(struct page *page, gfp_t gfp_flags);
 static int afs_launder_page(struct page *page);
@@ -310,16 +311,17 @@ static int afs_launder_page(struct page *page)
 * - release a page and clean up its private data if offset is 0 (indicating
 *   the entire page)
 */
-static void afs_invalidatepage(struct page *page, unsigned long offset)
+static void afs_invalidatepage(struct page *page, unsigned int offset,
+                               unsigned int length)
 {
        struct afs_writeback *wb = (struct afs_writeback *) page_private(page);
-        _enter("{%lu},%lu", page->index, offset);
+        _enter("{%lu},%u,%u", page->index, offset, length);
        BUG_ON(!PageLocked(page));
        /* we clean up only if the entire page is being invalidated */
-        if (offset == 0) {
+        if (offset == 0 && length == PAGE_CACHE_SIZE) {
 #ifdef CONFIG_AFS_FSCACHE
                if (PageFsCache(page)) {
                        struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 085da86e07c2..ca8e55548d98 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -41,7 +41,7 @@ const struct file_operations autofs4_root_operations = {
        .open           = dcache_dir_open,
        .release        = dcache_dir_close,
        .read           = generic_read_dir,
-        .readdir        = dcache_readdir,
+        .iterate        = dcache_readdir,
        .llseek         = dcache_dir_lseek,
        .unlocked_ioctl = autofs4_root_ioctl,
 #ifdef CONFIG_COMPAT
@@ -53,7 +53,7 @@ const struct file_operations autofs4_dir_operations = {
        .open           = autofs4_dir_open,
        .release        = dcache_dir_close,
        .read           = generic_read_dir,
-        .readdir        = dcache_readdir,
+        .iterate        = dcache_readdir,
        .llseek         = dcache_dir_lseek,
 };
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 922ad460bff9..7c93953030fb 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -45,7 +45,7 @@ static ssize_t bad_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
        return -EIO;
 }
-static int bad_file_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int bad_file_readdir(struct file *file, struct dir_context *ctx)
 {
        return -EIO;
 }
@@ -152,7 +152,7 @@ static const struct file_operations bad_file_ops =
        .write          = bad_file_write,
        .aio_read       = bad_file_aio_read,
        .aio_write      = bad_file_aio_write,
-        .readdir        = bad_file_readdir,
+        .iterate        = bad_file_readdir,
        .poll           = bad_file_poll,
        .unlocked_ioctl = bad_file_unlocked_ioctl,
        .compat_ioctl   = bad_file_compat_ioctl,
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index f95dddced968..e9c75e20db32 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -31,7 +31,7 @@ MODULE_LICENSE("GPL");
 /* The units the vfs expects inode->i_blocks to be in */
 #define VFS_BLOCK_SIZE 512
-static int befs_readdir(struct file *, void *, filldir_t);
+static int befs_readdir(struct file *, struct dir_context *);
 static int befs_get_block(struct inode *, sector_t, struct buffer_head *, int);
 static int befs_readpage(struct file *file, struct page *page);
 static sector_t befs_bmap(struct address_space *mapping, sector_t block);
@@ -66,7 +66,7 @@ static struct kmem_cache *befs_inode_cachep;
 static const struct file_operations befs_dir_operations = {
        .read           = generic_read_dir,
-        .readdir        = befs_readdir,
+        .iterate        = befs_readdir,
        .llseek         = generic_file_llseek,
 };
@@ -211,9 +211,9 @@ befs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
 }
 static int
-befs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+befs_readdir(struct file *file, struct dir_context *ctx)
 {
-        struct inode *inode = file_inode(filp);
+        struct inode *inode = file_inode(file);
        struct super_block *sb = inode->i_sb;
        befs_data_stream *ds = &BEFS_I(inode)->i_data.ds;
        befs_off_t value;
@@ -221,15 +221,14 @@ befs_readdir(struct file *filp, void *dirent, filldir_t filldir)
        size_t keysize;
        unsigned char d_type;
        char keybuf[BEFS_NAME_LEN + 1];
-        char *nlsname;
+        const char *dirname = file->f_path.dentry->d_name.name;
-        int nlsnamelen;
-        const char *dirname = filp->f_path.dentry->d_name.name;
        befs_debug(sb, "---> befs_readdir() "
-                   "name %s, inode %ld, filp->f_pos %Ld",
+                   "name %s, inode %ld, ctx->pos %Ld",
-                   dirname, inode->i_ino, filp->f_pos);
+                   dirname, inode->i_ino, ctx->pos);
-        result = befs_btree_read(sb, ds, filp->f_pos, BEFS_NAME_LEN + 1,
+more:
+        result = befs_btree_read(sb, ds, ctx->pos, BEFS_NAME_LEN + 1,
                                 keybuf, &keysize, &value);
        if (result == BEFS_ERR) {
@@ -251,24 +250,29 @@ befs_readdir(struct file *filp, void *dirent, filldir_t filldir)
        /* Convert to NLS */
        if (BEFS_SB(sb)->nls) {
+                char *nlsname;
+                int nlsnamelen;
                result =
                    befs_utf2nls(sb, keybuf, keysize, &nlsname, &nlsnamelen);
                if (result < 0) {
                        befs_debug(sb, "<--- befs_readdir() ERROR");
                        return result;
                }
-                result = filldir(dirent, nlsname, nlsnamelen, filp->f_pos,
+                if (!dir_emit(ctx, nlsname, nlsnamelen,
-                                 (ino_t) value, d_type);
+                                 (ino_t) value, d_type)) {
+                        kfree(nlsname);
+                        return 0;
+                }
                kfree(nlsname);
        } else {
-                result = filldir(dirent, keybuf, keysize, filp->f_pos,
+                if (!dir_emit(ctx, keybuf, keysize,
-                                 (ino_t) value, d_type);
+                                 (ino_t) value, d_type))
+                        return 0;
        }
-        if (!result)
+        ctx->pos++;
-                filp->f_pos++;
+        goto more;
-        befs_debug(sb, "<--- befs_readdir() filp->f_pos %Ld", filp->f_pos);
+        befs_debug(sb, "<--- befs_readdir() pos %Ld", ctx->pos);
        return 0;
 }
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index 3f422f6bb5ca..a399e6d9dc74 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -26,58 +26,51 @@ static struct buffer_head *bfs_find_entry(struct inode *dir,
                                const unsigned char *name, int namelen,
                                struct bfs_dirent **res_dir);
-static int bfs_readdir(struct file *f, void *dirent, filldir_t filldir)
+static int bfs_readdir(struct file *f, struct dir_context *ctx)
 {
        struct inode *dir = file_inode(f);
        struct buffer_head *bh;
        struct bfs_dirent *de;
-        struct bfs_sb_info *info = BFS_SB(dir->i_sb);
        unsigned int offset;
        int block;
-        mutex_lock(&info->bfs_lock);
+        if (ctx->pos & (BFS_DIRENT_SIZE - 1)) {
-        if (f->f_pos & (BFS_DIRENT_SIZE - 1)) {
                printf("Bad f_pos=%08lx for %s:%08lx\n",
-                                        (unsigned long)f->f_pos,
+                                        (unsigned long)ctx->pos,
                                        dir->i_sb->s_id, dir->i_ino);
-                mutex_unlock(&info->bfs_lock);
+                return -EINVAL;
-                return -EBADF;
        }
-        while (f->f_pos < dir->i_size) {
+        while (ctx->pos < dir->i_size) {
-                offset = f->f_pos & (BFS_BSIZE - 1);
+                offset = ctx->pos & (BFS_BSIZE - 1);
-                block = BFS_I(dir)->i_sblock + (f->f_pos >> BFS_BSIZE_BITS);
+                block = BFS_I(dir)->i_sblock + (ctx->pos >> BFS_BSIZE_BITS);
                bh = sb_bread(dir->i_sb, block);
                if (!bh) {
-                        f->f_pos += BFS_BSIZE - offset;
+                        ctx->pos += BFS_BSIZE - offset;
                        continue;
                }
                do {
                        de = (struct bfs_dirent *)(bh->b_data + offset);
                        if (de->ino) {
                                int size = strnlen(de->name, BFS_NAMELEN);
-                                if (filldir(dirent, de->name, size, f->f_pos,
+                                if (!dir_emit(ctx, de->name, size,
                                                le16_to_cpu(de->ino),
-                                                DT_UNKNOWN) < 0) {
+                                                DT_UNKNOWN)) {
                                        brelse(bh);
-                                        mutex_unlock(&info->bfs_lock);
                                        return 0;
                                }
                        }
                        offset += BFS_DIRENT_SIZE;
-                        f->f_pos += BFS_DIRENT_SIZE;
+                        ctx->pos += BFS_DIRENT_SIZE;
-                } while ((offset < BFS_BSIZE) && (f->f_pos < dir->i_size));
+                } while ((offset < BFS_BSIZE) && (ctx->pos < dir->i_size));
                brelse(bh);
        }
+        return 0;
-        mutex_unlock(&info->bfs_lock);
-        return 0;       
 }
 const struct file_operations bfs_dir_operations = {
        .read           = generic_read_dir,
-        .readdir        = bfs_readdir,
+        .iterate        = bfs_readdir,
        .fsync          = generic_file_fsync,
        .llseek         = generic_file_llseek,
 };
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index f26f38ccd194..eb34438ddedb 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1681,8 +1681,7 @@ int btrfs_should_delete_dir_index(struct list_head *del_list,
 * btrfs_readdir_delayed_dir_index - read dir info stored in the delayed tree
 *
 */
-int btrfs_readdir_delayed_dir_index(struct file *filp, void *dirent,
+int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
-                                    filldir_t filldir,
                                    struct list_head *ins_list)
 {
        struct btrfs_dir_item *di;
@@ -1704,13 +1703,13 @@ int btrfs_readdir_delayed_dir_index(struct file *filp, void *dirent,
        list_for_each_entry_safe(curr, next, ins_list, readdir_list) {
                list_del(&curr->readdir_list);
-                if (curr->key.offset < filp->f_pos) {
+                if (curr->key.offset < ctx->pos) {
                        if (atomic_dec_and_test(&curr->refs))
                                kfree(curr);
                        continue;
                }
-                filp->f_pos = curr->key.offset;
+                ctx->pos = curr->key.offset;
                di = (struct btrfs_dir_item *)curr->data;
                name = (char *)(di + 1);
@@ -1719,7 +1718,7 @@ int btrfs_readdir_delayed_dir_index(struct file *filp, void *dirent,
                d_type = btrfs_filetype_table[di->type];
                btrfs_disk_key_to_cpu(&location, &di->location);
-                over = filldir(dirent, name, name_len, curr->key.offset,
+                over = !dir_emit(ctx, name, name_len,
                               location.objectid, d_type);
                if (atomic_dec_and_test(&curr->refs))
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index 1d5c5f7abe3e..a4b38f934d14 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -139,8 +139,7 @@ void btrfs_put_delayed_items(struct list_head *ins_list,
                             struct list_head *del_list);
 int btrfs_should_delete_dir_index(struct list_head *del_list,
                                  u64 index);
-int btrfs_readdir_delayed_dir_index(struct file *filp, void *dirent,
+int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
-                                    filldir_t filldir,
                                    struct list_head *ins_list);
 /* for init */
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b8b60b660c8f..b0292b3ead54 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1013,7 +1013,8 @@ static int btree_releasepage(struct page *page, gfp_t gfp_flags)
        return try_release_extent_buffer(page);
 }
-static void btree_invalidatepage(struct page *page, unsigned long offset)
+static void btree_invalidatepage(struct page *page, unsigned int offset,
+                                 unsigned int length)
 {
        struct extent_io_tree *tree;
        tree = &BTRFS_I(page->mapping->host)->io_tree;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index e7e7afb4a872..6bca9472f313 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2957,7 +2957,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
        pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
        if (page->index > end_index ||
           (page->index == end_index && !pg_offset)) {
-                page->mapping->a_ops->invalidatepage(page, 0);
+                page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE);
                unlock_page(page);
                return 0;
        }
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 17f3064b4a3e..4f9d16b70d3d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5137,10 +5137,9 @@ unsigned char btrfs_filetype_table[] = {
        DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
 };
-static int btrfs_real_readdir(struct file *filp, void *dirent,
+static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
-                              filldir_t filldir)
 {
-        struct inode *inode = file_inode(filp);
+        struct inode *inode = file_inode(file);
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_item *item;
        struct btrfs_dir_item *di;
@@ -5161,29 +5160,15 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
        char tmp_name[32];
        char *name_ptr;
        int name_len;
-        int is_curr = 0;        /* filp->f_pos points to the current index? */
+        int is_curr = 0;        /* ctx->pos points to the current index? */
        /* FIXME, use a real flag for deciding about the key type */
        if (root->fs_info->tree_root == root)
                key_type = BTRFS_DIR_ITEM_KEY;
-        /* special case for "." */
+        if (!dir_emit_dots(file, ctx))
-        if (filp->f_pos == 0) {
+                return 0;
-                over = filldir(dirent, ".", 1,
-                               filp->f_pos, btrfs_ino(inode), DT_DIR);
-                if (over)
-                        return 0;
-                filp->f_pos = 1;
-        }
-        /* special case for .., just use the back ref */
-        if (filp->f_pos == 1) {
-                u64 pino = parent_ino(filp->f_path.dentry);
-                over = filldir(dirent, "..", 2,
-                               filp->f_pos, pino, DT_DIR);
-                if (over)
-                        return 0;
-                filp->f_pos = 2;
-        }
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
@@ -5197,7 +5182,7 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
        }
        btrfs_set_key_type(&key, key_type);
-        key.offset = filp->f_pos;
+        key.offset = ctx->pos;
        key.objectid = btrfs_ino(inode);
        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@@ -5223,14 +5208,14 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
                        break;
                if (btrfs_key_type(&found_key) != key_type)
                        break;
-                if (found_key.offset < filp->f_pos)
+                if (found_key.offset < ctx->pos)
                        goto next;
                if (key_type == BTRFS_DIR_INDEX_KEY &&
                    btrfs_should_delete_dir_index(&del_list,
                                                  found_key.offset))
                        goto next;
-                filp->f_pos = found_key.offset;
+                ctx->pos = found_key.offset;
                is_curr = 1;
                di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
@@ -5274,9 +5259,8 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
                                over = 0;
                                goto skip;
                        }
-                        over = filldir(dirent, name_ptr, name_len,
+                        over = !dir_emit(ctx, name_ptr, name_len,
-                                       found_key.offset, location.objectid,
+                                       location.objectid, d_type);
-                                       d_type);
 skip:
                        if (name_ptr != tmp_name)
@@ -5295,9 +5279,8 @@ next:
        if (key_type == BTRFS_DIR_INDEX_KEY) {
                if (is_curr)
-                        filp->f_pos++;
+                        ctx->pos++;
-                ret = btrfs_readdir_delayed_dir_index(filp, dirent, filldir,
+                ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list);
-                                                      &ins_list);
                if (ret)
                        goto nopos;
        }
@@ -5308,9 +5291,9 @@ next:
                 * 32-bit glibc will use getdents64, but then strtol -
                 * so the last number we can serve is this.
                 */
-                filp->f_pos = 0x7fffffff;
+                ctx->pos = 0x7fffffff;
        else
-                filp->f_pos++;
+                ctx->pos++;
 nopos:
        ret = 0;
 err:
@@ -7510,7 +7493,8 @@ static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
        return __btrfs_releasepage(page, gfp_flags & GFP_NOFS);
 }
-static void btrfs_invalidatepage(struct page *page, unsigned long offset)
+static void btrfs_invalidatepage(struct page *page, unsigned int offset,
+                                 unsigned int length)
 {
        struct inode *inode = page->mapping->host;
        struct extent_io_tree *tree;
@@ -8731,7 +8715,7 @@ static const struct inode_operations btrfs_dir_ro_inode_operations = {
 static const struct file_operations btrfs_dir_file_operations = {
        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
-        .readdir        = btrfs_real_readdir,
+        .iterate        = btrfs_real_readdir,
        .unlocked_ioctl = btrfs_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = btrfs_ioctl,
diff --git a/fs/buffer.c b/fs/buffer.c
index d2a4d1bb2d57..f93392e2df12 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1454,7 +1454,8 @@ static void discard_buffer(struct buffer_head * bh)
 * block_invalidatepage - invalidate part or all of a buffer-backed page
 *
 * @page: the page which is affected
- * @offset: the index of the truncation point
+ * @offset: start of the range to invalidate
+ * @length: length of the range to invalidate
 *
 * block_invalidatepage() is called when all or part of the page has become
 * invalidated by a truncate operation.
@@ -1465,15 +1466,22 @@ static void discard_buffer(struct buffer_head * bh)
 * point.  Because the caller is about to free (and possibly reuse) those
 * blocks on-disk.
 */
-void block_invalidatepage(struct page *page, unsigned long offset)
+void block_invalidatepage(struct page *page, unsigned int offset,
+                          unsigned int length)
 {
        struct buffer_head *head, *bh, *next;
        unsigned int curr_off = 0;
+        unsigned int stop = length + offset;
        BUG_ON(!PageLocked(page));
        if (!page_has_buffers(page))
                goto out;
+        /*
+         * Check for overflow
+         */
+        BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
        head = page_buffers(page);
        bh = head;
        do {
@@ -1481,6 +1489,12 @@ void block_invalidatepage(struct page *page, unsigned long offset)
                next = bh->b_this_page;
                /*
+                 * Are we still fully in range ?
+                 */
+                if (next_off > stop)
+                        goto out;
+                /*
                 * is this block fully invalidated?
                 */
                if (offset <= curr_off)
@@ -1501,6 +1515,7 @@ out:
 }
 EXPORT_SYMBOL(block_invalidatepage);
 /*
 * We attach and possibly dirty the buffers atomically wrt
 * __set_page_dirty_buffers() via private_lock.  try_to_free_buffers
@@ -2841,7 +2856,7 @@ int block_write_full_page_endio(struct page *page, get_block_t *get_block,
                 * they may have been added in ext3_writepage().  Make them
                 * freeable here, so the page does not leak.
                 */
-                do_invalidatepage(page, 0);
+                do_invalidatepage(page, 0, PAGE_CACHE_SIZE);
                unlock_page(page);
                return 0; /* don't care */
        }
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 746ce532e130..d4c1206af9fc 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -13,8 +13,6 @@
 #include <linux/mount.h>
 #include "internal.h"
-#define list_to_page(head) (list_entry((head)->prev, struct page, lru))
 struct cachefiles_lookup_data {
        struct cachefiles_xattr *auxdata;       /* auxiliary data */
        char                    *key;           /* key path */
@@ -212,20 +210,29 @@ static void cachefiles_update_object(struct fscache_object *_object)
        object = container_of(_object, struct cachefiles_object, fscache);
        cache = container_of(object->fscache.cache, struct cachefiles_cache,
                             cache);
+        if (!fscache_use_cookie(_object)) {
+                _leave(" [relinq]");
+                return;
+        }
        cookie = object->fscache.cookie;
        if (!cookie->def->get_aux) {
+                fscache_unuse_cookie(_object);
                _leave(" [no aux]");
                return;
        }
        auxdata = kmalloc(2 + 512 + 3, cachefiles_gfp);
        if (!auxdata) {
+                fscache_unuse_cookie(_object);
                _leave(" [nomem]");
                return;
        }
        auxlen = cookie->def->get_aux(cookie->netfs_data, auxdata->data, 511);
+        fscache_unuse_cookie(_object);
        ASSERTCMP(auxlen, <, 511);
        auxdata->len = auxlen + 1;
@@ -263,7 +270,7 @@ static void cachefiles_drop_object(struct fscache_object *_object)
 #endif
        /* delete retired objects */
-        if (object->fscache.state == FSCACHE_OBJECT_RECYCLING &&
+        if (test_bit(FSCACHE_COOKIE_RETIRED, &object->fscache.cookie->flags) &&
            _object != cache->cache.fsdef
            ) {
                _debug("- retire object OBJ%x", object->fscache.debug_id);
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 8c01c5fcdf75..25badd1aec5c 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -38,7 +38,7 @@ void __cachefiles_printk_object(struct cachefiles_object *object,
        printk(KERN_ERR "%sobject: OBJ%x\n",
               prefix, object->fscache.debug_id);
        printk(KERN_ERR "%sobjstate=%s fl=%lx wbusy=%x ev=%lx[%lx]\n",
-               prefix, fscache_object_states[object->fscache.state],
+               prefix, object->fscache.state->name,
               object->fscache.flags, work_busy(&object->fscache.work),
               object->fscache.events, object->fscache.event_mask);
        printk(KERN_ERR "%sops=%u inp=%u exc=%u\n",
@@ -127,10 +127,10 @@ static void cachefiles_mark_object_buried(struct cachefiles_cache *cache,
 found_dentry:
        kdebug("preemptive burial: OBJ%x [%s] %p",
               object->fscache.debug_id,
-               fscache_object_states[object->fscache.state],
+               object->fscache.state->name,
               dentry);
-        if (object->fscache.state < FSCACHE_OBJECT_DYING) {
+        if (fscache_object_is_live(&object->fscache)) {
                printk(KERN_ERR "\n");
                printk(KERN_ERR "CacheFiles: Error:"
                       " Can't preemptively bury live object\n");
@@ -192,7 +192,7 @@ try_again:
        /* an old object from a previous incarnation is hogging the slot - we
         * need to wait for it to be destroyed */
 wait_for_old_object:
-        if (xobject->fscache.state < FSCACHE_OBJECT_DYING) {
+        if (fscache_object_is_live(&object->fscache)) {
                printk(KERN_ERR "\n");
                printk(KERN_ERR "CacheFiles: Error:"
                       " Unexpected object collision\n");
@@ -836,7 +836,7 @@ static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache,
        //       dir->d_name.len, dir->d_name.len, dir->d_name.name, filename);
        /* look up the victim */
-        mutex_lock_nested(&dir->d_inode->i_mutex, 1);
+        mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
        start = jiffies;
        victim = lookup_one_len(filename, dir, strlen(filename));
diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c
index 73b46288b54b..2476e5162609 100644
--- a/fs/cachefiles/xattr.c
+++ b/fs/cachefiles/xattr.c
@@ -109,13 +109,12 @@ int cachefiles_set_object_xattr(struct cachefiles_object *object,
        struct dentry *dentry = object->dentry;
        int ret;
-        ASSERT(object->fscache.cookie);
        ASSERT(dentry);
        _enter("%p,#%d", object, auxdata->len);
        /* attempt to install the cache metadata directly */
-        _debug("SET %s #%u", object->fscache.cookie->def->name, auxdata->len);
+        _debug("SET #%u", auxdata->len);
        ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
                           &auxdata->type, auxdata->len,
@@ -138,13 +137,12 @@ int cachefiles_update_object_xattr(struct cachefiles_object *object,
        struct dentry *dentry = object->dentry;
        int ret;
-        ASSERT(object->fscache.cookie);
        ASSERT(dentry);
        _enter("%p,#%d", object, auxdata->len);
        /* attempt to install the cache metadata directly */
-        _debug("SET %s #%u", object->fscache.cookie->def->name, auxdata->len);
+        _debug("SET #%u", auxdata->len);
        ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
                           &auxdata->type, auxdata->len,
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 3e68ac101040..38b5c1bc6776 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -143,7 +143,8 @@ static int ceph_set_page_dirty(struct page *page)
 * dirty page counters appropriately.  Only called if there is private
 * data on the page.
 */
-static void ceph_invalidatepage(struct page *page, unsigned long offset)
+static void ceph_invalidatepage(struct page *page, unsigned int offset,
+                                unsigned int length)
 {
        struct inode *inode;
        struct ceph_inode_info *ci;
@@ -163,20 +164,20 @@ static void ceph_invalidatepage(struct page *page, unsigned long offset)
        if (!PageDirty(page))
                pr_err("%p invalidatepage %p page not dirty\n", inode, page);
-        if (offset == 0)
+        if (offset == 0 && length == PAGE_CACHE_SIZE)
                ClearPageChecked(page);
        ci = ceph_inode(inode);
-        if (offset == 0) {
+        if (offset == 0 && length == PAGE_CACHE_SIZE) {
-                dout("%p invalidatepage %p idx %lu full dirty page %lu\n",
+                dout("%p invalidatepage %p idx %lu full dirty page\n",
-                     inode, page, page->index, offset);
+                     inode, page, page->index);
                ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
                ceph_put_snap_context(snapc);
                page->private = 0;
                ClearPagePrivate(page);
        } else {
-                dout("%p invalidatepage %p idx %lu partial dirty page\n",
+                dout("%p invalidatepage %p idx %lu partial dirty page %u(%u)\n",
-                     inode, page, page->index);
+                     inode, page, page->index, offset, length);
        }
 }
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index f02d82b7933e..a40ceda47a32 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -111,11 +111,10 @@ static unsigned fpos_off(loff_t p)
 * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by
 * the MDS if/when the directory is modified).
 */
-static int __dcache_readdir(struct file *filp,
+static int __dcache_readdir(struct file *file, struct dir_context *ctx)
-                            void *dirent, filldir_t filldir)
 {
-        struct ceph_file_info *fi = filp->private_data;
+        struct ceph_file_info *fi = file->private_data;
-        struct dentry *parent = filp->f_dentry;
+        struct dentry *parent = file->f_dentry;
        struct inode *dir = parent->d_inode;
        struct list_head *p;
        struct dentry *dentry, *last;
@@ -126,14 +125,14 @@ static int __dcache_readdir(struct file *filp,
        last = fi->dentry;
        fi->dentry = NULL;
-        dout("__dcache_readdir %p at %llu (last %p)\n", dir, filp->f_pos,
+        dout("__dcache_readdir %p at %llu (last %p)\n", dir, ctx->pos,
             last);
        spin_lock(&parent->d_lock);
        /* start at beginning? */
-        if (filp->f_pos == 2 || last == NULL ||
+        if (ctx->pos == 2 || last == NULL ||
-            filp->f_pos < ceph_dentry(last)->offset) {
+            ctx->pos < ceph_dentry(last)->offset) {
                if (list_empty(&parent->d_subdirs))
                        goto out_unlock;
                p = parent->d_subdirs.prev;
@@ -157,11 +156,11 @@ more:
                if (!d_unhashed(dentry) && dentry->d_inode &&
                    ceph_snap(dentry->d_inode) != CEPH_SNAPDIR &&
                    ceph_ino(dentry->d_inode) != CEPH_INO_CEPH &&
-                    filp->f_pos <= di->offset)
+                    ctx->pos <= di->offset)
                        break;
                dout(" skipping %p %.*s at %llu (%llu)%s%s\n", dentry,
                     dentry->d_name.len, dentry->d_name.name, di->offset,
-                     filp->f_pos, d_unhashed(dentry) ? " unhashed" : "",
+                     ctx->pos, d_unhashed(dentry) ? " unhashed" : "",
                     !dentry->d_inode ? " null" : "");
                spin_unlock(&dentry->d_lock);
                p = p->prev;
@@ -173,29 +172,27 @@ more:
        spin_unlock(&dentry->d_lock);
        spin_unlock(&parent->d_lock);
-        dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos,
+        dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, ctx->pos,
             dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
-        filp->f_pos = di->offset;
+        ctx->pos = di->offset;
-        err = filldir(dirent, dentry->d_name.name,
+        if (!dir_emit(ctx, dentry->d_name.name,
-                      dentry->d_name.len, di->offset,
+                      dentry->d_name.len,
                      ceph_translate_ino(dentry->d_sb, dentry->d_inode->i_ino),
-                      dentry->d_inode->i_mode >> 12);
+                      dentry->d_inode->i_mode >> 12)) {
+                if (last) {
-        if (last) {
-                if (err < 0) {
                        /* remember our position */
                        fi->dentry = last;
                        fi->next_offset = di->offset;
-                } else {
-                        dput(last);
                }
+                dput(dentry);
+                return 0;
        }
-        last = dentry;
-        if (err < 0)
+        if (last)
-                goto out;
+                dput(last);
+        last = dentry;
-        filp->f_pos++;
+        ctx->pos++;
        /* make sure a dentry wasn't dropped while we didn't have parent lock */
        if (!ceph_dir_is_complete(dir)) {
@@ -235,59 +232,59 @@ static int note_last_dentry(struct ceph_file_info *fi, const char *name,
        return 0;
 }
-static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int ceph_readdir(struct file *file, struct dir_context *ctx)
 {
-        struct ceph_file_info *fi = filp->private_data;
+        struct ceph_file_info *fi = file->private_data;
-        struct inode *inode = file_inode(filp);
+        struct inode *inode = file_inode(file);
        struct ceph_inode_info *ci = ceph_inode(inode);
        struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
        struct ceph_mds_client *mdsc = fsc->mdsc;
-        unsigned frag = fpos_frag(filp->f_pos);
+        unsigned frag = fpos_frag(ctx->pos);
-        int off = fpos_off(filp->f_pos);
+        int off = fpos_off(ctx->pos);
        int err;
        u32 ftype;
        struct ceph_mds_reply_info_parsed *rinfo;
        const int max_entries = fsc->mount_options->max_readdir;
        const int max_bytes = fsc->mount_options->max_readdir_bytes;
-        dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off);
+        dout("readdir %p file %p frag %u off %u\n", inode, file, frag, off);
        if (fi->flags & CEPH_F_ATEND)
                return 0;
        /* always start with . and .. */
-        if (filp->f_pos == 0) {
+        if (ctx->pos == 0) {
                /* note dir version at start of readdir so we can tell
                 * if any dentries get dropped */
                fi->dir_release_count = atomic_read(&ci->i_release_count);
                dout("readdir off 0 -> '.'\n");
-                if (filldir(dirent, ".", 1, ceph_make_fpos(0, 0),
+                if (!dir_emit(ctx, ".", 1, 
                            ceph_translate_ino(inode->i_sb, inode->i_ino),
-                            inode->i_mode >> 12) < 0)
+                            inode->i_mode >> 12))
                        return 0;
-                filp->f_pos = 1;
+                ctx->pos = 1;
                off = 1;
        }
-        if (filp->f_pos == 1) {
+        if (ctx->pos == 1) {
-                ino_t ino = parent_ino(filp->f_dentry);
+                ino_t ino = parent_ino(file->f_dentry);
                dout("readdir off 1 -> '..'\n");
-                if (filldir(dirent, "..", 2, ceph_make_fpos(0, 1),
+                if (!dir_emit(ctx, "..", 2,
                            ceph_translate_ino(inode->i_sb, ino),
-                            inode->i_mode >> 12) < 0)
+                            inode->i_mode >> 12))
                        return 0;
-                filp->f_pos = 2;
+                ctx->pos = 2;
                off = 2;
        }
        /* can we use the dcache? */
        spin_lock(&ci->i_ceph_lock);
-        if ((filp->f_pos == 2 || fi->dentry) &&
+        if ((ctx->pos == 2 || fi->dentry) &&
            !ceph_test_mount_opt(fsc, NOASYNCREADDIR) &&
            ceph_snap(inode) != CEPH_SNAPDIR &&
            __ceph_dir_is_complete(ci) &&
            __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
                spin_unlock(&ci->i_ceph_lock);
-                err = __dcache_readdir(filp, dirent, filldir);
+                err = __dcache_readdir(file, ctx);
                if (err != -EAGAIN)
                        return err;
        } else {
@@ -327,7 +324,7 @@ more:
                        return PTR_ERR(req);
                req->r_inode = inode;
                ihold(inode);
-                req->r_dentry = dget(filp->f_dentry);
+                req->r_dentry = dget(file->f_dentry);
                /* hints to request -> mds selection code */
                req->r_direct_mode = USE_AUTH_MDS;
                req->r_direct_hash = ceph_frag_value(frag);
@@ -379,15 +376,16 @@ more:
        rinfo = &fi->last_readdir->r_reply_info;
        dout("readdir frag %x num %d off %d chunkoff %d\n", frag,
             rinfo->dir_nr, off, fi->offset);
+        ctx->pos = ceph_make_fpos(frag, off);
        while (off >= fi->offset && off - fi->offset < rinfo->dir_nr) {
-                u64 pos = ceph_make_fpos(frag, off);
                struct ceph_mds_reply_inode *in =
                        rinfo->dir_in[off - fi->offset].in;
                struct ceph_vino vino;
                ino_t ino;
                dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n",
-                     off, off - fi->offset, rinfo->dir_nr, pos,
+                     off, off - fi->offset, rinfo->dir_nr, ctx->pos,
                     rinfo->dir_dname_len[off - fi->offset],
                     rinfo->dir_dname[off - fi->offset], in);
                BUG_ON(!in);
@@ -395,16 +393,15 @@ more:
                vino.ino = le64_to_cpu(in->ino);
                vino.snap = le64_to_cpu(in->snapid);
                ino = ceph_vino_to_ino(vino);
-                if (filldir(dirent,
+                if (!dir_emit(ctx,
                            rinfo->dir_dname[off - fi->offset],
                            rinfo->dir_dname_len[off - fi->offset],
-                            pos,
+                            ceph_translate_ino(inode->i_sb, ino), ftype)) {
-                            ceph_translate_ino(inode->i_sb, ino), ftype) < 0) {
                        dout("filldir stopping us...\n");
                        return 0;
                }
                off++;
-                filp->f_pos = pos + 1;
+                ctx->pos++;
        }
        if (fi->last_name) {
@@ -417,7 +414,7 @@ more:
        if (!ceph_frag_is_rightmost(frag)) {
                frag = ceph_frag_next(frag);
                off = 0;
-                filp->f_pos = ceph_make_fpos(frag, off);
+                ctx->pos = ceph_make_fpos(frag, off);
                dout("readdir next frag is %x\n", frag);
                goto more;
        }
@@ -432,11 +429,11 @@ more:
        if (atomic_read(&ci->i_release_count) == fi->dir_release_count) {
                dout(" marking %p complete\n", inode);
                __ceph_dir_set_complete(ci, fi->dir_release_count);
-                ci->i_max_offset = filp->f_pos;
+                ci->i_max_offset = ctx->pos;
        }
        spin_unlock(&ci->i_ceph_lock);
-        dout("readdir %p filp %p done.\n", inode, filp);
+        dout("readdir %p file %p done.\n", inode, file);
        return 0;
 }
@@ -1268,7 +1265,7 @@ unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn)
 const struct file_operations ceph_dir_fops = {
        .read = ceph_read_dir,
-        .readdir = ceph_readdir,
+        .iterate = ceph_readdir,
        .llseek = ceph_dir_llseek,
        .open = ceph_open,
        .release = ceph_release,
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 3752b9f6d9e4..540c1ccfcdb2 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -968,7 +968,7 @@ const struct file_operations cifs_file_direct_nobrl_ops = {
 };
 const struct file_operations cifs_dir_ops = {
-        .readdir = cifs_readdir,
+        .iterate = cifs_readdir,
        .release = cifs_closedir,
        .read    = generic_read_dir,
        .unlocked_ioctl  = cifs_ioctl,
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 0e32c3446ce9..d05b3028e3b9 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -101,7 +101,7 @@ extern int cifs_file_mmap(struct file * , struct vm_area_struct *);
 extern int cifs_file_strict_mmap(struct file * , struct vm_area_struct *);
 extern const struct file_operations cifs_dir_ops;
 extern int cifs_dir_open(struct inode *inode, struct file *file);
-extern int cifs_readdir(struct file *file, void *direntry, filldir_t filldir);
+extern int cifs_readdir(struct file *file, struct dir_context *ctx);
 /* Functions related to dir entries */
 extern const struct dentry_operations cifs_dentry_ops;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 48b29d24c9f4..4d8ba8d491e5 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -3546,11 +3546,12 @@ static int cifs_release_page(struct page *page, gfp_t gfp)
        return cifs_fscache_release_page(page, gfp);
 }
-static void cifs_invalidate_page(struct page *page, unsigned long offset)
+static void cifs_invalidate_page(struct page *page, unsigned int offset,
+                                 unsigned int length)
 {
        struct cifsInodeInfo *cifsi = CIFS_I(page->mapping->host);
-        if (offset == 0)
+        if (offset == 0 && length == PAGE_CACHE_SIZE)
                cifs_fscache_invalidate_page(page, &cifsi->vfs_inode);
 }
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 770d5a9781c1..f1213799de1a 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -537,14 +537,14 @@ static int cifs_save_resume_key(const char *current_entry,
 * every entry (do not increment for . or .. entry).
 */
 static int
-find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon,
+find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos,
                struct file *file, char **current_entry, int *num_to_ret)
 {
        __u16 search_flags;
        int rc = 0;
        int pos_in_buf = 0;
        loff_t first_entry_in_buffer;
-        loff_t index_to_find = file->f_pos;
+        loff_t index_to_find = pos;
        struct cifsFileInfo *cfile = file->private_data;
        struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
        struct TCP_Server_Info *server = tcon->ses->server;
@@ -659,8 +659,9 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon,
        return rc;
 }
-static int cifs_filldir(char *find_entry, struct file *file, filldir_t filldir,
+static int cifs_filldir(char *find_entry, struct file *file,
-                void *dirent, char *scratch_buf, unsigned int max_len)
+                struct dir_context *ctx,
+                char *scratch_buf, unsigned int max_len)
 {
        struct cifsFileInfo *file_info = file->private_data;
        struct super_block *sb = file->f_path.dentry->d_sb;
@@ -740,13 +741,11 @@ static int cifs_filldir(char *find_entry, struct file *file, filldir_t filldir,
        cifs_prime_dcache(file->f_dentry, &name, &fattr);
        ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid);
-        rc = filldir(dirent, name.name, name.len, file->f_pos, ino,
+        return !dir_emit(ctx, name.name, name.len, ino, fattr.cf_dtype);
-                     fattr.cf_dtype);
-        return rc;
 }
-int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
+int cifs_readdir(struct file *file, struct dir_context *ctx)
 {
        int rc = 0;
        unsigned int xid;
@@ -772,103 +771,86 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
                        goto rddir2_exit;
        }
-        switch ((int) file->f_pos) {
+        if (!dir_emit_dots(file, ctx))
-        case 0:
+                goto rddir2_exit;
-                if (filldir(direntry, ".", 1, file->f_pos,
-                     file_inode(file)->i_ino, DT_DIR) < 0) {
-                        cifs_dbg(VFS, "Filldir for current dir failed\n");
-                        rc = -ENOMEM;
-                        break;
-                }
-                file->f_pos++;
-        case 1:
-                if (filldir(direntry, "..", 2, file->f_pos,
-                     parent_ino(file->f_path.dentry), DT_DIR) < 0) {
-                        cifs_dbg(VFS, "Filldir for parent dir failed\n");
-                        rc = -ENOMEM;
-                        break;
-                }
-                file->f_pos++;
-        default:
-                /* 1) If search is active,
-                        is in current search buffer?
-                        if it before then restart search
-                        if after then keep searching till find it */
-                if (file->private_data == NULL) {
-                        rc = -EINVAL;
-                        free_xid(xid);
-                        return rc;
-                }
-                cifsFile = file->private_data;
-                if (cifsFile->srch_inf.endOfSearch) {
-                        if (cifsFile->srch_inf.emptyDir) {
-                                cifs_dbg(FYI, "End of search, empty dir\n");
-                                rc = 0;
-                                break;
-                        }
-                } /* else {
-                        cifsFile->invalidHandle = true;
-                        tcon->ses->server->close(xid, tcon, &cifsFile->fid);
-                } */
-                tcon = tlink_tcon(cifsFile->tlink);
+        /* 1) If search is active,
-                rc = find_cifs_entry(xid, tcon, file, &current_entry,
+                is in current search buffer?
-                                     &num_to_fill);
+                if it before then restart search
-                if (rc) {
+                if after then keep searching till find it */
-                        cifs_dbg(FYI, "fce error %d\n", rc);
-                        goto rddir2_exit;
+        if (file->private_data == NULL) {
-                } else if (current_entry != NULL) {
+                rc = -EINVAL;
-                        cifs_dbg(FYI, "entry %lld found\n", file->f_pos);
+                goto rddir2_exit;
-                } else {
+        }
-                        cifs_dbg(FYI, "could not find entry\n");
+        cifsFile = file->private_data;
+        if (cifsFile->srch_inf.endOfSearch) {
+                if (cifsFile->srch_inf.emptyDir) {
+                        cifs_dbg(FYI, "End of search, empty dir\n");
+                        rc = 0;
                        goto rddir2_exit;
                }
-                cifs_dbg(FYI, "loop through %d times filling dir for net buf %p\n",
+        } /* else {
-                         num_to_fill, cifsFile->srch_inf.ntwrk_buf_start);
+                cifsFile->invalidHandle = true;
-                max_len = tcon->ses->server->ops->calc_smb_size(
+                tcon->ses->server->close(xid, tcon, &cifsFile->fid);
-                                cifsFile->srch_inf.ntwrk_buf_start);
+        } */
-                end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + max_len;
+        tcon = tlink_tcon(cifsFile->tlink);
-                tmp_buf = kmalloc(UNICODE_NAME_MAX, GFP_KERNEL);
+        rc = find_cifs_entry(xid, tcon, ctx->pos, file, &current_entry,
-                if (tmp_buf == NULL) {
+                             &num_to_fill);
-                        rc = -ENOMEM;
+        if (rc) {
+                cifs_dbg(FYI, "fce error %d\n", rc);
+                goto rddir2_exit;
+        } else if (current_entry != NULL) {
+                cifs_dbg(FYI, "entry %lld found\n", ctx->pos);
+        } else {
+                cifs_dbg(FYI, "could not find entry\n");
+                goto rddir2_exit;
+        }
+        cifs_dbg(FYI, "loop through %d times filling dir for net buf %p\n",
+                 num_to_fill, cifsFile->srch_inf.ntwrk_buf_start);
+        max_len = tcon->ses->server->ops->calc_smb_size(
+                        cifsFile->srch_inf.ntwrk_buf_start);
+        end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + max_len;
+        tmp_buf = kmalloc(UNICODE_NAME_MAX, GFP_KERNEL);
+        if (tmp_buf == NULL) {
+                rc = -ENOMEM;
+                goto rddir2_exit;
+        }
+        for (i = 0; i < num_to_fill; i++) {
+                if (current_entry == NULL) {
+                        /* evaluate whether this case is an error */
+                        cifs_dbg(VFS, "past SMB end,  num to fill %d i %d\n",
+                                 num_to_fill, i);
                        break;
                }
+                /*
-                for (i = 0; (i < num_to_fill) && (rc == 0); i++) {
+                 * if buggy server returns . and .. late do we want to
-                        if (current_entry == NULL) {
+                 * check for that here?
-                                /* evaluate whether this case is an error */
+                 */
-                                cifs_dbg(VFS, "past SMB end,  num to fill %d i %d\n",
+                rc = cifs_filldir(current_entry, file, ctx,
-                                         num_to_fill, i);
+                                  tmp_buf, max_len);
-                                break;
+                if (rc) {
-                        }
+                        if (rc > 0)
-                        /*
-                         * if buggy server returns . and .. late do we want to
-                         * check for that here?
-                         */
-                        rc = cifs_filldir(current_entry, file, filldir,
-                                          direntry, tmp_buf, max_len);
-                        if (rc == -EOVERFLOW) {
                                rc = 0;
-                                break;
+                        break;
-                        }
-                        file->f_pos++;
-                        if (file->f_pos ==
-                                cifsFile->srch_inf.index_of_last_entry) {
-                                cifs_dbg(FYI, "last entry in buf at pos %lld %s\n",
-                                         file->f_pos, tmp_buf);
-                                cifs_save_resume_key(current_entry, cifsFile);
-                                break;
-                        } else
-                                current_entry =
-                                        nxt_dir_entry(current_entry, end_of_smb,
-                                                cifsFile->srch_inf.info_level);
                }
-                kfree(tmp_buf);
-                break;
+                ctx->pos++;
-        } /* end switch */
+                if (ctx->pos ==
+                        cifsFile->srch_inf.index_of_last_entry) {
+                        cifs_dbg(FYI, "last entry in buf at pos %lld %s\n",
+                                 ctx->pos, tmp_buf);
+                        cifs_save_resume_key(current_entry, cifsFile);
+                        break;
+                } else
+                        current_entry =
+                                nxt_dir_entry(current_entry, end_of_smb,
+                                        cifsFile->srch_inf.info_level);
+        }
+        kfree(tmp_buf);
 rddir2_exit:
        free_xid(xid);
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index b7d3a05c062c..87e0ee9f4465 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -43,15 +43,14 @@ static int coda_rename(struct inode *old_inode, struct dentry *old_dentry,
                       struct inode *new_inode, struct dentry *new_dentry);
 /* dir file-ops */
-static int coda_readdir(struct file *file, void *buf, filldir_t filldir);
+static int coda_readdir(struct file *file, struct dir_context *ctx);
 /* dentry ops */
 static int coda_dentry_revalidate(struct dentry *de, unsigned int flags);
 static int coda_dentry_delete(const struct dentry *);
 /* support routines */
-static int coda_venus_readdir(struct file *coda_file, void *buf,
+static int coda_venus_readdir(struct file *, struct dir_context *);
-                              filldir_t filldir);
 /* same as fs/bad_inode.c */
 static int coda_return_EIO(void)
@@ -85,7 +84,7 @@ const struct inode_operations coda_dir_inode_operations =
 const struct file_operations coda_dir_operations = {
        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
-        .readdir        = coda_readdir,
+        .iterate        = coda_readdir,
        .open           = coda_open,
        .release        = coda_release,
        .fsync          = coda_fsync,
@@ -378,7 +377,7 @@ static int coda_rename(struct inode *old_dir, struct dentry *old_dentry,
 /* file operations for directories */
-static int coda_readdir(struct file *coda_file, void *buf, filldir_t filldir)
+static int coda_readdir(struct file *coda_file, struct dir_context *ctx)
 {
        struct coda_file_info *cfi;
        struct file *host_file;
@@ -391,30 +390,19 @@ static int coda_readdir(struct file *coda_file, void *buf, filldir_t filldir)
        if (!host_file->f_op)
                return -ENOTDIR;
-        if (host_file->f_op->readdir)
+        if (host_file->f_op->iterate) {
-        {
-                /* potemkin case: we were handed a directory inode.
-                 * We can't use vfs_readdir because we have to keep the file
-                 * position in sync between the coda_file and the host_file.
-                 * and as such we need grab the inode mutex. */
                struct inode *host_inode = file_inode(host_file);
                mutex_lock(&host_inode->i_mutex);
-                host_file->f_pos = coda_file->f_pos;
                ret = -ENOENT;
                if (!IS_DEADDIR(host_inode)) {
-                        ret = host_file->f_op->readdir(host_file, buf, filldir);
+                        ret = host_file->f_op->iterate(host_file, ctx);
                        file_accessed(host_file);
                }
-                coda_file->f_pos = host_file->f_pos;
                mutex_unlock(&host_inode->i_mutex);
+                return ret;
        }
-        else /* Venus: we must read Venus dirents from a file */
+        /* Venus: we must read Venus dirents from a file */
-                ret = coda_venus_readdir(coda_file, buf, filldir);
+        return coda_venus_readdir(coda_file, ctx);
-        return ret;
 }
 static inline unsigned int CDT2DT(unsigned char cdt)
@@ -437,10 +425,8 @@ static inline unsigned int CDT2DT(unsigned char cdt)
 }
 /* support routines */
-static int coda_venus_readdir(struct file *coda_file, void *buf,
+static int coda_venus_readdir(struct file *coda_file, struct dir_context *ctx)
-                              filldir_t filldir)
 {
-        int result = 0; /* # of entries returned */
        struct coda_file_info *cfi;
        struct coda_inode_info *cii;
        struct file *host_file;
@@ -462,23 +448,12 @@ static int coda_venus_readdir(struct file *coda_file, void *buf,
        vdir = kmalloc(sizeof(*vdir), GFP_KERNEL);
        if (!vdir) return -ENOMEM;
-        if (coda_file->f_pos == 0) {
+        if (!dir_emit_dots(coda_file, ctx))
-                ret = filldir(buf, ".", 1, 0, de->d_inode->i_ino, DT_DIR);
+                goto out;
-                if (ret < 0)
-                        goto out;
-                result++;
-                coda_file->f_pos++;
-        }
-        if (coda_file->f_pos == 1) {
-                ret = filldir(buf, "..", 2, 1, parent_ino(de), DT_DIR);
-                if (ret < 0)
-                        goto out;
-                result++;
-                coda_file->f_pos++;
-        }
        while (1) {
                /* read entries from the directory file */
-                ret = kernel_read(host_file, coda_file->f_pos - 2, (char *)vdir,
+                ret = kernel_read(host_file, ctx->pos - 2, (char *)vdir,
                                  sizeof(*vdir));
                if (ret < 0) {
                        printk(KERN_ERR "coda readdir: read dir %s failed %d\n",
@@ -507,7 +482,7 @@ static int coda_venus_readdir(struct file *coda_file, void *buf,
                /* Make sure we skip '.' and '..', we already got those */
                if (name.name[0] == '.' && (name.len == 1 ||
-                    (vdir->d_name[1] == '.' && name.len == 2)))
+                    (name.name[1] == '.' && name.len == 2)))
                        vdir->d_fileno = name.len = 0;
                /* skip null entries */
@@ -520,19 +495,16 @@ static int coda_venus_readdir(struct file *coda_file, void *buf,
                        if (!ino) ino = vdir->d_fileno;
                        type = CDT2DT(vdir->d_type);
-                        ret = filldir(buf, name.name, name.len,
+                        if (!dir_emit(ctx, name.name, name.len, ino, type))
-                                      coda_file->f_pos, ino, type);
+                                break;
-                        /* failure means no space for filling in this round */
-                        if (ret < 0) break;
-                        result++;
                }
                /* we'll always have progress because d_reclen is unsigned and
                 * we've already established it is non-zero. */
-                coda_file->f_pos += vdir->d_reclen;
+                ctx->pos += vdir->d_reclen;
        }
 out:
        kfree(vdir);
-        return result ? result : ret;
+        return 0;
 }
 /* called when a cache lookup succeeds */
diff --git a/fs/compat.c b/fs/compat.c
index fc3b55dce184..6af20de2c1a3 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -832,6 +832,7 @@ struct compat_old_linux_dirent {
 };
 struct compat_readdir_callback {
+        struct dir_context ctx;
        struct compat_old_linux_dirent __user *dirent;
        int result;
 };
@@ -873,15 +874,15 @@ asmlinkage long compat_sys_old_readdir(unsigned int fd,
 {
        int error;
        struct fd f = fdget(fd);
-        struct compat_readdir_callback buf;
+        struct compat_readdir_callback buf = {
+                .ctx.actor = compat_fillonedir,
+                .dirent = dirent
+        };
        if (!f.file)
                return -EBADF;
-        buf.result = 0;
+        error = iterate_dir(f.file, &buf.ctx);
-        buf.dirent = dirent;
-        error = vfs_readdir(f.file, compat_fillonedir, &buf);
        if (buf.result)
                error = buf.result;
@@ -897,6 +898,7 @@ struct compat_linux_dirent {
 };
 struct compat_getdents_callback {
+        struct dir_context ctx;
        struct compat_linux_dirent __user *current_dir;
        struct compat_linux_dirent __user *previous;
        int count;
@@ -951,7 +953,11 @@ asmlinkage long compat_sys_getdents(unsigned int fd,
 {
        struct fd f;
        struct compat_linux_dirent __user * lastdirent;
-        struct compat_getdents_callback buf;
+        struct compat_getdents_callback buf = {
+                .ctx.actor = compat_filldir,
+                .current_dir = dirent,
+                .count = count
+        };
        int error;
        if (!access_ok(VERIFY_WRITE, dirent, count))
@@ -961,17 +967,12 @@ asmlinkage long compat_sys_getdents(unsigned int fd,
        if (!f.file)
                return -EBADF;
-        buf.current_dir = dirent;
+        error = iterate_dir(f.file, &buf.ctx);
-        buf.previous = NULL;
-        buf.count = count;
-        buf.error = 0;
-        error = vfs_readdir(f.file, compat_filldir, &buf);
        if (error >= 0)
                error = buf.error;
        lastdirent = buf.previous;
        if (lastdirent) {
-                if (put_user(f.file->f_pos, &lastdirent->d_off))
+                if (put_user(buf.ctx.pos, &lastdirent->d_off))
                        error = -EFAULT;
                else
                        error = count - buf.count;
@@ -983,6 +984,7 @@ asmlinkage long compat_sys_getdents(unsigned int fd,
 #ifndef __ARCH_OMIT_COMPAT_SYS_GETDENTS64
 struct compat_getdents_callback64 {
+        struct dir_context ctx;
        struct linux_dirent64 __user *current_dir;
        struct linux_dirent64 __user *previous;
        int count;
@@ -1036,7 +1038,11 @@ asmlinkage long compat_sys_getdents64(unsigned int fd,
 {
        struct fd f;
        struct linux_dirent64 __user * lastdirent;
-        struct compat_getdents_callback64 buf;
+        struct compat_getdents_callback64 buf = {
+                .ctx.actor = compat_filldir64,
+                .current_dir = dirent,
+                .count = count
+        };
        int error;
        if (!access_ok(VERIFY_WRITE, dirent, count))
@@ -1046,17 +1052,12 @@ asmlinkage long compat_sys_getdents64(unsigned int fd,
        if (!f.file)
                return -EBADF;
-        buf.current_dir = dirent;
+        error = iterate_dir(f.file, &buf.ctx);
-        buf.previous = NULL;
-        buf.count = count;
-        buf.error = 0;
-        error = vfs_readdir(f.file, compat_filldir64, &buf);
        if (error >= 0)
                error = buf.error;
        lastdirent = buf.previous;
        if (lastdirent) {
-                typeof(lastdirent->d_off) d_off = f.file->f_pos;
+                typeof(lastdirent->d_off) d_off = buf.ctx.pos;
                if (__put_user_unaligned(d_off, &lastdirent->d_off))
                        error = -EFAULT;
                else
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 996cdc5abb85..5d19acfa7c6c 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -66,7 +66,6 @@
 #include <linux/gigaset_dev.h>
 #ifdef CONFIG_BLOCK
-#include <linux/loop.h>
 #include <linux/cdrom.h>
 #include <linux/fd.h>
 #include <scsi/scsi.h>
@@ -954,8 +953,6 @@ COMPATIBLE_IOCTL(MTIOCTOP)
 /* Socket level stuff */
 COMPATIBLE_IOCTL(FIOQSIZE)
 #ifdef CONFIG_BLOCK
-/* loop */
-IGNORE_IOCTL(LOOP_CLR_FD)
 /* md calls this on random blockdevs */
 IGNORE_IOCTL(RAID_VERSION)
 /* qemu/qemu-img might call these two on plain files for probing */
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 7aabc6ad4e9b..64e5323cbbb0 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -1532,84 +1532,66 @@ static inline unsigned char dt_type(struct configfs_dirent *sd)
        return (sd->s_mode >> 12) & 15;
 }
-static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
+static int configfs_readdir(struct file *file, struct dir_context *ctx)
 {
-        struct dentry *dentry = filp->f_path.dentry;
+        struct dentry *dentry = file->f_path.dentry;
        struct super_block *sb = dentry->d_sb;
        struct configfs_dirent * parent_sd = dentry->d_fsdata;
-        struct configfs_dirent *cursor = filp->private_data;
+        struct configfs_dirent *cursor = file->private_data;
        struct list_head *p, *q = &cursor->s_sibling;
        ino_t ino = 0;
-        int i = filp->f_pos;
-        switch (i) {
+        if (!dir_emit_dots(file, ctx))
-                case 0:
+                return 0;
-                        ino = dentry->d_inode->i_ino;
+        if (ctx->pos == 2) {
-                        if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
+                spin_lock(&configfs_dirent_lock);
-                                break;
+                list_move(q, &parent_sd->s_children);
-                        filp->f_pos++;
+                spin_unlock(&configfs_dirent_lock);
-                        i++;
+        }
-                        /* fallthrough */
+        for (p = q->next; p != &parent_sd->s_children; p = p->next) {
-                case 1:
+                struct configfs_dirent *next;
-                        ino = parent_ino(dentry);
+                const char *name;
-                        if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0)
+                int len;
-                                break;
+                struct inode *inode = NULL;
-                        filp->f_pos++;
-                        i++;
+                next = list_entry(p, struct configfs_dirent, s_sibling);
-                        /* fallthrough */
+                if (!next->s_element)
-                default:
+                        continue;
-                        if (filp->f_pos == 2) {
-                                spin_lock(&configfs_dirent_lock);
-                                list_move(q, &parent_sd->s_children);
-                                spin_unlock(&configfs_dirent_lock);
-                        }
-                        for (p=q->next; p!= &parent_sd->s_children; p=p->next) {
-                                struct configfs_dirent *next;
-                                const char * name;
-                                int len;
-                                struct inode *inode = NULL;
-                                next = list_entry(p, struct configfs_dirent,
+                name = configfs_get_name(next);
-                                                   s_sibling);
+                len = strlen(name);
-                                if (!next->s_element)
-                                        continue;
+                /*
+                 * We'll have a dentry and an inode for
-                                name = configfs_get_name(next);
+                 * PINNED items and for open attribute
-                                len = strlen(name);
+                 * files.  We lock here to prevent a race
+                 * with configfs_d_iput() clearing
-                                /*
+                 * s_dentry before calling iput().
-                                 * We'll have a dentry and an inode for
+                 *
-                                 * PINNED items and for open attribute
+                 * Why do we go to the trouble?  If
-                                 * files.  We lock here to prevent a race
+                 * someone has an attribute file open,
-                                 * with configfs_d_iput() clearing
+                 * the inode number should match until
-                                 * s_dentry before calling iput().
+                 * they close it.  Beyond that, we don't
-                                 *
+                 * care.
-                                 * Why do we go to the trouble?  If
+                 */
-                                 * someone has an attribute file open,
+                spin_lock(&configfs_dirent_lock);
-                                 * the inode number should match until
+                dentry = next->s_dentry;
-                                 * they close it.  Beyond that, we don't
+                if (dentry)
-                                 * care.
+                        inode = dentry->d_inode;
-                                 */
+                if (inode)
-                                spin_lock(&configfs_dirent_lock);
+                        ino = inode->i_ino;
-                                dentry = next->s_dentry;
+                spin_unlock(&configfs_dirent_lock);
-                                if (dentry)
+                if (!inode)
-                                        inode = dentry->d_inode;
+                        ino = iunique(sb, 2);
-                                if (inode)
-                                        ino = inode->i_ino;
-                                spin_unlock(&configfs_dirent_lock);
-                                if (!inode)
-                                        ino = iunique(sb, 2);
-                                if (filldir(dirent, name, len, filp->f_pos, ino,
+                if (!dir_emit(ctx, name, len, ino, dt_type(next)))
-                                                 dt_type(next)) < 0)
+                        return 0;
-                                        return 0;
-                                spin_lock(&configfs_dirent_lock);
+                spin_lock(&configfs_dirent_lock);
-                                list_move(q, p);
+                list_move(q, p);
-                                spin_unlock(&configfs_dirent_lock);
+                spin_unlock(&configfs_dirent_lock);
-                                p = q;
+                p = q;
-                                filp->f_pos++;
+                ctx->pos++;
-                        }
        }
        return 0;
 }
@@ -1661,7 +1643,7 @@ const struct file_operations configfs_dir_operations = {
        .release        = configfs_dir_close,
        .llseek         = configfs_dir_lseek,
        .read           = generic_read_dir,
-        .readdir        = configfs_readdir,
+        .iterate        = configfs_readdir,
 };
 int configfs_register_subsystem(struct configfs_subsystem *subsys)
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 35b1c7bd18b7..e501ac3a49ff 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -349,18 +349,17 @@ static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 /*
 * Read a cramfs directory entry.
 */
-static int cramfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int cramfs_readdir(struct file *file, struct dir_context *ctx)
 {
-        struct inode *inode = file_inode(filp);
+        struct inode *inode = file_inode(file);
        struct super_block *sb = inode->i_sb;
        char *buf;
        unsigned int offset;
-        int copied;
        /* Offset within the thing. */
-        offset = filp->f_pos;
+        if (ctx->pos >= inode->i_size)
-        if (offset >= inode->i_size)
                return 0;
+        offset = ctx->pos;
        /* Directory entries are always 4-byte aligned */
        if (offset & 3)
                return -EINVAL;
@@ -369,14 +368,13 @@ static int cramfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
        if (!buf)
                return -ENOMEM;
-        copied = 0;
        while (offset < inode->i_size) {
                struct cramfs_inode *de;
                unsigned long nextoffset;
                char *name;
                ino_t ino;
                umode_t mode;
-                int namelen, error;
+                int namelen;
                mutex_lock(&read_mutex);
                de = cramfs_read(sb, OFFSET(inode) + offset, sizeof(*de)+CRAMFS_MAXPATHLEN);
@@ -402,13 +400,10 @@ static int cramfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                                break;
                        namelen--;
                }
-                error = filldir(dirent, buf, namelen, offset, ino, mode >> 12);
+                if (!dir_emit(ctx, buf, namelen, ino, mode >> 12))
-                if (error)
                        break;
-                offset = nextoffset;
+                ctx->pos = offset = nextoffset;
-                filp->f_pos = offset;
-                copied++;
        }
        kfree(buf);
        return 0;
@@ -547,7 +542,7 @@ static const struct address_space_operations cramfs_aops = {
 static const struct file_operations cramfs_directory_operations = {
        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
-        .readdir        = cramfs_readdir,
+        .iterate        = cramfs_readdir,
 };
 static const struct inode_operations cramfs_dir_inode_operations = {
diff --git a/fs/dcache.c b/fs/dcache.c
index f09b9085f7d8..5a23073138df 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1612,6 +1612,10 @@ EXPORT_SYMBOL(d_obtain_alias);
 * If a dentry was found and moved, then it is returned.  Otherwise NULL
 * is returned.  This matches the expected return value of ->lookup.
 *
+ * Cluster filesystems may call this function with a negative, hashed dentry.
+ * In that case, we know that the inode will be a regular file, and also this
+ * will only occur during atomic_open. So we need to check for the dentry
+ * being already hashed only in the final case.
 */
 struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
 {
@@ -1636,8 +1640,11 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
                        security_d_instantiate(dentry, inode);
                        d_rehash(dentry);
                }
-        } else
+        } else {
-                d_add(dentry, inode);
+                d_instantiate(dentry, inode);
+                if (d_unhashed(dentry))
+                        d_rehash(dentry);
+        }
        return new;
 }
 EXPORT_SYMBOL(d_splice_alias);
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index c5ca6ae5a30c..63146295153b 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -21,6 +21,7 @@
 #include <linux/debugfs.h>
 #include <linux/io.h>
 #include <linux/slab.h>
+#include <linux/atomic.h>
 static ssize_t default_read_file(struct file *file, char __user *buf,
                                 size_t count, loff_t *ppos)
@@ -403,6 +404,47 @@ struct dentry *debugfs_create_size_t(const char *name, umode_t mode,
 }
 EXPORT_SYMBOL_GPL(debugfs_create_size_t);
+static int debugfs_atomic_t_set(void *data, u64 val)
+{
+        atomic_set((atomic_t *)data, val);
+        return 0;
+}
+static int debugfs_atomic_t_get(void *data, u64 *val)
+{
+        *val = atomic_read((atomic_t *)data);
+        return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(fops_atomic_t, debugfs_atomic_t_get,
+                        debugfs_atomic_t_set, "%lld\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_atomic_t_ro, debugfs_atomic_t_get, NULL, "%lld\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_atomic_t_wo, NULL, debugfs_atomic_t_set, "%lld\n");
+/**
+ * debugfs_create_atomic_t - create a debugfs file that is used to read and
+ * write an atomic_t value
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have
+ * @parent: a pointer to the parent dentry for this file.  This should be a
+ *          directory dentry if set.  If this parameter is %NULL, then the
+ *          file will be created in the root of the debugfs filesystem.
+ * @value: a pointer to the variable that the file should read to and write
+ *         from.
+ */
+struct dentry *debugfs_create_atomic_t(const char *name, umode_t mode,
+                                 struct dentry *parent, atomic_t *value)
+{
+        /* if there are no write bits set, make read only */
+        if (!(mode & S_IWUGO))
+                return debugfs_create_file(name, mode, parent, value,
+                                        &fops_atomic_t_ro);
+        /* if there are no read bits set, make write only */
+        if (!(mode & S_IRUGO))
+                return debugfs_create_file(name, mode, parent, value,
+                                        &fops_atomic_t_wo);
+        return debugfs_create_file(name, mode, parent, value, &fops_atomic_t);
+}
+EXPORT_SYMBOL_GPL(debugfs_create_atomic_t);
 static ssize_t read_file_bool(struct file *file, char __user *user_buf,
                              size_t count, loff_t *ppos)
@@ -431,6 +473,7 @@ static ssize_t write_file_bool(struct file *file, const char __user *user_buf,
        if (copy_from_user(buf, user_buf, buf_size))
                return -EFAULT;
+        buf[buf_size] = '\0';
        if (strtobool(buf, &bv) == 0)
                *val = bv;
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 7d58d5b112b5..76feb4b60fa6 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -138,8 +138,9 @@ static ssize_t cluster_cluster_name_read(struct dlm_cluster *cl, char *buf)
 static ssize_t cluster_cluster_name_write(struct dlm_cluster *cl,
                                          const char *buf, size_t len)
 {
-        strncpy(dlm_config.ci_cluster_name, buf, DLM_LOCKSPACE_LEN);
+        strlcpy(dlm_config.ci_cluster_name, buf,
-        strncpy(cl->cl_cluster_name, buf, DLM_LOCKSPACE_LEN);
+                                sizeof(dlm_config.ci_cluster_name));
+        strlcpy(cl->cl_cluster_name, buf, sizeof(cl->cl_cluster_name));
        return len;
 }
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 1b1146670c4b..e223a911a834 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -2038,8 +2038,8 @@ static void set_lvb_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb,
        b = dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1];
        if (b == 1) {
                int len = receive_extralen(ms);
-                if (len > DLM_RESNAME_MAXLEN)
+                if (len > r->res_ls->ls_lvblen)
-                        len = DLM_RESNAME_MAXLEN;
+                        len = r->res_ls->ls_lvblen;
                memcpy(lkb->lkb_lvbptr, ms->m_extra, len);
                lkb->lkb_lvbseq = ms->m_lvbseq;
        }
@@ -3893,8 +3893,8 @@ static int receive_lvb(struct dlm_ls *ls, struct dlm_lkb *lkb,
                if (!lkb->lkb_lvbptr)
                        return -ENOMEM;
                len = receive_extralen(ms);
-                if (len > DLM_RESNAME_MAXLEN)
+                if (len > ls->ls_lvblen)
-                        len = DLM_RESNAME_MAXLEN;
+                        len = ls->ls_lvblen;
                memcpy(lkb->lkb_lvbptr, ms->m_extra, len);
        }
        return 0;
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 3ca79d3253b9..88556dc0458e 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -883,17 +883,24 @@ int dlm_release_lockspace(void *lockspace, int force)
 void dlm_stop_lockspaces(void)
 {
        struct dlm_ls *ls;
+        int count;
 restart:
+        count = 0;
        spin_lock(&lslist_lock);
        list_for_each_entry(ls, &lslist, ls_list) {
-                if (!test_bit(LSFL_RUNNING, &ls->ls_flags))
+                if (!test_bit(LSFL_RUNNING, &ls->ls_flags)) {
+                        count++;
                        continue;
+                }
                spin_unlock(&lslist_lock);
                log_error(ls, "no userland control daemon, stopping lockspace");
                dlm_ls_stop(ls);
                goto restart;
        }
        spin_unlock(&lslist_lock);
+        if (count)
+                log_print("dlm user daemon left %d lockspaces", count);
 }
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index d0ccd2fd79eb..d90909ec6aa6 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -52,7 +52,6 @@
 #include <linux/mutex.h>
 #include <linux/sctp.h>
 #include <linux/slab.h>
-#include <linux/sctp.h>
 #include <net/sctp/sctp.h>
 #include <net/ipv6.h>
@@ -126,6 +125,7 @@ struct connection {
        struct connection *othercon;
        struct work_struct rwork; /* Receive workqueue */
        struct work_struct swork; /* Send workqueue */
+        bool try_new_addr;
 };
 #define sock2con(x) ((struct connection *)(x)->sk_user_data)
@@ -144,6 +144,7 @@ struct dlm_node_addr {
        struct list_head list;
        int nodeid;
        int addr_count;
+        int curr_addr_index;
        struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT];
 };
@@ -310,7 +311,7 @@ static int addr_compare(struct sockaddr_storage *x, struct sockaddr_storage *y)
 }
 static int nodeid_to_addr(int nodeid, struct sockaddr_storage *sas_out,
-                          struct sockaddr *sa_out)
+                          struct sockaddr *sa_out, bool try_new_addr)
 {
        struct sockaddr_storage sas;
        struct dlm_node_addr *na;
@@ -320,8 +321,16 @@ static int nodeid_to_addr(int nodeid, struct sockaddr_storage *sas_out,
        spin_lock(&dlm_node_addrs_spin);
        na = find_node_addr(nodeid);
-        if (na && na->addr_count)
+        if (na && na->addr_count) {
-                memcpy(&sas, na->addr[0], sizeof(struct sockaddr_storage));
+                if (try_new_addr) {
+                        na->curr_addr_index++;
+                        if (na->curr_addr_index == na->addr_count)
+                                na->curr_addr_index = 0;
+                }
+                memcpy(&sas, na->addr[na->curr_addr_index ],
+                        sizeof(struct sockaddr_storage));
+        }
        spin_unlock(&dlm_node_addrs_spin);
        if (!na)
@@ -353,19 +362,22 @@ static int addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid)
 {
        struct dlm_node_addr *na;
        int rv = -EEXIST;
+        int addr_i;
        spin_lock(&dlm_node_addrs_spin);
        list_for_each_entry(na, &dlm_node_addrs, list) {
                if (!na->addr_count)
                        continue;
-                if (!addr_compare(na->addr[0], addr))
+                for (addr_i = 0; addr_i < na->addr_count; addr_i++) {
-                        continue;
+                        if (addr_compare(na->addr[addr_i], addr)) {
+                                *nodeid = na->nodeid;
-                *nodeid = na->nodeid;
+                                rv = 0;
-                rv = 0;
+                                goto unlock;
-                break;
+                        }
+                }
        }
+unlock:
        spin_unlock(&dlm_node_addrs_spin);
        return rv;
 }
@@ -561,8 +573,23 @@ static void sctp_send_shutdown(sctp_assoc_t associd)
 static void sctp_init_failed_foreach(struct connection *con)
 {
+        /*
+         * Don't try to recover base con and handle race where the
+         * other node's assoc init creates a assoc and we get that
+         * notification, then we get a notification that our attempt
+         * failed due. This happens when we are still trying the primary
+         * address, but the other node has already tried secondary addrs
+         * and found one that worked.
+         */
+        if (!con->nodeid || con->sctp_assoc)
+                return;
+        log_print("Retrying SCTP association init for node %d\n", con->nodeid);
+        con->try_new_addr = true;
        con->sctp_assoc = 0;
-        if (test_and_clear_bit(CF_CONNECT_PENDING, &con->flags)) {
+        if (test_and_clear_bit(CF_INIT_PENDING, &con->flags)) {
                if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags))
                        queue_work(send_workqueue, &con->swork);
        }
@@ -579,15 +606,56 @@ static void sctp_init_failed(void)
        mutex_unlock(&connections_lock);
 }
+static void retry_failed_sctp_send(struct connection *recv_con,
+                                   struct sctp_send_failed *sn_send_failed,
+                                   char *buf)
+{
+        int len = sn_send_failed->ssf_length - sizeof(struct sctp_send_failed);
+        struct dlm_mhandle *mh;
+        struct connection *con;
+        char *retry_buf;
+        int nodeid = sn_send_failed->ssf_info.sinfo_ppid;
+        log_print("Retry sending %d bytes to node id %d", len, nodeid);
+        con = nodeid2con(nodeid, 0);
+        if (!con) {
+                log_print("Could not look up con for nodeid %d\n",
+                          nodeid);
+                return;
+        }
+        mh = dlm_lowcomms_get_buffer(nodeid, len, GFP_NOFS, &retry_buf);
+        if (!mh) {
+                log_print("Could not allocate buf for retry.");
+                return;
+        }
+        memcpy(retry_buf, buf + sizeof(struct sctp_send_failed), len);
+        dlm_lowcomms_commit_buffer(mh);
+        /*
+         * If we got a assoc changed event before the send failed event then
+         * we only need to retry the send.
+         */
+        if (con->sctp_assoc) {
+                if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags))
+                        queue_work(send_workqueue, &con->swork);
+        } else
+                sctp_init_failed_foreach(con);
+}
 /* Something happened to an association */
 static void process_sctp_notification(struct connection *con,
                                      struct msghdr *msg, char *buf)
 {
        union sctp_notification *sn = (union sctp_notification *)buf;
-        if (sn->sn_header.sn_type == SCTP_ASSOC_CHANGE) {
+        switch (sn->sn_header.sn_type) {
+        case SCTP_SEND_FAILED:
+                retry_failed_sctp_send(con, &sn->sn_send_failed, buf);
+                break;
+        case SCTP_ASSOC_CHANGE:
                switch (sn->sn_assoc_change.sac_state) {
                case SCTP_COMM_UP:
                case SCTP_RESTART:
                {
@@ -662,9 +730,11 @@ static void process_sctp_notification(struct connection *con,
                        log_print("connecting to %d sctp association %d",
                                 nodeid, (int)sn->sn_assoc_change.sac_assoc_id);
+                        new_con->sctp_assoc = sn->sn_assoc_change.sac_assoc_id;
+                        new_con->try_new_addr = false;
                        /* Send any pending writes */
                        clear_bit(CF_CONNECT_PENDING, &new_con->flags);
-                        clear_bit(CF_INIT_PENDING, &con->flags);
+                        clear_bit(CF_INIT_PENDING, &new_con->flags);
                        if (!test_and_set_bit(CF_WRITE_PENDING, &new_con->flags)) {
                                queue_work(send_workqueue, &new_con->swork);
                        }
@@ -683,14 +753,10 @@ static void process_sctp_notification(struct connection *con,
                }
                break;
-                /* We don't know which INIT failed, so clear the PENDING flags
-                 * on them all.  if assoc_id is zero then it will then try
-                 * again */
                case SCTP_CANT_STR_ASSOC:
                {
+                        /* Will retry init when we get the send failed notification */
                        log_print("Can't start SCTP association - retrying");
-                        sctp_init_failed();
                }
                break;
@@ -699,6 +765,8 @@ static void process_sctp_notification(struct connection *con,
                                  (int)sn->sn_assoc_change.sac_assoc_id,
                                  sn->sn_assoc_change.sac_state);
                }
+        default:
+                ; /* fall through */
        }
 }
@@ -958,6 +1026,24 @@ static void free_entry(struct writequeue_entry *e)
        kfree(e);
 }
+/*
+ * writequeue_entry_complete - try to delete and free write queue entry
+ * @e: write queue entry to try to delete
+ * @completed: bytes completed
+ *
+ * writequeue_lock must be held.
+ */
+static void writequeue_entry_complete(struct writequeue_entry *e, int completed)
+{
+        e->offset += completed;
+        e->len -= completed;
+        if (e->len == 0 && e->users == 0) {
+                list_del(&e->list);
+                free_entry(e);
+        }
+}
 /* Initiate an SCTP association.
   This is a special case of send_to_sock() in that we don't yet have a
   peeled-off socket for this association, so we use the listening socket
@@ -977,15 +1063,14 @@ static void sctp_init_assoc(struct connection *con)
        int addrlen;
        struct kvec iov[1];
+        mutex_lock(&con->sock_mutex);
        if (test_and_set_bit(CF_INIT_PENDING, &con->flags))
-                return;
+                goto unlock;
-        if (con->retries++ > MAX_CONNECT_RETRIES)
-                return;
-        if (nodeid_to_addr(con->nodeid, NULL, (struct sockaddr *)&rem_addr)) {
+        if (nodeid_to_addr(con->nodeid, NULL, (struct sockaddr *)&rem_addr,
+                           con->try_new_addr)) {
                log_print("no address for nodeid %d", con->nodeid);
-                return;
+                goto unlock;
        }
        base_con = nodeid2con(0, 0);
        BUG_ON(base_con == NULL);
@@ -1003,17 +1088,25 @@ static void sctp_init_assoc(struct connection *con)
        if (list_empty(&con->writequeue)) {
                spin_unlock(&con->writequeue_lock);
                log_print("writequeue empty for nodeid %d", con->nodeid);
-                return;
+                goto unlock;
        }
        e = list_first_entry(&con->writequeue, struct writequeue_entry, list);
        len = e->len;
        offset = e->offset;
-        spin_unlock(&con->writequeue_lock);
        /* Send the first block off the write queue */
        iov[0].iov_base = page_address(e->page)+offset;
        iov[0].iov_len = len;
+        spin_unlock(&con->writequeue_lock);
+        if (rem_addr.ss_family == AF_INET) {
+                struct sockaddr_in *sin = (struct sockaddr_in *)&rem_addr;
+                log_print("Trying to connect to %pI4", &sin->sin_addr.s_addr);
+        } else {
+                struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&rem_addr;
+                log_print("Trying to connect to %pI6", &sin6->sin6_addr);
+        }
        cmsg = CMSG_FIRSTHDR(&outmessage);
        cmsg->cmsg_level = IPPROTO_SCTP;
@@ -1021,8 +1114,9 @@ static void sctp_init_assoc(struct connection *con)
        cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
        sinfo = CMSG_DATA(cmsg);
        memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
-        sinfo->sinfo_ppid = cpu_to_le32(dlm_our_nodeid());
+        sinfo->sinfo_ppid = cpu_to_le32(con->nodeid);
        outmessage.msg_controllen = cmsg->cmsg_len;
+        sinfo->sinfo_flags |= SCTP_ADDR_OVER;
        ret = kernel_sendmsg(base_con->sock, &outmessage, iov, 1, len);
        if (ret < 0) {
@@ -1035,15 +1129,12 @@ static void sctp_init_assoc(struct connection *con)
        }
        else {
                spin_lock(&con->writequeue_lock);
-                e->offset += ret;
+                writequeue_entry_complete(e, ret);
-                e->len -= ret;
-                if (e->len == 0 && e->users == 0) {
-                        list_del(&e->list);
-                        free_entry(e);
-                }
                spin_unlock(&con->writequeue_lock);
        }
+unlock:
+        mutex_unlock(&con->sock_mutex);
 }
 /* Connect a new socket to its peer */
@@ -1075,7 +1166,7 @@ static void tcp_connect_to_sock(struct connection *con)
                goto out_err;
        memset(&saddr, 0, sizeof(saddr));
-        result = nodeid_to_addr(con->nodeid, &saddr, NULL);
+        result = nodeid_to_addr(con->nodeid, &saddr, NULL, false);
        if (result < 0) {
                log_print("no address for nodeid %d", con->nodeid);
                goto out_err;
@@ -1254,6 +1345,7 @@ static int sctp_listen_for_all(void)
        int result = -EINVAL, num = 1, i, addr_len;
        struct connection *con = nodeid2con(0, GFP_NOFS);
        int bufsize = NEEDED_RMEM;
+        int one = 1;
        if (!con)
                return -ENOMEM;
@@ -1288,6 +1380,11 @@ static int sctp_listen_for_all(void)
                goto create_delsock;
        }
+        result = kernel_setsockopt(sock, SOL_SCTP, SCTP_NODELAY, (char *)&one,
+                                   sizeof(one));
+        if (result < 0)
+                log_print("Could not set SCTP NODELAY error %d\n", result);
        /* Init con struct */
        sock->sk->sk_user_data = con;
        con->sock = sock;
@@ -1493,13 +1590,7 @@ static void send_to_sock(struct connection *con)
                }
                spin_lock(&con->writequeue_lock);
-                e->offset += ret;
+                writequeue_entry_complete(e, ret);
-                e->len -= ret;
-                if (e->len == 0 && e->users == 0) {
-                        list_del(&e->list);
-                        free_entry(e);
-                }
        }
        spin_unlock(&con->writequeue_lock);
 out:
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index a7abbea2c096..9aa05e08060b 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -68,9 +68,9 @@ static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb,
 }
 struct ecryptfs_getdents_callback {
-        void *dirent;
+        struct dir_context ctx;
+        struct dir_context *caller;
        struct dentry *dentry;
-        filldir_t filldir;
        int filldir_called;
        int entries_written;
 };
@@ -96,9 +96,10 @@ ecryptfs_filldir(void *dirent, const char *lower_name, int lower_namelen,
                       rc);
                goto out;
        }
-        rc = buf->filldir(buf->dirent, name, name_size, offset, ino, d_type);
+        buf->caller->pos = buf->ctx.pos;
+        rc = !dir_emit(buf->caller, name, name_size, ino, d_type);
        kfree(name);
-        if (rc >= 0)
+        if (!rc)
                buf->entries_written++;
 out:
        return rc;
@@ -107,27 +108,23 @@ out:
 /**
 * ecryptfs_readdir
 * @file: The eCryptfs directory file
- * @dirent: Directory entry handle
+ * @ctx: The actor to feed the entries to
- * @filldir: The filldir callback function
 */
-static int ecryptfs_readdir(struct file *file, void *dirent, filldir_t filldir)
+static int ecryptfs_readdir(struct file *file, struct dir_context *ctx)
 {
        int rc;
        struct file *lower_file;
        struct inode *inode;
-        struct ecryptfs_getdents_callback buf;
+        struct ecryptfs_getdents_callback buf = {
+                .ctx.actor = ecryptfs_filldir,
+                .caller = ctx,
+                .dentry = file->f_path.dentry
+        };
        lower_file = ecryptfs_file_to_lower(file);
-        lower_file->f_pos = file->f_pos;
+        lower_file->f_pos = ctx->pos;
        inode = file_inode(file);
-        memset(&buf, 0, sizeof(buf));
+        rc = iterate_dir(lower_file, &buf.ctx);
-        buf.dirent = dirent;
+        ctx->pos = buf.ctx.pos;
-        buf.dentry = file->f_path.dentry;
-        buf.filldir = filldir;
-        buf.filldir_called = 0;
-        buf.entries_written = 0;
-        rc = vfs_readdir(lower_file, ecryptfs_filldir, (void *)&buf);
-        file->f_pos = lower_file->f_pos;
        if (rc < 0)
                goto out;
        if (buf.filldir_called && !buf.entries_written)
@@ -344,7 +341,7 @@ ecryptfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 #endif
 const struct file_operations ecryptfs_dir_fops = {
-        .readdir = ecryptfs_readdir,
+        .iterate = ecryptfs_readdir,
        .read = generic_read_dir,
        .unlocked_ioctl = ecryptfs_unlocked_ioctl,
 #ifdef CONFIG_COMPAT
@@ -365,7 +362,7 @@ const struct file_operations ecryptfs_main_fops = {
        .aio_read = ecryptfs_read_update_atime,
        .write = do_sync_write,
        .aio_write = generic_file_aio_write,
-        .readdir = ecryptfs_readdir,
+        .iterate = ecryptfs_readdir,
        .unlocked_ioctl = ecryptfs_unlocked_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl = ecryptfs_compat_ioctl,
diff --git a/fs/efs/dir.c b/fs/efs/dir.c
index 055a9e9ca747..b72307ccdf7a 100644
--- a/fs/efs/dir.c
+++ b/fs/efs/dir.c
@@ -7,40 +7,38 @@
 #include <linux/buffer_head.h>
 #include "efs.h"
-static int efs_readdir(struct file *, void *, filldir_t);
+static int efs_readdir(struct file *, struct dir_context *);
 const struct file_operations efs_dir_operations = {
        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
-        .readdir        = efs_readdir,
+        .iterate        = efs_readdir,
 };
 const struct inode_operations efs_dir_inode_operations = {
        .lookup         = efs_lookup,
 };
-static int efs_readdir(struct file *filp, void *dirent, filldir_t filldir) {
+static int efs_readdir(struct file *file, struct dir_context *ctx)
-        struct inode *inode = file_inode(filp);
+{
-        struct buffer_head *bh;
+        struct inode *inode = file_inode(file);
-        struct efs_dir          *dirblock;
-        struct efs_dentry       *dirslot;
-        efs_ino_t               inodenum;
        efs_block_t             block;
-        int                     slot, namelen;
+        int                     slot;
-        char                    *nameptr;
        if (inode->i_size & (EFS_DIRBSIZE-1))
                printk(KERN_WARNING "EFS: WARNING: readdir(): directory size not a multiple of EFS_DIRBSIZE\n");
        /* work out where this entry can be found */
-        block = filp->f_pos >> EFS_DIRBSIZE_BITS;
+        block = ctx->pos >> EFS_DIRBSIZE_BITS;
        /* each block contains at most 256 slots */
-        slot  = filp->f_pos & 0xff;
+        slot  = ctx->pos & 0xff;
        /* look at all blocks */
        while (block < inode->i_blocks) {
+                struct efs_dir          *dirblock;
+                struct buffer_head *bh;
                /* read the dir block */
                bh = sb_bread(inode->i_sb, efs_bmap(inode, block));
@@ -57,11 +55,14 @@ static int efs_readdir(struct file *filp, void *dirent, filldir_t filldir) {
                        break;
                }
-                while (slot < dirblock->slots) {
+                for (; slot < dirblock->slots; slot++) {
-                        if (dirblock->space[slot] == 0) {
+                        struct efs_dentry *dirslot;
-                                slot++;
+                        efs_ino_t inodenum;
+                        const char *nameptr;
+                        int namelen;
+                        if (dirblock->space[slot] == 0)
                                continue;
-                        }
                        dirslot  = (struct efs_dentry *) (((char *) bh->b_data) + EFS_SLOTAT(dirblock, slot));
@@ -72,39 +73,29 @@ static int efs_readdir(struct file *filp, void *dirent, filldir_t filldir) {
 #ifdef DEBUG
                        printk(KERN_DEBUG "EFS: readdir(): block %d slot %d/%d: inode %u, name \"%s\", namelen %u\n", block, slot, dirblock->slots-1, inodenum, nameptr, namelen);
 #endif
-                        if (namelen > 0) {
+                        if (!namelen)
-                                /* found the next entry */
+                                continue;
-                                filp->f_pos = (block << EFS_DIRBSIZE_BITS) | slot;
+                        /* found the next entry */
+                        ctx->pos = (block << EFS_DIRBSIZE_BITS) | slot;
-                                /* copy filename and data in dirslot */
-                                filldir(dirent, nameptr, namelen, filp->f_pos, inodenum, DT_UNKNOWN);
+                        /* sanity check */
+                        if (nameptr - (char *) dirblock + namelen > EFS_DIRBSIZE) {
-                                /* sanity check */
+                                printk(KERN_WARNING "EFS: directory entry %d exceeds directory block\n", slot);
-                                if (nameptr - (char *) dirblock + namelen > EFS_DIRBSIZE) {
+                                continue;
-                                        printk(KERN_WARNING "EFS: directory entry %d exceeds directory block\n", slot);
+                        }
-                                        slot++;
-                                        continue;
+                        /* copy filename and data in dirslot */
-                                }
+                        if (!dir_emit(ctx, nameptr, namelen, inodenum, DT_UNKNOWN)) {
-                                /* store position of next slot */
-                                if (++slot == dirblock->slots) {
-                                        slot = 0;
-                                        block++;
-                                }
                                brelse(bh);
-                                filp->f_pos = (block << EFS_DIRBSIZE_BITS) | slot;
+                                return 0;
-                                goto out;
                        }
-                        slot++;
                }
                brelse(bh);
                slot = 0;
                block++;
        }
+        ctx->pos = (block << EFS_DIRBSIZE_BITS) | slot;
-        filp->f_pos = (block << EFS_DIRBSIZE_BITS) | slot;
-out:
        return 0;
 }
diff --git a/fs/exec.c b/fs/exec.c
index 643019585574..ffd7a813ad3d 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1135,13 +1135,6 @@ void setup_new_exec(struct linux_binprm * bprm)
                        set_dumpable(current->mm, suid_dumpable);
        }
-        /*
-         * Flush performance counters when crossing a
-         * security domain:
-         */
-        if (!get_dumpable(current->mm))
-                perf_event_exit_task(current);
        /* An exec changes our domain. We are no longer part of the thread
           group */
@@ -1205,6 +1198,15 @@ void install_exec_creds(struct linux_binprm *bprm)
        commit_creds(bprm->cred);
        bprm->cred = NULL;
+        /*
+         * Disable monitoring for regular users
+         * when executing setuid binaries. Must
+         * wait until new credentials are committed
+         * by commit_creds() above
+         */
+        if (get_dumpable(current->mm) != SUID_DUMP_USER)
+                perf_event_exit_task(current);
        /*
         * cred_guard_mutex must be held at least to this point to prevent
         * ptrace_attach() from altering our determination of the task's
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
index 46375896cfc0..49f51ab4caac 100644
--- a/fs/exofs/dir.c
+++ b/fs/exofs/dir.c
@@ -239,22 +239,19 @@ void exofs_set_de_type(struct exofs_dir_entry *de, struct inode *inode)
 }
 static int
-exofs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+exofs_readdir(struct file *file, struct dir_context *ctx)
 {
-        loff_t pos = filp->f_pos;
+        loff_t pos = ctx->pos;
-        struct inode *inode = file_inode(filp);
+        struct inode *inode = file_inode(file);
        unsigned int offset = pos & ~PAGE_CACHE_MASK;
        unsigned long n = pos >> PAGE_CACHE_SHIFT;
        unsigned long npages = dir_pages(inode);
        unsigned chunk_mask = ~(exofs_chunk_size(inode)-1);
-        unsigned char *types = NULL;
+        int need_revalidate = (file->f_version != inode->i_version);
-        int need_revalidate = (filp->f_version != inode->i_version);
        if (pos > inode->i_size - EXOFS_DIR_REC_LEN(1))
                return 0;
-        types = exofs_filetype_table;
        for ( ; n < npages; n++, offset = 0) {
                char *kaddr, *limit;
                struct exofs_dir_entry *de;
@@ -263,7 +260,7 @@ exofs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                if (IS_ERR(page)) {
                        EXOFS_ERR("ERROR: bad page in directory(0x%lx)\n",
                                  inode->i_ino);
-                        filp->f_pos += PAGE_CACHE_SIZE - offset;
+                        ctx->pos += PAGE_CACHE_SIZE - offset;
                        return PTR_ERR(page);
                }
                kaddr = page_address(page);
@@ -271,9 +268,9 @@ exofs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                        if (offset) {
                                offset = exofs_validate_entry(kaddr, offset,
                                                                chunk_mask);
-                                filp->f_pos = (n<<PAGE_CACHE_SHIFT) + offset;
+                                ctx->pos = (n<<PAGE_CACHE_SHIFT) + offset;
                        }
-                        filp->f_version = inode->i_version;
+                        file->f_version = inode->i_version;
                        need_revalidate = 0;
                }
                de = (struct exofs_dir_entry *)(kaddr + offset);
@@ -288,27 +285,24 @@ exofs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                                return -EIO;
                        }
                        if (de->inode_no) {
-                                int over;
+                                unsigned char t;
-                                unsigned char d_type = DT_UNKNOWN;
-                                if (types && de->file_type < EXOFS_FT_MAX)
+                                if (de->file_type < EXOFS_FT_MAX)
-                                        d_type = types[de->file_type];
+                                        t = exofs_filetype_table[de->file_type];
+                                else
+                                        t = DT_UNKNOWN;
-                                offset = (char *)de - kaddr;
+                                if (!dir_emit(ctx, de->name, de->name_len,
-                                over = filldir(dirent, de->name, de->name_len,
-                                                (n<<PAGE_CACHE_SHIFT) | offset,
                                                le64_to_cpu(de->inode_no),
-                                                d_type);
+                                                t)) {
-                                if (over) {
                                        exofs_put_page(page);
                                        return 0;
                                }
                        }
-                        filp->f_pos += le16_to_cpu(de->rec_len);
+                        ctx->pos += le16_to_cpu(de->rec_len);
                }
                exofs_put_page(page);
        }
        return 0;
 }
@@ -669,5 +663,5 @@ not_empty:
 const struct file_operations exofs_dir_operations = {
        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
-        .readdir        = exofs_readdir,
+        .iterate        = exofs_readdir,
 };
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index d1f80abd8828..2ec8eb1ab269 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -953,9 +953,11 @@ static int exofs_releasepage(struct page *page, gfp_t gfp)
        return 0;
 }
-static void exofs_invalidatepage(struct page *page, unsigned long offset)
+static void exofs_invalidatepage(struct page *page, unsigned int offset,
+                                 unsigned int length)
 {
-        EXOFS_DBGMSG("page 0x%lx offset 0x%lx\n", page->index, offset);
+        EXOFS_DBGMSG("page 0x%lx offset 0x%x length 0x%x\n",
+                     page->index, offset, length);
        WARN_ON(1);
 }
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 262fc9940982..293bc2e47a73 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -212,6 +212,7 @@ reconnect_path(struct vfsmount *mnt, struct dentry *target_dir, char *nbuf)
 }
 struct getdents_callback {
+        struct dir_context ctx;
        char *name;             /* name that was found. It already points to a
                                   buffer NAME_MAX+1 is size */
        unsigned long ino;      /* the inum we are looking for */
@@ -254,7 +255,11 @@ static int get_name(const struct path *path, char *name, struct dentry *child)
        struct inode *dir = path->dentry->d_inode;
        int error;
        struct file *file;
-        struct getdents_callback buffer;
+        struct getdents_callback buffer = {
+                .ctx.actor = filldir_one,
+                .name = name,
+                .ino = child->d_inode->i_ino
+        };
        error = -ENOTDIR;
        if (!dir || !S_ISDIR(dir->i_mode))
@@ -271,17 +276,14 @@ static int get_name(const struct path *path, char *name, struct dentry *child)
                goto out;
        error = -EINVAL;
-        if (!file->f_op->readdir)
+        if (!file->f_op->iterate)
                goto out_close;
-        buffer.name = name;
-        buffer.ino = child->d_inode->i_ino;
-        buffer.found = 0;
        buffer.sequence = 0;
        while (1) {
                int old_seq = buffer.sequence;
-                error = vfs_readdir(file, filldir_one, &buffer);
+                error = iterate_dir(file, &buffer.ctx);
                if (buffer.found) {
                        error = 0;
                        break;
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 4237722bfd27..6e1d4ab09d72 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -287,17 +287,17 @@ static inline void ext2_set_de_type(ext2_dirent *de, struct inode *inode)
 }
 static int
-ext2_readdir (struct file * filp, void * dirent, filldir_t filldir)
+ext2_readdir(struct file *file, struct dir_context *ctx)
 {
-        loff_t pos = filp->f_pos;
+        loff_t pos = ctx->pos;
-        struct inode *inode = file_inode(filp);
+        struct inode *inode = file_inode(file);
        struct super_block *sb = inode->i_sb;
        unsigned int offset = pos & ~PAGE_CACHE_MASK;
        unsigned long n = pos >> PAGE_CACHE_SHIFT;
        unsigned long npages = dir_pages(inode);
        unsigned chunk_mask = ~(ext2_chunk_size(inode)-1);
        unsigned char *types = NULL;
-        int need_revalidate = filp->f_version != inode->i_version;
+        int need_revalidate = file->f_version != inode->i_version;
        if (pos > inode->i_size - EXT2_DIR_REC_LEN(1))
                return 0;
@@ -314,16 +314,16 @@ ext2_readdir (struct file * filp, void * dirent, filldir_t filldir)
                        ext2_error(sb, __func__,
                                   "bad page in #%lu",
                                   inode->i_ino);
-                        filp->f_pos += PAGE_CACHE_SIZE - offset;
+                        ctx->pos += PAGE_CACHE_SIZE - offset;
                        return PTR_ERR(page);
                }
                kaddr = page_address(page);
                if (unlikely(need_revalidate)) {
                        if (offset) {
                                offset = ext2_validate_entry(kaddr, offset, chunk_mask);
-                                filp->f_pos = (n<<PAGE_CACHE_SHIFT) + offset;
+                                ctx->pos = (n<<PAGE_CACHE_SHIFT) + offset;
                        }
-                        filp->f_version = inode->i_version;
+                        file->f_version = inode->i_version;
                        need_revalidate = 0;
                }
                de = (ext2_dirent *)(kaddr+offset);
@@ -336,22 +336,19 @@ ext2_readdir (struct file * filp, void * dirent, filldir_t filldir)
                                return -EIO;
                        }
                        if (de->inode) {
-                                int over;
                                unsigned char d_type = DT_UNKNOWN;
                                if (types && de->file_type < EXT2_FT_MAX)
                                        d_type = types[de->file_type];
-                                offset = (char *)de - kaddr;
+                                if (!dir_emit(ctx, de->name, de->name_len,
-                                over = filldir(dirent, de->name, de->name_len,
+                                                le32_to_cpu(de->inode),
-                                                (n<<PAGE_CACHE_SHIFT) | offset,
+                                                d_type)) {
-                                                le32_to_cpu(de->inode), d_type);
-                                if (over) {
                                        ext2_put_page(page);
                                        return 0;
                                }
                        }
-                        filp->f_pos += ext2_rec_len_from_disk(de->rec_len);
+                        ctx->pos += ext2_rec_len_from_disk(de->rec_len);
                }
                ext2_put_page(page);
        }
@@ -724,7 +721,7 @@ not_empty:
 const struct file_operations ext2_dir_operations = {
        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
-        .readdir        = ext2_readdir,
+        .iterate        = ext2_readdir,
        .unlocked_ioctl = ext2_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = ext2_compat_ioctl,
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index 87eccbbca255..f522425aaa24 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -28,8 +28,7 @@ static unsigned char ext3_filetype_table[] = {
        DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
 };
-static int ext3_dx_readdir(struct file * filp,
+static int ext3_dx_readdir(struct file *, struct dir_context *);
-                           void * dirent, filldir_t filldir);
 static unsigned char get_dtype(struct super_block *sb, int filetype)
 {
@@ -91,36 +90,30 @@ int ext3_check_dir_entry (const char * function, struct inode * dir,
        return error_msg == NULL ? 1 : 0;
 }
-static int ext3_readdir(struct file * filp,
+static int ext3_readdir(struct file *file, struct dir_context *ctx)
-                         void * dirent, filldir_t filldir)
 {
-        int error = 0;
        unsigned long offset;
-        int i, stored;
+        int i;
        struct ext3_dir_entry_2 *de;
        int err;
-        struct inode *inode = file_inode(filp);
+        struct inode *inode = file_inode(file);
        struct super_block *sb = inode->i_sb;
-        int ret = 0;
        int dir_has_error = 0;
        if (is_dx_dir(inode)) {
-                err = ext3_dx_readdir(filp, dirent, filldir);
+                err = ext3_dx_readdir(file, ctx);
-                if (err != ERR_BAD_DX_DIR) {
+                if (err != ERR_BAD_DX_DIR)
-                        ret = err;
+                        return err;
-                        goto out;
-                }
                /*
                 * We don't set the inode dirty flag since it's not
                 * critical that it get flushed back to the disk.
                 */
-                EXT3_I(file_inode(filp))->i_flags &= ~EXT3_INDEX_FL;
+                EXT3_I(inode)->i_flags &= ~EXT3_INDEX_FL;
        }
-        stored = 0;
+        offset = ctx->pos & (sb->s_blocksize - 1);
-        offset = filp->f_pos & (sb->s_blocksize - 1);
-        while (!error && !stored && filp->f_pos < inode->i_size) {
+        while (ctx->pos < inode->i_size) {
-                unsigned long blk = filp->f_pos >> EXT3_BLOCK_SIZE_BITS(sb);
+                unsigned long blk = ctx->pos >> EXT3_BLOCK_SIZE_BITS(sb);
                struct buffer_head map_bh;
                struct buffer_head *bh = NULL;
@@ -129,12 +122,12 @@ static int ext3_readdir(struct file * filp,
                if (err > 0) {
                        pgoff_t index = map_bh.b_blocknr >>
                                        (PAGE_CACHE_SHIFT - inode->i_blkbits);
-                        if (!ra_has_index(&filp->f_ra, index))
+                        if (!ra_has_index(&file->f_ra, index))
                                page_cache_sync_readahead(
                                        sb->s_bdev->bd_inode->i_mapping,
-                                        &filp->f_ra, filp,
+                                        &file->f_ra, file,
                                        index, 1);
-                        filp->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
+                        file->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
                        bh = ext3_bread(NULL, inode, blk, 0, &err);
                }
@@ -146,22 +139,21 @@ static int ext3_readdir(struct file * filp,
                        if (!dir_has_error) {
                                ext3_error(sb, __func__, "directory #%lu "
                                        "contains a hole at offset %lld",
-                                        inode->i_ino, filp->f_pos);
+                                        inode->i_ino, ctx->pos);
                                dir_has_error = 1;
                        }
                        /* corrupt size?  Maybe no more blocks to read */
-                        if (filp->f_pos > inode->i_blocks << 9)
+                        if (ctx->pos > inode->i_blocks << 9)
                                break;
-                        filp->f_pos += sb->s_blocksize - offset;
+                        ctx->pos += sb->s_blocksize - offset;
                        continue;
                }
-revalidate:
                /* If the dir block has changed since the last call to
                 * readdir(2), then we might be pointing to an invalid
                 * dirent right now.  Scan from the start of the block
                 * to make sure. */
-                if (filp->f_version != inode->i_version) {
+                if (offset && file->f_version != inode->i_version) {
                        for (i = 0; i < sb->s_blocksize && i < offset; ) {
                                de = (struct ext3_dir_entry_2 *)
                                        (bh->b_data + i);
@@ -177,53 +169,40 @@ revalidate:
                                i += ext3_rec_len_from_disk(de->rec_len);
                        }
                        offset = i;
-                        filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
+                        ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1))
                                | offset;
-                        filp->f_version = inode->i_version;
+                        file->f_version = inode->i_version;
                }
-                while (!error && filp->f_pos < inode->i_size
+                while (ctx->pos < inode->i_size
                       && offset < sb->s_blocksize) {
                        de = (struct ext3_dir_entry_2 *) (bh->b_data + offset);
                        if (!ext3_check_dir_entry ("ext3_readdir", inode, de,
                                                   bh, offset)) {
-                                /* On error, skip the f_pos to the
+                                /* On error, skip the to the
                                   next block. */
-                                filp->f_pos = (filp->f_pos |
+                                ctx->pos = (ctx->pos |
                                                (sb->s_blocksize - 1)) + 1;
-                                brelse (bh);
+                                break;
-                                ret = stored;
-                                goto out;
                        }
                        offset += ext3_rec_len_from_disk(de->rec_len);
                        if (le32_to_cpu(de->inode)) {
-                                /* We might block in the next section
+                                if (!dir_emit(ctx, de->name, de->name_len,
-                                 * if the data destination is
+                                              le32_to_cpu(de->inode),
-                                 * currently swapped out.  So, use a
+                                              get_dtype(sb, de->file_type))) {
-                                 * version stamp to detect whether or
+                                        brelse(bh);
-                                 * not the directory has been modified
+                                        return 0;
-                                 * during the copy operation.
+                                }
-                                 */
-                                u64 version = filp->f_version;
-                                error = filldir(dirent, de->name,
-                                                de->name_len,
-                                                filp->f_pos,
-                                                le32_to_cpu(de->inode),
-                                                get_dtype(sb, de->file_type));
-                                if (error)
-                                        break;
-                                if (version != filp->f_version)
-                                        goto revalidate;
-                                stored ++;
                        }
-                        filp->f_pos += ext3_rec_len_from_disk(de->rec_len);
+                        ctx->pos += ext3_rec_len_from_disk(de->rec_len);
                }
                offset = 0;
                brelse (bh);
+                if (ctx->pos < inode->i_size)
+                        if (!dir_relax(inode))
+                                return 0;
        }
-out:
+        return 0;
-        return ret;
 }
 static inline int is_32bit_api(void)
@@ -452,62 +431,54 @@ int ext3_htree_store_dirent(struct file *dir_file, __u32 hash,
 * for all entres on the fname linked list.  (Normally there is only
 * one entry on the linked list, unless there are 62 bit hash collisions.)
 */
-static int call_filldir(struct file * filp, void * dirent,
+static bool call_filldir(struct file *file, struct dir_context *ctx,
-                        filldir_t filldir, struct fname *fname)
+                        struct fname *fname)
 {
-        struct dir_private_info *info = filp->private_data;
+        struct dir_private_info *info = file->private_data;
-        loff_t  curr_pos;
+        struct inode *inode = file_inode(file);
-        struct inode *inode = file_inode(filp);
+        struct super_block *sb = inode->i_sb;
-        struct super_block * sb;
-        int error;
-        sb = inode->i_sb;
        if (!fname) {
                printk("call_filldir: called with null fname?!?\n");
-                return 0;
+                return true;
        }
-        curr_pos = hash2pos(filp, fname->hash, fname->minor_hash);
+        ctx->pos = hash2pos(file, fname->hash, fname->minor_hash);
        while (fname) {
-                error = filldir(dirent, fname->name,
+                if (!dir_emit(ctx, fname->name, fname->name_len,
-                                fname->name_len, curr_pos,
                                fname->inode,
-                                get_dtype(sb, fname->file_type));
+                                get_dtype(sb, fname->file_type))) {
-                if (error) {
-                        filp->f_pos = curr_pos;
                        info->extra_fname = fname;
-                        return error;
+                        return false;
                }
                fname = fname->next;
        }
-        return 0;
+        return true;
 }
-static int ext3_dx_readdir(struct file * filp,
+static int ext3_dx_readdir(struct file *file, struct dir_context *ctx)
-                         void * dirent, filldir_t filldir)
 {
-        struct dir_private_info *info = filp->private_data;
+        struct dir_private_info *info = file->private_data;
-        struct inode *inode = file_inode(filp);
+        struct inode *inode = file_inode(file);
        struct fname *fname;
        int     ret;
        if (!info) {
-                info = ext3_htree_create_dir_info(filp, filp->f_pos);
+                info = ext3_htree_create_dir_info(file, ctx->pos);
                if (!info)
                        return -ENOMEM;
-                filp->private_data = info;
+                file->private_data = info;
        }
-        if (filp->f_pos == ext3_get_htree_eof(filp))
+        if (ctx->pos == ext3_get_htree_eof(file))
                return 0;       /* EOF */
        /* Some one has messed with f_pos; reset the world */
-        if (info->last_pos != filp->f_pos) {
+        if (info->last_pos != ctx->pos) {
                free_rb_tree_fname(&info->root);
                info->curr_node = NULL;
                info->extra_fname = NULL;
-                info->curr_hash = pos2maj_hash(filp, filp->f_pos);
+                info->curr_hash = pos2maj_hash(file, ctx->pos);
-                info->curr_minor_hash = pos2min_hash(filp, filp->f_pos);
+                info->curr_minor_hash = pos2min_hash(file, ctx->pos);
        }
        /*
@@ -515,7 +486,7 @@ static int ext3_dx_readdir(struct file * filp,
         * chain, return them first.
         */
        if (info->extra_fname) {
-                if (call_filldir(filp, dirent, filldir, info->extra_fname))
+                if (!call_filldir(file, ctx, info->extra_fname))
                        goto finished;
                info->extra_fname = NULL;
                goto next_node;
@@ -529,17 +500,17 @@ static int ext3_dx_readdir(struct file * filp,
                 * cached entries.
                 */
                if ((!info->curr_node) ||
-                    (filp->f_version != inode->i_version)) {
+                    (file->f_version != inode->i_version)) {
                        info->curr_node = NULL;
                        free_rb_tree_fname(&info->root);
-                        filp->f_version = inode->i_version;
+                        file->f_version = inode->i_version;
-                        ret = ext3_htree_fill_tree(filp, info->curr_hash,
+                        ret = ext3_htree_fill_tree(file, info->curr_hash,
                                                   info->curr_minor_hash,
                                                   &info->next_hash);
                        if (ret < 0)
                                return ret;
                        if (ret == 0) {
-                                filp->f_pos = ext3_get_htree_eof(filp);
+                                ctx->pos = ext3_get_htree_eof(file);
                                break;
                        }
                        info->curr_node = rb_first(&info->root);
@@ -548,7 +519,7 @@ static int ext3_dx_readdir(struct file * filp,
                fname = rb_entry(info->curr_node, struct fname, rb_hash);
                info->curr_hash = fname->hash;
                info->curr_minor_hash = fname->minor_hash;
-                if (call_filldir(filp, dirent, filldir, fname))
+                if (!call_filldir(file, ctx, fname))
                        break;
        next_node:
                info->curr_node = rb_next(info->curr_node);
@@ -559,7 +530,7 @@ static int ext3_dx_readdir(struct file * filp,
                        info->curr_minor_hash = fname->minor_hash;
                } else {
                        if (info->next_hash == ~0) {
-                                filp->f_pos = ext3_get_htree_eof(filp);
+                                ctx->pos = ext3_get_htree_eof(file);
                                break;
                        }
                        info->curr_hash = info->next_hash;
@@ -567,7 +538,7 @@ static int ext3_dx_readdir(struct file * filp,
                }
        }
 finished:
-        info->last_pos = filp->f_pos;
+        info->last_pos = ctx->pos;
        return 0;
 }
@@ -582,7 +553,7 @@ static int ext3_release_dir (struct inode * inode, struct file * filp)
 const struct file_operations ext3_dir_operations = {
        .llseek         = ext3_dir_llseek,
        .read           = generic_read_dir,
-        .readdir        = ext3_readdir,
+        .iterate        = ext3_readdir,
        .unlocked_ioctl = ext3_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = ext3_compat_ioctl,
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 23c712825640..f67668f724ba 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1825,19 +1825,20 @@ ext3_readpages(struct file *file, struct address_space *mapping,
        return mpage_readpages(mapping, pages, nr_pages, ext3_get_block);
 }
-static void ext3_invalidatepage(struct page *page, unsigned long offset)
+static void ext3_invalidatepage(struct page *page, unsigned int offset,
+                                unsigned int length)
 {
        journal_t *journal = EXT3_JOURNAL(page->mapping->host);
-        trace_ext3_invalidatepage(page, offset);
+        trace_ext3_invalidatepage(page, offset, length);
        /*
         * If it's a full truncate we just forget about the pending dirtying
         */
-        if (offset == 0)
+        if (offset == 0 && length == PAGE_CACHE_SIZE)
                ClearPageChecked(page);
-        journal_invalidatepage(journal, page, offset);
+        journal_invalidatepage(journal, page, offset, length);
 }
 static int ext3_releasepage(struct page *page, gfp_t wait)
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 692de13e3596..cea8ecf3e76e 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -576,11 +576,8 @@ static int htree_dirblock_to_tree(struct file *dir_file,
                if (!ext3_check_dir_entry("htree_dirblock_to_tree", dir, de, bh,
                                        (block<<EXT3_BLOCK_SIZE_BITS(dir->i_sb))
                                                +((char *)de - bh->b_data))) {
-                        /* On error, skip the f_pos to the next block. */
+                        /* silently ignore the rest of the block */
-                        dir_file->f_pos = (dir_file->f_pos |
+                        break;
-                                        (dir->i_sb->s_blocksize - 1)) + 1;
-                        brelse (bh);
-                        return count;
                }
                ext3fs_dirhash(de->name, de->name_len, hinfo);
                if ((hinfo->hash < start_hash) ||
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index d0f13eada0ed..58339393fa6e 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -682,11 +682,15 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb)
 static inline int test_root(ext4_group_t a, int b)
 {
-        int num = b;
+        while (1) {
+                if (a < b)
-        while (a > num)
+                        return 0;
-                num *= b;
+                if (a == b)
-        return num == a;
+                        return 1;
+                if ((a % b) != 0)
+                        return 0;
+                a = a / b;
+        }
 }
 static int ext4_group_sparse(ext4_group_t group)
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index f8d56e4254e0..3c7d288ae94c 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -29,8 +29,7 @@
 #include "ext4.h"
 #include "xattr.h"
-static int ext4_dx_readdir(struct file *filp,
+static int ext4_dx_readdir(struct file *, struct dir_context *);
-                           void *dirent, filldir_t filldir);
 /**
 * Check if the given dir-inode refers to an htree-indexed directory
@@ -103,60 +102,56 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
        return 1;
 }
-static int ext4_readdir(struct file *filp,
+static int ext4_readdir(struct file *file, struct dir_context *ctx)
-                         void *dirent, filldir_t filldir)
 {
-        int error = 0;
        unsigned int offset;
        int i, stored;
        struct ext4_dir_entry_2 *de;
        int err;
-        struct inode *inode = file_inode(filp);
+        struct inode *inode = file_inode(file);
        struct super_block *sb = inode->i_sb;
-        int ret = 0;
        int dir_has_error = 0;
        if (is_dx_dir(inode)) {
-                err = ext4_dx_readdir(filp, dirent, filldir);
+                err = ext4_dx_readdir(file, ctx);
                if (err != ERR_BAD_DX_DIR) {
-                        ret = err;
+                        return err;
-                        goto out;
                }
                /*
                 * We don't set the inode dirty flag since it's not
                 * critical that it get flushed back to the disk.
                 */
-                ext4_clear_inode_flag(file_inode(filp),
+                ext4_clear_inode_flag(file_inode(file),
                                      EXT4_INODE_INDEX);
        }
        if (ext4_has_inline_data(inode)) {
                int has_inline_data = 1;
-                ret = ext4_read_inline_dir(filp, dirent, filldir,
+                int ret = ext4_read_inline_dir(file, ctx,
                                           &has_inline_data);
                if (has_inline_data)
                        return ret;
        }
        stored = 0;
-        offset = filp->f_pos & (sb->s_blocksize - 1);
+        offset = ctx->pos & (sb->s_blocksize - 1);
-        while (!error && !stored && filp->f_pos < inode->i_size) {
+        while (ctx->pos < inode->i_size) {
                struct ext4_map_blocks map;
                struct buffer_head *bh = NULL;
-                map.m_lblk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb);
+                map.m_lblk = ctx->pos >> EXT4_BLOCK_SIZE_BITS(sb);
                map.m_len = 1;
                err = ext4_map_blocks(NULL, inode, &map, 0);
                if (err > 0) {
                        pgoff_t index = map.m_pblk >>
                                        (PAGE_CACHE_SHIFT - inode->i_blkbits);
-                        if (!ra_has_index(&filp->f_ra, index))
+                        if (!ra_has_index(&file->f_ra, index))
                                page_cache_sync_readahead(
                                        sb->s_bdev->bd_inode->i_mapping,
-                                        &filp->f_ra, filp,
+                                        &file->f_ra, file,
                                        index, 1);
-                        filp->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
+                        file->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
                        bh = ext4_bread(NULL, inode, map.m_lblk, 0, &err);
                }
@@ -166,16 +161,16 @@ static int ext4_readdir(struct file *filp,
                 */
                if (!bh) {
                        if (!dir_has_error) {
-                                EXT4_ERROR_FILE(filp, 0,
+                                EXT4_ERROR_FILE(file, 0,
                                                "directory contains a "
                                                "hole at offset %llu",
-                                           (unsigned long long) filp->f_pos);
+                                           (unsigned long long) ctx->pos);
                                dir_has_error = 1;
                        }
                        /* corrupt size?  Maybe no more blocks to read */
-                        if (filp->f_pos > inode->i_blocks << 9)
+                        if (ctx->pos > inode->i_blocks << 9)
                                break;
-                        filp->f_pos += sb->s_blocksize - offset;
+                        ctx->pos += sb->s_blocksize - offset;
                        continue;
                }
@@ -183,21 +178,20 @@ static int ext4_readdir(struct file *filp,
                if (!buffer_verified(bh) &&
                    !ext4_dirent_csum_verify(inode,
                                (struct ext4_dir_entry *)bh->b_data)) {
-                        EXT4_ERROR_FILE(filp, 0, "directory fails checksum "
+                        EXT4_ERROR_FILE(file, 0, "directory fails checksum "
                                        "at offset %llu",
-                                        (unsigned long long)filp->f_pos);
+                                        (unsigned long long)ctx->pos);
-                        filp->f_pos += sb->s_blocksize - offset;
+                        ctx->pos += sb->s_blocksize - offset;
                        brelse(bh);
                        continue;
                }
                set_buffer_verified(bh);
-revalidate:
                /* If the dir block has changed since the last call to
                 * readdir(2), then we might be pointing to an invalid
                 * dirent right now.  Scan from the start of the block
                 * to make sure. */
-                if (filp->f_version != inode->i_version) {
+                if (file->f_version != inode->i_version) {
                        for (i = 0; i < sb->s_blocksize && i < offset; ) {
                                de = (struct ext4_dir_entry_2 *)
                                        (bh->b_data + i);
@@ -214,57 +208,46 @@ revalidate:
                                                            sb->s_blocksize);
                        }
                        offset = i;
-                        filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
+                        ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1))
                                | offset;
-                        filp->f_version = inode->i_version;
+                        file->f_version = inode->i_version;
                }
-                while (!error && filp->f_pos < inode->i_size
+                while (ctx->pos < inode->i_size
                       && offset < sb->s_blocksize) {
                        de = (struct ext4_dir_entry_2 *) (bh->b_data + offset);
-                        if (ext4_check_dir_entry(inode, filp, de, bh,
+                        if (ext4_check_dir_entry(inode, file, de, bh,
                                                 bh->b_data, bh->b_size,
                                                 offset)) {
                                /*
-                                 * On error, skip the f_pos to the next block
+                                 * On error, skip to the next block
                                 */
-                                filp->f_pos = (filp->f_pos |
+                                ctx->pos = (ctx->pos |
                                                (sb->s_blocksize - 1)) + 1;
-                                brelse(bh);
+                                break;
-                                ret = stored;
-                                goto out;
                        }
                        offset += ext4_rec_len_from_disk(de->rec_len,
                                        sb->s_blocksize);
                        if (le32_to_cpu(de->inode)) {
-                                /* We might block in the next section
+                                if (!dir_emit(ctx, de->name,
-                                 * if the data destination is
-                                 * currently swapped out.  So, use a
-                                 * version stamp to detect whether or
-                                 * not the directory has been modified
-                                 * during the copy operation.
-                                 */
-                                u64 version = filp->f_version;
-                                error = filldir(dirent, de->name,
                                                de->name_len,
-                                                filp->f_pos,
                                                le32_to_cpu(de->inode),
-                                                get_dtype(sb, de->file_type));
+                                                get_dtype(sb, de->file_type))) {
-                                if (error)
+                                        brelse(bh);
-                                        break;
+                                        return 0;
-                                if (version != filp->f_version)
+                                }
-                                        goto revalidate;
-                                stored++;
                        }
-                        filp->f_pos += ext4_rec_len_from_disk(de->rec_len,
+                        ctx->pos += ext4_rec_len_from_disk(de->rec_len,
                                                sb->s_blocksize);
                }
                offset = 0;
                brelse(bh);
+                if (ctx->pos < inode->i_size) {
+                        if (!dir_relax(inode))
+                                return 0;
+                }
        }
-out:
+        return 0;
-        return ret;
 }
 static inline int is_32bit_api(void)
@@ -492,16 +475,12 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
 * for all entres on the fname linked list.  (Normally there is only
 * one entry on the linked list, unless there are 62 bit hash collisions.)
 */
-static int call_filldir(struct file *filp, void *dirent,
+static int call_filldir(struct file *file, struct dir_context *ctx,
-                        filldir_t filldir, struct fname *fname)
+                        struct fname *fname)
 {
-        struct dir_private_info *info = filp->private_data;
+        struct dir_private_info *info = file->private_data;
-        loff_t  curr_pos;
+        struct inode *inode = file_inode(file);
-        struct inode *inode = file_inode(filp);
+        struct super_block *sb = inode->i_sb;
-        struct super_block *sb;
-        int error;
-        sb = inode->i_sb;
        if (!fname) {
                ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: comm %s: "
@@ -509,47 +488,44 @@ static int call_filldir(struct file *filp, void *dirent,
                         inode->i_ino, current->comm);
                return 0;
        }
-        curr_pos = hash2pos(filp, fname->hash, fname->minor_hash);
+        ctx->pos = hash2pos(file, fname->hash, fname->minor_hash);
        while (fname) {
-                error = filldir(dirent, fname->name,
+                if (!dir_emit(ctx, fname->name,
-                                fname->name_len, curr_pos,
+                                fname->name_len,
                                fname->inode,
-                                get_dtype(sb, fname->file_type));
+                                get_dtype(sb, fname->file_type))) {
-                if (error) {
-                        filp->f_pos = curr_pos;
                        info->extra_fname = fname;
-                        return error;
+                        return 1;
                }
                fname = fname->next;
        }
        return 0;
 }
-static int ext4_dx_readdir(struct file *filp,
+static int ext4_dx_readdir(struct file *file, struct dir_context *ctx)
-                         void *dirent, filldir_t filldir)
 {
-        struct dir_private_info *info = filp->private_data;
+        struct dir_private_info *info = file->private_data;
-        struct inode *inode = file_inode(filp);
+        struct inode *inode = file_inode(file);
        struct fname *fname;
        int     ret;
        if (!info) {
-                info = ext4_htree_create_dir_info(filp, filp->f_pos);
+                info = ext4_htree_create_dir_info(file, ctx->pos);
                if (!info)
                        return -ENOMEM;
-                filp->private_data = info;
+                file->private_data = info;
        }
-        if (filp->f_pos == ext4_get_htree_eof(filp))
+        if (ctx->pos == ext4_get_htree_eof(file))
                return 0;       /* EOF */
        /* Some one has messed with f_pos; reset the world */
-        if (info->last_pos != filp->f_pos) {
+        if (info->last_pos != ctx->pos) {
                free_rb_tree_fname(&info->root);
                info->curr_node = NULL;
                info->extra_fname = NULL;
-                info->curr_hash = pos2maj_hash(filp, filp->f_pos);
+                info->curr_hash = pos2maj_hash(file, ctx->pos);
-                info->curr_minor_hash = pos2min_hash(filp, filp->f_pos);
+                info->curr_minor_hash = pos2min_hash(file, ctx->pos);
        }
        /*
@@ -557,7 +533,7 @@ static int ext4_dx_readdir(struct file *filp,
         * chain, return them first.
         */
        if (info->extra_fname) {
-                if (call_filldir(filp, dirent, filldir, info->extra_fname))
+                if (call_filldir(file, ctx, info->extra_fname))
                        goto finished;
                info->extra_fname = NULL;
                goto next_node;
@@ -571,17 +547,17 @@ static int ext4_dx_readdir(struct file *filp,
                 * cached entries.
                 */
                if ((!info->curr_node) ||
-                    (filp->f_version != inode->i_version)) {
+                    (file->f_version != inode->i_version)) {
                        info->curr_node = NULL;
                        free_rb_tree_fname(&info->root);
-                        filp->f_version = inode->i_version;
+                        file->f_version = inode->i_version;
-                        ret = ext4_htree_fill_tree(filp, info->curr_hash,
+                        ret = ext4_htree_fill_tree(file, info->curr_hash,
                                                   info->curr_minor_hash,
                                                   &info->next_hash);
                        if (ret < 0)
                                return ret;
                        if (ret == 0) {
-                                filp->f_pos = ext4_get_htree_eof(filp);
+                                ctx->pos = ext4_get_htree_eof(file);
                                break;
                        }
                        info->curr_node = rb_first(&info->root);
@@ -590,7 +566,7 @@ static int ext4_dx_readdir(struct file *filp,
                fname = rb_entry(info->curr_node, struct fname, rb_hash);
                info->curr_hash = fname->hash;
                info->curr_minor_hash = fname->minor_hash;
-                if (call_filldir(filp, dirent, filldir, fname))
+                if (call_filldir(file, ctx, fname))
                        break;
        next_node:
                info->curr_node = rb_next(info->curr_node);
@@ -601,7 +577,7 @@ static int ext4_dx_readdir(struct file *filp,
                        info->curr_minor_hash = fname->minor_hash;
                } else {
                        if (info->next_hash == ~0) {
-                                filp->f_pos = ext4_get_htree_eof(filp);
+                                ctx->pos = ext4_get_htree_eof(file);
                                break;
                        }
                        info->curr_hash = info->next_hash;
@@ -609,7 +585,7 @@ static int ext4_dx_readdir(struct file *filp,
                }
        }
 finished:
-        info->last_pos = filp->f_pos;
+        info->last_pos = ctx->pos;
        return 0;
 }
@@ -624,7 +600,7 @@ static int ext4_release_dir(struct inode *inode, struct file *filp)
 const struct file_operations ext4_dir_operations = {
        .llseek         = ext4_dir_llseek,
        .read           = generic_read_dir,
-        .readdir        = ext4_readdir,
+        .iterate        = ext4_readdir,
        .unlocked_ioctl = ext4_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = ext4_compat_ioctl,
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 5aae3d12d400..b577e45425b0 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -177,38 +177,28 @@ struct ext4_map_blocks {
 };
 /*
- * For delayed allocation tracking
- */
-struct mpage_da_data {
-        struct inode *inode;
-        sector_t b_blocknr;             /* start block number of extent */
-        size_t b_size;                  /* size of extent */
-        unsigned long b_state;          /* state of the extent */
-        unsigned long first_page, next_page;    /* extent of pages */
-        struct writeback_control *wbc;
-        int io_done;
-        int pages_written;
-        int retval;
-};
-/*
 * Flags for ext4_io_end->flags
 */
 #define EXT4_IO_END_UNWRITTEN   0x0001
-#define EXT4_IO_END_ERROR       0x0002
+#define EXT4_IO_END_DIRECT      0x0002
-#define EXT4_IO_END_DIRECT      0x0004
 /*
- * For converting uninitialized extents on a work queue.
+ * For converting uninitialized extents on a work queue. 'handle' is used for
+ * buffered writeback.
 */
 typedef struct ext4_io_end {
        struct list_head        list;           /* per-file finished IO list */
+        handle_t                *handle;        /* handle reserved for extent
+                                                 * conversion */
        struct inode            *inode;         /* file being written to */
+        struct bio              *bio;           /* Linked list of completed
+                                                 * bios covering the extent */
        unsigned int            flag;           /* unwritten or not */
        loff_t                  offset;         /* offset in the file */
        ssize_t                 size;           /* size of the extent */
        struct kiocb            *iocb;          /* iocb struct for AIO */
        int                     result;         /* error value for AIO */
+        atomic_t                count;          /* reference counter */
 } ext4_io_end_t;
 struct ext4_io_submit {
@@ -581,11 +571,6 @@ enum {
 #define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER    0x0020
 /*
- * Flags used by ext4_discard_partial_page_buffers
- */
-#define EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED   0x0001
-/*
 * ioctl commands
 */
 #define EXT4_IOC_GETFLAGS               FS_IOC_GETFLAGS
@@ -879,6 +864,7 @@ struct ext4_inode_info {
        rwlock_t i_es_lock;
        struct list_head i_es_lru;
        unsigned int i_es_lru_nr;       /* protected by i_es_lock */
+        unsigned long i_touch_when;     /* jiffies of last accessing */
        /* ialloc */
        ext4_group_t    i_last_alloc_group;
@@ -903,12 +889,22 @@ struct ext4_inode_info {
        qsize_t i_reserved_quota;
 #endif
-        /* completed IOs that might need unwritten extents handling */
+        /* Lock protecting lists below */
-        struct list_head i_completed_io_list;
        spinlock_t i_completed_io_lock;
+        /*
+         * Completed IOs that need unwritten extents handling and have
+         * transaction reserved
+         */
+        struct list_head i_rsv_conversion_list;
+        /*
+         * Completed IOs that need unwritten extents handling and don't have
+         * transaction reserved
+         */
+        struct list_head i_unrsv_conversion_list;
        atomic_t i_ioend_count; /* Number of outstanding io_end structs */
        atomic_t i_unwritten; /* Nr. of inflight conversions pending */
-        struct work_struct i_unwritten_work;    /* deferred extent conversion */
+        struct work_struct i_rsv_conversion_work;
+        struct work_struct i_unrsv_conversion_work;
        spinlock_t i_block_reservation_lock;
@@ -1245,7 +1241,6 @@ struct ext4_sb_info {
        unsigned int s_mb_stats;
        unsigned int s_mb_order2_reqs;
        unsigned int s_mb_group_prealloc;
-        unsigned int s_max_writeback_mb_bump;
        unsigned int s_max_dir_size_kb;
        /* where last allocation was done - for stream allocation */
        unsigned long s_mb_last_group;
@@ -1281,8 +1276,10 @@ struct ext4_sb_info {
        struct flex_groups *s_flex_groups;
        ext4_group_t s_flex_groups_allocated;
-        /* workqueue for dio unwritten */
+        /* workqueue for unreserved extent convertions (dio) */
-        struct workqueue_struct *dio_unwritten_wq;
+        struct workqueue_struct *unrsv_conversion_wq;
+        /* workqueue for reserved extent conversions (buffered io) */
+        struct workqueue_struct *rsv_conversion_wq;
        /* timer for periodic error stats printing */
        struct timer_list s_err_report;
@@ -1307,6 +1304,7 @@ struct ext4_sb_info {
        /* Reclaim extents from extent status tree */
        struct shrinker s_es_shrinker;
        struct list_head s_es_lru;
+        unsigned long s_es_last_sorted;
        struct percpu_counter s_extent_cache_cnt;
        spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp;
 };
@@ -1342,6 +1340,9 @@ static inline void ext4_set_io_unwritten_flag(struct inode *inode,
                                              struct ext4_io_end *io_end)
 {
        if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
+                /* Writeback has to have coversion transaction reserved */
+                WARN_ON(EXT4_SB(inode->i_sb)->s_journal && !io_end->handle &&
+                        !(io_end->flag & EXT4_IO_END_DIRECT));
                io_end->flag |= EXT4_IO_END_UNWRITTEN;
                atomic_inc(&EXT4_I(inode)->i_unwritten);
        }
@@ -1999,7 +2000,6 @@ static inline  unsigned char get_dtype(struct super_block *sb, int filetype)
 /* fsync.c */
 extern int ext4_sync_file(struct file *, loff_t, loff_t, int);
-extern int ext4_flush_unwritten_io(struct inode *);
 /* hash.c */
 extern int ext4fs_dirhash(const char *name, int len, struct
@@ -2088,7 +2088,7 @@ extern int ext4_change_inode_journal_flag(struct inode *, int);
 extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
 extern int ext4_can_truncate(struct inode *inode);
 extern void ext4_truncate(struct inode *);
-extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length);
+extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length);
 extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
 extern void ext4_set_inode_flags(struct inode *);
 extern void ext4_get_inode_flags(struct ext4_inode_info *);
@@ -2096,9 +2096,12 @@ extern int ext4_alloc_da_blocks(struct inode *inode);
 extern void ext4_set_aops(struct inode *inode);
 extern int ext4_writepage_trans_blocks(struct inode *);
 extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
-extern int ext4_discard_partial_page_buffers(handle_t *handle,
+extern int ext4_block_truncate_page(handle_t *handle,
-                struct address_space *mapping, loff_t from,
+                struct address_space *mapping, loff_t from);
-                loff_t length, int flags);
+extern int ext4_block_zero_page_range(handle_t *handle,
+                struct address_space *mapping, loff_t from, loff_t length);
+extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
+                             loff_t lstart, loff_t lend);
 extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 extern qsize_t *ext4_get_reserved_space(struct inode *inode);
 extern void ext4_da_update_reserve_space(struct inode *inode,
@@ -2111,7 +2114,7 @@ extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
                                const struct iovec *iov, loff_t offset,
                                unsigned long nr_segs);
 extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock);
-extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk);
+extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks);
 extern void ext4_ind_truncate(handle_t *, struct inode *inode);
 extern int ext4_free_hole_blocks(handle_t *handle, struct inode *inode,
                                 ext4_lblk_t first, ext4_lblk_t stop);
@@ -2166,42 +2169,96 @@ extern int ext4_alloc_flex_bg_array(struct super_block *sb,
                                    ext4_group_t ngroup);
 extern const char *ext4_decode_error(struct super_block *sb, int errno,
                                     char nbuf[16]);
 extern __printf(4, 5)
 void __ext4_error(struct super_block *, const char *, unsigned int,
                  const char *, ...);
-#define ext4_error(sb, message...)      __ext4_error(sb, __func__,      \
-                                                     __LINE__, ## message)
 extern __printf(5, 6)
-void ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t,
+void __ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t,
                      const char *, ...);
 extern __printf(5, 6)
-void ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t,
+void __ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t,
                     const char *, ...);
 extern void __ext4_std_error(struct super_block *, const char *,
                             unsigned int, int);
 extern __printf(4, 5)
 void __ext4_abort(struct super_block *, const char *, unsigned int,
                  const char *, ...);
-#define ext4_abort(sb, message...)      __ext4_abort(sb, __func__, \
-                                                       __LINE__, ## message)
 extern __printf(4, 5)
 void __ext4_warning(struct super_block *, const char *, unsigned int,
                    const char *, ...);
-#define ext4_warning(sb, message...)    __ext4_warning(sb, __func__, \
-                                                       __LINE__, ## message)
 extern __printf(3, 4)
-void ext4_msg(struct super_block *, const char *, const char *, ...);
+void __ext4_msg(struct super_block *, const char *, const char *, ...);
 extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp,
                           const char *, unsigned int, const char *);
-#define dump_mmp_msg(sb, mmp, msg)      __dump_mmp_msg(sb, mmp, __func__, \
-                                                       __LINE__, msg)
 extern __printf(7, 8)
 void __ext4_grp_locked_error(const char *, unsigned int,
                             struct super_block *, ext4_group_t,
                             unsigned long, ext4_fsblk_t,
                             const char *, ...);
-#define ext4_grp_locked_error(sb, grp, message...) \
-        __ext4_grp_locked_error(__func__, __LINE__, (sb), (grp), ## message)
+#ifdef CONFIG_PRINTK
+#define ext4_error_inode(inode, func, line, block, fmt, ...)            \
+        __ext4_error_inode(inode, func, line, block, fmt, ##__VA_ARGS__)
+#define ext4_error_file(file, func, line, block, fmt, ...)              \
+        __ext4_error_file(file, func, line, block, fmt, ##__VA_ARGS__)
+#define ext4_error(sb, fmt, ...)                                        \
+        __ext4_error(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
+#define ext4_abort(sb, fmt, ...)                                        \
+        __ext4_abort(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
+#define ext4_warning(sb, fmt, ...)                                      \
+        __ext4_warning(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
+#define ext4_msg(sb, level, fmt, ...)                           \
+        __ext4_msg(sb, level, fmt, ##__VA_ARGS__)
+#define dump_mmp_msg(sb, mmp, msg)                                      \
+        __dump_mmp_msg(sb, mmp, __func__, __LINE__, msg)
+#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...)            \
+        __ext4_grp_locked_error(__func__, __LINE__, sb, grp, ino, block, \
+                                fmt, ##__VA_ARGS__)
+#else
+#define ext4_error_inode(inode, func, line, block, fmt, ...)            \
+do {                                                                    \
+        no_printk(fmt, ##__VA_ARGS__);                                  \
+        __ext4_error_inode(inode, "", 0, block, " ");                   \
+} while (0)
+#define ext4_error_file(file, func, line, block, fmt, ...)              \
+do {                                                                    \
+        no_printk(fmt, ##__VA_ARGS__);                                  \
+        __ext4_error_file(file, "", 0, block, " ");                     \
+} while (0)
+#define ext4_error(sb, fmt, ...)                                        \
+do {                                                                    \
+        no_printk(fmt, ##__VA_ARGS__);                                  \
+        __ext4_error(sb, "", 0, " ");                                   \
+} while (0)
+#define ext4_abort(sb, fmt, ...)                                        \
+do {                                                                    \
+        no_printk(fmt, ##__VA_ARGS__);                                  \
+        __ext4_abort(sb, "", 0, " ");                                   \
+} while (0)
+#define ext4_warning(sb, fmt, ...)                                      \
+do {                                                                    \
+        no_printk(fmt, ##__VA_ARGS__);                                  \
+        __ext4_warning(sb, "", 0, " ");                                 \
+} while (0)
+#define ext4_msg(sb, level, fmt, ...)                                   \
+do {                                                                    \
+        no_printk(fmt, ##__VA_ARGS__);                                  \
+        __ext4_msg(sb, "", " ");                                        \
+} while (0)
+#define dump_mmp_msg(sb, mmp, msg)                                      \
+        __dump_mmp_msg(sb, mmp, "", 0, "")
+#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...)            \
+do {                                                                    \
+        no_printk(fmt, ##__VA_ARGS__);                          \
+        __ext4_grp_locked_error("", 0, sb, grp, ino, block, " ");       \
+} while (0)
+#endif
 extern void ext4_update_dynamic_rev(struct super_block *sb);
 extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb,
                                        __u32 compat);
@@ -2312,6 +2369,7 @@ struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
 {
         struct ext4_group_info ***grp_info;
         long indexv, indexh;
+         BUG_ON(group >= EXT4_SB(sb)->s_groups_count);
         grp_info = EXT4_SB(sb)->s_group_info;
         indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb));
         indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1);
@@ -2515,7 +2573,7 @@ extern int ext4_try_create_inline_dir(handle_t *handle,
                                      struct inode *parent,
                                      struct inode *inode);
 extern int ext4_read_inline_dir(struct file *filp,
-                                void *dirent, filldir_t filldir,
+                                struct dir_context *ctx,
                                int *has_inline_data);
 extern int htree_inlinedir_to_tree(struct file *dir_file,
                                   struct inode *dir, ext4_lblk_t block,
@@ -2598,8 +2656,7 @@ struct ext4_extent;
 extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
 extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
-extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
+extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents);
-                                       int chunk);
 extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                               struct ext4_map_blocks *map, int flags);
 extern void ext4_ext_truncate(handle_t *, struct inode *);
@@ -2609,8 +2666,8 @@ extern void ext4_ext_init(struct super_block *);
 extern void ext4_ext_release(struct super_block *);
 extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
                          loff_t len);
-extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
+extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
-                          ssize_t len);
+                                          loff_t offset, ssize_t len);
 extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
                           struct ext4_map_blocks *map, int flags);
 extern int ext4_ext_calc_metadata_amount(struct inode *inode,
@@ -2650,12 +2707,15 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
 /* page-io.c */
 extern int __init ext4_init_pageio(void);
-extern void ext4_add_complete_io(ext4_io_end_t *io_end);
 extern void ext4_exit_pageio(void);
-extern void ext4_ioend_shutdown(struct inode *);
-extern void ext4_free_io_end(ext4_io_end_t *io);
 extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
-extern void ext4_end_io_work(struct work_struct *work);
+extern ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end);
+extern int ext4_put_io_end(ext4_io_end_t *io_end);
+extern void ext4_put_io_end_defer(ext4_io_end_t *io_end);
+extern void ext4_io_submit_init(struct ext4_io_submit *io,
+                                struct writeback_control *wbc);
+extern void ext4_end_io_rsv_work(struct work_struct *work);
+extern void ext4_end_io_unrsv_work(struct work_struct *work);
 extern void ext4_io_submit(struct ext4_io_submit *io);
 extern int ext4_bio_write_page(struct ext4_io_submit *io,
                               struct page *page,
@@ -2668,20 +2728,17 @@ extern void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp);
 extern int ext4_mmp_csum_verify(struct super_block *sb,
                                struct mmp_struct *mmp);
-/* BH_Uninit flag: blocks are allocated but uninitialized on disk */
+/*
+ * Note that these flags will never ever appear in a buffer_head's state flag.
+ * See EXT4_MAP_... to see where this is used.
+ */
 enum ext4_state_bits {
        BH_Uninit       /* blocks are allocated but uninitialized on disk */
-          = BH_JBDPrivateStart,
+         = BH_JBDPrivateStart,
        BH_AllocFromCluster,    /* allocated blocks were part of already
-                                 * allocated cluster. Note that this flag will
+                                 * allocated cluster. */
-                                 * never, ever appear in a buffer_head's state
-                                 * flag. See EXT4_MAP_FROM_CLUSTER to see where
-                                 * this is used. */
 };
-BUFFER_FNS(Uninit, uninit)
-TAS_BUFFER_FNS(Uninit, uninit)
 /*
 * Add new method to test whether block and inode bitmaps are properly
 * initialized. With uninit_bg reading the block from disk is not enough
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 451eb4045330..72a3600aedbd 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -38,31 +38,43 @@ static void ext4_put_nojournal(handle_t *handle)
 /*
 * Wrappers for jbd2_journal_start/end.
 */
-handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
+static int ext4_journal_check_start(struct super_block *sb)
-                                  int type, int nblocks)
 {
        journal_t *journal;
        might_sleep();
-        trace_ext4_journal_start(sb, nblocks, _RET_IP_);
        if (sb->s_flags & MS_RDONLY)
-                return ERR_PTR(-EROFS);
+                return -EROFS;
        WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE);
        journal = EXT4_SB(sb)->s_journal;
-        if (!journal)
-                return ext4_get_nojournal();
        /*
         * Special case here: if the journal has aborted behind our
         * backs (eg. EIO in the commit thread), then we still need to
         * take the FS itself readonly cleanly.
         */
-        if (is_journal_aborted(journal)) {
+        if (journal && is_journal_aborted(journal)) {
                ext4_abort(sb, "Detected aborted journal");
-                return ERR_PTR(-EROFS);
+                return -EROFS;
        }
-        return jbd2__journal_start(journal, nblocks, GFP_NOFS, type, line);
+        return 0;
+}
+handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
+                                  int type, int blocks, int rsv_blocks)
+{
+        journal_t *journal;
+        int err;
+        trace_ext4_journal_start(sb, blocks, rsv_blocks, _RET_IP_);
+        err = ext4_journal_check_start(sb);
+        if (err < 0)
+                return ERR_PTR(err);
+        journal = EXT4_SB(sb)->s_journal;
+        if (!journal)
+                return ext4_get_nojournal();
+        return jbd2__journal_start(journal, blocks, rsv_blocks, GFP_NOFS,
+                                   type, line);
 }
 int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle)
@@ -86,6 +98,30 @@ int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle)
        return err;
 }
+handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line,
+                                        int type)
+{
+        struct super_block *sb;
+        int err;
+        if (!ext4_handle_valid(handle))
+                return ext4_get_nojournal();
+        sb = handle->h_journal->j_private;
+        trace_ext4_journal_start_reserved(sb, handle->h_buffer_credits,
+                                          _RET_IP_);
+        err = ext4_journal_check_start(sb);
+        if (err < 0) {
+                jbd2_journal_free_reserved(handle);
+                return ERR_PTR(err);
+        }
+        err = jbd2_journal_start_reserved(handle, type, line);
+        if (err < 0)
+                return ERR_PTR(err);
+        return handle;
+}
 void ext4_journal_abort_handle(const char *caller, unsigned int line,
                               const char *err_fn, struct buffer_head *bh,
                               handle_t *handle, int err)
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index c8c6885406db..2877258d9497 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -134,7 +134,8 @@ static inline int ext4_jbd2_credits_xattr(struct inode *inode)
 #define EXT4_HT_MIGRATE          8
 #define EXT4_HT_MOVE_EXTENTS     9
 #define EXT4_HT_XATTR           10
-#define EXT4_HT_MAX             11
+#define EXT4_HT_EXT_CONVERT     11
+#define EXT4_HT_MAX             12
 /**
 *   struct ext4_journal_cb_entry - Base structure for callback information.
@@ -265,7 +266,7 @@ int __ext4_handle_dirty_super(const char *where, unsigned int line,
        __ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb))
 handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
-                                  int type, int nblocks);
+                                  int type, int blocks, int rsv_blocks);
 int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle);
 #define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096)
@@ -300,21 +301,37 @@ static inline int ext4_handle_has_enough_credits(handle_t *handle, int needed)
 }
 #define ext4_journal_start_sb(sb, type, nblocks)                        \
-        __ext4_journal_start_sb((sb), __LINE__, (type), (nblocks))
+        __ext4_journal_start_sb((sb), __LINE__, (type), (nblocks), 0)
 #define ext4_journal_start(inode, type, nblocks)                        \
-        __ext4_journal_start((inode), __LINE__, (type), (nblocks))
+        __ext4_journal_start((inode), __LINE__, (type), (nblocks), 0)
+#define ext4_journal_start_with_reserve(inode, type, blocks, rsv_blocks) \
+        __ext4_journal_start((inode), __LINE__, (type), (blocks), (rsv_blocks))
 static inline handle_t *__ext4_journal_start(struct inode *inode,
                                             unsigned int line, int type,
-                                             int nblocks)
+                                             int blocks, int rsv_blocks)
 {
-        return __ext4_journal_start_sb(inode->i_sb, line, type, nblocks);
+        return __ext4_journal_start_sb(inode->i_sb, line, type, blocks,
+                                       rsv_blocks);
 }
 #define ext4_journal_stop(handle) \
        __ext4_journal_stop(__func__, __LINE__, (handle))
+#define ext4_journal_start_reserved(handle, type) \
+        __ext4_journal_start_reserved((handle), __LINE__, (type))
+handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line,
+                                        int type);
+static inline void ext4_journal_free_reserved(handle_t *handle)
+{
+        if (ext4_handle_valid(handle))
+                jbd2_journal_free_reserved(handle);
+}
 static inline handle_t *ext4_journal_current_handle(void)
 {
        return journal_current_handle();
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index bc0f1910b9cf..7097b0f680e6 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2125,7 +2125,8 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
                next_del = ext4_find_delayed_extent(inode, &es);
                if (!exists && next_del) {
                        exists = 1;
-                        flags |= FIEMAP_EXTENT_DELALLOC;
+                        flags |= (FIEMAP_EXTENT_DELALLOC |
+                                  FIEMAP_EXTENT_UNKNOWN);
                }
                up_read(&EXT4_I(inode)->i_data_sem);
@@ -2328,17 +2329,15 @@ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
 }
 /*
- * How many index/leaf blocks need to change/allocate to modify nrblocks?
+ * How many index/leaf blocks need to change/allocate to add @extents extents?
 *
- * if nrblocks are fit in a single extent (chunk flag is 1), then
+ * If we add a single extent, then in the worse case, each tree level
- * in the worse case, each tree level index/leaf need to be changed
+ * index/leaf need to be changed in case of the tree split.
- * if the tree split due to insert a new extent, then the old tree
- * index/leaf need to be updated too
 *
- * If the nrblocks are discontiguous, they could cause
+ * If more extents are inserted, they could cause the whole tree split more
- * the whole tree split more than once, but this is really rare.
+ * than once, but this is really rare.
 */
-int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+int ext4_ext_index_trans_blocks(struct inode *inode, int extents)
 {
        int index;
        int depth;
@@ -2349,7 +2348,7 @@ int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
        depth = ext_depth(inode);
-        if (chunk)
+        if (extents <= 1)
                index = depth * 2;
        else
                index = depth * 3;
@@ -2357,20 +2356,24 @@ int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
        return index;
 }
+static inline int get_default_free_blocks_flags(struct inode *inode)
+{
+        if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+                return EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET;
+        else if (ext4_should_journal_data(inode))
+                return EXT4_FREE_BLOCKS_FORGET;
+        return 0;
+}
 static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
                              struct ext4_extent *ex,
-                              ext4_fsblk_t *partial_cluster,
+                              long long *partial_cluster,
                              ext4_lblk_t from, ext4_lblk_t to)
 {
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        unsigned short ee_len =  ext4_ext_get_actual_len(ex);
        ext4_fsblk_t pblk;
-        int flags = 0;
+        int flags = get_default_free_blocks_flags(inode);
-        if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
-                flags |= EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET;
-        else if (ext4_should_journal_data(inode))
-                flags |= EXT4_FREE_BLOCKS_FORGET;
        /*
         * For bigalloc file systems, we never free a partial cluster
@@ -2388,7 +2391,8 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
         * partial cluster here.
         */
        pblk = ext4_ext_pblock(ex) + ee_len - 1;
-        if (*partial_cluster && (EXT4_B2C(sbi, pblk) != *partial_cluster)) {
+        if ((*partial_cluster > 0) &&
+            (EXT4_B2C(sbi, pblk) != *partial_cluster)) {
                ext4_free_blocks(handle, inode, NULL,
                                 EXT4_C2B(sbi, *partial_cluster),
                                 sbi->s_cluster_ratio, flags);
@@ -2414,41 +2418,46 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
            && to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
                /* tail removal */
                ext4_lblk_t num;
+                unsigned int unaligned;
                num = le32_to_cpu(ex->ee_block) + ee_len - from;
                pblk = ext4_ext_pblock(ex) + ee_len - num;
-                ext_debug("free last %u blocks starting %llu\n", num, pblk);
+                /*
+                 * Usually we want to free partial cluster at the end of the
+                 * extent, except for the situation when the cluster is still
+                 * used by any other extent (partial_cluster is negative).
+                 */
+                if (*partial_cluster < 0 &&
+                    -(*partial_cluster) == EXT4_B2C(sbi, pblk + num - 1))
+                        flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER;
+                ext_debug("free last %u blocks starting %llu partial %lld\n",
+                          num, pblk, *partial_cluster);
                ext4_free_blocks(handle, inode, NULL, pblk, num, flags);
                /*
                 * If the block range to be freed didn't start at the
                 * beginning of a cluster, and we removed the entire
-                 * extent, save the partial cluster here, since we
+                 * extent and the cluster is not used by any other extent,
-                 * might need to delete if we determine that the
+                 * save the partial cluster here, since we might need to
-                 * truncate operation has removed all of the blocks in
+                 * delete if we determine that the truncate operation has
-                 * the cluster.
+                 * removed all of the blocks in the cluster.
+                 *
+                 * On the other hand, if we did not manage to free the whole
+                 * extent, we have to mark the cluster as used (store negative
+                 * cluster number in partial_cluster).
                 */
-                if (pblk & (sbi->s_cluster_ratio - 1) &&
+                unaligned = pblk & (sbi->s_cluster_ratio - 1);
-                    (ee_len == num))
+                if (unaligned && (ee_len == num) &&
+                    (*partial_cluster != -((long long)EXT4_B2C(sbi, pblk))))
                        *partial_cluster = EXT4_B2C(sbi, pblk);
-                else
+                else if (unaligned)
+                        *partial_cluster = -((long long)EXT4_B2C(sbi, pblk));
+                else if (*partial_cluster > 0)
                        *partial_cluster = 0;
-        } else if (from == le32_to_cpu(ex->ee_block)
+        } else
-                   && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
+                ext4_error(sbi->s_sb, "strange request: removal(2) "
-                /* head removal */
+                           "%u-%u from %u:%u\n",
-                ext4_lblk_t num;
+                           from, to, le32_to_cpu(ex->ee_block), ee_len);
-                ext4_fsblk_t start;
-                num = to - from;
-                start = ext4_ext_pblock(ex);
-                ext_debug("free first %u blocks starting %llu\n", num, start);
-                ext4_free_blocks(handle, inode, NULL, start, num, flags);
-        } else {
-                printk(KERN_INFO "strange request: removal(2) "
-                                "%u-%u from %u:%u\n",
-                                from, to, le32_to_cpu(ex->ee_block), ee_len);
-        }
        return 0;
 }
@@ -2461,12 +2470,16 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
 * @handle: The journal handle
 * @inode:  The files inode
 * @path:   The path to the leaf
+ * @partial_cluster: The cluster which we'll have to free if all extents
+ *                   has been released from it. It gets negative in case
+ *                   that the cluster is still used.
 * @start:  The first block to remove
 * @end:   The last block to remove
 */
 static int
 ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
-                 struct ext4_ext_path *path, ext4_fsblk_t *partial_cluster,
+                 struct ext4_ext_path *path,
+                 long long *partial_cluster,
                 ext4_lblk_t start, ext4_lblk_t end)
 {
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -2479,6 +2492,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
        unsigned short ex_ee_len;
        unsigned uninitialized = 0;
        struct ext4_extent *ex;
+        ext4_fsblk_t pblk;
        /* the header must be checked already in ext4_ext_remove_space() */
        ext_debug("truncate since %u in leaf to %u\n", start, end);
@@ -2490,7 +2504,9 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
                return -EIO;
        }
        /* find where to start removing */
-        ex = EXT_LAST_EXTENT(eh);
+        ex = path[depth].p_ext;
+        if (!ex)
+                ex = EXT_LAST_EXTENT(eh);
        ex_ee_block = le32_to_cpu(ex->ee_block);
        ex_ee_len = ext4_ext_get_actual_len(ex);
@@ -2517,6 +2533,16 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
                /* If this extent is beyond the end of the hole, skip it */
                if (end < ex_ee_block) {
+                        /*
+                         * We're going to skip this extent and move to another,
+                         * so if this extent is not cluster aligned we have
+                         * to mark the current cluster as used to avoid
+                         * accidentally freeing it later on
+                         */
+                        pblk = ext4_ext_pblock(ex);
+                        if (pblk & (sbi->s_cluster_ratio - 1))
+                                *partial_cluster =
+                                        -((long long)EXT4_B2C(sbi, pblk));
                        ex--;
                        ex_ee_block = le32_to_cpu(ex->ee_block);
                        ex_ee_len = ext4_ext_get_actual_len(ex);
@@ -2592,7 +2618,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
                                        sizeof(struct ext4_extent));
                        }
                        le16_add_cpu(&eh->eh_entries, -1);
-                } else
+                } else if (*partial_cluster > 0)
                        *partial_cluster = 0;
                err = ext4_ext_dirty(handle, inode, path + depth);
@@ -2610,17 +2636,13 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
                err = ext4_ext_correct_indexes(handle, inode, path);
        /*
-         * If there is still a entry in the leaf node, check to see if
+         * Free the partial cluster only if the current extent does not
-         * it references the partial cluster.  This is the only place
+         * reference it. Otherwise we might free used cluster.
-         * where it could; if it doesn't, we can free the cluster.
         */
-        if (*partial_cluster && ex >= EXT_FIRST_EXTENT(eh) &&
+        if (*partial_cluster > 0 &&
            (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) !=
             *partial_cluster)) {
-                int flags = EXT4_FREE_BLOCKS_FORGET;
+                int flags = get_default_free_blocks_flags(inode);
-                if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
-                        flags |= EXT4_FREE_BLOCKS_METADATA;
                ext4_free_blocks(handle, inode, NULL,
                                 EXT4_C2B(sbi, *partial_cluster),
@@ -2664,7 +2686,7 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
        struct super_block *sb = inode->i_sb;
        int depth = ext_depth(inode);
        struct ext4_ext_path *path = NULL;
-        ext4_fsblk_t partial_cluster = 0;
+        long long partial_cluster = 0;
        handle_t *handle;
        int i = 0, err = 0;
@@ -2676,7 +2698,7 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
                return PTR_ERR(handle);
 again:
-        trace_ext4_ext_remove_space(inode, start, depth);
+        trace_ext4_ext_remove_space(inode, start, end, depth);
        /*
         * Check if we are removing extents inside the extent tree. If that
@@ -2844,17 +2866,14 @@ again:
                }
        }
-        trace_ext4_ext_remove_space_done(inode, start, depth, partial_cluster,
+        trace_ext4_ext_remove_space_done(inode, start, end, depth,
-                        path->p_hdr->eh_entries);
+                        partial_cluster, path->p_hdr->eh_entries);
        /* If we still have something in the partial cluster and we have removed
         * even the first extent, then we should free the blocks in the partial
         * cluster as well. */
-        if (partial_cluster && path->p_hdr->eh_entries == 0) {
+        if (partial_cluster > 0 && path->p_hdr->eh_entries == 0) {
-                int flags = EXT4_FREE_BLOCKS_FORGET;
+                int flags = get_default_free_blocks_flags(inode);
-                if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
-                        flags |= EXT4_FREE_BLOCKS_METADATA;
                ext4_free_blocks(handle, inode, NULL,
                                 EXT4_C2B(EXT4_SB(sb), partial_cluster),
@@ -4363,7 +4382,7 @@ out2:
        }
 out3:
-        trace_ext4_ext_map_blocks_exit(inode, map, err ? err : allocated);
+        trace_ext4_ext_map_blocks_exit(inode, flags, map, err ? err : allocated);
        return err ? err : allocated;
 }
@@ -4446,7 +4465,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
                return -EOPNOTSUPP;
        if (mode & FALLOC_FL_PUNCH_HOLE)
-                return ext4_punch_hole(file, offset, len);
+                return ext4_punch_hole(inode, offset, len);
        ret = ext4_convert_inline_data(inode);
        if (ret)
@@ -4548,10 +4567,9 @@ retry:
 * function, to convert the fallocated extents after IO is completed.
 * Returns 0 on success.
 */
-int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
+int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
-                                    ssize_t len)
+                                   loff_t offset, ssize_t len)
 {
-        handle_t *handle;
        unsigned int max_blocks;
        int ret = 0;
        int ret2 = 0;
@@ -4566,16 +4584,32 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
        max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) -
                      map.m_lblk);
        /*
-         * credits to insert 1 extent into extent tree
+         * This is somewhat ugly but the idea is clear: When transaction is
+         * reserved, everything goes into it. Otherwise we rather start several
+         * smaller transactions for conversion of each extent separately.
         */
-        credits = ext4_chunk_trans_blocks(inode, max_blocks);
+        if (handle) {
+                handle = ext4_journal_start_reserved(handle,
+                                                     EXT4_HT_EXT_CONVERT);
+                if (IS_ERR(handle))
+                        return PTR_ERR(handle);
+                credits = 0;
+        } else {
+                /*
+                 * credits to insert 1 extent into extent tree
+                 */
+                credits = ext4_chunk_trans_blocks(inode, max_blocks);
+        }
        while (ret >= 0 && ret < max_blocks) {
                map.m_lblk += ret;
                map.m_len = (max_blocks -= ret);
-                handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits);
+                if (credits) {
-                if (IS_ERR(handle)) {
+                        handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
-                        ret = PTR_ERR(handle);
+                                                    credits);
-                        break;
+                        if (IS_ERR(handle)) {
+                                ret = PTR_ERR(handle);
+                                break;
+                        }
                }
                ret = ext4_map_blocks(handle, inode, &map,
                                      EXT4_GET_BLOCKS_IO_CONVERT_EXT);
@@ -4586,10 +4620,13 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
                                     inode->i_ino, map.m_lblk,
                                     map.m_len, ret);
                ext4_mark_inode_dirty(handle, inode);
-                ret2 = ext4_journal_stop(handle);
+                if (credits)
-                if (ret <= 0 || ret2 )
+                        ret2 = ext4_journal_stop(handle);
+                if (ret <= 0 || ret2)
                        break;
        }
+        if (!credits)
+                ret2 = ext4_journal_stop(handle);
        return ret > 0 ? ret2 : ret;
 }
@@ -4659,7 +4696,7 @@ static int ext4_xattr_fiemap(struct inode *inode,
                error = ext4_get_inode_loc(inode, &iloc);
                if (error)
                        return error;
-                physical = iloc.bh->b_blocknr << blockbits;
+                physical = (__u64)iloc.bh->b_blocknr << blockbits;
                offset = EXT4_GOOD_OLD_INODE_SIZE +
                                EXT4_I(inode)->i_extra_isize;
                physical += offset;
@@ -4667,7 +4704,7 @@ static int ext4_xattr_fiemap(struct inode *inode,
                flags |= FIEMAP_EXTENT_DATA_INLINE;
                brelse(iloc.bh);
        } else { /* external block */
-                physical = EXT4_I(inode)->i_file_acl << blockbits;
+                physical = (__u64)EXT4_I(inode)->i_file_acl << blockbits;
                length = inode->i_sb->s_blocksize;
        }
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index e6941e622d31..ee018d5f397e 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -10,6 +10,7 @@
 * Ext4 extents status tree core functions.
 */
 #include <linux/rbtree.h>
+#include <linux/list_sort.h>
 #include "ext4.h"
 #include "extents_status.h"
 #include "ext4_extents.h"
@@ -291,7 +292,6 @@ out:
        read_unlock(&EXT4_I(inode)->i_es_lock);
-        ext4_es_lru_add(inode);
        trace_ext4_es_find_delayed_extent_range_exit(inode, es);
 }
@@ -672,7 +672,6 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
 error:
        write_unlock(&EXT4_I(inode)->i_es_lock);
-        ext4_es_lru_add(inode);
        ext4_es_print_tree(inode);
        return err;
@@ -734,7 +733,6 @@ out:
        read_unlock(&EXT4_I(inode)->i_es_lock);
-        ext4_es_lru_add(inode);
        trace_ext4_es_lookup_extent_exit(inode, es, found);
        return found;
 }
@@ -878,12 +876,28 @@ int ext4_es_zeroout(struct inode *inode, struct ext4_extent *ex)
                                     EXTENT_STATUS_WRITTEN);
 }
+static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a,
+                                     struct list_head *b)
+{
+        struct ext4_inode_info *eia, *eib;
+        eia = list_entry(a, struct ext4_inode_info, i_es_lru);
+        eib = list_entry(b, struct ext4_inode_info, i_es_lru);
+        if (eia->i_touch_when == eib->i_touch_when)
+                return 0;
+        if (time_after(eia->i_touch_when, eib->i_touch_when))
+                return 1;
+        else
+                return -1;
+}
 static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
 {
        struct ext4_sb_info *sbi = container_of(shrink,
                                        struct ext4_sb_info, s_es_shrinker);
        struct ext4_inode_info *ei;
-        struct list_head *cur, *tmp, scanned;
+        struct list_head *cur, *tmp;
+        LIST_HEAD(skiped);
        int nr_to_scan = sc->nr_to_scan;
        int ret, nr_shrunk = 0;
@@ -893,23 +907,41 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
        if (!nr_to_scan)
                return ret;
-        INIT_LIST_HEAD(&scanned);
        spin_lock(&sbi->s_es_lru_lock);
+        /*
+         * If the inode that is at the head of LRU list is newer than
+         * last_sorted time, that means that we need to sort this list.
+         */
+        ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info, i_es_lru);
+        if (sbi->s_es_last_sorted < ei->i_touch_when) {
+                list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp);
+                sbi->s_es_last_sorted = jiffies;
+        }
        list_for_each_safe(cur, tmp, &sbi->s_es_lru) {
-                list_move_tail(cur, &scanned);
+                /*
+                 * If we have already reclaimed all extents from extent
+                 * status tree, just stop the loop immediately.
+                 */
+                if (percpu_counter_read_positive(&sbi->s_extent_cache_cnt) == 0)
+                        break;
                ei = list_entry(cur, struct ext4_inode_info, i_es_lru);
-                read_lock(&ei->i_es_lock);
+                /* Skip the inode that is newer than the last_sorted time */
-                if (ei->i_es_lru_nr == 0) {
+                if (sbi->s_es_last_sorted < ei->i_touch_when) {
-                        read_unlock(&ei->i_es_lock);
+                        list_move_tail(cur, &skiped);
                        continue;
                }
-                read_unlock(&ei->i_es_lock);
+                if (ei->i_es_lru_nr == 0)
+                        continue;
                write_lock(&ei->i_es_lock);
                ret = __es_try_to_reclaim_extents(ei, nr_to_scan);
+                if (ei->i_es_lru_nr == 0)
+                        list_del_init(&ei->i_es_lru);
                write_unlock(&ei->i_es_lock);
                nr_shrunk += ret;
@@ -917,7 +949,9 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
                if (nr_to_scan == 0)
                        break;
        }
-        list_splice_tail(&scanned, &sbi->s_es_lru);
+        /* Move the newer inodes into the tail of the LRU list. */
+        list_splice_tail(&skiped, &sbi->s_es_lru);
        spin_unlock(&sbi->s_es_lru_lock);
        ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
@@ -925,21 +959,19 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
        return ret;
 }
-void ext4_es_register_shrinker(struct super_block *sb)
+void ext4_es_register_shrinker(struct ext4_sb_info *sbi)
 {
-        struct ext4_sb_info *sbi;
-        sbi = EXT4_SB(sb);
        INIT_LIST_HEAD(&sbi->s_es_lru);
        spin_lock_init(&sbi->s_es_lru_lock);
+        sbi->s_es_last_sorted = 0;
        sbi->s_es_shrinker.shrink = ext4_es_shrink;
        sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
        register_shrinker(&sbi->s_es_shrinker);
 }
-void ext4_es_unregister_shrinker(struct super_block *sb)
+void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
 {
-        unregister_shrinker(&EXT4_SB(sb)->s_es_shrinker);
+        unregister_shrinker(&sbi->s_es_shrinker);
 }
 void ext4_es_lru_add(struct inode *inode)
@@ -947,11 +979,14 @@ void ext4_es_lru_add(struct inode *inode)
        struct ext4_inode_info *ei = EXT4_I(inode);
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+        ei->i_touch_when = jiffies;
+        if (!list_empty(&ei->i_es_lru))
+                return;
        spin_lock(&sbi->s_es_lru_lock);
        if (list_empty(&ei->i_es_lru))
                list_add_tail(&ei->i_es_lru, &sbi->s_es_lru);
-        else
-                list_move_tail(&ei->i_es_lru, &sbi->s_es_lru);
        spin_unlock(&sbi->s_es_lru_lock);
 }
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index f740eb03b707..e936730cc5b0 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -39,6 +39,7 @@
                                 EXTENT_STATUS_DELAYED | \
                                 EXTENT_STATUS_HOLE)
+struct ext4_sb_info;
 struct ext4_extent;
 struct extent_status {
@@ -119,8 +120,8 @@ static inline void ext4_es_store_status(struct extent_status *es,
        es->es_pblk = block;
 }
-extern void ext4_es_register_shrinker(struct super_block *sb);
+extern void ext4_es_register_shrinker(struct ext4_sb_info *sbi);
-extern void ext4_es_unregister_shrinker(struct super_block *sb);
+extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi);
 extern void ext4_es_lru_add(struct inode *inode);
 extern void ext4_es_lru_del(struct inode *inode);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index b1b4d51b5d86..b19f0a457f32 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -312,7 +312,7 @@ static int ext4_find_unwritten_pgoff(struct inode *inode,
        blkbits = inode->i_sb->s_blocksize_bits;
        startoff = *offset;
        lastoff = startoff;
-        endoff = (map->m_lblk + map->m_len) << blkbits;
+        endoff = (loff_t)(map->m_lblk + map->m_len) << blkbits;
        index = startoff >> PAGE_CACHE_SHIFT;
        end = endoff >> PAGE_CACHE_SHIFT;
@@ -457,7 +457,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
                ret = ext4_map_blocks(NULL, inode, &map, 0);
                if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
                        if (last != start)
-                                dataoff = last << blkbits;
+                                dataoff = (loff_t)last << blkbits;
                        break;
                }
@@ -468,7 +468,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
                ext4_es_find_delayed_extent_range(inode, last, last, &es);
                if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
                        if (last != start)
-                                dataoff = last << blkbits;
+                                dataoff = (loff_t)last << blkbits;
                        break;
                }
@@ -486,7 +486,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
                }
                last++;
-                dataoff = last << blkbits;
+                dataoff = (loff_t)last << blkbits;
        } while (last <= end);
        mutex_unlock(&inode->i_mutex);
@@ -540,7 +540,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
                ret = ext4_map_blocks(NULL, inode, &map, 0);
                if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
                        last += ret;
-                        holeoff = last << blkbits;
+                        holeoff = (loff_t)last << blkbits;
                        continue;
                }
@@ -551,7 +551,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
                ext4_es_find_delayed_extent_range(inode, last, last, &es);
                if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
                        last = es.es_lblk + es.es_len;
-                        holeoff = last << blkbits;
+                        holeoff = (loff_t)last << blkbits;
                        continue;
                }
@@ -566,7 +566,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
                                                              &map, &holeoff);
                        if (!unwritten) {
                                last += ret;
-                                holeoff = last << blkbits;
+                                holeoff = (loff_t)last << blkbits;
                                continue;
                        }
                }
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index e0ba8a408def..a8bc47f75fa0 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -73,32 +73,6 @@ static int ext4_sync_parent(struct inode *inode)
        return ret;
 }
-/**
- * __sync_file - generic_file_fsync without the locking and filemap_write
- * @inode:      inode to sync
- * @datasync:   only sync essential metadata if true
- *
- * This is just generic_file_fsync without the locking.  This is needed for
- * nojournal mode to make sure this inodes data/metadata makes it to disk
- * properly.  The i_mutex should be held already.
- */
-static int __sync_inode(struct inode *inode, int datasync)
-{
-        int err;
-        int ret;
-        ret = sync_mapping_buffers(inode->i_mapping);
-        if (!(inode->i_state & I_DIRTY))
-                return ret;
-        if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
-                return ret;
-        err = sync_inode_metadata(inode, 1);
-        if (ret == 0)
-                ret = err;
-        return ret;
-}
 /*
 * akpm: A new design for ext4_sync_file().
 *
@@ -116,7 +90,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
        struct inode *inode = file->f_mapping->host;
        struct ext4_inode_info *ei = EXT4_I(inode);
        journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
-        int ret, err;
+        int ret = 0, err;
        tid_t commit_tid;
        bool needs_barrier = false;
@@ -124,25 +98,24 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
        trace_ext4_sync_file_enter(file, datasync);
-        ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+        if (inode->i_sb->s_flags & MS_RDONLY) {
-        if (ret)
+                /* Make sure that we read updated s_mount_flags value */
-                return ret;
+                smp_rmb();
-        mutex_lock(&inode->i_mutex);
+                if (EXT4_SB(inode->i_sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
+                        ret = -EROFS;
-        if (inode->i_sb->s_flags & MS_RDONLY)
-                goto out;
-        ret = ext4_flush_unwritten_io(inode);
-        if (ret < 0)
                goto out;
+        }
        if (!journal) {
-                ret = __sync_inode(inode, datasync);
+                ret = generic_file_fsync(file, start, end, datasync);
                if (!ret && !hlist_empty(&inode->i_dentry))
                        ret = ext4_sync_parent(inode);
                goto out;
        }
+        ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+        if (ret)
+                return ret;
        /*
         * data=writeback,ordered:
         *  The caller's filemap_fdatawrite()/wait will sync the data.
@@ -172,8 +145,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
                if (!ret)
                        ret = err;
        }
- out:
+out:
-        mutex_unlock(&inode->i_mutex);
        trace_ext4_sync_file_exit(inode, ret);
        return ret;
 }
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 00a818d67b54..f03598c6ffd3 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -747,7 +747,8 @@ repeat_in_this_group:
                if (!handle) {
                        BUG_ON(nblocks <= 0);
                        handle = __ext4_journal_start_sb(dir->i_sb, line_no,
-                                                         handle_type, nblocks);
+                                                         handle_type, nblocks,
+                                                         0);
                        if (IS_ERR(handle)) {
                                err = PTR_ERR(handle);
                                ext4_std_error(sb, err);
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index b8d5d351e24f..87b30cd357e7 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -624,7 +624,7 @@ cleanup:
                partial--;
        }
 out:
-        trace_ext4_ind_map_blocks_exit(inode, map, err);
+        trace_ext4_ind_map_blocks_exit(inode, flags, map, err);
        return err;
 }
@@ -675,11 +675,6 @@ ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
 retry:
        if (rw == READ && ext4_should_dioread_nolock(inode)) {
-                if (unlikely(atomic_read(&EXT4_I(inode)->i_unwritten))) {
-                        mutex_lock(&inode->i_mutex);
-                        ext4_flush_unwritten_io(inode);
-                        mutex_unlock(&inode->i_mutex);
-                }
                /*
                 * Nolock dioread optimization may be dynamically disabled
                 * via ext4_inode_block_unlocked_dio(). Check inode's state
@@ -779,27 +774,18 @@ int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock)
        return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1;
 }
-int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+/*
+ * Calculate number of indirect blocks touched by mapping @nrblocks logically
+ * contiguous blocks
+ */
+int ext4_ind_trans_blocks(struct inode *inode, int nrblocks)
 {
-        int indirects;
-        /* if nrblocks are contiguous */
-        if (chunk) {
-                /*
-                 * With N contiguous data blocks, we need at most
-                 * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,
-                 * 2 dindirect blocks, and 1 tindirect block
-                 */
-                return DIV_ROUND_UP(nrblocks,
-                                    EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4;
-        }
        /*
-         * if nrblocks are not contiguous, worse case, each block touch
+         * With N contiguous data blocks, we need at most
-         * a indirect block, and each indirect block touch a double indirect
+         * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,
-         * block, plus a triple indirect block
+         * 2 dindirect blocks, and 1 tindirect block
         */
-        indirects = nrblocks * 2 + 1;
+        return DIV_ROUND_UP(nrblocks, EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4;
-        return indirects;
 }
 /*
@@ -940,11 +926,13 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
                             __le32 *last)
 {
        __le32 *p;
-        int     flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED;
+        int     flags = EXT4_FREE_BLOCKS_VALIDATED;
        int     err;
        if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
-                flags |= EXT4_FREE_BLOCKS_METADATA;
+                flags |= EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_METADATA;
+        else if (ext4_should_journal_data(inode))
+                flags |= EXT4_FREE_BLOCKS_FORGET;
        if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,
                                   count)) {
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 3e2bf873e8a8..d9ecbf1113a7 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -72,7 +72,7 @@ static int get_max_inline_xattr_value_size(struct inode *inode,
                entry = (struct ext4_xattr_entry *)
                        ((void *)raw_inode + EXT4_I(inode)->i_inline_off);
-                free += le32_to_cpu(entry->e_value_size);
+                free += EXT4_XATTR_SIZE(le32_to_cpu(entry->e_value_size));
                goto out;
        }
@@ -1404,16 +1404,15 @@ out:
 * offset as if '.' and '..' really take place.
 *
 */
-int ext4_read_inline_dir(struct file *filp,
+int ext4_read_inline_dir(struct file *file,
-                         void *dirent, filldir_t filldir,
+                         struct dir_context *ctx,
                         int *has_inline_data)
 {
-        int error = 0;
        unsigned int offset, parent_ino;
-        int i, stored;
+        int i;
        struct ext4_dir_entry_2 *de;
        struct super_block *sb;
-        struct inode *inode = file_inode(filp);
+        struct inode *inode = file_inode(file);
        int ret, inline_size = 0;
        struct ext4_iloc iloc;
        void *dir_buf = NULL;
@@ -1444,9 +1443,8 @@ int ext4_read_inline_dir(struct file *filp,
                goto out;
        sb = inode->i_sb;
-        stored = 0;
        parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode);
-        offset = filp->f_pos;
+        offset = ctx->pos;
        /*
         * dotdot_offset and dotdot_size is the real offset and
@@ -1460,104 +1458,74 @@ int ext4_read_inline_dir(struct file *filp,
        extra_offset = dotdot_size - EXT4_INLINE_DOTDOT_SIZE;
        extra_size = extra_offset + inline_size;
-        while (!error && !stored && filp->f_pos < extra_size) {
+        /*
-revalidate:
+         * If the version has changed since the last call to
-                /*
+         * readdir(2), then we might be pointing to an invalid
-                 * If the version has changed since the last call to
+         * dirent right now.  Scan from the start of the inline
-                 * readdir(2), then we might be pointing to an invalid
+         * dir to make sure.
-                 * dirent right now.  Scan from the start of the inline
+         */
-                 * dir to make sure.
+        if (file->f_version != inode->i_version) {
-                 */
+                for (i = 0; i < extra_size && i < offset;) {
-                if (filp->f_version != inode->i_version) {
+                        /*
-                        for (i = 0; i < extra_size && i < offset;) {
+                         * "." is with offset 0 and
-                                /*
+                         * ".." is dotdot_offset.
-                                 * "." is with offset 0 and
+                         */
-                                 * ".." is dotdot_offset.
+                        if (!i) {
-                                 */
+                                i = dotdot_offset;
-                                if (!i) {
+                                continue;
-                                        i = dotdot_offset;
+                        } else if (i == dotdot_offset) {
-                                        continue;
+                                i = dotdot_size;
-                                } else if (i == dotdot_offset) {
-                                        i = dotdot_size;
-                                        continue;
-                                }
-                                /* for other entry, the real offset in
-                                 * the buf has to be tuned accordingly.
-                                 */
-                                de = (struct ext4_dir_entry_2 *)
-                                        (dir_buf + i - extra_offset);
-                                /* It's too expensive to do a full
-                                 * dirent test each time round this
-                                 * loop, but we do have to test at
-                                 * least that it is non-zero.  A
-                                 * failure will be detected in the
-                                 * dirent test below. */
-                                if (ext4_rec_len_from_disk(de->rec_len,
-                                        extra_size) < EXT4_DIR_REC_LEN(1))
-                                        break;
-                                i += ext4_rec_len_from_disk(de->rec_len,
-                                                            extra_size);
-                        }
-                        offset = i;
-                        filp->f_pos = offset;
-                        filp->f_version = inode->i_version;
-                }
-                while (!error && filp->f_pos < extra_size) {
-                        if (filp->f_pos == 0) {
-                                error = filldir(dirent, ".", 1, 0, inode->i_ino,
-                                                DT_DIR);
-                                if (error)
-                                        break;
-                                stored++;
-                                filp->f_pos = dotdot_offset;
                                continue;
                        }
+                        /* for other entry, the real offset in
+                         * the buf has to be tuned accordingly.
+                         */
+                        de = (struct ext4_dir_entry_2 *)
+                                (dir_buf + i - extra_offset);
+                        /* It's too expensive to do a full
+                         * dirent test each time round this
+                         * loop, but we do have to test at
+                         * least that it is non-zero.  A
+                         * failure will be detected in the
+                         * dirent test below. */
+                        if (ext4_rec_len_from_disk(de->rec_len, extra_size)
+                                < EXT4_DIR_REC_LEN(1))
+                                break;
+                        i += ext4_rec_len_from_disk(de->rec_len,
+                                                    extra_size);
+                }
+                offset = i;
+                ctx->pos = offset;
+                file->f_version = inode->i_version;
+        }
-                        if (filp->f_pos == dotdot_offset) {
+        while (ctx->pos < extra_size) {
-                                error = filldir(dirent, "..", 2,
+                if (ctx->pos == 0) {
-                                                dotdot_offset,
+                        if (!dir_emit(ctx, ".", 1, inode->i_ino, DT_DIR))
-                                                parent_ino, DT_DIR);
+                                goto out;
-                                if (error)
+                        ctx->pos = dotdot_offset;
-                                        break;
+                        continue;
-                                stored++;
+                }
-                                filp->f_pos = dotdot_size;
+                if (ctx->pos == dotdot_offset) {
-                                continue;
+                        if (!dir_emit(ctx, "..", 2, parent_ino, DT_DIR))
-                        }
+                                goto out;
+                        ctx->pos = dotdot_size;
+                        continue;
+                }
-                        de = (struct ext4_dir_entry_2 *)
+                de = (struct ext4_dir_entry_2 *)
-                                (dir_buf + filp->f_pos - extra_offset);
+                        (dir_buf + ctx->pos - extra_offset);
-                        if (ext4_check_dir_entry(inode, filp, de,
+                if (ext4_check_dir_entry(inode, file, de, iloc.bh, dir_buf,
-                                                 iloc.bh, dir_buf,
+                                         extra_size, ctx->pos))
-                                                 extra_size, filp->f_pos)) {
+                        goto out;
-                                ret = stored;
+                if (le32_to_cpu(de->inode)) {
+                        if (!dir_emit(ctx, de->name, de->name_len,
+                                      le32_to_cpu(de->inode),
+                                      get_dtype(sb, de->file_type)))
                                goto out;
-                        }
-                        if (le32_to_cpu(de->inode)) {
-                                /* We might block in the next section
-                                 * if the data destination is
-                                 * currently swapped out.  So, use a
-                                 * version stamp to detect whether or
-                                 * not the directory has been modified
-                                 * during the copy operation.
-                                 */
-                                u64 version = filp->f_version;
-                                error = filldir(dirent, de->name,
-                                                de->name_len,
-                                                filp->f_pos,
-                                                le32_to_cpu(de->inode),
-                                                get_dtype(sb, de->file_type));
-                                if (error)
-                                        break;
-                                if (version != filp->f_version)
-                                        goto revalidate;
-                                stored++;
-                        }
-                        filp->f_pos += ext4_rec_len_from_disk(de->rec_len,
-                                                              extra_size);
                }
+                ctx->pos += ext4_rec_len_from_disk(de->rec_len, extra_size);
        }
 out:
        kfree(dir_buf);
@@ -1842,7 +1810,7 @@ int ext4_inline_data_fiemap(struct inode *inode,
        if (error)
                goto out;
-        physical = iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits;
+        physical = (__u64)iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits;
        physical += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data;
        physical += offsetof(struct ext4_inode, i_block);
        length = i_size_read(inode);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index d6382b89ecbd..0188e65e1f58 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -132,12 +132,12 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode,
                                                   new_size);
 }
-static void ext4_invalidatepage(struct page *page, unsigned long offset);
+static void ext4_invalidatepage(struct page *page, unsigned int offset,
+                                unsigned int length);
 static int __ext4_journalled_writepage(struct page *page, unsigned int len);
 static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
-static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
+static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
-                struct inode *inode, struct page *page, loff_t from,
+                                  int pextents);
-                loff_t length, int flags);
 /*
 * Test whether an inode is a fast symlink.
@@ -215,7 +215,8 @@ void ext4_evict_inode(struct inode *inode)
                        filemap_write_and_wait(&inode->i_data);
                }
                truncate_inode_pages(&inode->i_data, 0);
-                ext4_ioend_shutdown(inode);
+                WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
                goto no_delete;
        }
@@ -225,8 +226,8 @@ void ext4_evict_inode(struct inode *inode)
        if (ext4_should_order_data(inode))
                ext4_begin_ordered_truncate(inode, 0);
        truncate_inode_pages(&inode->i_data, 0);
-        ext4_ioend_shutdown(inode);
+        WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
        if (is_bad_inode(inode))
                goto no_delete;
@@ -423,66 +424,6 @@ static int __check_block_validity(struct inode *inode, const char *func,
 #define check_block_validity(inode, map)        \
        __check_block_validity((inode), __func__, __LINE__, (map))
-/*
- * Return the number of contiguous dirty pages in a given inode
- * starting at page frame idx.
- */
-static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
-                                    unsigned int max_pages)
-{
-        struct address_space *mapping = inode->i_mapping;
-        pgoff_t index;
-        struct pagevec pvec;
-        pgoff_t num = 0;
-        int i, nr_pages, done = 0;
-        if (max_pages == 0)
-                return 0;
-        pagevec_init(&pvec, 0);
-        while (!done) {
-                index = idx;
-                nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
-                                              PAGECACHE_TAG_DIRTY,
-                                              (pgoff_t)PAGEVEC_SIZE);
-                if (nr_pages == 0)
-                        break;
-                for (i = 0; i < nr_pages; i++) {
-                        struct page *page = pvec.pages[i];
-                        struct buffer_head *bh, *head;
-                        lock_page(page);
-                        if (unlikely(page->mapping != mapping) ||
-                            !PageDirty(page) ||
-                            PageWriteback(page) ||
-                            page->index != idx) {
-                                done = 1;
-                                unlock_page(page);
-                                break;
-                        }
-                        if (page_has_buffers(page)) {
-                                bh = head = page_buffers(page);
-                                do {
-                                        if (!buffer_delay(bh) &&
-                                            !buffer_unwritten(bh))
-                                                done = 1;
-                                        bh = bh->b_this_page;
-                                } while (!done && (bh != head));
-                        }
-                        unlock_page(page);
-                        if (done)
-                                break;
-                        idx++;
-                        num++;
-                        if (num >= max_pages) {
-                                done = 1;
-                                break;
-                        }
-                }
-                pagevec_release(&pvec);
-        }
-        return num;
-}
 #ifdef ES_AGGRESSIVE_TEST
 static void ext4_map_blocks_es_recheck(handle_t *handle,
                                       struct inode *inode,
@@ -573,6 +514,8 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
                  "logical block %lu\n", inode->i_ino, flags, map->m_len,
                  (unsigned long) map->m_lblk);
+        ext4_es_lru_add(inode);
        /* Lookup extent status tree firstly */
        if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
                if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
@@ -1118,10 +1061,13 @@ static int ext4_write_end(struct file *file,
                }
        }
-        if (ext4_has_inline_data(inode))
+        if (ext4_has_inline_data(inode)) {
-                copied = ext4_write_inline_data_end(inode, pos, len,
+                ret = ext4_write_inline_data_end(inode, pos, len,
-                                                    copied, page);
+                                                 copied, page);
-        else
+                if (ret < 0)
+                        goto errout;
+                copied = ret;
+        } else
                copied = block_write_end(file, mapping, pos,
                                         len, copied, page, fsdata);
@@ -1157,8 +1103,6 @@ static int ext4_write_end(struct file *file,
        if (i_size_changed)
                ext4_mark_inode_dirty(handle, inode);
-        if (copied < 0)
-                ret = copied;
        if (pos + len > inode->i_size && ext4_can_truncate(inode))
                /* if we have allocated more blocks and copied
                 * less. We will have blocks allocated outside
@@ -1415,21 +1359,28 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
 }
 static void ext4_da_page_release_reservation(struct page *page,
-                                             unsigned long offset)
+                                             unsigned int offset,
+                                             unsigned int length)
 {
        int to_release = 0;
        struct buffer_head *head, *bh;
        unsigned int curr_off = 0;
        struct inode *inode = page->mapping->host;
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+        unsigned int stop = offset + length;
        int num_clusters;
        ext4_fsblk_t lblk;
+        BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
        head = page_buffers(page);
        bh = head;
        do {
                unsigned int next_off = curr_off + bh->b_size;
+                if (next_off > stop)
+                        break;
                if ((offset <= curr_off) && (buffer_delay(bh))) {
                        to_release++;
                        clear_buffer_delay(bh);
@@ -1460,140 +1411,43 @@ static void ext4_da_page_release_reservation(struct page *page,
 * Delayed allocation stuff
 */
-/*
+struct mpage_da_data {
- * mpage_da_submit_io - walks through extent of pages and try to write
+        struct inode *inode;
- * them with writepage() call back
+        struct writeback_control *wbc;
- *
- * @mpd->inode: inode
- * @mpd->first_page: first page of the extent
- * @mpd->next_page: page after the last page of the extent
- *
- * By the time mpage_da_submit_io() is called we expect all blocks
- * to be allocated. this may be wrong if allocation failed.
- *
- * As pages are already locked by write_cache_pages(), we can't use it
- */
-static int mpage_da_submit_io(struct mpage_da_data *mpd,
-                              struct ext4_map_blocks *map)
-{
-        struct pagevec pvec;
-        unsigned long index, end;
-        int ret = 0, err, nr_pages, i;
-        struct inode *inode = mpd->inode;
-        struct address_space *mapping = inode->i_mapping;
-        loff_t size = i_size_read(inode);
-        unsigned int len, block_start;
-        struct buffer_head *bh, *page_bufs = NULL;
-        sector_t pblock = 0, cur_logical = 0;
-        struct ext4_io_submit io_submit;
-        BUG_ON(mpd->next_page <= mpd->first_page);
+        pgoff_t first_page;     /* The first page to write */
-        memset(&io_submit, 0, sizeof(io_submit));
+        pgoff_t next_page;      /* Current page to examine */
+        pgoff_t last_page;      /* Last page to examine */
        /*
-         * We need to start from the first_page to the next_page - 1
+         * Extent to map - this can be after first_page because that can be
-         * to make sure we also write the mapped dirty buffer_heads.
+         * fully mapped. We somewhat abuse m_flags to store whether the extent
-         * If we look at mpd->b_blocknr we would only be looking
+         * is delalloc or unwritten.
-         * at the currently mapped buffer_heads.
         */
-        index = mpd->first_page;
+        struct ext4_map_blocks map;
-        end = mpd->next_page - 1;
+        struct ext4_io_submit io_submit;        /* IO submission data */
+};
-        pagevec_init(&pvec, 0);
-        while (index <= end) {
-                nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
-                if (nr_pages == 0)
-                        break;
-                for (i = 0; i < nr_pages; i++) {
-                        int skip_page = 0;
-                        struct page *page = pvec.pages[i];
-                        index = page->index;
-                        if (index > end)
-                                break;
-                        if (index == size >> PAGE_CACHE_SHIFT)
-                                len = size & ~PAGE_CACHE_MASK;
-                        else
-                                len = PAGE_CACHE_SIZE;
-                        if (map) {
-                                cur_logical = index << (PAGE_CACHE_SHIFT -
-                                                        inode->i_blkbits);
-                                pblock = map->m_pblk + (cur_logical -
-                                                        map->m_lblk);
-                        }
-                        index++;
-                        BUG_ON(!PageLocked(page));
-                        BUG_ON(PageWriteback(page));
-                        bh = page_bufs = page_buffers(page);
-                        block_start = 0;
-                        do {
-                                if (map && (cur_logical >= map->m_lblk) &&
-                                    (cur_logical <= (map->m_lblk +
-                                                     (map->m_len - 1)))) {
-                                        if (buffer_delay(bh)) {
-                                                clear_buffer_delay(bh);
-                                                bh->b_blocknr = pblock;
-                                        }
-                                        if (buffer_unwritten(bh) ||
-                                            buffer_mapped(bh))
-                                                BUG_ON(bh->b_blocknr != pblock);
-                                        if (map->m_flags & EXT4_MAP_UNINIT)
-                                                set_buffer_uninit(bh);
-                                        clear_buffer_unwritten(bh);
-                                }
-                                /*
-                                 * skip page if block allocation undone and
-                                 * block is dirty
-                                 */
-                                if (ext4_bh_delay_or_unwritten(NULL, bh))
-                                        skip_page = 1;
-                                bh = bh->b_this_page;
-                                block_start += bh->b_size;
-                                cur_logical++;
-                                pblock++;
-                        } while (bh != page_bufs);
-                        if (skip_page) {
-                                unlock_page(page);
-                                continue;
-                        }
-                        clear_page_dirty_for_io(page);
-                        err = ext4_bio_write_page(&io_submit, page, len,
-                                                  mpd->wbc);
-                        if (!err)
-                                mpd->pages_written++;
-                        /*
-                         * In error case, we have to continue because
-                         * remaining pages are still locked
-                         */
-                        if (ret == 0)
-                                ret = err;
-                }
-                pagevec_release(&pvec);
-        }
-        ext4_io_submit(&io_submit);
-        return ret;
-}
-static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd)
+static void mpage_release_unused_pages(struct mpage_da_data *mpd,
+                                       bool invalidate)
 {
        int nr_pages, i;
        pgoff_t index, end;
        struct pagevec pvec;
        struct inode *inode = mpd->inode;
        struct address_space *mapping = inode->i_mapping;
-        ext4_lblk_t start, last;
+        /* This is necessary when next_page == 0. */
+        if (mpd->first_page >= mpd->next_page)
+                return;
        index = mpd->first_page;
        end   = mpd->next_page - 1;
+        if (invalidate) {
-        start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+                ext4_lblk_t start, last;
-        last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+                start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
-        ext4_es_remove_extent(inode, start, last - start + 1);
+                last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+                ext4_es_remove_extent(inode, start, last - start + 1);
+        }
        pagevec_init(&pvec, 0);
        while (index <= end) {
@@ -1606,14 +1460,15 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd)
                                break;
                        BUG_ON(!PageLocked(page));
                        BUG_ON(PageWriteback(page));
-                        block_invalidatepage(page, 0);
+                        if (invalidate) {
-                        ClearPageUptodate(page);
+                                block_invalidatepage(page, 0, PAGE_CACHE_SIZE);
+                                ClearPageUptodate(page);
+                        }
                        unlock_page(page);
                }
                index = pvec.pages[nr_pages - 1]->index + 1;
                pagevec_release(&pvec);
        }
-        return;
 }
 static void ext4_print_free_blocks(struct inode *inode)
@@ -1642,215 +1497,6 @@ static void ext4_print_free_blocks(struct inode *inode)
        return;
 }
-/*
- * mpage_da_map_and_submit - go through given space, map them
- *       if necessary, and then submit them for I/O
- *
- * @mpd - bh describing space
- *
- * The function skips space we know is already mapped to disk blocks.
- *
- */
-static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
-{
-        int err, blks, get_blocks_flags;
-        struct ext4_map_blocks map, *mapp = NULL;
-        sector_t next = mpd->b_blocknr;
-        unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
-        loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
-        handle_t *handle = NULL;
-        /*
-         * If the blocks are mapped already, or we couldn't accumulate
-         * any blocks, then proceed immediately to the submission stage.
-         */
-        if ((mpd->b_size == 0) ||
-            ((mpd->b_state  & (1 << BH_Mapped)) &&
-             !(mpd->b_state & (1 << BH_Delay)) &&
-             !(mpd->b_state & (1 << BH_Unwritten))))
-                goto submit_io;
-        handle = ext4_journal_current_handle();
-        BUG_ON(!handle);
-        /*
-         * Call ext4_map_blocks() to allocate any delayed allocation
-         * blocks, or to convert an uninitialized extent to be
-         * initialized (in the case where we have written into
-         * one or more preallocated blocks).
-         *
-         * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE to
-         * indicate that we are on the delayed allocation path.  This
-         * affects functions in many different parts of the allocation
-         * call path.  This flag exists primarily because we don't
-         * want to change *many* call functions, so ext4_map_blocks()
-         * will set the EXT4_STATE_DELALLOC_RESERVED flag once the
-         * inode's allocation semaphore is taken.
-         *
-         * If the blocks in questions were delalloc blocks, set
-         * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting
-         * variables are updated after the blocks have been allocated.
-         */
-        map.m_lblk = next;
-        map.m_len = max_blocks;
-        /*
-         * We're in delalloc path and it is possible that we're going to
-         * need more metadata blocks than previously reserved. However
-         * we must not fail because we're in writeback and there is
-         * nothing we can do about it so it might result in data loss.
-         * So use reserved blocks to allocate metadata if possible.
-         */
-        get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
-                           EXT4_GET_BLOCKS_METADATA_NOFAIL;
-        if (ext4_should_dioread_nolock(mpd->inode))
-                get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
-        if (mpd->b_state & (1 << BH_Delay))
-                get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
-        blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags);
-        if (blks < 0) {
-                struct super_block *sb = mpd->inode->i_sb;
-                err = blks;
-                /*
-                 * If get block returns EAGAIN or ENOSPC and there
-                 * appears to be free blocks we will just let
-                 * mpage_da_submit_io() unlock all of the pages.
-                 */
-                if (err == -EAGAIN)
-                        goto submit_io;
-                if (err == -ENOSPC && ext4_count_free_clusters(sb)) {
-                        mpd->retval = err;
-                        goto submit_io;
-                }
-                /*
-                 * get block failure will cause us to loop in
-                 * writepages, because a_ops->writepage won't be able
-                 * to make progress. The page will be redirtied by
-                 * writepage and writepages will again try to write
-                 * the same.
-                 */
-                if (!(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) {
-                        ext4_msg(sb, KERN_CRIT,
-                                 "delayed block allocation failed for inode %lu "
-                                 "at logical offset %llu with max blocks %zd "
-                                 "with error %d", mpd->inode->i_ino,
-                                 (unsigned long long) next,
-                                 mpd->b_size >> mpd->inode->i_blkbits, err);
-                        ext4_msg(sb, KERN_CRIT,
-                                "This should not happen!! Data will be lost");
-                        if (err == -ENOSPC)
-                                ext4_print_free_blocks(mpd->inode);
-                }
-                /* invalidate all the pages */
-                ext4_da_block_invalidatepages(mpd);
-                /* Mark this page range as having been completed */
-                mpd->io_done = 1;
-                return;
-        }
-        BUG_ON(blks == 0);
-        mapp = &map;
-        if (map.m_flags & EXT4_MAP_NEW) {
-                struct block_device *bdev = mpd->inode->i_sb->s_bdev;
-                int i;
-                for (i = 0; i < map.m_len; i++)
-                        unmap_underlying_metadata(bdev, map.m_pblk + i);
-        }
-        /*
-         * Update on-disk size along with block allocation.
-         */
-        disksize = ((loff_t) next + blks) << mpd->inode->i_blkbits;
-        if (disksize > i_size_read(mpd->inode))
-                disksize = i_size_read(mpd->inode);
-        if (disksize > EXT4_I(mpd->inode)->i_disksize) {
-                ext4_update_i_disksize(mpd->inode, disksize);
-                err = ext4_mark_inode_dirty(handle, mpd->inode);
-                if (err)
-                        ext4_error(mpd->inode->i_sb,
-                                   "Failed to mark inode %lu dirty",
-                                   mpd->inode->i_ino);
-        }
-submit_io:
-        mpage_da_submit_io(mpd, mapp);
-        mpd->io_done = 1;
-}
-#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
-                (1 << BH_Delay) | (1 << BH_Unwritten))
-/*
- * mpage_add_bh_to_extent - try to add one more block to extent of blocks
- *
- * @mpd->lbh - extent of blocks
- * @logical - logical number of the block in the file
- * @b_state - b_state of the buffer head added
- *
- * the function is used to collect contig. blocks in same state
- */
-static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, sector_t logical,
-                                   unsigned long b_state)
-{
-        sector_t next;
-        int blkbits = mpd->inode->i_blkbits;
-        int nrblocks = mpd->b_size >> blkbits;
-        /*
-         * XXX Don't go larger than mballoc is willing to allocate
-         * This is a stopgap solution.  We eventually need to fold
-         * mpage_da_submit_io() into this function and then call
-         * ext4_map_blocks() multiple times in a loop
-         */
-        if (nrblocks >= (8*1024*1024 >> blkbits))
-                goto flush_it;
-        /* check if the reserved journal credits might overflow */
-        if (!ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS)) {
-                if (nrblocks >= EXT4_MAX_TRANS_DATA) {
-                        /*
-                         * With non-extent format we are limited by the journal
-                         * credit available.  Total credit needed to insert
-                         * nrblocks contiguous blocks is dependent on the
-                         * nrblocks.  So limit nrblocks.
-                         */
-                        goto flush_it;
-                }
-        }
-        /*
-         * First block in the extent
-         */
-        if (mpd->b_size == 0) {
-                mpd->b_blocknr = logical;
-                mpd->b_size = 1 << blkbits;
-                mpd->b_state = b_state & BH_FLAGS;
-                return;
-        }
-        next = mpd->b_blocknr + nrblocks;
-        /*
-         * Can we merge the block to our big extent?
-         */
-        if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) {
-                mpd->b_size += 1 << blkbits;
-                return;
-        }
-flush_it:
-        /*
-         * We couldn't merge the block to our extent, so we
-         * need to flush current  extent and start new one
-         */
-        mpage_da_map_and_submit(mpd);
-        return;
-}
 static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
 {
        return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh);
@@ -1883,6 +1529,8 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
                  "logical block %lu\n", inode->i_ino, map->m_len,
                  (unsigned long) map->m_lblk);
+        ext4_es_lru_add(inode);
        /* Lookup extent status tree firstly */
        if (ext4_es_lookup_extent(inode, iblock, &es)) {
@@ -2156,7 +1804,7 @@ out:
 * lock so we have to do some magic.
 *
 * This function can get called via...
- *   - ext4_da_writepages after taking page lock (have journal handle)
+ *   - ext4_writepages after taking page lock (have journal handle)
 *   - journal_submit_inode_data_buffers (no journal handle)
 *   - shrink_page_list via the kswapd/direct reclaim (no journal handle)
 *   - grab_page_cache when doing write_begin (have journal handle)
@@ -2234,76 +1882,405 @@ static int ext4_writepage(struct page *page,
                 */
                return __ext4_journalled_writepage(page, len);
-        memset(&io_submit, 0, sizeof(io_submit));
+        ext4_io_submit_init(&io_submit, wbc);
+        io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS);
+        if (!io_submit.io_end) {
+                redirty_page_for_writepage(wbc, page);
+                unlock_page(page);
+                return -ENOMEM;
+        }
        ret = ext4_bio_write_page(&io_submit, page, len, wbc);
        ext4_io_submit(&io_submit);
+        /* Drop io_end reference we got from init */
+        ext4_put_io_end_defer(io_submit.io_end);
        return ret;
 }
+#define BH_FLAGS ((1 << BH_Unwritten) | (1 << BH_Delay))
 /*
- * This is called via ext4_da_writepages() to
+ * mballoc gives us at most this number of blocks...
- * calculate the total number of credits to reserve to fit
+ * XXX: That seems to be only a limitation of ext4_mb_normalize_request().
- * a single extent allocation into a single transaction,
+ * The rest of mballoc seems to handle chunks upto full group size.
- * ext4_da_writpeages() will loop calling this before
- * the block allocation.
 */
+#define MAX_WRITEPAGES_EXTENT_LEN 2048
-static int ext4_da_writepages_trans_blocks(struct inode *inode)
+/*
+ * mpage_add_bh_to_extent - try to add bh to extent of blocks to map
+ *
+ * @mpd - extent of blocks
+ * @lblk - logical number of the block in the file
+ * @b_state - b_state of the buffer head added
+ *
+ * the function is used to collect contig. blocks in same state
+ */
+static int mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk,
+                                  unsigned long b_state)
+{
+        struct ext4_map_blocks *map = &mpd->map;
+        /* Don't go larger than mballoc is willing to allocate */
+        if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN)
+                return 0;
+        /* First block in the extent? */
+        if (map->m_len == 0) {
+                map->m_lblk = lblk;
+                map->m_len = 1;
+                map->m_flags = b_state & BH_FLAGS;
+                return 1;
+        }
+        /* Can we merge the block to our big extent? */
+        if (lblk == map->m_lblk + map->m_len &&
+            (b_state & BH_FLAGS) == map->m_flags) {
+                map->m_len++;
+                return 1;
+        }
+        return 0;
+}
+static bool add_page_bufs_to_extent(struct mpage_da_data *mpd,
+                                    struct buffer_head *head,
+                                    struct buffer_head *bh,
+                                    ext4_lblk_t lblk)
+{
+        struct inode *inode = mpd->inode;
+        ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1)
+                                                        >> inode->i_blkbits;
+        do {
+                BUG_ON(buffer_locked(bh));
+                if (!buffer_dirty(bh) || !buffer_mapped(bh) ||
+                    (!buffer_delay(bh) && !buffer_unwritten(bh)) ||
+                    lblk >= blocks) {
+                        /* Found extent to map? */
+                        if (mpd->map.m_len)
+                                return false;
+                        if (lblk >= blocks)
+                                return true;
+                        continue;
+                }
+                if (!mpage_add_bh_to_extent(mpd, lblk, bh->b_state))
+                        return false;
+        } while (lblk++, (bh = bh->b_this_page) != head);
+        return true;
+}
+static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page)
 {
-        int max_blocks = EXT4_I(inode)->i_reserved_data_blocks;
+        int len;
+        loff_t size = i_size_read(mpd->inode);
+        int err;
+        BUG_ON(page->index != mpd->first_page);
+        if (page->index == size >> PAGE_CACHE_SHIFT)
+                len = size & ~PAGE_CACHE_MASK;
+        else
+                len = PAGE_CACHE_SIZE;
+        clear_page_dirty_for_io(page);
+        err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc);
+        if (!err)
+                mpd->wbc->nr_to_write--;
+        mpd->first_page++;
+        return err;
+}
+/*
+ * mpage_map_buffers - update buffers corresponding to changed extent and
+ *                     submit fully mapped pages for IO
+ *
+ * @mpd - description of extent to map, on return next extent to map
+ *
+ * Scan buffers corresponding to changed extent (we expect corresponding pages
+ * to be already locked) and update buffer state according to new extent state.
+ * We map delalloc buffers to their physical location, clear unwritten bits,
+ * and mark buffers as uninit when we perform writes to uninitialized extents
+ * and do extent conversion after IO is finished. If the last page is not fully
+ * mapped, we update @map to the next extent in the last page that needs
+ * mapping. Otherwise we submit the page for IO.
+ */
+static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
+{
+        struct pagevec pvec;
+        int nr_pages, i;
+        struct inode *inode = mpd->inode;
+        struct buffer_head *head, *bh;
+        int bpp_bits = PAGE_CACHE_SHIFT - inode->i_blkbits;
+        ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1)
+                                                        >> inode->i_blkbits;
+        pgoff_t start, end;
+        ext4_lblk_t lblk;
+        sector_t pblock;
+        int err;
+        start = mpd->map.m_lblk >> bpp_bits;
+        end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits;
+        lblk = start << bpp_bits;
+        pblock = mpd->map.m_pblk;
+        pagevec_init(&pvec, 0);
+        while (start <= end) {
+                nr_pages = pagevec_lookup(&pvec, inode->i_mapping, start,
+                                          PAGEVEC_SIZE);
+                if (nr_pages == 0)
+                        break;
+                for (i = 0; i < nr_pages; i++) {
+                        struct page *page = pvec.pages[i];
+                        if (page->index > end)
+                                break;
+                        /* Upto 'end' pages must be contiguous */
+                        BUG_ON(page->index != start);
+                        bh = head = page_buffers(page);
+                        do {
+                                if (lblk < mpd->map.m_lblk)
+                                        continue;
+                                if (lblk >= mpd->map.m_lblk + mpd->map.m_len) {
+                                        /*
+                                         * Buffer after end of mapped extent.
+                                         * Find next buffer in the page to map.
+                                         */
+                                        mpd->map.m_len = 0;
+                                        mpd->map.m_flags = 0;
+                                        add_page_bufs_to_extent(mpd, head, bh,
+                                                                lblk);
+                                        pagevec_release(&pvec);
+                                        return 0;
+                                }
+                                if (buffer_delay(bh)) {
+                                        clear_buffer_delay(bh);
+                                        bh->b_blocknr = pblock++;
+                                }
+                                clear_buffer_unwritten(bh);
+                        } while (++lblk < blocks &&
+                                 (bh = bh->b_this_page) != head);
+                        /*
+                         * FIXME: This is going to break if dioread_nolock
+                         * supports blocksize < pagesize as we will try to
+                         * convert potentially unmapped parts of inode.
+                         */
+                        mpd->io_submit.io_end->size += PAGE_CACHE_SIZE;
+                        /* Page fully mapped - let IO run! */
+                        err = mpage_submit_page(mpd, page);
+                        if (err < 0) {
+                                pagevec_release(&pvec);
+                                return err;
+                        }
+                        start++;
+                }
+                pagevec_release(&pvec);
+        }
+        /* Extent fully mapped and matches with page boundary. We are done. */
+        mpd->map.m_len = 0;
+        mpd->map.m_flags = 0;
+        return 0;
+}
+static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
+{
+        struct inode *inode = mpd->inode;
+        struct ext4_map_blocks *map = &mpd->map;
+        int get_blocks_flags;
+        int err;
+        trace_ext4_da_write_pages_extent(inode, map);
        /*
-         * With non-extent format the journal credit needed to
+         * Call ext4_map_blocks() to allocate any delayed allocation blocks, or
-         * insert nrblocks contiguous block is dependent on
+         * to convert an uninitialized extent to be initialized (in the case
-         * number of contiguous block. So we will limit
+         * where we have written into one or more preallocated blocks).  It is
-         * number of contiguous block to a sane value
+         * possible that we're going to need more metadata blocks than
+         * previously reserved. However we must not fail because we're in
+         * writeback and there is nothing we can do about it so it might result
+         * in data loss.  So use reserved blocks to allocate metadata if
+         * possible.
+         *
+         * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if the blocks
+         * in question are delalloc blocks.  This affects functions in many
+         * different parts of the allocation call path.  This flag exists
+         * primarily because we don't want to change *many* call functions, so
+         * ext4_map_blocks() will set the EXT4_STATE_DELALLOC_RESERVED flag
+         * once the inode's allocation semaphore is taken.
         */
-        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) &&
+        get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
-            (max_blocks > EXT4_MAX_TRANS_DATA))
+                           EXT4_GET_BLOCKS_METADATA_NOFAIL;
-                max_blocks = EXT4_MAX_TRANS_DATA;
+        if (ext4_should_dioread_nolock(inode))
+                get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
+        if (map->m_flags & (1 << BH_Delay))
+                get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
-        return ext4_chunk_trans_blocks(inode, max_blocks);
+        err = ext4_map_blocks(handle, inode, map, get_blocks_flags);
+        if (err < 0)
+                return err;
+        if (map->m_flags & EXT4_MAP_UNINIT) {
+                if (!mpd->io_submit.io_end->handle &&
+                    ext4_handle_valid(handle)) {
+                        mpd->io_submit.io_end->handle = handle->h_rsv_handle;
+                        handle->h_rsv_handle = NULL;
+                }
+                ext4_set_io_unwritten_flag(inode, mpd->io_submit.io_end);
+        }
+        BUG_ON(map->m_len == 0);
+        if (map->m_flags & EXT4_MAP_NEW) {
+                struct block_device *bdev = inode->i_sb->s_bdev;
+                int i;
+                for (i = 0; i < map->m_len; i++)
+                        unmap_underlying_metadata(bdev, map->m_pblk + i);
+        }
+        return 0;
 }
 /*
- * write_cache_pages_da - walk the list of dirty pages of the given
+ * mpage_map_and_submit_extent - map extent starting at mpd->lblk of length
- * address space and accumulate pages that need writing, and call
+ *                               mpd->len and submit pages underlying it for IO
- * mpage_da_map_and_submit to map a single contiguous memory region
+ *
- * and then write them.
+ * @handle - handle for journal operations
+ * @mpd - extent to map
+ *
+ * The function maps extent starting at mpd->lblk of length mpd->len. If it is
+ * delayed, blocks are allocated, if it is unwritten, we may need to convert
+ * them to initialized or split the described range from larger unwritten
+ * extent. Note that we need not map all the described range since allocation
+ * can return less blocks or the range is covered by more unwritten extents. We
+ * cannot map more because we are limited by reserved transaction credits. On
+ * the other hand we always make sure that the last touched page is fully
+ * mapped so that it can be written out (and thus forward progress is
+ * guaranteed). After mapping we submit all mapped pages for IO.
 */
-static int write_cache_pages_da(handle_t *handle,
+static int mpage_map_and_submit_extent(handle_t *handle,
-                                struct address_space *mapping,
+                                       struct mpage_da_data *mpd,
-                                struct writeback_control *wbc,
+                                       bool *give_up_on_write)
-                                struct mpage_da_data *mpd,
-                                pgoff_t *done_index)
 {
-        struct buffer_head      *bh, *head;
+        struct inode *inode = mpd->inode;
-        struct inode            *inode = mapping->host;
+        struct ext4_map_blocks *map = &mpd->map;
-        struct pagevec          pvec;
+        int err;
-        unsigned int            nr_pages;
+        loff_t disksize;
-        sector_t                logical;
-        pgoff_t                 index, end;
-        long                    nr_to_write = wbc->nr_to_write;
-        int                     i, tag, ret = 0;
-        memset(mpd, 0, sizeof(struct mpage_da_data));
-        mpd->wbc = wbc;
-        mpd->inode = inode;
-        pagevec_init(&pvec, 0);
-        index = wbc->range_start >> PAGE_CACHE_SHIFT;
-        end = wbc->range_end >> PAGE_CACHE_SHIFT;
-        if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
+        mpd->io_submit.io_end->offset =
+                                ((loff_t)map->m_lblk) << inode->i_blkbits;
+        while (map->m_len) {
+                err = mpage_map_one_extent(handle, mpd);
+                if (err < 0) {
+                        struct super_block *sb = inode->i_sb;
+                        if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
+                                goto invalidate_dirty_pages;
+                        /*
+                         * Let the uper layers retry transient errors.
+                         * In the case of ENOSPC, if ext4_count_free_blocks()
+                         * is non-zero, a commit should free up blocks.
+                         */
+                        if ((err == -ENOMEM) ||
+                            (err == -ENOSPC && ext4_count_free_clusters(sb)))
+                                return err;
+                        ext4_msg(sb, KERN_CRIT,
+                                 "Delayed block allocation failed for "
+                                 "inode %lu at logical offset %llu with"
+                                 " max blocks %u with error %d",
+                                 inode->i_ino,
+                                 (unsigned long long)map->m_lblk,
+                                 (unsigned)map->m_len, -err);
+                        ext4_msg(sb, KERN_CRIT,
+                                 "This should not happen!! Data will "
+                                 "be lost\n");
+                        if (err == -ENOSPC)
+                                ext4_print_free_blocks(inode);
+                invalidate_dirty_pages:
+                        *give_up_on_write = true;
+                        return err;
+                }
+                /*
+                 * Update buffer state, submit mapped pages, and get us new
+                 * extent to map
+                 */
+                err = mpage_map_and_submit_buffers(mpd);
+                if (err < 0)
+                        return err;
+        }
+        /* Update on-disk size after IO is submitted */
+        disksize = ((loff_t)mpd->first_page) << PAGE_CACHE_SHIFT;
+        if (disksize > i_size_read(inode))
+                disksize = i_size_read(inode);
+        if (disksize > EXT4_I(inode)->i_disksize) {
+                int err2;
+                ext4_update_i_disksize(inode, disksize);
+                err2 = ext4_mark_inode_dirty(handle, inode);
+                if (err2)
+                        ext4_error(inode->i_sb,
+                                   "Failed to mark inode %lu dirty",
+                                   inode->i_ino);
+                if (!err)
+                        err = err2;
+        }
+        return err;
+}
+/*
+ * Calculate the total number of credits to reserve for one writepages
+ * iteration. This is called from ext4_writepages(). We map an extent of
+ * upto MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping
+ * the last partial page. So in total we can map MAX_WRITEPAGES_EXTENT_LEN +
+ * bpp - 1 blocks in bpp different extents.
+ */
+static int ext4_da_writepages_trans_blocks(struct inode *inode)
+{
+        int bpp = ext4_journal_blocks_per_page(inode);
+        return ext4_meta_trans_blocks(inode,
+                                MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp);
+}
+/*
+ * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages
+ *                               and underlying extent to map
+ *
+ * @mpd - where to look for pages
+ *
+ * Walk dirty pages in the mapping. If they are fully mapped, submit them for
+ * IO immediately. When we find a page which isn't mapped we start accumulating
+ * extent of buffers underlying these pages that needs mapping (formed by
+ * either delayed or unwritten buffers). We also lock the pages containing
+ * these buffers. The extent found is returned in @mpd structure (starting at
+ * mpd->lblk with length mpd->len blocks).
+ *
+ * Note that this function can attach bios to one io_end structure which are
+ * neither logically nor physically contiguous. Although it may seem as an
+ * unnecessary complication, it is actually inevitable in blocksize < pagesize
+ * case as we need to track IO to all buffers underlying a page in one io_end.
+ */
+static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
+{
+        struct address_space *mapping = mpd->inode->i_mapping;
+        struct pagevec pvec;
+        unsigned int nr_pages;
+        pgoff_t index = mpd->first_page;
+        pgoff_t end = mpd->last_page;
+        int tag;
+        int i, err = 0;
+        int blkbits = mpd->inode->i_blkbits;
+        ext4_lblk_t lblk;
+        struct buffer_head *head;
+        if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages)
                tag = PAGECACHE_TAG_TOWRITE;
        else
                tag = PAGECACHE_TAG_DIRTY;
-        *done_index = index;
+        pagevec_init(&pvec, 0);
+        mpd->map.m_len = 0;
+        mpd->next_page = index;
        while (index <= end) {
                nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
                              min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
                if (nr_pages == 0)
-                        return 0;
+                        goto out;
                for (i = 0; i < nr_pages; i++) {
                        struct page *page = pvec.pages[i];
@@ -2318,31 +2295,21 @@ static int write_cache_pages_da(handle_t *handle,
                        if (page->index > end)
                                goto out;
-                        *done_index = page->index + 1;
+                        /* If we can't merge this page, we are done. */
+                        if (mpd->map.m_len > 0 && mpd->next_page != page->index)
-                        /*
+                                goto out;
-                         * If we can't merge this page, and we have
-                         * accumulated an contiguous region, write it
-                         */
-                        if ((mpd->next_page != page->index) &&
-                            (mpd->next_page != mpd->first_page)) {
-                                mpage_da_map_and_submit(mpd);
-                                goto ret_extent_tail;
-                        }
                        lock_page(page);
                        /*
-                         * If the page is no longer dirty, or its
+                         * If the page is no longer dirty, or its mapping no
-                         * mapping no longer corresponds to inode we
+                         * longer corresponds to inode we are writing (which
-                         * are writing (which means it has been
+                         * means it has been truncated or invalidated), or the
-                         * truncated or invalidated), or the page is
+                         * page is already under writeback and we are not doing
-                         * already under writeback and we are not
+                         * a data integrity writeback, skip the page
-                         * doing a data integrity writeback, skip the page
                         */
                        if (!PageDirty(page) ||
                            (PageWriteback(page) &&
-                             (wbc->sync_mode == WB_SYNC_NONE)) ||
+                             (mpd->wbc->sync_mode == WB_SYNC_NONE)) ||
                            unlikely(page->mapping != mapping)) {
                                unlock_page(page);
                                continue;
@@ -2351,106 +2318,70 @@ static int write_cache_pages_da(handle_t *handle,
                        wait_on_page_writeback(page);
                        BUG_ON(PageWriteback(page));
-                        /*
+                        if (mpd->map.m_len == 0)
-                         * If we have inline data and arrive here, it means that
-                         * we will soon create the block for the 1st page, so
-                         * we'd better clear the inline data here.
-                         */
-                        if (ext4_has_inline_data(inode)) {
-                                BUG_ON(ext4_test_inode_state(inode,
-                                                EXT4_STATE_MAY_INLINE_DATA));
-                                ext4_destroy_inline_data(handle, inode);
-                        }
-                        if (mpd->next_page != page->index)
                                mpd->first_page = page->index;
                        mpd->next_page = page->index + 1;
-                        logical = (sector_t) page->index <<
-                                (PAGE_CACHE_SHIFT - inode->i_blkbits);
                        /* Add all dirty buffers to mpd */
+                        lblk = ((ext4_lblk_t)page->index) <<
+                                (PAGE_CACHE_SHIFT - blkbits);
                        head = page_buffers(page);
-                        bh = head;
+                        if (!add_page_bufs_to_extent(mpd, head, head, lblk))
-                        do {
+                                goto out;
-                                BUG_ON(buffer_locked(bh));
+                        /* So far everything mapped? Submit the page for IO. */
-                                /*
+                        if (mpd->map.m_len == 0) {
-                                 * We need to try to allocate unmapped blocks
+                                err = mpage_submit_page(mpd, page);
-                                 * in the same page.  Otherwise we won't make
+                                if (err < 0)
-                                 * progress with the page in ext4_writepage
-                                 */
-                                if (ext4_bh_delay_or_unwritten(NULL, bh)) {
-                                        mpage_add_bh_to_extent(mpd, logical,
-                                                               bh->b_state);
-                                        if (mpd->io_done)
-                                                goto ret_extent_tail;
-                                } else if (buffer_dirty(bh) &&
-                                           buffer_mapped(bh)) {
-                                        /*
-                                         * mapped dirty buffer. We need to
-                                         * update the b_state because we look
-                                         * at b_state in mpage_da_map_blocks.
-                                         * We don't update b_size because if we
-                                         * find an unmapped buffer_head later
-                                         * we need to use the b_state flag of
-                                         * that buffer_head.
-                                         */
-                                        if (mpd->b_size == 0)
-                                                mpd->b_state =
-                                                        bh->b_state & BH_FLAGS;
-                                }
-                                logical++;
-                        } while ((bh = bh->b_this_page) != head);
-                        if (nr_to_write > 0) {
-                                nr_to_write--;
-                                if (nr_to_write == 0 &&
-                                    wbc->sync_mode == WB_SYNC_NONE)
-                                        /*
-                                         * We stop writing back only if we are
-                                         * not doing integrity sync. In case of
-                                         * integrity sync we have to keep going
-                                         * because someone may be concurrently
-                                         * dirtying pages, and we might have
-                                         * synced a lot of newly appeared dirty
-                                         * pages, but have not synced all of the
-                                         * old dirty pages.
-                                         */
                                        goto out;
                        }
+                        /*
+                         * Accumulated enough dirty pages? This doesn't apply
+                         * to WB_SYNC_ALL mode. For integrity sync we have to
+                         * keep going because someone may be concurrently
+                         * dirtying pages, and we might have synced a lot of
+                         * newly appeared dirty pages, but have not synced all
+                         * of the old dirty pages.
+                         */
+                        if (mpd->wbc->sync_mode == WB_SYNC_NONE &&
+                            mpd->next_page - mpd->first_page >=
+                                                        mpd->wbc->nr_to_write)
+                                goto out;
                }
                pagevec_release(&pvec);
                cond_resched();
        }
        return 0;
-ret_extent_tail:
-        ret = MPAGE_DA_EXTENT_TAIL;
 out:
        pagevec_release(&pvec);
-        cond_resched();
+        return err;
-        return ret;
 }
+static int __writepage(struct page *page, struct writeback_control *wbc,
+                       void *data)
+{
+        struct address_space *mapping = data;
+        int ret = ext4_writepage(page, wbc);
+        mapping_set_error(mapping, ret);
+        return ret;
+}
-static int ext4_da_writepages(struct address_space *mapping,
+static int ext4_writepages(struct address_space *mapping,
-                              struct writeback_control *wbc)
+                           struct writeback_control *wbc)
 {
-        pgoff_t index;
+        pgoff_t writeback_index = 0;
+        long nr_to_write = wbc->nr_to_write;
        int range_whole = 0;
+        int cycled = 1;
        handle_t *handle = NULL;
        struct mpage_da_data mpd;
        struct inode *inode = mapping->host;
-        int pages_written = 0;
+        int needed_blocks, rsv_blocks = 0, ret = 0;
-        unsigned int max_pages;
-        int range_cyclic, cycled = 1, io_done = 0;
-        int needed_blocks, ret = 0;
-        long desired_nr_to_write, nr_to_writebump = 0;
-        loff_t range_start = wbc->range_start;
        struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
-        pgoff_t done_index = 0;
+        bool done;
-        pgoff_t end;
        struct blk_plug plug;
+        bool give_up_on_write = false;
-        trace_ext4_da_writepages(inode, wbc);
+        trace_ext4_writepages(inode, wbc);
        /*
         * No pages to write? This is mainly a kludge to avoid starting
@@ -2460,164 +2391,165 @@ static int ext4_da_writepages(struct address_space *mapping,
        if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
                return 0;
+        if (ext4_should_journal_data(inode)) {
+                struct blk_plug plug;
+                int ret;
+                blk_start_plug(&plug);
+                ret = write_cache_pages(mapping, wbc, __writepage, mapping);
+                blk_finish_plug(&plug);
+                return ret;
+        }
        /*
         * If the filesystem has aborted, it is read-only, so return
         * right away instead of dumping stack traces later on that
         * will obscure the real source of the problem.  We test
         * EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because
         * the latter could be true if the filesystem is mounted
-         * read-only, and in that case, ext4_da_writepages should
+         * read-only, and in that case, ext4_writepages should
         * *never* be called, so if that ever happens, we would want
         * the stack trace.
         */
        if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))
                return -EROFS;
+        if (ext4_should_dioread_nolock(inode)) {
+                /*
+                 * We may need to convert upto one extent per block in
+                 * the page and we may dirty the inode.
+                 */
+                rsv_blocks = 1 + (PAGE_CACHE_SIZE >> inode->i_blkbits);
+        }
+        /*
+         * If we have inline data and arrive here, it means that
+         * we will soon create the block for the 1st page, so
+         * we'd better clear the inline data here.
+         */
+        if (ext4_has_inline_data(inode)) {
+                /* Just inode will be modified... */
+                handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
+                if (IS_ERR(handle)) {
+                        ret = PTR_ERR(handle);
+                        goto out_writepages;
+                }
+                BUG_ON(ext4_test_inode_state(inode,
+                                EXT4_STATE_MAY_INLINE_DATA));
+                ext4_destroy_inline_data(handle, inode);
+                ext4_journal_stop(handle);
+        }
        if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
                range_whole = 1;
-        range_cyclic = wbc->range_cyclic;
        if (wbc->range_cyclic) {
-                index = mapping->writeback_index;
+                writeback_index = mapping->writeback_index;
-                if (index)
+                if (writeback_index)
                        cycled = 0;
-                wbc->range_start = index << PAGE_CACHE_SHIFT;
+                mpd.first_page = writeback_index;
-                wbc->range_end  = LLONG_MAX;
+                mpd.last_page = -1;
-                wbc->range_cyclic = 0;
-                end = -1;
        } else {
-                index = wbc->range_start >> PAGE_CACHE_SHIFT;
+                mpd.first_page = wbc->range_start >> PAGE_CACHE_SHIFT;
-                end = wbc->range_end >> PAGE_CACHE_SHIFT;
+                mpd.last_page = wbc->range_end >> PAGE_CACHE_SHIFT;
-        }
-        /*
-         * This works around two forms of stupidity.  The first is in
-         * the writeback code, which caps the maximum number of pages
-         * written to be 1024 pages.  This is wrong on multiple
-         * levels; different architectues have a different page size,
-         * which changes the maximum amount of data which gets
-         * written.  Secondly, 4 megabytes is way too small.  XFS
-         * forces this value to be 16 megabytes by multiplying
-         * nr_to_write parameter by four, and then relies on its
-         * allocator to allocate larger extents to make them
-         * contiguous.  Unfortunately this brings us to the second
-         * stupidity, which is that ext4's mballoc code only allocates
-         * at most 2048 blocks.  So we force contiguous writes up to
-         * the number of dirty blocks in the inode, or
-         * sbi->max_writeback_mb_bump whichever is smaller.
-         */
-        max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);
-        if (!range_cyclic && range_whole) {
-                if (wbc->nr_to_write == LONG_MAX)
-                        desired_nr_to_write = wbc->nr_to_write;
-                else
-                        desired_nr_to_write = wbc->nr_to_write * 8;
-        } else
-                desired_nr_to_write = ext4_num_dirty_pages(inode, index,
-                                                           max_pages);
-        if (desired_nr_to_write > max_pages)
-                desired_nr_to_write = max_pages;
-        if (wbc->nr_to_write < desired_nr_to_write) {
-                nr_to_writebump = desired_nr_to_write - wbc->nr_to_write;
-                wbc->nr_to_write = desired_nr_to_write;
        }
+        mpd.inode = inode;
+        mpd.wbc = wbc;
+        ext4_io_submit_init(&mpd.io_submit, wbc);
 retry:
        if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
-                tag_pages_for_writeback(mapping, index, end);
+                tag_pages_for_writeback(mapping, mpd.first_page, mpd.last_page);
+        done = false;
        blk_start_plug(&plug);
-        while (!ret && wbc->nr_to_write > 0) {
+        while (!done && mpd.first_page <= mpd.last_page) {
+                /* For each extent of pages we use new io_end */
+                mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
+                if (!mpd.io_submit.io_end) {
+                        ret = -ENOMEM;
+                        break;
+                }
                /*
-                 * we  insert one extent at a time. So we need
+                 * We have two constraints: We find one extent to map and we
-                 * credit needed for single extent allocation.
+                 * must always write out whole page (makes a difference when
-                 * journalled mode is currently not supported
+                 * blocksize < pagesize) so that we don't block on IO when we
-                 * by delalloc
+                 * try to write out the rest of the page. Journalled mode is
+                 * not supported by delalloc.
                 */
                BUG_ON(ext4_should_journal_data(inode));
                needed_blocks = ext4_da_writepages_trans_blocks(inode);
-                /* start a new transaction*/
+                /* start a new transaction */
-                handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
+                handle = ext4_journal_start_with_reserve(inode,
-                                            needed_blocks);
+                                EXT4_HT_WRITE_PAGE, needed_blocks, rsv_blocks);
                if (IS_ERR(handle)) {
                        ret = PTR_ERR(handle);
                        ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
                               "%ld pages, ino %lu; err %d", __func__,
                                wbc->nr_to_write, inode->i_ino, ret);
-                        blk_finish_plug(&plug);
+                        /* Release allocated io_end */
-                        goto out_writepages;
+                        ext4_put_io_end(mpd.io_submit.io_end);
+                        break;
                }
-                /*
+                trace_ext4_da_write_pages(inode, mpd.first_page, mpd.wbc);
-                 * Now call write_cache_pages_da() to find the next
+                ret = mpage_prepare_extent_to_map(&mpd);
-                 * contiguous region of logical blocks that need
+                if (!ret) {
-                 * blocks to be allocated by ext4 and submit them.
+                        if (mpd.map.m_len)
-                 */
+                                ret = mpage_map_and_submit_extent(handle, &mpd,
-                ret = write_cache_pages_da(handle, mapping,
+                                        &give_up_on_write);
-                                           wbc, &mpd, &done_index);
+                        else {
-                /*
+                                /*
-                 * If we have a contiguous extent of pages and we
+                                 * We scanned the whole range (or exhausted
-                 * haven't done the I/O yet, map the blocks and submit
+                                 * nr_to_write), submitted what was mapped and
-                 * them for I/O.
+                                 * didn't find anything needing mapping. We are
-                 */
+                                 * done.
-                if (!mpd.io_done && mpd.next_page != mpd.first_page) {
+                                 */
-                        mpage_da_map_and_submit(&mpd);
+                                done = true;
-                        ret = MPAGE_DA_EXTENT_TAIL;
+                        }
                }
-                trace_ext4_da_write_pages(inode, &mpd);
-                wbc->nr_to_write -= mpd.pages_written;
                ext4_journal_stop(handle);
+                /* Submit prepared bio */
-                if ((mpd.retval == -ENOSPC) && sbi->s_journal) {
+                ext4_io_submit(&mpd.io_submit);
-                        /* commit the transaction which would
+                /* Unlock pages we didn't use */
+                mpage_release_unused_pages(&mpd, give_up_on_write);
+                /* Drop our io_end reference we got from init */
+                ext4_put_io_end(mpd.io_submit.io_end);
+                if (ret == -ENOSPC && sbi->s_journal) {
+                        /*
+                         * Commit the transaction which would
                         * free blocks released in the transaction
                         * and try again
                         */
                        jbd2_journal_force_commit_nested(sbi->s_journal);
                        ret = 0;
-                } else if (ret == MPAGE_DA_EXTENT_TAIL) {
+                        continue;
-                        /*
+                }
-                         * Got one extent now try with rest of the pages.
+                /* Fatal error - ENOMEM, EIO... */
-                         * If mpd.retval is set -EIO, journal is aborted.
+                if (ret)
-                         * So we don't need to write any more.
-                         */
-                        pages_written += mpd.pages_written;
-                        ret = mpd.retval;
-                        io_done = 1;
-                } else if (wbc->nr_to_write)
-                        /*
-                         * There is no more writeout needed
-                         * or we requested for a noblocking writeout
-                         * and we found the device congested
-                         */
                        break;
        }
        blk_finish_plug(&plug);
-        if (!io_done && !cycled) {
+        if (!ret && !cycled) {
                cycled = 1;
-                index = 0;
+                mpd.last_page = writeback_index - 1;
-                wbc->range_start = index << PAGE_CACHE_SHIFT;
+                mpd.first_page = 0;
-                wbc->range_end  = mapping->writeback_index - 1;
                goto retry;
        }
        /* Update index */
-        wbc->range_cyclic = range_cyclic;
        if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
                /*
-                 * set the writeback_index so that range_cyclic
+                 * Set the writeback_index so that range_cyclic
                 * mode will write it back later
                 */
-                mapping->writeback_index = done_index;
+                mapping->writeback_index = mpd.first_page;
 out_writepages:
-        wbc->nr_to_write -= nr_to_writebump;
+        trace_ext4_writepages_result(inode, wbc, ret,
-        wbc->range_start = range_start;
+                                     nr_to_write - wbc->nr_to_write);
-        trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
        return ret;
 }
@@ -2829,7 +2761,8 @@ static int ext4_da_write_end(struct file *file,
        return ret ? ret : copied;
 }
-static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
+static void ext4_da_invalidatepage(struct page *page, unsigned int offset,
+                                   unsigned int length)
 {
        /*
         * Drop reserved blocks
@@ -2838,10 +2771,10 @@ static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
        if (!page_has_buffers(page))
                goto out;
-        ext4_da_page_release_reservation(page, offset);
+        ext4_da_page_release_reservation(page, offset, length);
 out:
-        ext4_invalidatepage(page, offset);
+        ext4_invalidatepage(page, offset, length);
        return;
 }
@@ -2864,7 +2797,7 @@ int ext4_alloc_da_blocks(struct inode *inode)
         * laptop_mode, not even desirable).  However, to do otherwise
         * would require replicating code paths in:
         *
-         * ext4_da_writepages() ->
+         * ext4_writepages() ->
         *    write_cache_pages() ---> (via passed in callback function)
         *        __mpage_da_writepage() -->
         *           mpage_add_bh_to_extent()
@@ -2989,37 +2922,40 @@ ext4_readpages(struct file *file, struct address_space *mapping,
        return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
 }
-static void ext4_invalidatepage(struct page *page, unsigned long offset)
+static void ext4_invalidatepage(struct page *page, unsigned int offset,
+                                unsigned int length)
 {
-        trace_ext4_invalidatepage(page, offset);
+        trace_ext4_invalidatepage(page, offset, length);
        /* No journalling happens on data buffers when this function is used */
        WARN_ON(page_has_buffers(page) && buffer_jbd(page_buffers(page)));
-        block_invalidatepage(page, offset);
+        block_invalidatepage(page, offset, length);
 }
 static int __ext4_journalled_invalidatepage(struct page *page,
-                                            unsigned long offset)
+                                            unsigned int offset,
+                                            unsigned int length)
 {
        journal_t *journal = EXT4_JOURNAL(page->mapping->host);
-        trace_ext4_journalled_invalidatepage(page, offset);
+        trace_ext4_journalled_invalidatepage(page, offset, length);
        /*
         * If it's a full truncate we just forget about the pending dirtying
         */
-        if (offset == 0)
+        if (offset == 0 && length == PAGE_CACHE_SIZE)
                ClearPageChecked(page);
-        return jbd2_journal_invalidatepage(journal, page, offset);
+        return jbd2_journal_invalidatepage(journal, page, offset, length);
 }
 /* Wrapper for aops... */
 static void ext4_journalled_invalidatepage(struct page *page,
-                                           unsigned long offset)
+                                           unsigned int offset,
+                                           unsigned int length)
 {
-        WARN_ON(__ext4_journalled_invalidatepage(page, offset) < 0);
+        WARN_ON(__ext4_journalled_invalidatepage(page, offset, length) < 0);
 }
 static int ext4_releasepage(struct page *page, gfp_t wait)
@@ -3067,9 +3003,13 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
        struct inode *inode = file_inode(iocb->ki_filp);
        ext4_io_end_t *io_end = iocb->private;
-        /* if not async direct IO or dio with 0 bytes write, just return */
+        /* if not async direct IO just return */
-        if (!io_end || !size)
+        if (!io_end) {
-                goto out;
+                inode_dio_done(inode);
+                if (is_async)
+                        aio_complete(iocb, ret, 0);
+                return;
+        }
        ext_debug("ext4_end_io_dio(): io_end 0x%p "
                  "for inode %lu, iocb 0x%p, offset %llu, size %zd\n",
@@ -3077,25 +3017,13 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
                  size);
        iocb->private = NULL;
-        /* if not aio dio with unwritten extents, just free io and return */
-        if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
-                ext4_free_io_end(io_end);
-out:
-                inode_dio_done(inode);
-                if (is_async)
-                        aio_complete(iocb, ret, 0);
-                return;
-        }
        io_end->offset = offset;
        io_end->size = size;
        if (is_async) {
                io_end->iocb = iocb;
                io_end->result = ret;
        }
+        ext4_put_io_end_defer(io_end);
-        ext4_add_complete_io(io_end);
 }
 /*
@@ -3129,6 +3057,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
        get_block_t *get_block_func = NULL;
        int dio_flags = 0;
        loff_t final_size = offset + count;
+        ext4_io_end_t *io_end = NULL;
        /* Use the old path for reads and writes beyond i_size. */
        if (rw != WRITE || final_size > inode->i_size)
@@ -3136,11 +3065,18 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
        BUG_ON(iocb->private == NULL);
+        /*
+         * Make all waiters for direct IO properly wait also for extent
+         * conversion. This also disallows race between truncate() and
+         * overwrite DIO as i_dio_count needs to be incremented under i_mutex.
+         */
+        if (rw == WRITE)
+                atomic_inc(&inode->i_dio_count);
        /* If we do a overwrite dio, i_mutex locking can be released */
        overwrite = *((int *)iocb->private);
        if (overwrite) {
-                atomic_inc(&inode->i_dio_count);
                down_read(&EXT4_I(inode)->i_data_sem);
                mutex_unlock(&inode->i_mutex);
        }
@@ -3167,13 +3103,16 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
        iocb->private = NULL;
        ext4_inode_aio_set(inode, NULL);
        if (!is_sync_kiocb(iocb)) {
-                ext4_io_end_t *io_end = ext4_init_io_end(inode, GFP_NOFS);
+                io_end = ext4_init_io_end(inode, GFP_NOFS);
                if (!io_end) {
                        ret = -ENOMEM;
                        goto retake_lock;
                }
                io_end->flag |= EXT4_IO_END_DIRECT;
-                iocb->private = io_end;
+                /*
+                 * Grab reference for DIO. Will be dropped in ext4_end_io_dio()
+                 */
+                iocb->private = ext4_get_io_end(io_end);
                /*
                 * we save the io structure for current async direct
                 * IO, so that later ext4_map_blocks() could flag the
@@ -3197,33 +3136,42 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
                                   NULL,
                                   dio_flags);
-        if (iocb->private)
-                ext4_inode_aio_set(inode, NULL);
        /*
-         * The io_end structure takes a reference to the inode, that
+         * Put our reference to io_end. This can free the io_end structure e.g.
-         * structure needs to be destroyed and the reference to the
+         * in sync IO case or in case of error. It can even perform extent
-         * inode need to be dropped, when IO is complete, even with 0
+         * conversion if all bios we submitted finished before we got here.
-         * byte write, or failed.
+         * Note that in that case iocb->private can be already set to NULL
-         *
+         * here.
-         * In the successful AIO DIO case, the io_end structure will
-         * be destroyed and the reference to the inode will be dropped
-         * after the end_io call back function is called.
-         *
-         * In the case there is 0 byte write, or error case, since VFS
-         * direct IO won't invoke the end_io call back function, we
-         * need to free the end_io structure here.
         */
-        if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
+        if (io_end) {
-                ext4_free_io_end(iocb->private);
+                ext4_inode_aio_set(inode, NULL);
-                iocb->private = NULL;
+                ext4_put_io_end(io_end);
-        } else if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
+                /*
+                 * When no IO was submitted ext4_end_io_dio() was not
+                 * called so we have to put iocb's reference.
+                 */
+                if (ret <= 0 && ret != -EIOCBQUEUED && iocb->private) {
+                        WARN_ON(iocb->private != io_end);
+                        WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
+                        WARN_ON(io_end->iocb);
+                        /*
+                         * Generic code already did inode_dio_done() so we
+                         * have to clear EXT4_IO_END_DIRECT to not do it for
+                         * the second time.
+                         */
+                        io_end->flag = 0;
+                        ext4_put_io_end(io_end);
+                        iocb->private = NULL;
+                }
+        }
+        if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
                                                EXT4_STATE_DIO_UNWRITTEN)) {
                int err;
                /*
                 * for non AIO case, since the IO is already
                 * completed, we could do the conversion right here
                 */
-                err = ext4_convert_unwritten_extents(inode,
+                err = ext4_convert_unwritten_extents(NULL, inode,
                                                     offset, ret);
                if (err < 0)
                        ret = err;
@@ -3231,9 +3179,10 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
        }
 retake_lock:
+        if (rw == WRITE)
+                inode_dio_done(inode);
        /* take i_mutex locking again if we do a ovewrite dio */
        if (overwrite) {
-                inode_dio_done(inode);
                up_read(&EXT4_I(inode)->i_data_sem);
                mutex_lock(&inode->i_mutex);
        }
@@ -3292,6 +3241,7 @@ static const struct address_space_operations ext4_aops = {
        .readpage               = ext4_readpage,
        .readpages              = ext4_readpages,
        .writepage              = ext4_writepage,
+        .writepages             = ext4_writepages,
        .write_begin            = ext4_write_begin,
        .write_end              = ext4_write_end,
        .bmap                   = ext4_bmap,
@@ -3307,6 +3257,7 @@ static const struct address_space_operations ext4_journalled_aops = {
        .readpage               = ext4_readpage,
        .readpages              = ext4_readpages,
        .writepage              = ext4_writepage,
+        .writepages             = ext4_writepages,
        .write_begin            = ext4_write_begin,
        .write_end              = ext4_journalled_write_end,
        .set_page_dirty         = ext4_journalled_set_page_dirty,
@@ -3322,7 +3273,7 @@ static const struct address_space_operations ext4_da_aops = {
        .readpage               = ext4_readpage,
        .readpages              = ext4_readpages,
        .writepage              = ext4_writepage,
-        .writepages             = ext4_da_writepages,
+        .writepages             = ext4_writepages,
        .write_begin            = ext4_da_write_begin,
        .write_end              = ext4_da_write_end,
        .bmap                   = ext4_bmap,
@@ -3355,89 +3306,56 @@ void ext4_set_aops(struct inode *inode)
                inode->i_mapping->a_ops = &ext4_aops;
 }
 /*
- * ext4_discard_partial_page_buffers()
+ * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
- * Wrapper function for ext4_discard_partial_page_buffers_no_lock.
+ * up to the end of the block which corresponds to `from'.
- * This function finds and locks the page containing the offset
+ * This required during truncate. We need to physically zero the tail end
- * "from" and passes it to ext4_discard_partial_page_buffers_no_lock.
+ * of that block so it doesn't yield old data if the file is later grown.
- * Calling functions that already have the page locked should call
- * ext4_discard_partial_page_buffers_no_lock directly.
 */
-int ext4_discard_partial_page_buffers(handle_t *handle,
+int ext4_block_truncate_page(handle_t *handle,
-                struct address_space *mapping, loff_t from,
+                struct address_space *mapping, loff_t from)
-                loff_t length, int flags)
 {
+        unsigned offset = from & (PAGE_CACHE_SIZE-1);
+        unsigned length;
+        unsigned blocksize;
        struct inode *inode = mapping->host;
-        struct page *page;
-        int err = 0;
-        page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
+        blocksize = inode->i_sb->s_blocksize;
-                                   mapping_gfp_mask(mapping) & ~__GFP_FS);
+        length = blocksize - (offset & (blocksize - 1));
-        if (!page)
-                return -ENOMEM;
-        err = ext4_discard_partial_page_buffers_no_lock(handle, inode, page,
-                from, length, flags);
-        unlock_page(page);
+        return ext4_block_zero_page_range(handle, mapping, from, length);
-        page_cache_release(page);
-        return err;
 }
 /*
- * ext4_discard_partial_page_buffers_no_lock()
+ * ext4_block_zero_page_range() zeros out a mapping of length 'length'
- * Zeros a page range of length 'length' starting from offset 'from'.
+ * starting from file offset 'from'.  The range to be zero'd must
- * Buffer heads that correspond to the block aligned regions of the
+ * be contained with in one block.  If the specified range exceeds
- * zeroed range will be unmapped.  Unblock aligned regions
+ * the end of the block it will be shortened to end of the block
- * will have the corresponding buffer head mapped if needed so that
+ * that cooresponds to 'from'
- * that region of the page can be updated with the partial zero out.
- *
- * This function assumes that the page has already been  locked.  The
- * The range to be discarded must be contained with in the given page.
- * If the specified range exceeds the end of the page it will be shortened
- * to the end of the page that corresponds to 'from'.  This function is
- * appropriate for updating a page and it buffer heads to be unmapped and
- * zeroed for blocks that have been either released, or are going to be
- * released.
- *
- * handle: The journal handle
- * inode:  The files inode
- * page:   A locked page that contains the offset "from"
- * from:   The starting byte offset (from the beginning of the file)
- *         to begin discarding
- * len:    The length of bytes to discard
- * flags:  Optional flags that may be used:
- *
- *         EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED
- *         Only zero the regions of the page whose buffer heads
- *         have already been unmapped.  This flag is appropriate
- *         for updating the contents of a page whose blocks may
- *         have already been released, and we only want to zero
- *         out the regions that correspond to those released blocks.
- *
- * Returns zero on success or negative on failure.
 */
-static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
+int ext4_block_zero_page_range(handle_t *handle,
-                struct inode *inode, struct page *page, loff_t from,
+                struct address_space *mapping, loff_t from, loff_t length)
-                loff_t length, int flags)
 {
        ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
-        unsigned int offset = from & (PAGE_CACHE_SIZE-1);
+        unsigned offset = from & (PAGE_CACHE_SIZE-1);
-        unsigned int blocksize, max, pos;
+        unsigned blocksize, max, pos;
        ext4_lblk_t iblock;
+        struct inode *inode = mapping->host;
        struct buffer_head *bh;
+        struct page *page;
        int err = 0;
-        blocksize = inode->i_sb->s_blocksize;
+        page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
-        max = PAGE_CACHE_SIZE - offset;
+                                   mapping_gfp_mask(mapping) & ~__GFP_FS);
+        if (!page)
+                return -ENOMEM;
-        if (index != page->index)
+        blocksize = inode->i_sb->s_blocksize;
-                return -EINVAL;
+        max = blocksize - (offset & (blocksize - 1));
        /*
         * correct length if it does not fall between
-         * 'from' and the end of the page
+         * 'from' and the end of the block
         */
        if (length > max || length < 0)
                length = max;
@@ -3455,106 +3373,91 @@ static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
                iblock++;
                pos += blocksize;
        }
+        if (buffer_freed(bh)) {
-        pos = offset;
+                BUFFER_TRACE(bh, "freed: skip");
-        while (pos < offset + length) {
+                goto unlock;
-                unsigned int end_of_block, range_to_discard;
+        }
+        if (!buffer_mapped(bh)) {
-                err = 0;
+                BUFFER_TRACE(bh, "unmapped");
+                ext4_get_block(inode, iblock, bh, 0);
-                /* The length of space left to zero and unmap */
+                /* unmapped? It's a hole - nothing to do */
-                range_to_discard = offset + length - pos;
-                /* The length of space until the end of the block */
-                end_of_block = blocksize - (pos & (blocksize-1));
-                /*
-                 * Do not unmap or zero past end of block
-                 * for this buffer head
-                 */
-                if (range_to_discard > end_of_block)
-                        range_to_discard = end_of_block;
-                /*
-                 * Skip this buffer head if we are only zeroing unampped
-                 * regions of the page
-                 */
-                if (flags & EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED &&
-                        buffer_mapped(bh))
-                                goto next;
-                /* If the range is block aligned, unmap */
-                if (range_to_discard == blocksize) {
-                        clear_buffer_dirty(bh);
-                        bh->b_bdev = NULL;
-                        clear_buffer_mapped(bh);
-                        clear_buffer_req(bh);
-                        clear_buffer_new(bh);
-                        clear_buffer_delay(bh);
-                        clear_buffer_unwritten(bh);
-                        clear_buffer_uptodate(bh);
-                        zero_user(page, pos, range_to_discard);
-                        BUFFER_TRACE(bh, "Buffer discarded");
-                        goto next;
-                }
-                /*
-                 * If this block is not completely contained in the range
-                 * to be discarded, then it is not going to be released. Because
-                 * we need to keep this block, we need to make sure this part
-                 * of the page is uptodate before we modify it by writeing
-                 * partial zeros on it.
-                 */
                if (!buffer_mapped(bh)) {
-                        /*
+                        BUFFER_TRACE(bh, "still unmapped");
-                         * Buffer head must be mapped before we can read
+                        goto unlock;
-                         * from the block
-                         */
-                        BUFFER_TRACE(bh, "unmapped");
-                        ext4_get_block(inode, iblock, bh, 0);
-                        /* unmapped? It's a hole - nothing to do */
-                        if (!buffer_mapped(bh)) {
-                                BUFFER_TRACE(bh, "still unmapped");
-                                goto next;
-                        }
                }
+        }
-                /* Ok, it's mapped. Make sure it's up-to-date */
+        /* Ok, it's mapped. Make sure it's up-to-date */
-                if (PageUptodate(page))
+        if (PageUptodate(page))
-                        set_buffer_uptodate(bh);
+                set_buffer_uptodate(bh);
-                if (!buffer_uptodate(bh)) {
+        if (!buffer_uptodate(bh)) {
-                        err = -EIO;
+                err = -EIO;
-                        ll_rw_block(READ, 1, &bh);
+                ll_rw_block(READ, 1, &bh);
-                        wait_on_buffer(bh);
+                wait_on_buffer(bh);
-                        /* Uhhuh. Read error. Complain and punt.*/
+                /* Uhhuh. Read error. Complain and punt. */
-                        if (!buffer_uptodate(bh))
+                if (!buffer_uptodate(bh))
-                                goto next;
+                        goto unlock;
-                }
+        }
+        if (ext4_should_journal_data(inode)) {
+                BUFFER_TRACE(bh, "get write access");
+                err = ext4_journal_get_write_access(handle, bh);
+                if (err)
+                        goto unlock;
+        }
+        zero_user(page, offset, length);
+        BUFFER_TRACE(bh, "zeroed end of block");
-                if (ext4_should_journal_data(inode)) {
+        if (ext4_should_journal_data(inode)) {
-                        BUFFER_TRACE(bh, "get write access");
+                err = ext4_handle_dirty_metadata(handle, inode, bh);
-                        err = ext4_journal_get_write_access(handle, bh);
+        } else {
-                        if (err)
+                err = 0;
-                                goto next;
+                mark_buffer_dirty(bh);
-                }
+                if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE))
+                        err = ext4_jbd2_file_inode(handle, inode);
+        }
+unlock:
+        unlock_page(page);
+        page_cache_release(page);
+        return err;
+}
-                zero_user(page, pos, range_to_discard);
+int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
+                             loff_t lstart, loff_t length)
+{
+        struct super_block *sb = inode->i_sb;
+        struct address_space *mapping = inode->i_mapping;
+        unsigned partial_start, partial_end;
+        ext4_fsblk_t start, end;
+        loff_t byte_end = (lstart + length - 1);
+        int err = 0;
-                err = 0;
+        partial_start = lstart & (sb->s_blocksize - 1);
-                if (ext4_should_journal_data(inode)) {
+        partial_end = byte_end & (sb->s_blocksize - 1);
-                        err = ext4_handle_dirty_metadata(handle, inode, bh);
-                } else
-                        mark_buffer_dirty(bh);
-                BUFFER_TRACE(bh, "Partial buffer zeroed");
+        start = lstart >> sb->s_blocksize_bits;
-next:
+        end = byte_end >> sb->s_blocksize_bits;
-                bh = bh->b_this_page;
-                iblock++;
-                pos += range_to_discard;
-        }
+        /* Handle partial zero within the single block */
+        if (start == end &&
+            (partial_start || (partial_end != sb->s_blocksize - 1))) {
+                err = ext4_block_zero_page_range(handle, mapping,
+                                                 lstart, length);
+                return err;
+        }
+        /* Handle partial zero out on the start of the range */
+        if (partial_start) {
+                err = ext4_block_zero_page_range(handle, mapping,
+                                                 lstart, sb->s_blocksize);
+                if (err)
+                        return err;
+        }
+        /* Handle partial zero out on the end of the range */
+        if (partial_end != sb->s_blocksize - 1)
+                err = ext4_block_zero_page_range(handle, mapping,
+                                                 byte_end - partial_end,
+                                                 partial_end + 1);
        return err;
 }
@@ -3580,14 +3483,12 @@ int ext4_can_truncate(struct inode *inode)
 * Returns: 0 on success or negative on failure
 */
-int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
+int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
 {
-        struct inode *inode = file_inode(file);
        struct super_block *sb = inode->i_sb;
        ext4_lblk_t first_block, stop_block;
        struct address_space *mapping = inode->i_mapping;
-        loff_t first_page, last_page, page_len;
+        loff_t first_block_offset, last_block_offset;
-        loff_t first_page_offset, last_page_offset;
        handle_t *handle;
        unsigned int credits;
        int ret = 0;
@@ -3638,23 +3539,16 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
                   offset;
        }
-        first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+        first_block_offset = round_up(offset, sb->s_blocksize);
-        last_page = (offset + length) >> PAGE_CACHE_SHIFT;
+        last_block_offset = round_down((offset + length), sb->s_blocksize) - 1;
-        first_page_offset = first_page << PAGE_CACHE_SHIFT;
+        /* Now release the pages and zero block aligned part of pages*/
-        last_page_offset = last_page << PAGE_CACHE_SHIFT;
+        if (last_block_offset > first_block_offset)
+                truncate_pagecache_range(inode, first_block_offset,
-        /* Now release the pages */
+                                         last_block_offset);
-        if (last_page_offset > first_page_offset) {
-                truncate_pagecache_range(inode, first_page_offset,
-                                         last_page_offset - 1);
-        }
        /* Wait all existing dio workers, newcomers will block on i_mutex */
        ext4_inode_block_unlocked_dio(inode);
-        ret = ext4_flush_unwritten_io(inode);
-        if (ret)
-                goto out_dio;
        inode_dio_wait(inode);
        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
@@ -3668,66 +3562,10 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
                goto out_dio;
        }
-        /*
+        ret = ext4_zero_partial_blocks(handle, inode, offset,
-         * Now we need to zero out the non-page-aligned data in the
+                                       length);
-         * pages at the start and tail of the hole, and unmap the
+        if (ret)
-         * buffer heads for the block aligned regions of the page that
+                goto out_stop;
-         * were completely zeroed.
-         */
-        if (first_page > last_page) {
-                /*
-                 * If the file space being truncated is contained
-                 * within a page just zero out and unmap the middle of
-                 * that page
-                 */
-                ret = ext4_discard_partial_page_buffers(handle,
-                        mapping, offset, length, 0);
-                if (ret)
-                        goto out_stop;
-        } else {
-                /*
-                 * zero out and unmap the partial page that contains
-                 * the start of the hole
-                 */
-                page_len = first_page_offset - offset;
-                if (page_len > 0) {
-                        ret = ext4_discard_partial_page_buffers(handle, mapping,
-                                                offset, page_len, 0);
-                        if (ret)
-                                goto out_stop;
-                }
-                /*
-                 * zero out and unmap the partial page that contains
-                 * the end of the hole
-                 */
-                page_len = offset + length - last_page_offset;
-                if (page_len > 0) {
-                        ret = ext4_discard_partial_page_buffers(handle, mapping,
-                                        last_page_offset, page_len, 0);
-                        if (ret)
-                                goto out_stop;
-                }
-        }
-        /*
-         * If i_size is contained in the last page, we need to
-         * unmap and zero the partial page after i_size
-         */
-        if (inode->i_size >> PAGE_CACHE_SHIFT == last_page &&
-           inode->i_size % PAGE_CACHE_SIZE != 0) {
-                page_len = PAGE_CACHE_SIZE -
-                        (inode->i_size & (PAGE_CACHE_SIZE - 1));
-                if (page_len > 0) {
-                        ret = ext4_discard_partial_page_buffers(handle,
-                                        mapping, inode->i_size, page_len, 0);
-                        if (ret)
-                                goto out_stop;
-                }
-        }
        first_block = (offset + sb->s_blocksize - 1) >>
                EXT4_BLOCK_SIZE_BITS(sb);
@@ -3803,7 +3641,6 @@ void ext4_truncate(struct inode *inode)
        unsigned int credits;
        handle_t *handle;
        struct address_space *mapping = inode->i_mapping;
-        loff_t page_len;
        /*
         * There is a possibility that we're either freeing the inode
@@ -3830,12 +3667,6 @@ void ext4_truncate(struct inode *inode)
                        return;
        }
-        /*
-         * finish any pending end_io work so we won't run the risk of
-         * converting any truncated blocks to initialized later
-         */
-        ext4_flush_unwritten_io(inode);
        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                credits = ext4_writepage_trans_blocks(inode);
        else
@@ -3847,14 +3678,8 @@ void ext4_truncate(struct inode *inode)
                return;
        }
-        if (inode->i_size % PAGE_CACHE_SIZE != 0) {
+        if (inode->i_size & (inode->i_sb->s_blocksize - 1))
-                page_len = PAGE_CACHE_SIZE -
+                ext4_block_truncate_page(handle, mapping, inode->i_size);
-                        (inode->i_size & (PAGE_CACHE_SIZE - 1));
-                if (ext4_discard_partial_page_buffers(handle,
-                                mapping, inode->i_size, page_len, 0))
-                        goto out_stop;
-        }
        /*
         * We add the inode to the orphan list, so that if this
@@ -4623,7 +4448,8 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode)
                                      inode->i_size >> PAGE_CACHE_SHIFT);
                if (!page)
                        return;
-                ret = __ext4_journalled_invalidatepage(page, offset);
+                ret = __ext4_journalled_invalidatepage(page, offset,
+                                                PAGE_CACHE_SIZE - offset);
                unlock_page(page);
                page_cache_release(page);
                if (ret != -EBUSY)
@@ -4805,7 +4631,7 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
                 struct kstat *stat)
 {
        struct inode *inode;
-        unsigned long delalloc_blocks;
+        unsigned long long delalloc_blocks;
        inode = dentry->d_inode;
        generic_fillattr(inode, stat);
@@ -4823,15 +4649,16 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
        delalloc_blocks = EXT4_C2B(EXT4_SB(inode->i_sb),
                                EXT4_I(inode)->i_reserved_data_blocks);
-        stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9;
+        stat->blocks += delalloc_blocks << (inode->i_sb->s_blocksize_bits-9);
        return 0;
 }
-static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+static int ext4_index_trans_blocks(struct inode *inode, int lblocks,
+                                   int pextents)
 {
        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
-                return ext4_ind_trans_blocks(inode, nrblocks, chunk);
+                return ext4_ind_trans_blocks(inode, lblocks);
-        return ext4_ext_index_trans_blocks(inode, nrblocks, chunk);
+        return ext4_ext_index_trans_blocks(inode, pextents);
 }
 /*
@@ -4845,7 +4672,8 @@ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
 *
 * Also account for superblock, inode, quota and xattr blocks
 */
-static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
+                                  int pextents)
 {
        ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
        int gdpblocks;
@@ -4853,14 +4681,10 @@ static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
        int ret = 0;
        /*
-         * How many index blocks need to touch to modify nrblocks?
+         * How many index blocks need to touch to map @lblocks logical blocks
-         * The "Chunk" flag indicating whether the nrblocks is
+         * to @pextents physical extents?
-         * physically contiguous on disk
-         *
-         * For Direct IO and fallocate, they calls get_block to allocate
-         * one single extent at a time, so they could set the "Chunk" flag
         */
-        idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk);
+        idxblocks = ext4_index_trans_blocks(inode, lblocks, pextents);
        ret = idxblocks;
@@ -4868,12 +4692,7 @@ static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
         * Now let's see how many group bitmaps and group descriptors need
         * to account
         */
-        groups = idxblocks;
+        groups = idxblocks + pextents;
-        if (chunk)
-                groups += 1;
-        else
-                groups += nrblocks;
        gdpblocks = groups;
        if (groups > ngroups)
                groups = ngroups;
@@ -4904,7 +4723,7 @@ int ext4_writepage_trans_blocks(struct inode *inode)
        int bpp = ext4_journal_blocks_per_page(inode);
        int ret;
-        ret = ext4_meta_trans_blocks(inode, bpp, 0);
+        ret = ext4_meta_trans_blocks(inode, bpp, bpp);
        /* Account for data blocks for journalled mode */
        if (ext4_should_journal_data(inode))
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index def84082a9a9..a9ff5e5137ca 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2105,6 +2105,7 @@ repeat:
                group = ac->ac_g_ex.fe_group;
                for (i = 0; i < ngroups; group++, i++) {
+                        cond_resched();
                        /*
                         * Artificially restricted ngroups for non-extent
                         * files makes group > ngroups possible on first loop.
@@ -4405,17 +4406,20 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 repeat:
                /* allocate space in core */
                *errp = ext4_mb_regular_allocator(ac);
-                if (*errp) {
+                if (*errp)
-                        ext4_discard_allocated_blocks(ac);
+                        goto discard_and_exit;
-                        goto errout;
-                }
                /* as we've just preallocated more space than
-                 * user requested orinally, we store allocated
+                 * user requested originally, we store allocated
                 * space in a special descriptor */
                if (ac->ac_status == AC_STATUS_FOUND &&
-                                ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
+                    ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
-                        ext4_mb_new_preallocation(ac);
+                        *errp = ext4_mb_new_preallocation(ac);
+                if (*errp) {
+                discard_and_exit:
+                        ext4_discard_allocated_blocks(ac);
+                        goto errout;
+                }
        }
        if (likely(ac->ac_status == AC_STATUS_FOUND)) {
                *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs);
@@ -4612,10 +4616,11 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
                BUG_ON(bh && (count > 1));
                for (i = 0; i < count; i++) {
+                        cond_resched();
                        if (!bh)
                                tbh = sb_find_get_block(inode->i_sb,
                                                        block + i);
-                        if (unlikely(!tbh))
+                        if (!tbh)
                                continue;
                        ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
                                    inode, tbh, block + i);
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 3dcbf364022f..e86dddbd8296 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -912,7 +912,6 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
        struct page *pagep[2] = {NULL, NULL};
        handle_t *handle;
        ext4_lblk_t orig_blk_offset;
-        long long offs = orig_page_offset << PAGE_CACHE_SHIFT;
        unsigned long blocksize = orig_inode->i_sb->s_blocksize;
        unsigned int w_flags = 0;
        unsigned int tmp_data_size, data_size, replaced_size;
@@ -940,8 +939,6 @@ again:
        orig_blk_offset = orig_page_offset * blocks_per_page +
                data_offset_in_page;
-        offs = (long long)orig_blk_offset << orig_inode->i_blkbits;
        /* Calculate data_size */
        if ((orig_blk_offset + block_len_in_page - 1) ==
            ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) {
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 6653fc35ecb7..ab2f6dc44b3a 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -918,11 +918,8 @@ static int htree_dirblock_to_tree(struct file *dir_file,
                                bh->b_data, bh->b_size,
                                (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
                                         + ((char *)de - bh->b_data))) {
-                        /* On error, skip the f_pos to the next block. */
+                        /* silently ignore the rest of the block */
-                        dir_file->f_pos = (dir_file->f_pos |
+                        break;
-                                        (dir->i_sb->s_blocksize - 1)) + 1;
-                        brelse(bh);
-                        return count;
                }
                ext4fs_dirhash(de->name, de->name_len, hinfo);
                if ((hinfo->hash < start_hash) ||
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 4acf1f78881b..48786cdb5e6c 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -46,46 +46,121 @@ void ext4_exit_pageio(void)
 }
 /*
- * This function is called by ext4_evict_inode() to make sure there is
+ * Print an buffer I/O error compatible with the fs/buffer.c.  This
- * no more pending I/O completion work left to do.
+ * provides compatibility with dmesg scrapers that look for a specific
+ * buffer I/O error message.  We really need a unified error reporting
+ * structure to userspace ala Digital Unix's uerf system, but it's
+ * probably not going to happen in my lifetime, due to LKML politics...
 */
-void ext4_ioend_shutdown(struct inode *inode)
+static void buffer_io_error(struct buffer_head *bh)
 {
-        wait_queue_head_t *wq = ext4_ioend_wq(inode);
+        char b[BDEVNAME_SIZE];
+        printk(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n",
+                        bdevname(bh->b_bdev, b),
+                        (unsigned long long)bh->b_blocknr);
+}
-        wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0));
+static void ext4_finish_bio(struct bio *bio)
-        /*
+{
-         * We need to make sure the work structure is finished being
+        int i;
-         * used before we let the inode get destroyed.
+        int error = !test_bit(BIO_UPTODATE, &bio->bi_flags);
-         */
-        if (work_pending(&EXT4_I(inode)->i_unwritten_work))
+        for (i = 0; i < bio->bi_vcnt; i++) {
-                cancel_work_sync(&EXT4_I(inode)->i_unwritten_work);
+                struct bio_vec *bvec = &bio->bi_io_vec[i];
+                struct page *page = bvec->bv_page;
+                struct buffer_head *bh, *head;
+                unsigned bio_start = bvec->bv_offset;
+                unsigned bio_end = bio_start + bvec->bv_len;
+                unsigned under_io = 0;
+                unsigned long flags;
+                if (!page)
+                        continue;
+                if (error) {
+                        SetPageError(page);
+                        set_bit(AS_EIO, &page->mapping->flags);
+                }
+                bh = head = page_buffers(page);
+                /*
+                 * We check all buffers in the page under BH_Uptodate_Lock
+                 * to avoid races with other end io clearing async_write flags
+                 */
+                local_irq_save(flags);
+                bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
+                do {
+                        if (bh_offset(bh) < bio_start ||
+                            bh_offset(bh) + bh->b_size > bio_end) {
+                                if (buffer_async_write(bh))
+                                        under_io++;
+                                continue;
+                        }
+                        clear_buffer_async_write(bh);
+                        if (error)
+                                buffer_io_error(bh);
+                } while ((bh = bh->b_this_page) != head);
+                bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
+                local_irq_restore(flags);
+                if (!under_io)
+                        end_page_writeback(page);
+        }
 }
-void ext4_free_io_end(ext4_io_end_t *io)
+static void ext4_release_io_end(ext4_io_end_t *io_end)
 {
-        BUG_ON(!io);
+        struct bio *bio, *next_bio;
-        BUG_ON(!list_empty(&io->list));
-        BUG_ON(io->flag & EXT4_IO_END_UNWRITTEN);
+        BUG_ON(!list_empty(&io_end->list));
+        BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
+        WARN_ON(io_end->handle);
-        if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count))
+        if (atomic_dec_and_test(&EXT4_I(io_end->inode)->i_ioend_count))
-                wake_up_all(ext4_ioend_wq(io->inode));
+                wake_up_all(ext4_ioend_wq(io_end->inode));
-        kmem_cache_free(io_end_cachep, io);
+        for (bio = io_end->bio; bio; bio = next_bio) {
+                next_bio = bio->bi_private;
+                ext4_finish_bio(bio);
+                bio_put(bio);
+        }
+        if (io_end->flag & EXT4_IO_END_DIRECT)
+                inode_dio_done(io_end->inode);
+        if (io_end->iocb)
+                aio_complete(io_end->iocb, io_end->result, 0);
+        kmem_cache_free(io_end_cachep, io_end);
 }
-/* check a range of space and convert unwritten extents to written. */
+static void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
+{
+        struct inode *inode = io_end->inode;
+        io_end->flag &= ~EXT4_IO_END_UNWRITTEN;
+        /* Wake up anyone waiting on unwritten extent conversion */
+        if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
+                wake_up_all(ext4_ioend_wq(inode));
+}
+/*
+ * Check a range of space and convert unwritten extents to written. Note that
+ * we are protected from truncate touching same part of extent tree by the
+ * fact that truncate code waits for all DIO to finish (thus exclusion from
+ * direct IO is achieved) and also waits for PageWriteback bits. Thus we
+ * cannot get to ext4_ext_truncate() before all IOs overlapping that range are
+ * completed (happens from ext4_free_ioend()).
+ */
 static int ext4_end_io(ext4_io_end_t *io)
 {
        struct inode *inode = io->inode;
        loff_t offset = io->offset;
        ssize_t size = io->size;
+        handle_t *handle = io->handle;
        int ret = 0;
        ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
                   "list->prev 0x%p\n",
                   io, inode->i_ino, io->list.next, io->list.prev);
-        ret = ext4_convert_unwritten_extents(inode, offset, size);
+        io->handle = NULL;      /* Following call will use up the handle */
+        ret = ext4_convert_unwritten_extents(handle, inode, offset, size);
        if (ret < 0) {
                ext4_msg(inode->i_sb, KERN_EMERG,
                         "failed to convert unwritten extents to written "
@@ -93,30 +168,22 @@ static int ext4_end_io(ext4_io_end_t *io)
                         "(inode %lu, offset %llu, size %zd, error %d)",
                         inode->i_ino, offset, size, ret);
        }
-        /* Wake up anyone waiting on unwritten extent conversion */
+        ext4_clear_io_unwritten_flag(io);
-        if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
+        ext4_release_io_end(io);
-                wake_up_all(ext4_ioend_wq(inode));
-        if (io->flag & EXT4_IO_END_DIRECT)
-                inode_dio_done(inode);
-        if (io->iocb)
-                aio_complete(io->iocb, io->result, 0);
        return ret;
 }
-static void dump_completed_IO(struct inode *inode)
+static void dump_completed_IO(struct inode *inode, struct list_head *head)
 {
 #ifdef  EXT4FS_DEBUG
        struct list_head *cur, *before, *after;
        ext4_io_end_t *io, *io0, *io1;
-        if (list_empty(&EXT4_I(inode)->i_completed_io_list)) {
+        if (list_empty(head))
-                ext4_debug("inode %lu completed_io list is empty\n",
-                           inode->i_ino);
                return;
-        }
-        ext4_debug("Dump inode %lu completed_io list\n", inode->i_ino);
+        ext4_debug("Dump inode %lu completed io list\n", inode->i_ino);
-        list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list) {
+        list_for_each_entry(io, head, list) {
                cur = &io->list;
                before = cur->prev;
                io0 = container_of(before, ext4_io_end_t, list);
@@ -130,23 +197,30 @@ static void dump_completed_IO(struct inode *inode)
 }
 /* Add the io_end to per-inode completed end_io list. */
-void ext4_add_complete_io(ext4_io_end_t *io_end)
+static void ext4_add_complete_io(ext4_io_end_t *io_end)
 {
        struct ext4_inode_info *ei = EXT4_I(io_end->inode);
        struct workqueue_struct *wq;
        unsigned long flags;
        BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
-        wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
        spin_lock_irqsave(&ei->i_completed_io_lock, flags);
-        if (list_empty(&ei->i_completed_io_list))
+        if (io_end->handle) {
-                queue_work(wq, &ei->i_unwritten_work);
+                wq = EXT4_SB(io_end->inode->i_sb)->rsv_conversion_wq;
-        list_add_tail(&io_end->list, &ei->i_completed_io_list);
+                if (list_empty(&ei->i_rsv_conversion_list))
+                        queue_work(wq, &ei->i_rsv_conversion_work);
+                list_add_tail(&io_end->list, &ei->i_rsv_conversion_list);
+        } else {
+                wq = EXT4_SB(io_end->inode->i_sb)->unrsv_conversion_wq;
+                if (list_empty(&ei->i_unrsv_conversion_list))
+                        queue_work(wq, &ei->i_unrsv_conversion_work);
+                list_add_tail(&io_end->list, &ei->i_unrsv_conversion_list);
+        }
        spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
 }
-static int ext4_do_flush_completed_IO(struct inode *inode)
+static int ext4_do_flush_completed_IO(struct inode *inode,
+                                      struct list_head *head)
 {
        ext4_io_end_t *io;
        struct list_head unwritten;
@@ -155,8 +229,8 @@ static int ext4_do_flush_completed_IO(struct inode *inode)
        int err, ret = 0;
        spin_lock_irqsave(&ei->i_completed_io_lock, flags);
-        dump_completed_IO(inode);
+        dump_completed_IO(inode, head);
-        list_replace_init(&ei->i_completed_io_list, &unwritten);
+        list_replace_init(head, &unwritten);
        spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
        while (!list_empty(&unwritten)) {
@@ -167,30 +241,25 @@ static int ext4_do_flush_completed_IO(struct inode *inode)
                err = ext4_end_io(io);
                if (unlikely(!ret && err))
                        ret = err;
-                io->flag &= ~EXT4_IO_END_UNWRITTEN;
-                ext4_free_io_end(io);
        }
        return ret;
 }
 /*
- * work on completed aio dio IO, to convert unwritten extents to extents
+ * work on completed IO, to convert unwritten extents to extents
 */
-void ext4_end_io_work(struct work_struct *work)
+void ext4_end_io_rsv_work(struct work_struct *work)
 {
        struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info,
-                                                  i_unwritten_work);
+                                                  i_rsv_conversion_work);
-        ext4_do_flush_completed_IO(&ei->vfs_inode);
+        ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_rsv_conversion_list);
 }
-int ext4_flush_unwritten_io(struct inode *inode)
+void ext4_end_io_unrsv_work(struct work_struct *work)
 {
-        int ret;
+        struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info,
-        WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex) &&
+                                                  i_unrsv_conversion_work);
-                     !(inode->i_state & I_FREEING));
+        ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_unrsv_conversion_list);
-        ret = ext4_do_flush_completed_IO(inode);
-        ext4_unwritten_wait(inode);
-        return ret;
 }
 ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
@@ -200,83 +269,70 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
                atomic_inc(&EXT4_I(inode)->i_ioend_count);
                io->inode = inode;
                INIT_LIST_HEAD(&io->list);
+                atomic_set(&io->count, 1);
        }
        return io;
 }
-/*
+void ext4_put_io_end_defer(ext4_io_end_t *io_end)
- * Print an buffer I/O error compatible with the fs/buffer.c.  This
- * provides compatibility with dmesg scrapers that look for a specific
- * buffer I/O error message.  We really need a unified error reporting
- * structure to userspace ala Digital Unix's uerf system, but it's
- * probably not going to happen in my lifetime, due to LKML politics...
- */
-static void buffer_io_error(struct buffer_head *bh)
 {
-        char b[BDEVNAME_SIZE];
+        if (atomic_dec_and_test(&io_end->count)) {
-        printk(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n",
+                if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || !io_end->size) {
-                        bdevname(bh->b_bdev, b),
+                        ext4_release_io_end(io_end);
-                        (unsigned long long)bh->b_blocknr);
+                        return;
+                }
+                ext4_add_complete_io(io_end);
+        }
+}
+int ext4_put_io_end(ext4_io_end_t *io_end)
+{
+        int err = 0;
+        if (atomic_dec_and_test(&io_end->count)) {
+                if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
+                        err = ext4_convert_unwritten_extents(io_end->handle,
+                                                io_end->inode, io_end->offset,
+                                                io_end->size);
+                        io_end->handle = NULL;
+                        ext4_clear_io_unwritten_flag(io_end);
+                }
+                ext4_release_io_end(io_end);
+        }
+        return err;
+}
+ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end)
+{
+        atomic_inc(&io_end->count);
+        return io_end;
 }
 static void ext4_end_bio(struct bio *bio, int error)
 {
        ext4_io_end_t *io_end = bio->bi_private;
-        struct inode *inode;
-        int i;
-        int blocksize;
        sector_t bi_sector = bio->bi_sector;
        BUG_ON(!io_end);
-        inode = io_end->inode;
-        blocksize = 1 << inode->i_blkbits;
-        bio->bi_private = NULL;
        bio->bi_end_io = NULL;
        if (test_bit(BIO_UPTODATE, &bio->bi_flags))
                error = 0;
-        for (i = 0; i < bio->bi_vcnt; i++) {
-                struct bio_vec *bvec = &bio->bi_io_vec[i];
-                struct page *page = bvec->bv_page;
-                struct buffer_head *bh, *head;
-                unsigned bio_start = bvec->bv_offset;
-                unsigned bio_end = bio_start + bvec->bv_len;
-                unsigned under_io = 0;
-                unsigned long flags;
-                if (!page)
+        if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
-                        continue;
-                if (error) {
-                        SetPageError(page);
-                        set_bit(AS_EIO, &page->mapping->flags);
-                }
-                bh = head = page_buffers(page);
                /*
-                 * We check all buffers in the page under BH_Uptodate_Lock
+                 * Link bio into list hanging from io_end. We have to do it
-                 * to avoid races with other end io clearing async_write flags
+                 * atomically as bio completions can be racing against each
+                 * other.
                 */
-                local_irq_save(flags);
+                bio->bi_private = xchg(&io_end->bio, bio);
-                bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
+        } else {
-                do {
+                ext4_finish_bio(bio);
-                        if (bh_offset(bh) < bio_start ||
+                bio_put(bio);
-                            bh_offset(bh) + blocksize > bio_end) {
-                                if (buffer_async_write(bh))
-                                        under_io++;
-                                continue;
-                        }
-                        clear_buffer_async_write(bh);
-                        if (error)
-                                buffer_io_error(bh);
-                } while ((bh = bh->b_this_page) != head);
-                bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
-                local_irq_restore(flags);
-                if (!under_io)
-                        end_page_writeback(page);
        }
-        bio_put(bio);
        if (error) {
-                io_end->flag |= EXT4_IO_END_ERROR;
+                struct inode *inode = io_end->inode;
                ext4_warning(inode->i_sb, "I/O error writing to inode %lu "
                             "(offset %llu size %ld starting block %llu)",
                             inode->i_ino,
@@ -285,13 +341,7 @@ static void ext4_end_bio(struct bio *bio, int error)
                             (unsigned long long)
                             bi_sector >> (inode->i_blkbits - 9));
        }
+        ext4_put_io_end_defer(io_end);
-        if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
-                ext4_free_io_end(io_end);
-                return;
-        }
-        ext4_add_complete_io(io_end);
 }
 void ext4_io_submit(struct ext4_io_submit *io)
@@ -305,43 +355,38 @@ void ext4_io_submit(struct ext4_io_submit *io)
                bio_put(io->io_bio);
        }
        io->io_bio = NULL;
-        io->io_op = 0;
+}
+void ext4_io_submit_init(struct ext4_io_submit *io,
+                         struct writeback_control *wbc)
+{
+        io->io_op = (wbc->sync_mode == WB_SYNC_ALL ?  WRITE_SYNC : WRITE);
+        io->io_bio = NULL;
        io->io_end = NULL;
 }
-static int io_submit_init(struct ext4_io_submit *io,
+static int io_submit_init_bio(struct ext4_io_submit *io,
-                          struct inode *inode,
+                              struct buffer_head *bh)
-                          struct writeback_control *wbc,
-                          struct buffer_head *bh)
 {
-        ext4_io_end_t *io_end;
-        struct page *page = bh->b_page;
        int nvecs = bio_get_nr_vecs(bh->b_bdev);
        struct bio *bio;
-        io_end = ext4_init_io_end(inode, GFP_NOFS);
-        if (!io_end)
-                return -ENOMEM;
        bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES));
+        if (!bio)
+                return -ENOMEM;
        bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
        bio->bi_bdev = bh->b_bdev;
-        bio->bi_private = io->io_end = io_end;
        bio->bi_end_io = ext4_end_bio;
+        bio->bi_private = ext4_get_io_end(io->io_end);
-        io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh);
        io->io_bio = bio;
-        io->io_op = (wbc->sync_mode == WB_SYNC_ALL ?  WRITE_SYNC : WRITE);
        io->io_next_block = bh->b_blocknr;
        return 0;
 }
 static int io_submit_add_bh(struct ext4_io_submit *io,
                            struct inode *inode,
-                            struct writeback_control *wbc,
                            struct buffer_head *bh)
 {
-        ext4_io_end_t *io_end;
        int ret;
        if (io->io_bio && bh->b_blocknr != io->io_next_block) {
@@ -349,18 +394,14 @@ submit_and_retry:
                ext4_io_submit(io);
        }
        if (io->io_bio == NULL) {
-                ret = io_submit_init(io, inode, wbc, bh);
+                ret = io_submit_init_bio(io, bh);
                if (ret)
                        return ret;
        }
-        io_end = io->io_end;
-        if (test_clear_buffer_uninit(bh))
-                ext4_set_io_unwritten_flag(inode, io_end);
-        io->io_end->size += bh->b_size;
-        io->io_next_block++;
        ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh));
        if (ret != bh->b_size)
                goto submit_and_retry;
+        io->io_next_block++;
        return 0;
 }
@@ -432,7 +473,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
        do {
                if (!buffer_async_write(bh))
                        continue;
-                ret = io_submit_add_bh(io, inode, wbc, bh);
+                ret = io_submit_add_bh(io, inode, bh);
                if (ret) {
                        /*
                         * We only get here on ENOMEM.  Not much else
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index b27c96d01965..c5adbb318a90 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -79,12 +79,20 @@ static int verify_group_input(struct super_block *sb,
        ext4_fsblk_t end = start + input->blocks_count;
        ext4_group_t group = input->group;
        ext4_fsblk_t itend = input->inode_table + sbi->s_itb_per_group;
-        unsigned overhead = ext4_group_overhead_blocks(sb, group);
+        unsigned overhead;
-        ext4_fsblk_t metaend = start + overhead;
+        ext4_fsblk_t metaend;
        struct buffer_head *bh = NULL;
        ext4_grpblk_t free_blocks_count, offset;
        int err = -EINVAL;
+        if (group != sbi->s_groups_count) {
+                ext4_warning(sb, "Cannot add at group %u (only %u groups)",
+                             input->group, sbi->s_groups_count);
+                return -EINVAL;
+        }
+        overhead = ext4_group_overhead_blocks(sb, group);
+        metaend = start + overhead;
        input->free_blocks_count = free_blocks_count =
                input->blocks_count - 2 - overhead - sbi->s_itb_per_group;
@@ -96,10 +104,7 @@ static int verify_group_input(struct super_block *sb,
                       free_blocks_count, input->reserved_blocks);
        ext4_get_group_no_and_offset(sb, start, NULL, &offset);
-        if (group != sbi->s_groups_count)
+        if (offset != 0)
-                ext4_warning(sb, "Cannot add at group %u (only %u groups)",
-                             input->group, sbi->s_groups_count);
-        else if (offset != 0)
                        ext4_warning(sb, "Last group not full");
        else if (input->reserved_blocks > input->blocks_count / 5)
                ext4_warning(sb, "Reserved blocks too high (%u)",
@@ -1551,11 +1556,10 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
        int reserved_gdb = ext4_bg_has_super(sb, input->group) ?
                le16_to_cpu(es->s_reserved_gdt_blocks) : 0;
        struct inode *inode = NULL;
-        int gdb_off, gdb_num;
+        int gdb_off;
        int err;
        __u16 bg_flags = 0;
-        gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
        gdb_off = input->group % EXT4_DESC_PER_BLOCK(sb);
        if (gdb_off == 0 && !EXT4_HAS_RO_COMPAT_FEATURE(sb,
@@ -1656,12 +1660,10 @@ errout:
                err = err2;
        if (!err) {
-                ext4_fsblk_t first_block;
-                first_block = ext4_group_first_block_no(sb, 0);
                if (test_opt(sb, DEBUG))
                        printk(KERN_DEBUG "EXT4-fs: extended group to %llu "
                               "blocks\n", ext4_blocks_count(es));
-                update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr - first_block,
+                update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr,
                               (char *)es, sizeof(struct ext4_super_block), 0);
        }
        return err;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 94cc84db7c9a..85b3dd60169b 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -69,6 +69,7 @@ static void ext4_mark_recovery_complete(struct super_block *sb,
 static void ext4_clear_journal_err(struct super_block *sb,
                                   struct ext4_super_block *es);
 static int ext4_sync_fs(struct super_block *sb, int wait);
+static int ext4_sync_fs_nojournal(struct super_block *sb, int wait);
 static int ext4_remount(struct super_block *sb, int *flags, char *data);
 static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
 static int ext4_unfreeze(struct super_block *sb);
@@ -398,6 +399,11 @@ static void ext4_handle_error(struct super_block *sb)
        }
        if (test_opt(sb, ERRORS_RO)) {
                ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
+                /*
+                 * Make sure updated value of ->s_mount_flags will be visible
+                 * before ->s_flags update
+                 */
+                smp_wmb();
                sb->s_flags |= MS_RDONLY;
        }
        if (test_opt(sb, ERRORS_PANIC))
@@ -422,9 +428,9 @@ void __ext4_error(struct super_block *sb, const char *function,
        ext4_handle_error(sb);
 }
-void ext4_error_inode(struct inode *inode, const char *function,
+void __ext4_error_inode(struct inode *inode, const char *function,
-                      unsigned int line, ext4_fsblk_t block,
+                        unsigned int line, ext4_fsblk_t block,
-                      const char *fmt, ...)
+                        const char *fmt, ...)
 {
        va_list args;
        struct va_format vaf;
@@ -451,9 +457,9 @@ void ext4_error_inode(struct inode *inode, const char *function,
        ext4_handle_error(inode->i_sb);
 }
-void ext4_error_file(struct file *file, const char *function,
+void __ext4_error_file(struct file *file, const char *function,
-                     unsigned int line, ext4_fsblk_t block,
+                       unsigned int line, ext4_fsblk_t block,
-                     const char *fmt, ...)
+                       const char *fmt, ...)
 {
        va_list args;
        struct va_format vaf;
@@ -570,8 +576,13 @@ void __ext4_abort(struct super_block *sb, const char *function,
        if ((sb->s_flags & MS_RDONLY) == 0) {
                ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
-                sb->s_flags |= MS_RDONLY;
                EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
+                /*
+                 * Make sure updated value of ->s_mount_flags will be visible
+                 * before ->s_flags update
+                 */
+                smp_wmb();
+                sb->s_flags |= MS_RDONLY;
                if (EXT4_SB(sb)->s_journal)
                        jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
                save_error_info(sb, function, line);
@@ -580,7 +591,8 @@ void __ext4_abort(struct super_block *sb, const char *function,
                panic("EXT4-fs panic from previous error\n");
 }
-void ext4_msg(struct super_block *sb, const char *prefix, const char *fmt, ...)
+void __ext4_msg(struct super_block *sb,
+                const char *prefix, const char *fmt, ...)
 {
        struct va_format vaf;
        va_list args;
@@ -750,8 +762,10 @@ static void ext4_put_super(struct super_block *sb)
        ext4_unregister_li_request(sb);
        dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
-        flush_workqueue(sbi->dio_unwritten_wq);
+        flush_workqueue(sbi->unrsv_conversion_wq);
-        destroy_workqueue(sbi->dio_unwritten_wq);
+        flush_workqueue(sbi->rsv_conversion_wq);
+        destroy_workqueue(sbi->unrsv_conversion_wq);
+        destroy_workqueue(sbi->rsv_conversion_wq);
        if (sbi->s_journal) {
                err = jbd2_journal_destroy(sbi->s_journal);
@@ -760,7 +774,7 @@ static void ext4_put_super(struct super_block *sb)
                        ext4_abort(sb, "Couldn't clean up the journal");
        }
-        ext4_es_unregister_shrinker(sb);
+        ext4_es_unregister_shrinker(sbi);
        del_timer(&sbi->s_err_report);
        ext4_release_system_zone(sb);
        ext4_mb_release(sb);
@@ -849,6 +863,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
        rwlock_init(&ei->i_es_lock);
        INIT_LIST_HEAD(&ei->i_es_lru);
        ei->i_es_lru_nr = 0;
+        ei->i_touch_when = 0;
        ei->i_reserved_data_blocks = 0;
        ei->i_reserved_meta_blocks = 0;
        ei->i_allocated_meta_blocks = 0;
@@ -859,13 +874,15 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
        ei->i_reserved_quota = 0;
 #endif
        ei->jinode = NULL;
-        INIT_LIST_HEAD(&ei->i_completed_io_list);
+        INIT_LIST_HEAD(&ei->i_rsv_conversion_list);
+        INIT_LIST_HEAD(&ei->i_unrsv_conversion_list);
        spin_lock_init(&ei->i_completed_io_lock);
        ei->i_sync_tid = 0;
        ei->i_datasync_tid = 0;
        atomic_set(&ei->i_ioend_count, 0);
        atomic_set(&ei->i_unwritten, 0);
-        INIT_WORK(&ei->i_unwritten_work, ext4_end_io_work);
+        INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
+        INIT_WORK(&ei->i_unrsv_conversion_work, ext4_end_io_unrsv_work);
        return &ei->vfs_inode;
 }
@@ -1093,6 +1110,7 @@ static const struct super_operations ext4_nojournal_sops = {
        .dirty_inode    = ext4_dirty_inode,
        .drop_inode     = ext4_drop_inode,
        .evict_inode    = ext4_evict_inode,
+        .sync_fs        = ext4_sync_fs_nojournal,
        .put_super      = ext4_put_super,
        .statfs         = ext4_statfs,
        .remount_fs     = ext4_remount,
@@ -1908,7 +1926,6 @@ static int ext4_fill_flex_info(struct super_block *sb)
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_group_desc *gdp = NULL;
        ext4_group_t flex_group;
-        unsigned int groups_per_flex = 0;
        int i, err;
        sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
@@ -1916,7 +1933,6 @@ static int ext4_fill_flex_info(struct super_block *sb)
                sbi->s_log_groups_per_flex = 0;
                return 1;
        }
-        groups_per_flex = 1U << sbi->s_log_groups_per_flex;
        err = ext4_alloc_flex_bg_array(sb, sbi->s_groups_count);
        if (err)
@@ -2164,19 +2180,22 @@ static void ext4_orphan_cleanup(struct super_block *sb,
                list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
                dquot_initialize(inode);
                if (inode->i_nlink) {
-                        ext4_msg(sb, KERN_DEBUG,
+                        if (test_opt(sb, DEBUG))
-                                "%s: truncating inode %lu to %lld bytes",
+                                ext4_msg(sb, KERN_DEBUG,
-                                __func__, inode->i_ino, inode->i_size);
+                                        "%s: truncating inode %lu to %lld bytes",
+                                        __func__, inode->i_ino, inode->i_size);
                        jbd_debug(2, "truncating inode %lu to %lld bytes\n",
                                  inode->i_ino, inode->i_size);
                        mutex_lock(&inode->i_mutex);
+                        truncate_inode_pages(inode->i_mapping, inode->i_size);
                        ext4_truncate(inode);
                        mutex_unlock(&inode->i_mutex);
                        nr_truncates++;
                } else {
-                        ext4_msg(sb, KERN_DEBUG,
+                        if (test_opt(sb, DEBUG))
-                                "%s: deleting unreferenced inode %lu",
+                                ext4_msg(sb, KERN_DEBUG,
-                                __func__, inode->i_ino);
+                                        "%s: deleting unreferenced inode %lu",
+                                        __func__, inode->i_ino);
                        jbd_debug(2, "deleting unreferenced inode %lu\n",
                                  inode->i_ino);
                        nr_orphans++;
@@ -2377,7 +2396,10 @@ struct ext4_attr {
        ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *);
        ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *,
                         const char *, size_t);
-        int offset;
+        union {
+                int offset;
+                int deprecated_val;
+        } u;
 };
 static int parse_strtoull(const char *buf,
@@ -2446,7 +2468,7 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
 static ssize_t sbi_ui_show(struct ext4_attr *a,
                           struct ext4_sb_info *sbi, char *buf)
 {
-        unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset);
+        unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset);
        return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
 }
@@ -2455,7 +2477,7 @@ static ssize_t sbi_ui_store(struct ext4_attr *a,
                            struct ext4_sb_info *sbi,
                            const char *buf, size_t count)
 {
-        unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset);
+        unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset);
        unsigned long t;
        int ret;
@@ -2504,12 +2526,20 @@ static ssize_t trigger_test_error(struct ext4_attr *a,
        return count;
 }
+static ssize_t sbi_deprecated_show(struct ext4_attr *a,
+                                   struct ext4_sb_info *sbi, char *buf)
+{
+        return snprintf(buf, PAGE_SIZE, "%d\n", a->u.deprecated_val);
+}
 #define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \
 static struct ext4_attr ext4_attr_##_name = {                   \
        .attr = {.name = __stringify(_name), .mode = _mode },   \
        .show   = _show,                                        \
        .store  = _store,                                       \
-        .offset = offsetof(struct ext4_sb_info, _elname),       \
+        .u = {                                                  \
+                .offset = offsetof(struct ext4_sb_info, _elname),\
+        },                                                      \
 }
 #define EXT4_ATTR(name, mode, show, store) \
 static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
@@ -2520,6 +2550,14 @@ static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
 #define EXT4_RW_ATTR_SBI_UI(name, elname)       \
        EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname)
 #define ATTR_LIST(name) &ext4_attr_##name.attr
+#define EXT4_DEPRECATED_ATTR(_name, _val)       \
+static struct ext4_attr ext4_attr_##_name = {                   \
+        .attr = {.name = __stringify(_name), .mode = 0444 },    \
+        .show   = sbi_deprecated_show,                          \
+        .u = {                                                  \
+                .deprecated_val = _val,                         \
+        },                                                      \
+}
 EXT4_RO_ATTR(delayed_allocation_blocks);
 EXT4_RO_ATTR(session_write_kbytes);
@@ -2534,7 +2572,7 @@ EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
 EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
 EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
 EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
-EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump);
+EXT4_DEPRECATED_ATTR(max_writeback_mb_bump, 128);
 EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
 EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error);
@@ -3763,7 +3801,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        sbi->s_err_report.data = (unsigned long) sb;
        /* Register extent status tree shrinker */
-        ext4_es_register_shrinker(sb);
+        ext4_es_register_shrinker(sbi);
        err = percpu_counter_init(&sbi->s_freeclusters_counter,
                        ext4_count_free_clusters(sb));
@@ -3787,7 +3825,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        }
        sbi->s_stripe = ext4_get_stripe_size(sbi);
-        sbi->s_max_writeback_mb_bump = 128;
        sbi->s_extent_max_zeroout_kb = 32;
        /*
@@ -3915,12 +3952,20 @@ no_journal:
         * The maximum number of concurrent works can be high and
         * concurrency isn't really necessary.  Limit it to 1.
         */
-        EXT4_SB(sb)->dio_unwritten_wq =
+        EXT4_SB(sb)->rsv_conversion_wq =
-                alloc_workqueue("ext4-dio-unwritten", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
+                alloc_workqueue("ext4-rsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
-        if (!EXT4_SB(sb)->dio_unwritten_wq) {
+        if (!EXT4_SB(sb)->rsv_conversion_wq) {
-                printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
+                printk(KERN_ERR "EXT4-fs: failed to create workqueue\n");
                ret = -ENOMEM;
-                goto failed_mount_wq;
+                goto failed_mount4;
+        }
+        EXT4_SB(sb)->unrsv_conversion_wq =
+                alloc_workqueue("ext4-unrsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
+        if (!EXT4_SB(sb)->unrsv_conversion_wq) {
+                printk(KERN_ERR "EXT4-fs: failed to create workqueue\n");
+                ret = -ENOMEM;
+                goto failed_mount4;
        }
        /*
@@ -4074,14 +4119,17 @@ failed_mount4a:
        sb->s_root = NULL;
 failed_mount4:
        ext4_msg(sb, KERN_ERR, "mount failed");
-        destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq);
+        if (EXT4_SB(sb)->rsv_conversion_wq)
+                destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
+        if (EXT4_SB(sb)->unrsv_conversion_wq)
+                destroy_workqueue(EXT4_SB(sb)->unrsv_conversion_wq);
 failed_mount_wq:
        if (sbi->s_journal) {
                jbd2_journal_destroy(sbi->s_journal);
                sbi->s_journal = NULL;
        }
 failed_mount3:
-        ext4_es_unregister_shrinker(sb);
+        ext4_es_unregister_shrinker(sbi);
        del_timer(&sbi->s_err_report);
        if (sbi->s_flex_groups)
                ext4_kvfree(sbi->s_flex_groups);
@@ -4517,19 +4565,52 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
 {
        int ret = 0;
        tid_t target;
+        bool needs_barrier = false;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        trace_ext4_sync_fs(sb, wait);
-        flush_workqueue(sbi->dio_unwritten_wq);
+        flush_workqueue(sbi->rsv_conversion_wq);
+        flush_workqueue(sbi->unrsv_conversion_wq);
        /*
         * Writeback quota in non-journalled quota case - journalled quota has
         * no dirty dquots
         */
        dquot_writeback_dquots(sb, -1);
+        /*
+         * Data writeback is possible w/o journal transaction, so barrier must
+         * being sent at the end of the function. But we can skip it if
+         * transaction_commit will do it for us.
+         */
+        target = jbd2_get_latest_transaction(sbi->s_journal);
+        if (wait && sbi->s_journal->j_flags & JBD2_BARRIER &&
+            !jbd2_trans_will_send_data_barrier(sbi->s_journal, target))
+                needs_barrier = true;
        if (jbd2_journal_start_commit(sbi->s_journal, &target)) {
                if (wait)
-                        jbd2_log_wait_commit(sbi->s_journal, target);
+                        ret = jbd2_log_wait_commit(sbi->s_journal, target);
+        }
+        if (needs_barrier) {
+                int err;
+                err = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
+                if (!ret)
+                        ret = err;
        }
+        return ret;
+}
+static int ext4_sync_fs_nojournal(struct super_block *sb, int wait)
+{
+        int ret = 0;
+        trace_ext4_sync_fs(sb, wait);
+        flush_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
+        flush_workqueue(EXT4_SB(sb)->unrsv_conversion_wq);
+        dquot_writeback_dquots(sb, -1);
+        if (wait && test_opt(sb, BARRIER))
+                ret = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
        return ret;
 }
diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
index fd27e7e6326e..e06e0995e00f 100644
--- a/fs/f2fs/Kconfig
+++ b/fs/f2fs/Kconfig
@@ -51,3 +51,15 @@ config F2FS_FS_POSIX_ACL
          Linux website <http://acl.bestbits.at/>.
          If you don't know what Access Control Lists are, say N
+config F2FS_FS_SECURITY
+        bool "F2FS Security Labels"
+        depends on F2FS_FS_XATTR
+        help
+          Security labels provide an access control facility to support Linux
+          Security Models (LSMs) accepted by AppArmor, SELinux, Smack and TOMOYO
+          Linux. This option enables an extended attribute handler for file
+          security labels in the f2fs filesystem, so that it requires enabling
+          the extended attribute support in advance.
+          If you are not using a security module, say N.
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index 44abc2f286e0..b7826ec1b470 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -250,7 +250,7 @@ static int f2fs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
                }
        }
-        error = f2fs_setxattr(inode, name_index, "", value, size);
+        error = f2fs_setxattr(inode, name_index, "", value, size, NULL);
        kfree(value);
        if (!error)
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index b1de01da1a40..66a6b85a51d8 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -357,8 +357,8 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
        unsigned long blk_size = sbi->blocksize;
        struct f2fs_checkpoint *cp_block;
        unsigned long long cur_version = 0, pre_version = 0;
-        unsigned int crc = 0;
        size_t crc_offset;
+        __u32 crc = 0;
        /* Read the 1st cp block in this CP pack */
        cp_page_1 = get_meta_page(sbi, cp_addr);
@@ -369,7 +369,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
        if (crc_offset >= blk_size)
                goto invalid_cp1;
-        crc = *(unsigned int *)((unsigned char *)cp_block + crc_offset);
+        crc = le32_to_cpu(*((__u32 *)((unsigned char *)cp_block + crc_offset)));
        if (!f2fs_crc_valid(crc, cp_block, crc_offset))
                goto invalid_cp1;
@@ -384,7 +384,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
        if (crc_offset >= blk_size)
                goto invalid_cp2;
-        crc = *(unsigned int *)((unsigned char *)cp_block + crc_offset);
+        crc = le32_to_cpu(*((__u32 *)((unsigned char *)cp_block + crc_offset)));
        if (!f2fs_crc_valid(crc, cp_block, crc_offset))
                goto invalid_cp2;
@@ -450,13 +450,30 @@ fail_no_cp:
        return -EINVAL;
 }
-void set_dirty_dir_page(struct inode *inode, struct page *page)
+static int __add_dirty_inode(struct inode *inode, struct dir_inode_entry *new)
 {
        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
        struct list_head *head = &sbi->dir_inode_list;
-        struct dir_inode_entry *new;
        struct list_head *this;
+        list_for_each(this, head) {
+                struct dir_inode_entry *entry;
+                entry = list_entry(this, struct dir_inode_entry, list);
+                if (entry->inode == inode)
+                        return -EEXIST;
+        }
+        list_add_tail(&new->list, head);
+#ifdef CONFIG_F2FS_STAT_FS
+        sbi->n_dirty_dirs++;
+#endif
+        return 0;
+}
+void set_dirty_dir_page(struct inode *inode, struct page *page)
+{
+        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        struct dir_inode_entry *new;
        if (!S_ISDIR(inode->i_mode))
                return;
 retry:
@@ -469,23 +486,31 @@ retry:
        INIT_LIST_HEAD(&new->list);
        spin_lock(&sbi->dir_inode_lock);
-        list_for_each(this, head) {
+        if (__add_dirty_inode(inode, new))
-                struct dir_inode_entry *entry;
+                kmem_cache_free(inode_entry_slab, new);
-                entry = list_entry(this, struct dir_inode_entry, list);
-                if (entry->inode == inode) {
-                        kmem_cache_free(inode_entry_slab, new);
-                        goto out;
-                }
-        }
-        list_add_tail(&new->list, head);
-        sbi->n_dirty_dirs++;
-        BUG_ON(!S_ISDIR(inode->i_mode));
-out:
        inc_page_count(sbi, F2FS_DIRTY_DENTS);
        inode_inc_dirty_dents(inode);
        SetPagePrivate(page);
+        spin_unlock(&sbi->dir_inode_lock);
+}
+void add_dirty_dir_inode(struct inode *inode)
+{
+        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        struct dir_inode_entry *new;
+retry:
+        new = kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
+        if (!new) {
+                cond_resched();
+                goto retry;
+        }
+        new->inode = inode;
+        INIT_LIST_HEAD(&new->list);
+        spin_lock(&sbi->dir_inode_lock);
+        if (__add_dirty_inode(inode, new))
+                kmem_cache_free(inode_entry_slab, new);
        spin_unlock(&sbi->dir_inode_lock);
 }
@@ -499,8 +524,10 @@ void remove_dirty_dir_inode(struct inode *inode)
                return;
        spin_lock(&sbi->dir_inode_lock);
-        if (atomic_read(&F2FS_I(inode)->dirty_dents))
+        if (atomic_read(&F2FS_I(inode)->dirty_dents)) {
-                goto out;
+                spin_unlock(&sbi->dir_inode_lock);
+                return;
+        }
        list_for_each(this, head) {
                struct dir_inode_entry *entry;
@@ -508,12 +535,38 @@ void remove_dirty_dir_inode(struct inode *inode)
                if (entry->inode == inode) {
                        list_del(&entry->list);
                        kmem_cache_free(inode_entry_slab, entry);
+#ifdef CONFIG_F2FS_STAT_FS
                        sbi->n_dirty_dirs--;
+#endif
+                        break;
+                }
+        }
+        spin_unlock(&sbi->dir_inode_lock);
+        /* Only from the recovery routine */
+        if (is_inode_flag_set(F2FS_I(inode), FI_DELAY_IPUT)) {
+                clear_inode_flag(F2FS_I(inode), FI_DELAY_IPUT);
+                iput(inode);
+        }
+}
+struct inode *check_dirty_dir_inode(struct f2fs_sb_info *sbi, nid_t ino)
+{
+        struct list_head *head = &sbi->dir_inode_list;
+        struct list_head *this;
+        struct inode *inode = NULL;
+        spin_lock(&sbi->dir_inode_lock);
+        list_for_each(this, head) {
+                struct dir_inode_entry *entry;
+                entry = list_entry(this, struct dir_inode_entry, list);
+                if (entry->inode->i_ino == ino) {
+                        inode = entry->inode;
                        break;
                }
        }
-out:
        spin_unlock(&sbi->dir_inode_lock);
+        return inode;
 }
 void sync_dirty_dir_inodes(struct f2fs_sb_info *sbi)
@@ -595,7 +648,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
        block_t start_blk;
        struct page *cp_page;
        unsigned int data_sum_blocks, orphan_blocks;
-        unsigned int crc32 = 0;
+        __u32 crc32 = 0;
        void *kaddr;
        int i;
@@ -664,8 +717,8 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
        get_nat_bitmap(sbi, __bitmap_ptr(sbi, NAT_BITMAP));
        crc32 = f2fs_crc32(ckpt, le32_to_cpu(ckpt->checksum_offset));
-        *(__le32 *)((unsigned char *)ckpt +
+        *((__le32 *)((unsigned char *)ckpt +
-                                le32_to_cpu(ckpt->checksum_offset))
+                                le32_to_cpu(ckpt->checksum_offset)))
                                = cpu_to_le32(crc32);
        start_blk = __start_cp_addr(sbi);
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 91ff93b0b0f4..035f9a345cdf 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -68,7 +68,9 @@ static int check_extent_cache(struct inode *inode, pgoff_t pgofs,
                                        struct buffer_head *bh_result)
 {
        struct f2fs_inode_info *fi = F2FS_I(inode);
+#ifdef CONFIG_F2FS_STAT_FS
        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+#endif
        pgoff_t start_fofs, end_fofs;
        block_t start_blkaddr;
@@ -78,7 +80,9 @@ static int check_extent_cache(struct inode *inode, pgoff_t pgofs,
                return 0;
        }
+#ifdef CONFIG_F2FS_STAT_FS
        sbi->total_hit_ext++;
+#endif
        start_fofs = fi->ext.fofs;
        end_fofs = fi->ext.fofs + fi->ext.len - 1;
        start_blkaddr = fi->ext.blk_addr;
@@ -96,7 +100,9 @@ static int check_extent_cache(struct inode *inode, pgoff_t pgofs,
                else
                        bh_result->b_size = UINT_MAX;
+#ifdef CONFIG_F2FS_STAT_FS
                sbi->read_hit_ext++;
+#endif
                read_unlock(&fi->ext.ext_lock);
                return 1;
        }
@@ -199,7 +205,7 @@ struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
        if (dn.data_blkaddr == NEW_ADDR)
                return ERR_PTR(-EINVAL);
-        page = grab_cache_page(mapping, index);
+        page = grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS);
        if (!page)
                return ERR_PTR(-ENOMEM);
@@ -233,18 +239,23 @@ struct page *get_lock_data_page(struct inode *inode, pgoff_t index)
        struct page *page;
        int err;
+repeat:
+        page = grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS);
+        if (!page)
+                return ERR_PTR(-ENOMEM);
        set_new_dnode(&dn, inode, NULL, NULL, 0);
        err = get_dnode_of_data(&dn, index, LOOKUP_NODE);
-        if (err)
+        if (err) {
+                f2fs_put_page(page, 1);
                return ERR_PTR(err);
+        }
        f2fs_put_dnode(&dn);
-        if (dn.data_blkaddr == NULL_ADDR)
+        if (dn.data_blkaddr == NULL_ADDR) {
+                f2fs_put_page(page, 1);
                return ERR_PTR(-ENOENT);
-repeat:
+        }
-        page = grab_cache_page(mapping, index);
-        if (!page)
-                return ERR_PTR(-ENOMEM);
        if (PageUptodate(page))
                return page;
@@ -274,9 +285,10 @@ repeat:
 *
 * Also, caller should grab and release a mutex by calling mutex_lock_op() and
 * mutex_unlock_op().
+ * Note that, npage is set only by make_empty_dir.
 */
-struct page *get_new_data_page(struct inode *inode, pgoff_t index,
+struct page *get_new_data_page(struct inode *inode,
-                                                bool new_i_size)
+                struct page *npage, pgoff_t index, bool new_i_size)
 {
        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
        struct address_space *mapping = inode->i_mapping;
@@ -284,18 +296,20 @@ struct page *get_new_data_page(struct inode *inode, pgoff_t index,
        struct dnode_of_data dn;
        int err;
-        set_new_dnode(&dn, inode, NULL, NULL, 0);
+        set_new_dnode(&dn, inode, npage, npage, 0);
        err = get_dnode_of_data(&dn, index, ALLOC_NODE);
        if (err)
                return ERR_PTR(err);
        if (dn.data_blkaddr == NULL_ADDR) {
                if (reserve_new_block(&dn)) {
-                        f2fs_put_dnode(&dn);
+                        if (!npage)
+                                f2fs_put_dnode(&dn);
                        return ERR_PTR(-ENOSPC);
                }
        }
-        f2fs_put_dnode(&dn);
+        if (!npage)
+                f2fs_put_dnode(&dn);
 repeat:
        page = grab_cache_page(mapping, index);
        if (!page)
@@ -325,6 +339,8 @@ repeat:
        if (new_i_size &&
                i_size_read(inode) < ((index + 1) << PAGE_CACHE_SHIFT)) {
                i_size_write(inode, ((index + 1) << PAGE_CACHE_SHIFT));
+                /* Only the directory inode sets new_i_size */
+                set_inode_flag(F2FS_I(inode), FI_UPDATE_DIR);
                mark_inode_dirty_sync(inode);
        }
        return page;
@@ -481,8 +497,9 @@ int do_write_data_page(struct page *page)
         * If current allocation needs SSR,
         * it had better in-place writes for updated data.
         */
-        if (old_blk_addr != NEW_ADDR && !is_cold_data(page) &&
+        if (unlikely(old_blk_addr != NEW_ADDR &&
-                                need_inplace_update(inode)) {
+                        !is_cold_data(page) &&
+                        need_inplace_update(inode))) {
                rewrite_data_page(F2FS_SB(inode->i_sb), page,
                                                old_blk_addr);
        } else {
@@ -684,6 +701,27 @@ err:
        return err;
 }
+static int f2fs_write_end(struct file *file,
+                        struct address_space *mapping,
+                        loff_t pos, unsigned len, unsigned copied,
+                        struct page *page, void *fsdata)
+{
+        struct inode *inode = page->mapping->host;
+        SetPageUptodate(page);
+        set_page_dirty(page);
+        if (pos + copied > i_size_read(inode)) {
+                i_size_write(inode, pos + copied);
+                mark_inode_dirty(inode);
+                update_inode_page(inode);
+        }
+        unlock_page(page);
+        page_cache_release(page);
+        return copied;
+}
 static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb,
                const struct iovec *iov, loff_t offset, unsigned long nr_segs)
 {
@@ -698,7 +736,8 @@ static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb,
                                                  get_data_block_ro);
 }
-static void f2fs_invalidate_data_page(struct page *page, unsigned long offset)
+static void f2fs_invalidate_data_page(struct page *page, unsigned int offset,
+                                      unsigned int length)
 {
        struct inode *inode = page->mapping->host;
        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
@@ -740,7 +779,7 @@ const struct address_space_operations f2fs_dblock_aops = {
        .writepage      = f2fs_write_data_page,
        .writepages     = f2fs_write_data_pages,
        .write_begin    = f2fs_write_begin,
-        .write_end      = nobh_write_end,
+        .write_end      = f2fs_write_end,
        .set_page_dirty = f2fs_set_data_page_dirty,
        .invalidatepage = f2fs_invalidate_data_page,
        .releasepage    = f2fs_release_data_page,
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 8d9943786c31..0d6c6aafb235 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -175,12 +175,12 @@ get_cache:
 static int stat_show(struct seq_file *s, void *v)
 {
-        struct f2fs_stat_info *si, *next;
+        struct f2fs_stat_info *si;
        int i = 0;
        int j;
        mutex_lock(&f2fs_stat_mutex);
-        list_for_each_entry_safe(si, next, &f2fs_stat_list, stat_list) {
+        list_for_each_entry(si, &f2fs_stat_list, stat_list) {
                char devname[BDEVNAME_SIZE];
                update_general_status(si->sbi);
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 1ac6b93036b7..9d1cd423450d 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -13,6 +13,7 @@
 #include "f2fs.h"
 #include "node.h"
 #include "acl.h"
+#include "xattr.h"
 static unsigned long dir_blocks(struct inode *inode)
 {
@@ -215,9 +216,9 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
 struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p)
 {
-        struct page *page = NULL;
+        struct page *page;
-        struct f2fs_dir_entry *de = NULL;
+        struct f2fs_dir_entry *de;
-        struct f2fs_dentry_block *dentry_blk = NULL;
+        struct f2fs_dentry_block *dentry_blk;
        page = get_lock_data_page(dir, 0);
        if (IS_ERR(page))
@@ -264,15 +265,10 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
        f2fs_put_page(page, 1);
 }
-void init_dent_inode(const struct qstr *name, struct page *ipage)
+static void init_dent_inode(const struct qstr *name, struct page *ipage)
 {
        struct f2fs_node *rn;
-        if (IS_ERR(ipage))
-                return;
-        wait_on_page_writeback(ipage);
        /* copy name info. to this inode page */
        rn = (struct f2fs_node *)page_address(ipage);
        rn->i.i_namelen = cpu_to_le32(name->len);
@@ -280,14 +276,15 @@ void init_dent_inode(const struct qstr *name, struct page *ipage)
        set_page_dirty(ipage);
 }
-static int make_empty_dir(struct inode *inode, struct inode *parent)
+static int make_empty_dir(struct inode *inode,
+                struct inode *parent, struct page *page)
 {
        struct page *dentry_page;
        struct f2fs_dentry_block *dentry_blk;
        struct f2fs_dir_entry *de;
        void *kaddr;
-        dentry_page = get_new_data_page(inode, 0, true);
+        dentry_page = get_new_data_page(inode, page, 0, true);
        if (IS_ERR(dentry_page))
                return PTR_ERR(dentry_page);
@@ -317,63 +314,76 @@ static int make_empty_dir(struct inode *inode, struct inode *parent)
        return 0;
 }
-static int init_inode_metadata(struct inode *inode,
+static struct page *init_inode_metadata(struct inode *inode,
                struct inode *dir, const struct qstr *name)
 {
+        struct page *page;
+        int err;
        if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) {
-                int err;
+                page = new_inode_page(inode, name);
-                err = new_inode_page(inode, name);
+                if (IS_ERR(page))
-                if (err)
+                        return page;
-                        return err;
                if (S_ISDIR(inode->i_mode)) {
-                        err = make_empty_dir(inode, dir);
+                        err = make_empty_dir(inode, dir, page);
-                        if (err) {
+                        if (err)
-                                remove_inode_page(inode);
+                                goto error;
-                                return err;
-                        }
                }
                err = f2fs_init_acl(inode, dir);
-                if (err) {
+                if (err)
-                        remove_inode_page(inode);
+                        goto error;
-                        return err;
-                }
+                err = f2fs_init_security(inode, dir, name, page);
+                if (err)
+                        goto error;
+                wait_on_page_writeback(page);
        } else {
-                struct page *ipage;
+                page = get_node_page(F2FS_SB(dir->i_sb), inode->i_ino);
-                ipage = get_node_page(F2FS_SB(dir->i_sb), inode->i_ino);
+                if (IS_ERR(page))
-                if (IS_ERR(ipage))
+                        return page;
-                        return PTR_ERR(ipage);
-                set_cold_node(inode, ipage);
+                wait_on_page_writeback(page);
-                init_dent_inode(name, ipage);
+                set_cold_node(inode, page);
-                f2fs_put_page(ipage, 1);
        }
+        init_dent_inode(name, page);
+        /*
+         * This file should be checkpointed during fsync.
+         * We lost i_pino from now on.
+         */
        if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) {
+                file_lost_pino(inode);
                inc_nlink(inode);
-                update_inode_page(inode);
        }
-        return 0;
+        return page;
+error:
+        f2fs_put_page(page, 1);
+        remove_inode_page(inode);
+        return ERR_PTR(err);
 }
 static void update_parent_metadata(struct inode *dir, struct inode *inode,
                                                unsigned int current_depth)
 {
-        bool need_dir_update = false;
        if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) {
                if (S_ISDIR(inode->i_mode)) {
                        inc_nlink(dir);
-                        need_dir_update = true;
+                        set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
                }
                clear_inode_flag(F2FS_I(inode), FI_NEW_INODE);
        }
        dir->i_mtime = dir->i_ctime = CURRENT_TIME;
        if (F2FS_I(dir)->i_current_depth != current_depth) {
                F2FS_I(dir)->i_current_depth = current_depth;
-                need_dir_update = true;
+                set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
        }
-        if (need_dir_update)
+        if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR))
                update_inode_page(dir);
        else
                mark_inode_dirty(dir);
@@ -423,6 +433,7 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name, struct inode *in
        struct page *dentry_page = NULL;
        struct f2fs_dentry_block *dentry_blk = NULL;
        int slots = GET_DENTRY_SLOTS(namelen);
+        struct page *page;
        int err = 0;
        int i;
@@ -448,7 +459,7 @@ start:
        bidx = dir_block_index(level, (le32_to_cpu(dentry_hash) % nbucket));
        for (block = bidx; block <= (bidx + nblock - 1); block++) {
-                dentry_page = get_new_data_page(dir, block, true);
+                dentry_page = get_new_data_page(dir, NULL, block, true);
                if (IS_ERR(dentry_page))
                        return PTR_ERR(dentry_page);
@@ -465,12 +476,13 @@ start:
        ++level;
        goto start;
 add_dentry:
-        err = init_inode_metadata(inode, dir, name);
-        if (err)
-                goto fail;
        wait_on_page_writeback(dentry_page);
+        page = init_inode_metadata(inode, dir, name);
+        if (IS_ERR(page)) {
+                err = PTR_ERR(page);
+                goto fail;
+        }
        de = &dentry_blk->dentry[bit_pos];
        de->hash_code = dentry_hash;
        de->name_len = cpu_to_le16(namelen);
@@ -481,11 +493,14 @@ add_dentry:
                test_and_set_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap);
        set_page_dirty(dentry_page);
-        update_parent_metadata(dir, inode, current_depth);
+        /* we don't need to mark_inode_dirty now */
-        /* update parent inode number before releasing dentry page */
        F2FS_I(inode)->i_pino = dir->i_ino;
+        update_inode(inode, page);
+        f2fs_put_page(page, 1);
+        update_parent_metadata(dir, inode, current_depth);
 fail:
+        clear_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
        kunmap(dentry_page);
        f2fs_put_page(dentry_page, 1);
        return err;
@@ -591,24 +606,19 @@ bool f2fs_empty_dir(struct inode *dir)
        return true;
 }
-static int f2fs_readdir(struct file *file, void *dirent, filldir_t filldir)
+static int f2fs_readdir(struct file *file, struct dir_context *ctx)
 {
-        unsigned long pos = file->f_pos;
        struct inode *inode = file_inode(file);
        unsigned long npages = dir_blocks(inode);
-        unsigned char *types = NULL;
        unsigned int bit_pos = 0, start_bit_pos = 0;
-        int over = 0;
        struct f2fs_dentry_block *dentry_blk = NULL;
        struct f2fs_dir_entry *de = NULL;
        struct page *dentry_page = NULL;
-        unsigned int n = 0;
+        unsigned int n = ((unsigned long)ctx->pos / NR_DENTRY_IN_BLOCK);
        unsigned char d_type = DT_UNKNOWN;
        int slots;
-        types = f2fs_filetype_table;
+        bit_pos = ((unsigned long)ctx->pos % NR_DENTRY_IN_BLOCK);
-        bit_pos = (pos % NR_DENTRY_IN_BLOCK);
-        n = (pos / NR_DENTRY_IN_BLOCK);
        for ( ; n < npages; n++) {
                dentry_page = get_lock_data_page(inode, n);
@@ -618,31 +628,28 @@ static int f2fs_readdir(struct file *file, void *dirent, filldir_t filldir)
                start_bit_pos = bit_pos;
                dentry_blk = kmap(dentry_page);
                while (bit_pos < NR_DENTRY_IN_BLOCK) {
-                        d_type = DT_UNKNOWN;
                        bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap,
                                                        NR_DENTRY_IN_BLOCK,
                                                        bit_pos);
                        if (bit_pos >= NR_DENTRY_IN_BLOCK)
                                break;
+                        ctx->pos += bit_pos - start_bit_pos;
                        de = &dentry_blk->dentry[bit_pos];
-                        if (types && de->file_type < F2FS_FT_MAX)
+                        if (de->file_type < F2FS_FT_MAX)
-                                d_type = types[de->file_type];
+                                d_type = f2fs_filetype_table[de->file_type];
+                        else
-                        over = filldir(dirent,
+                                d_type = DT_UNKNOWN;
-                                        dentry_blk->filename[bit_pos],
+                        if (!dir_emit(ctx,
-                                        le16_to_cpu(de->name_len),
+                                      dentry_blk->filename[bit_pos],
-                                        (n * NR_DENTRY_IN_BLOCK) + bit_pos,
+                                      le16_to_cpu(de->name_len),
-                                        le32_to_cpu(de->ino), d_type);
+                                      le32_to_cpu(de->ino), d_type))
-                        if (over) {
-                                file->f_pos += bit_pos - start_bit_pos;
                                goto success;
-                        }
                        slots = GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
                        bit_pos += slots;
                }
                bit_pos = 0;
-                file->f_pos = (n + 1) * NR_DENTRY_IN_BLOCK;
+                ctx->pos = (n + 1) * NR_DENTRY_IN_BLOCK;
                kunmap(dentry_page);
                f2fs_put_page(dentry_page, 1);
                dentry_page = NULL;
@@ -659,7 +666,7 @@ success:
 const struct file_operations f2fs_dir_operations = {
        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
-        .readdir        = f2fs_readdir,
+        .iterate        = f2fs_readdir,
        .fsync          = f2fs_sync_file,
        .unlocked_ioctl = f2fs_ioctl,
 };
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 20aab02f2a42..467d42d65c48 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -37,21 +37,35 @@
                typecheck(unsigned long long, b) &&                     \
                ((long long)((a) - (b)) > 0))
-typedef u64 block_t;
+typedef u32 block_t;    /*
+                         * should not change u32, since it is the on-disk block
+                         * address format, __le32.
+                         */
 typedef u32 nid_t;
 struct f2fs_mount_info {
        unsigned int    opt;
 };
-static inline __u32 f2fs_crc32(void *buff, size_t len)
+#define CRCPOLY_LE 0xedb88320
+static inline __u32 f2fs_crc32(void *buf, size_t len)
 {
-        return crc32_le(F2FS_SUPER_MAGIC, buff, len);
+        unsigned char *p = (unsigned char *)buf;
+        __u32 crc = F2FS_SUPER_MAGIC;
+        int i;
+        while (len--) {
+                crc ^= *p++;
+                for (i = 0; i < 8; i++)
+                        crc = (crc >> 1) ^ ((crc & 1) ? CRCPOLY_LE : 0);
+        }
+        return crc;
 }
-static inline bool f2fs_crc_valid(__u32 blk_crc, void *buff, size_t buff_size)
+static inline bool f2fs_crc_valid(__u32 blk_crc, void *buf, size_t buf_size)
 {
-        return f2fs_crc32(buff, buff_size) == blk_crc;
+        return f2fs_crc32(buf, buf_size) == blk_crc;
 }
 /*
@@ -148,7 +162,7 @@ struct extent_info {
 * i_advise uses FADVISE_XXX_BIT. We can add additional hints later.
 */
 #define FADVISE_COLD_BIT        0x01
-#define FADVISE_CP_BIT          0x02
+#define FADVISE_LOST_PINO_BIT   0x02
 struct f2fs_inode_info {
        struct inode vfs_inode;         /* serve a vfs inode */
@@ -369,7 +383,6 @@ struct f2fs_sb_info {
        /* for directory inode management */
        struct list_head dir_inode_list;        /* dir inode list */
        spinlock_t dir_inode_lock;              /* for dir inode list lock */
-        unsigned int n_dirty_dirs;              /* # of dir inodes */
        /* basic file system units */
        unsigned int log_sectors_per_block;     /* log2 sectors per block */
@@ -406,12 +419,15 @@ struct f2fs_sb_info {
         * for stat information.
         * one is for the LFS mode, and the other is for the SSR mode.
         */
+#ifdef CONFIG_F2FS_STAT_FS
        struct f2fs_stat_info *stat_info;       /* FS status information */
        unsigned int segment_count[2];          /* # of allocated segments */
        unsigned int block_count[2];            /* # of allocated blocks */
-        unsigned int last_victim[2];            /* last victim segment # */
        int total_hit_ext, read_hit_ext;        /* extent cache hit ratio */
        int bg_gc;                              /* background gc calls */
+        unsigned int n_dirty_dirs;              /* # of dir inodes */
+#endif
+        unsigned int last_victim[2];            /* last victim segment # */
        spinlock_t stat_lock;                   /* lock for stat operations */
 };
@@ -495,9 +511,17 @@ static inline void clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f)
 static inline void mutex_lock_all(struct f2fs_sb_info *sbi)
 {
-        int i = 0;
+        int i;
-        for (; i < NR_GLOBAL_LOCKS; i++)
-                mutex_lock(&sbi->fs_lock[i]);
+        for (i = 0; i < NR_GLOBAL_LOCKS; i++) {
+                /*
+                 * This is the only time we take multiple fs_lock[]
+                 * instances; the order is immaterial since we
+                 * always hold cp_mutex, which serializes multiple
+                 * such operations.
+                 */
+                mutex_lock_nest_lock(&sbi->fs_lock[i], &sbi->cp_mutex);
+        }
 }
 static inline void mutex_unlock_all(struct f2fs_sb_info *sbi)
@@ -843,9 +867,12 @@ static inline int f2fs_clear_bit(unsigned int nr, char *addr)
 /* used for f2fs_inode_info->flags */
 enum {
        FI_NEW_INODE,           /* indicate newly allocated inode */
+        FI_DIRTY_INODE,         /* indicate inode is dirty or not */
        FI_INC_LINK,            /* need to increment i_nlink */
        FI_ACL_MODE,            /* indicate acl mode */
        FI_NO_ALLOC,            /* should not allocate any blocks */
+        FI_UPDATE_DIR,          /* should update inode block for consistency */
+        FI_DELAY_IPUT,          /* used for the recovery */
 };
 static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag)
@@ -878,14 +905,21 @@ static inline int cond_clear_inode_flag(struct f2fs_inode_info *fi, int flag)
        return 0;
 }
+static inline int f2fs_readonly(struct super_block *sb)
+{
+        return sb->s_flags & MS_RDONLY;
+}
 /*
 * file.c
 */
 int f2fs_sync_file(struct file *, loff_t, loff_t, int);
 void truncate_data_blocks(struct dnode_of_data *);
 void f2fs_truncate(struct inode *);
+int f2fs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
 int f2fs_setattr(struct dentry *, struct iattr *);
 int truncate_hole(struct inode *, pgoff_t, pgoff_t);
+int truncate_data_blocks_range(struct dnode_of_data *, int);
 long f2fs_ioctl(struct file *, unsigned int, unsigned long);
 long f2fs_compat_ioctl(struct file *, unsigned int, unsigned long);
@@ -913,7 +947,6 @@ struct f2fs_dir_entry *f2fs_parent_dir(struct inode *, struct page **);
 ino_t f2fs_inode_by_name(struct inode *, struct qstr *);
 void f2fs_set_link(struct inode *, struct f2fs_dir_entry *,
                                struct page *, struct inode *);
-void init_dent_inode(const struct qstr *, struct page *);
 int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *);
 void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *);
 int f2fs_make_empty(struct inode *, struct inode *);
@@ -948,8 +981,8 @@ void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *);
 int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int);
 int truncate_inode_blocks(struct inode *, pgoff_t);
 int remove_inode_page(struct inode *);
-int new_inode_page(struct inode *, const struct qstr *);
+struct page *new_inode_page(struct inode *, const struct qstr *);
-struct page *new_node_page(struct dnode_of_data *, unsigned int);
+struct page *new_node_page(struct dnode_of_data *, unsigned int, struct page *);
 void ra_node_page(struct f2fs_sb_info *, nid_t);
 struct page *get_node_page(struct f2fs_sb_info *, pgoff_t);
 struct page *get_node_page_ra(struct page *, int);
@@ -974,7 +1007,6 @@ void destroy_node_manager_caches(void);
 */
 void f2fs_balance_fs(struct f2fs_sb_info *);
 void invalidate_blocks(struct f2fs_sb_info *, block_t);
-void locate_dirty_segment(struct f2fs_sb_info *, unsigned int);
 void clear_prefree_segments(struct f2fs_sb_info *);
 int npages_for_summary_flush(struct f2fs_sb_info *);
 void allocate_new_segments(struct f2fs_sb_info *);
@@ -1011,7 +1043,9 @@ void remove_orphan_inode(struct f2fs_sb_info *, nid_t);
 int recover_orphan_inodes(struct f2fs_sb_info *);
 int get_valid_checkpoint(struct f2fs_sb_info *);
 void set_dirty_dir_page(struct inode *, struct page *);
+void add_dirty_dir_inode(struct inode *);
 void remove_dirty_dir_inode(struct inode *);
+struct inode *check_dirty_dir_inode(struct f2fs_sb_info *, nid_t);
 void sync_dirty_dir_inodes(struct f2fs_sb_info *);
 void write_checkpoint(struct f2fs_sb_info *, bool);
 void init_orphan_info(struct f2fs_sb_info *);
@@ -1025,7 +1059,7 @@ int reserve_new_block(struct dnode_of_data *);
 void update_extent_cache(block_t, struct dnode_of_data *);
 struct page *find_data_page(struct inode *, pgoff_t, bool);
 struct page *get_lock_data_page(struct inode *, pgoff_t);
-struct page *get_new_data_page(struct inode *, pgoff_t, bool);
+struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool);
 int f2fs_readpage(struct f2fs_sb_info *, struct page *, block_t, int);
 int do_write_data_page(struct page *);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 1cae864f8dfc..d2d2b7dbdcc1 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -63,9 +63,10 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
        f2fs_put_dnode(&dn);
        mutex_unlock_op(sbi, ilock);
+        file_update_time(vma->vm_file);
        lock_page(page);
        if (page->mapping != inode->i_mapping ||
-                        page_offset(page) >= i_size_read(inode) ||
+                        page_offset(page) > i_size_read(inode) ||
                        !PageUptodate(page)) {
                unlock_page(page);
                err = -EFAULT;
@@ -76,10 +77,7 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
         * check to see if the page is mapped already (no holes)
         */
        if (PageMappedToDisk(page))
-                goto out;
+                goto mapped;
-        /* fill the page */
-        wait_on_page_writeback(page);
        /* page is wholly or partially inside EOF */
        if (((page->index + 1) << PAGE_CACHE_SHIFT) > i_size_read(inode)) {
@@ -90,7 +88,9 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
        set_page_dirty(page);
        SetPageUptodate(page);
-        file_update_time(vma->vm_file);
+mapped:
+        /* fill the page */
+        wait_on_page_writeback(page);
 out:
        sb_end_pagefault(inode->i_sb);
        return block_page_mkwrite_return(err);
@@ -102,6 +102,24 @@ static const struct vm_operations_struct f2fs_file_vm_ops = {
        .remap_pages    = generic_file_remap_pages,
 };
+static int get_parent_ino(struct inode *inode, nid_t *pino)
+{
+        struct dentry *dentry;
+        inode = igrab(inode);
+        dentry = d_find_any_alias(inode);
+        iput(inode);
+        if (!dentry)
+                return 0;
+        inode = igrab(dentry->d_parent->d_inode);
+        dput(dentry);
+        *pino = inode->i_ino;
+        iput(inode);
+        return 1;
+}
 int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 {
        struct inode *inode = file->f_mapping->host;
@@ -114,7 +132,7 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
                .for_reclaim = 0,
        };
-        if (inode->i_sb->s_flags & MS_RDONLY)
+        if (f2fs_readonly(inode->i_sb))
                return 0;
        trace_f2fs_sync_file_enter(inode);
@@ -134,7 +152,7 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
        if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1)
                need_cp = true;
-        else if (is_cp_file(inode))
+        else if (file_wrong_pino(inode))
                need_cp = true;
        else if (!space_for_roll_forward(sbi))
                need_cp = true;
@@ -142,11 +160,23 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
                need_cp = true;
        if (need_cp) {
+                nid_t pino;
                /* all the dirty node pages should be flushed for POR */
                ret = f2fs_sync_fs(inode->i_sb, 1);
+                if (file_wrong_pino(inode) && inode->i_nlink == 1 &&
+                                        get_parent_ino(inode, &pino)) {
+                        F2FS_I(inode)->i_pino = pino;
+                        file_got_pino(inode);
+                        mark_inode_dirty_sync(inode);
+                        ret = f2fs_write_inode(inode, NULL);
+                        if (ret)
+                                goto out;
+                }
        } else {
                /* if there is no written node page, write its inode page */
                while (!sync_node_pages(sbi, inode->i_ino, &wbc)) {
+                        mark_inode_dirty_sync(inode);
                        ret = f2fs_write_inode(inode, NULL);
                        if (ret)
                                goto out;
@@ -168,7 +198,7 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
        return 0;
 }
-static int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
+int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
 {
        int nr_free = 0, ofs = dn->ofs_in_node;
        struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
@@ -185,10 +215,10 @@ static int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
                update_extent_cache(NULL_ADDR, dn);
                invalidate_blocks(sbi, blkaddr);
-                dec_valid_block_count(sbi, dn->inode, 1);
                nr_free++;
        }
        if (nr_free) {
+                dec_valid_block_count(sbi, dn->inode, nr_free);
                set_page_dirty(dn->node_page);
                sync_inode_page(dn);
        }
@@ -291,7 +321,7 @@ void f2fs_truncate(struct inode *inode)
        }
 }
-static int f2fs_getattr(struct vfsmount *mnt,
+int f2fs_getattr(struct vfsmount *mnt,
                         struct dentry *dentry, struct kstat *stat)
 {
        struct inode *inode = dentry->d_inode;
@@ -387,7 +417,7 @@ static void fill_zero(struct inode *inode, pgoff_t index,
        f2fs_balance_fs(sbi);
        ilock = mutex_lock_op(sbi);
-        page = get_new_data_page(inode, index, false);
+        page = get_new_data_page(inode, NULL, index, false);
        mutex_unlock_op(sbi, ilock);
        if (!IS_ERR(page)) {
@@ -575,10 +605,10 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
        int ret;
        switch (cmd) {
-        case FS_IOC_GETFLAGS:
+        case F2FS_IOC_GETFLAGS:
                flags = fi->i_flags & FS_FL_USER_VISIBLE;
                return put_user(flags, (int __user *) arg);
-        case FS_IOC_SETFLAGS:
+        case F2FS_IOC_SETFLAGS:
        {
                unsigned int oldflags;
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 14961593e93c..35f9b1a196aa 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -76,7 +76,9 @@ static int gc_thread_func(void *data)
                else
                        wait_ms = increase_sleep_time(wait_ms);
+#ifdef CONFIG_F2FS_STAT_FS
                sbi->bg_gc++;
+#endif
                /* if return value is not zero, no victim was selected */
                if (f2fs_gc(sbi))
@@ -89,23 +91,28 @@ int start_gc_thread(struct f2fs_sb_info *sbi)
 {
        struct f2fs_gc_kthread *gc_th;
        dev_t dev = sbi->sb->s_bdev->bd_dev;
+        int err = 0;
        if (!test_opt(sbi, BG_GC))
-                return 0;
+                goto out;
        gc_th = kmalloc(sizeof(struct f2fs_gc_kthread), GFP_KERNEL);
-        if (!gc_th)
+        if (!gc_th) {
-                return -ENOMEM;
+                err = -ENOMEM;
+                goto out;
+        }
        sbi->gc_thread = gc_th;
        init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head);
        sbi->gc_thread->f2fs_gc_task = kthread_run(gc_thread_func, sbi,
                        "f2fs_gc-%u:%u", MAJOR(dev), MINOR(dev));
        if (IS_ERR(gc_th->f2fs_gc_task)) {
+                err = PTR_ERR(gc_th->f2fs_gc_task);
                kfree(gc_th);
                sbi->gc_thread = NULL;
-                return -ENOMEM;
        }
-        return 0;
+out:
+        return err;
 }
 void stop_gc_thread(struct f2fs_sb_info *sbi)
@@ -234,14 +241,14 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
 {
        struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
        struct victim_sel_policy p;
-        unsigned int secno;
+        unsigned int secno, max_cost;
        int nsearched = 0;
        p.alloc_mode = alloc_mode;
        select_policy(sbi, gc_type, type, &p);
        p.min_segno = NULL_SEGNO;
-        p.min_cost = get_max_cost(sbi, &p);
+        p.min_cost = max_cost = get_max_cost(sbi, &p);
        mutex_lock(&dirty_i->seglist_lock);
@@ -280,7 +287,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
                        p.min_cost = cost;
                }
-                if (cost == get_max_cost(sbi, &p))
+                if (cost == max_cost)
                        continue;
                if (nsearched++ >= MAX_VICTIM_SEARCH) {
@@ -288,8 +295,8 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
                        break;
                }
        }
-got_it:
        if (p.min_segno != NULL_SEGNO) {
+got_it:
                if (p.alloc_mode == LFS) {
                        secno = GET_SECNO(sbi, p.min_segno);
                        if (gc_type == FG_GC)
@@ -314,28 +321,21 @@ static const struct victim_selection default_v_ops = {
 static struct inode *find_gc_inode(nid_t ino, struct list_head *ilist)
 {
-        struct list_head *this;
        struct inode_entry *ie;
-        list_for_each(this, ilist) {
+        list_for_each_entry(ie, ilist, list)
-                ie = list_entry(this, struct inode_entry, list);
                if (ie->inode->i_ino == ino)
                        return ie->inode;
-        }
        return NULL;
 }
 static void add_gc_inode(struct inode *inode, struct list_head *ilist)
 {
-        struct list_head *this;
+        struct inode_entry *new_ie;
-        struct inode_entry *new_ie, *ie;
-        list_for_each(this, ilist) {
+        if (inode == find_gc_inode(inode->i_ino, ilist)) {
-                ie = list_entry(this, struct inode_entry, list);
+                iput(inode);
-                if (ie->inode == inode) {
+                return;
-                        iput(inode);
-                        return;
-                }
        }
 repeat:
        new_ie = kmem_cache_alloc(winode_slab, GFP_NOFS);
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 91ac7f9d88ee..2b2d45d19e3e 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -109,12 +109,6 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
        ret = do_read_inode(inode);
        if (ret)
                goto bad_inode;
-        if (!sbi->por_doing && inode->i_nlink == 0) {
-                ret = -ENOENT;
-                goto bad_inode;
-        }
 make_now:
        if (ino == F2FS_NODE_INO(sbi)) {
                inode->i_mapping->a_ops = &f2fs_node_aops;
@@ -130,8 +124,7 @@ make_now:
                inode->i_op = &f2fs_dir_inode_operations;
                inode->i_fop = &f2fs_dir_operations;
                inode->i_mapping->a_ops = &f2fs_dblock_aops;
-                mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER_MOVABLE |
+                mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO);
-                                __GFP_ZERO);
        } else if (S_ISLNK(inode->i_mode)) {
                inode->i_op = &f2fs_symlink_inode_operations;
                inode->i_mapping->a_ops = &f2fs_dblock_aops;
@@ -199,6 +192,7 @@ void update_inode(struct inode *inode, struct page *node_page)
        set_cold_node(inode, node_page);
        set_page_dirty(node_page);
+        clear_inode_flag(F2FS_I(inode), FI_DIRTY_INODE);
 }
 int update_inode_page(struct inode *inode)
@@ -224,6 +218,9 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
                        inode->i_ino == F2FS_META_INO(sbi))
                return 0;
+        if (!is_inode_flag_set(F2FS_I(inode), FI_DIRTY_INODE))
+                return 0;
        if (wbc)
                f2fs_balance_fs(sbi);
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 47abc9722b17..64c07169df05 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -112,7 +112,7 @@ static inline void set_cold_files(struct f2fs_sb_info *sbi, struct inode *inode,
        int count = le32_to_cpu(sbi->raw_super->extension_count);
        for (i = 0; i < count; i++) {
                if (is_multimedia_file(name, extlist[i])) {
-                        set_cold_file(inode);
+                        file_set_cold(inode);
                        break;
                }
        }
@@ -149,8 +149,7 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
        alloc_nid_done(sbi, ino);
-        if (!sbi->por_doing)
+        d_instantiate(dentry, inode);
-                d_instantiate(dentry, inode);
        unlock_new_inode(inode);
        return 0;
 out:
@@ -173,7 +172,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
        f2fs_balance_fs(sbi);
        inode->i_ctime = CURRENT_TIME;
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        set_inode_flag(F2FS_I(inode), FI_INC_LINK);
        ilock = mutex_lock_op(sbi);
@@ -182,17 +181,10 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
        if (err)
                goto out;
-        /*
-         * This file should be checkpointed during fsync.
-         * We lost i_pino from now on.
-         */
-        set_cp_file(inode);
        d_instantiate(dentry, inode);
        return 0;
 out:
        clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
-        make_bad_inode(inode);
        iput(inode);
        return err;
 }
@@ -498,6 +490,7 @@ const struct inode_operations f2fs_dir_inode_operations = {
        .rmdir          = f2fs_rmdir,
        .mknod          = f2fs_mknod,
        .rename         = f2fs_rename,
+        .getattr        = f2fs_getattr,
        .setattr        = f2fs_setattr,
        .get_acl        = f2fs_get_acl,
 #ifdef CONFIG_F2FS_FS_XATTR
@@ -512,6 +505,7 @@ const struct inode_operations f2fs_symlink_inode_operations = {
        .readlink       = generic_readlink,
        .follow_link    = page_follow_link_light,
        .put_link       = page_put_link,
+        .getattr        = f2fs_getattr,
        .setattr        = f2fs_setattr,
 #ifdef CONFIG_F2FS_FS_XATTR
        .setxattr       = generic_setxattr,
@@ -522,6 +516,7 @@ const struct inode_operations f2fs_symlink_inode_operations = {
 };
 const struct inode_operations f2fs_special_inode_operations = {
+        .getattr        = f2fs_getattr,
        .setattr        = f2fs_setattr,
        .get_acl        = f2fs_get_acl,
 #ifdef CONFIG_F2FS_FS_XATTR
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 3df43b4efd89..b418aee09573 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -408,10 +408,13 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
        level = get_node_path(index, offset, noffset);
        nids[0] = dn->inode->i_ino;
-        npage[0] = get_node_page(sbi, nids[0]);
+        npage[0] = dn->inode_page;
-        if (IS_ERR(npage[0]))
-                return PTR_ERR(npage[0]);
+        if (!npage[0]) {
+                npage[0] = get_node_page(sbi, nids[0]);
+                if (IS_ERR(npage[0]))
+                        return PTR_ERR(npage[0]);
+        }
        parent = npage[0];
        if (level != 0)
                nids[1] = get_nid(parent, offset[0], true);
@@ -430,7 +433,7 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
                        }
                        dn->nid = nids[i];
-                        npage[i] = new_node_page(dn, noffset[i]);
+                        npage[i] = new_node_page(dn, noffset[i], NULL);
                        if (IS_ERR(npage[i])) {
                                alloc_nid_failed(sbi, nids[i]);
                                err = PTR_ERR(npage[i]);
@@ -803,22 +806,19 @@ int remove_inode_page(struct inode *inode)
        return 0;
 }
-int new_inode_page(struct inode *inode, const struct qstr *name)
+struct page *new_inode_page(struct inode *inode, const struct qstr *name)
 {
-        struct page *page;
        struct dnode_of_data dn;
        /* allocate inode page for new inode */
        set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino);
-        page = new_node_page(&dn, 0);
-        init_dent_inode(name, page);
+        /* caller should f2fs_put_page(page, 1); */
-        if (IS_ERR(page))
+        return new_node_page(&dn, 0, NULL);
-                return PTR_ERR(page);
-        f2fs_put_page(page, 1);
-        return 0;
 }
-struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs)
+struct page *new_node_page(struct dnode_of_data *dn,
+                                unsigned int ofs, struct page *ipage)
 {
        struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
        struct address_space *mapping = sbi->node_inode->i_mapping;
@@ -851,7 +851,10 @@ struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs)
        set_cold_node(dn->inode, page);
        dn->node_page = page;
-        sync_inode_page(dn);
+        if (ipage)
+                update_inode(dn->inode, ipage);
+        else
+                sync_inode_page(dn);
        set_page_dirty(page);
        if (ofs == 0)
                inc_valid_inode_count(sbi);
@@ -1205,7 +1208,8 @@ static int f2fs_set_node_page_dirty(struct page *page)
        return 0;
 }
-static void f2fs_invalidate_node_page(struct page *page, unsigned long offset)
+static void f2fs_invalidate_node_page(struct page *page, unsigned int offset,
+                                      unsigned int length)
 {
        struct inode *inode = page->mapping->host;
        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
@@ -1492,9 +1496,10 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
        new_ni = old_ni;
        new_ni.ino = ino;
+        if (!inc_valid_node_count(sbi, NULL, 1))
+                WARN_ON(1);
        set_node_addr(sbi, &new_ni, NEW_ADDR);
        inc_valid_inode_count(sbi);
        f2fs_put_page(ipage, 1);
        return 0;
 }
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 0a2d72f0024d..c65fb4f4230f 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -275,25 +275,27 @@ static inline nid_t get_nid(struct page *p, int off, bool i)
 *  - Mark cold node blocks in their node footer
 *  - Mark cold data pages in page cache
 */
-static inline int is_cold_file(struct inode *inode)
+static inline int is_file(struct inode *inode, int type)
 {
-        return F2FS_I(inode)->i_advise & FADVISE_COLD_BIT;
+        return F2FS_I(inode)->i_advise & type;
 }
-static inline void set_cold_file(struct inode *inode)
+static inline void set_file(struct inode *inode, int type)
 {
-        F2FS_I(inode)->i_advise |= FADVISE_COLD_BIT;
+        F2FS_I(inode)->i_advise |= type;
 }
-static inline int is_cp_file(struct inode *inode)
+static inline void clear_file(struct inode *inode, int type)
 {
-        return F2FS_I(inode)->i_advise & FADVISE_CP_BIT;
+        F2FS_I(inode)->i_advise &= ~type;
 }
-static inline void set_cp_file(struct inode *inode)
+#define file_is_cold(inode)     is_file(inode, FADVISE_COLD_BIT)
-{
+#define file_wrong_pino(inode)  is_file(inode, FADVISE_LOST_PINO_BIT)
-        F2FS_I(inode)->i_advise |= FADVISE_CP_BIT;
+#define file_set_cold(inode)    set_file(inode, FADVISE_COLD_BIT)
-}
+#define file_lost_pino(inode)   set_file(inode, FADVISE_LOST_PINO_BIT)
+#define file_clear_cold(inode)  clear_file(inode, FADVISE_COLD_BIT)
+#define file_got_pino(inode)    clear_file(inode, FADVISE_LOST_PINO_BIT)
 static inline int is_cold_data(struct page *page)
 {
@@ -310,29 +312,16 @@ static inline void clear_cold_data(struct page *page)
        ClearPageChecked(page);
 }
-static inline int is_cold_node(struct page *page)
+static inline int is_node(struct page *page, int type)
 {
        void *kaddr = page_address(page);
        struct f2fs_node *rn = (struct f2fs_node *)kaddr;
-        unsigned int flag = le32_to_cpu(rn->footer.flag);
+        return le32_to_cpu(rn->footer.flag) & (1 << type);
-        return flag & (0x1 << COLD_BIT_SHIFT);
 }
-static inline unsigned char is_fsync_dnode(struct page *page)
+#define is_cold_node(page)      is_node(page, COLD_BIT_SHIFT)
-{
+#define is_fsync_dnode(page)    is_node(page, FSYNC_BIT_SHIFT)
-        void *kaddr = page_address(page);
+#define is_dent_dnode(page)     is_node(page, DENT_BIT_SHIFT)
-        struct f2fs_node *rn = (struct f2fs_node *)kaddr;
-        unsigned int flag = le32_to_cpu(rn->footer.flag);
-        return flag & (0x1 << FSYNC_BIT_SHIFT);
-}
-static inline unsigned char is_dent_dnode(struct page *page)
-{
-        void *kaddr = page_address(page);
-        struct f2fs_node *rn = (struct f2fs_node *)kaddr;
-        unsigned int flag = le32_to_cpu(rn->footer.flag);
-        return flag & (0x1 << DENT_BIT_SHIFT);
-}
 static inline void set_cold_node(struct inode *inode, struct page *page)
 {
@@ -346,26 +335,15 @@ static inline void set_cold_node(struct inode *inode, struct page *page)
        rn->footer.flag = cpu_to_le32(flag);
 }
-static inline void set_fsync_mark(struct page *page, int mark)
+static inline void set_mark(struct page *page, int mark, int type)
 {
-        void *kaddr = page_address(page);
+        struct f2fs_node *rn = (struct f2fs_node *)page_address(page);
-        struct f2fs_node *rn = (struct f2fs_node *)kaddr;
-        unsigned int flag = le32_to_cpu(rn->footer.flag);
-        if (mark)
-                flag |= (0x1 << FSYNC_BIT_SHIFT);
-        else
-                flag &= ~(0x1 << FSYNC_BIT_SHIFT);
-        rn->footer.flag = cpu_to_le32(flag);
-}
-static inline void set_dentry_mark(struct page *page, int mark)
-{
-        void *kaddr = page_address(page);
-        struct f2fs_node *rn = (struct f2fs_node *)kaddr;
        unsigned int flag = le32_to_cpu(rn->footer.flag);
        if (mark)
-                flag |= (0x1 << DENT_BIT_SHIFT);
+                flag |= (0x1 << type);
        else
-                flag &= ~(0x1 << DENT_BIT_SHIFT);
+                flag &= ~(0x1 << type);
        rn->footer.flag = cpu_to_le32(flag);
 }
+#define set_dentry_mark(page, mark)     set_mark(page, mark, DENT_BIT_SHIFT)
+#define set_fsync_mark(page, mark)      set_mark(page, mark, FSYNC_BIT_SHIFT)
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 60c8a5097058..d56d951c2253 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -40,36 +40,54 @@ static struct fsync_inode_entry *get_fsync_inode(struct list_head *head,
 static int recover_dentry(struct page *ipage, struct inode *inode)
 {
-        struct f2fs_node *raw_node = (struct f2fs_node *)kmap(ipage);
+        void *kaddr = page_address(ipage);
+        struct f2fs_node *raw_node = (struct f2fs_node *)kaddr;
        struct f2fs_inode *raw_inode = &(raw_node->i);
-        struct qstr name;
+        nid_t pino = le32_to_cpu(raw_inode->i_pino);
        struct f2fs_dir_entry *de;
+        struct qstr name;
        struct page *page;
-        struct inode *dir;
+        struct inode *dir, *einode;
        int err = 0;
-        if (!is_dent_dnode(ipage))
+        dir = check_dirty_dir_inode(F2FS_SB(inode->i_sb), pino);
-                goto out;
+        if (!dir) {
+                dir = f2fs_iget(inode->i_sb, pino);
-        dir = f2fs_iget(inode->i_sb, le32_to_cpu(raw_inode->i_pino));
+                if (IS_ERR(dir)) {
-        if (IS_ERR(dir)) {
+                        err = PTR_ERR(dir);
-                err = PTR_ERR(dir);
+                        goto out;
-                goto out;
+                }
+                set_inode_flag(F2FS_I(dir), FI_DELAY_IPUT);
+                add_dirty_dir_inode(dir);
        }
        name.len = le32_to_cpu(raw_inode->i_namelen);
        name.name = raw_inode->i_name;
+retry:
        de = f2fs_find_entry(dir, &name, &page);
-        if (de) {
+        if (de && inode->i_ino == le32_to_cpu(de->ino)) {
                kunmap(page);
                f2fs_put_page(page, 0);
-        } else {
+                goto out;
-                err = __f2fs_add_link(dir, &name, inode);
+        }
+        if (de) {
+                einode = f2fs_iget(inode->i_sb, le32_to_cpu(de->ino));
+                if (IS_ERR(einode)) {
+                        WARN_ON(1);
+                        if (PTR_ERR(einode) == -ENOENT)
+                                err = -EEXIST;
+                        goto out;
+                }
+                f2fs_delete_entry(de, page, einode);
+                iput(einode);
+                goto retry;
        }
-        iput(dir);
+        err = __f2fs_add_link(dir, &name, inode);
 out:
-        kunmap(ipage);
+        f2fs_msg(inode->i_sb, KERN_NOTICE, "recover_inode and its dentry: "
+                        "ino = %x, name = %s, dir = %lx, err = %d",
+                        ino_of_node(ipage), raw_inode->i_name,
+                        IS_ERR(dir) ? 0 : dir->i_ino, err);
        return err;
 }
@@ -79,6 +97,9 @@ static int recover_inode(struct inode *inode, struct page *node_page)
        struct f2fs_node *raw_node = (struct f2fs_node *)kaddr;
        struct f2fs_inode *raw_inode = &(raw_node->i);
+        if (!IS_INODE(node_page))
+                return 0;
        inode->i_mode = le16_to_cpu(raw_inode->i_mode);
        i_size_write(inode, le64_to_cpu(raw_inode->i_size));
        inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
@@ -88,7 +109,12 @@ static int recover_inode(struct inode *inode, struct page *node_page)
        inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec);
        inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
-        return recover_dentry(node_page, inode);
+        if (is_dent_dnode(node_page))
+                return recover_dentry(node_page, inode);
+        f2fs_msg(inode->i_sb, KERN_NOTICE, "recover_inode: ino = %x, name = %s",
+                        ino_of_node(node_page), raw_inode->i_name);
+        return 0;
 }
 static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
@@ -119,14 +145,13 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
                lock_page(page);
                if (cp_ver != cpver_of_node(page))
-                        goto unlock_out;
+                        break;
                if (!is_fsync_dnode(page))
                        goto next;
                entry = get_fsync_inode(head, ino_of_node(page));
                if (entry) {
-                        entry->blkaddr = blkaddr;
                        if (IS_INODE(page) && is_dent_dnode(page))
                                set_inode_flag(F2FS_I(entry->inode),
                                                        FI_INC_LINK);
@@ -134,48 +159,40 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
                        if (IS_INODE(page) && is_dent_dnode(page)) {
                                err = recover_inode_page(sbi, page);
                                if (err)
-                                        goto unlock_out;
+                                        break;
                        }
                        /* add this fsync inode to the list */
                        entry = kmem_cache_alloc(fsync_entry_slab, GFP_NOFS);
                        if (!entry) {
                                err = -ENOMEM;
-                                goto unlock_out;
+                                break;
                        }
                        entry->inode = f2fs_iget(sbi->sb, ino_of_node(page));
                        if (IS_ERR(entry->inode)) {
                                err = PTR_ERR(entry->inode);
                                kmem_cache_free(fsync_entry_slab, entry);
-                                goto unlock_out;
+                                break;
                        }
                        list_add_tail(&entry->list, head);
-                        entry->blkaddr = blkaddr;
-                }
-                if (IS_INODE(page)) {
-                        err = recover_inode(entry->inode, page);
-                        if (err == -ENOENT) {
-                                goto next;
-                        } else if (err) {
-                                err = -EINVAL;
-                                goto unlock_out;
-                        }
                }
+                entry->blkaddr = blkaddr;
+                err = recover_inode(entry->inode, page);
+                if (err && err != -ENOENT)
+                        break;
 next:
                /* check next segment */
                blkaddr = next_blkaddr_of_node(page);
        }
-unlock_out:
        unlock_page(page);
 out:
        __free_pages(page, 0);
        return err;
 }
-static void destroy_fsync_dnodes(struct f2fs_sb_info *sbi,
+static void destroy_fsync_dnodes(struct list_head *head)
-                                        struct list_head *head)
 {
        struct fsync_inode_entry *entry, *tmp;
@@ -186,15 +203,15 @@ static void destroy_fsync_dnodes(struct f2fs_sb_info *sbi,
        }
 }
-static void check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
+static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
-                                                block_t blkaddr)
+                        block_t blkaddr, struct dnode_of_data *dn)
 {
        struct seg_entry *sentry;
        unsigned int segno = GET_SEGNO(sbi, blkaddr);
        unsigned short blkoff = GET_SEGOFF_FROM_SEG0(sbi, blkaddr) &
                                        (sbi->blocks_per_seg - 1);
        struct f2fs_summary sum;
-        nid_t ino;
+        nid_t ino, nid;
        void *kaddr;
        struct inode *inode;
        struct page *node_page;
@@ -203,7 +220,7 @@ static void check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
        sentry = get_seg_entry(sbi, segno);
        if (!f2fs_test_bit(blkoff, sentry->cur_valid_map))
-                return;
+                return 0;
        /* Get the previous summary */
        for (i = CURSEG_WARM_DATA; i <= CURSEG_COLD_DATA; i++) {
@@ -222,20 +239,39 @@ static void check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
                f2fs_put_page(sum_page, 1);
        }
+        /* Use the locked dnode page and inode */
+        nid = le32_to_cpu(sum.nid);
+        if (dn->inode->i_ino == nid) {
+                struct dnode_of_data tdn = *dn;
+                tdn.nid = nid;
+                tdn.node_page = dn->inode_page;
+                tdn.ofs_in_node = le16_to_cpu(sum.ofs_in_node);
+                truncate_data_blocks_range(&tdn, 1);
+                return 0;
+        } else if (dn->nid == nid) {
+                struct dnode_of_data tdn = *dn;
+                tdn.ofs_in_node = le16_to_cpu(sum.ofs_in_node);
+                truncate_data_blocks_range(&tdn, 1);
+                return 0;
+        }
        /* Get the node page */
-        node_page = get_node_page(sbi, le32_to_cpu(sum.nid));
+        node_page = get_node_page(sbi, nid);
+        if (IS_ERR(node_page))
+                return PTR_ERR(node_page);
        bidx = start_bidx_of_node(ofs_of_node(node_page)) +
-                                le16_to_cpu(sum.ofs_in_node);
+                                        le16_to_cpu(sum.ofs_in_node);
        ino = ino_of_node(node_page);
        f2fs_put_page(node_page, 1);
        /* Deallocate previous index in the node page */
        inode = f2fs_iget(sbi->sb, ino);
        if (IS_ERR(inode))
-                return;
+                return PTR_ERR(inode);
        truncate_hole(inode, bidx, bidx + 1);
        iput(inode);
+        return 0;
 }
 static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
@@ -245,7 +281,7 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
        struct dnode_of_data dn;
        struct f2fs_summary sum;
        struct node_info ni;
-        int err = 0;
+        int err = 0, recovered = 0;
        int ilock;
        start = start_bidx_of_node(ofs_of_node(page));
@@ -283,13 +319,16 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
                        }
                        /* Check the previous node page having this index */
-                        check_index_in_prev_nodes(sbi, dest);
+                        err = check_index_in_prev_nodes(sbi, dest, &dn);
+                        if (err)
+                                goto err;
                        set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version);
                        /* write dummy data page */
                        recover_data_page(sbi, NULL, &sum, src, dest);
                        update_extent_cache(dest, &dn);
+                        recovered++;
                }
                dn.ofs_in_node++;
        }
@@ -305,9 +344,14 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
        set_page_dirty(dn.node_page);
        recover_node_page(sbi, dn.node_page, &sum, &ni, blkaddr);
+err:
        f2fs_put_dnode(&dn);
        mutex_unlock_op(sbi, ilock);
-        return 0;
+        f2fs_msg(sbi->sb, KERN_NOTICE, "recover_data: ino = %lx, "
+                        "recovered_data = %d blocks, err = %d",
+                        inode->i_ino, recovered, err);
+        return err;
 }
 static int recover_data(struct f2fs_sb_info *sbi,
@@ -340,7 +384,7 @@ static int recover_data(struct f2fs_sb_info *sbi,
                lock_page(page);
                if (cp_ver != cpver_of_node(page))
-                        goto unlock_out;
+                        break;
                entry = get_fsync_inode(head, ino_of_node(page));
                if (!entry)
@@ -348,7 +392,7 @@ static int recover_data(struct f2fs_sb_info *sbi,
                err = do_recover_data(sbi, entry->inode, page, blkaddr);
                if (err)
-                        goto out;
+                        break;
                if (entry->blkaddr == blkaddr) {
                        iput(entry->inode);
@@ -359,7 +403,6 @@ next:
                /* check next segment */
                blkaddr = next_blkaddr_of_node(page);
        }
-unlock_out:
        unlock_page(page);
 out:
        __free_pages(page, 0);
@@ -382,6 +425,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi)
        INIT_LIST_HEAD(&inode_list);
        /* step #1: find fsynced inode numbers */
+        sbi->por_doing = 1;
        err = find_fsync_dnodes(sbi, &inode_list);
        if (err)
                goto out;
@@ -390,13 +434,13 @@ int recover_fsync_data(struct f2fs_sb_info *sbi)
                goto out;
        /* step #2: recover data */
-        sbi->por_doing = 1;
        err = recover_data(sbi, &inode_list, CURSEG_WARM_NODE);
-        sbi->por_doing = 0;
        BUG_ON(!list_empty(&inode_list));
 out:
-        destroy_fsync_dnodes(sbi, &inode_list);
+        destroy_fsync_dnodes(&inode_list);
        kmem_cache_destroy(fsync_entry_slab);
-        write_checkpoint(sbi, false);
+        sbi->por_doing = 0;
+        if (!err)
+                write_checkpoint(sbi, false);
        return err;
 }
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index d8e84e49a5c3..a86d125a9885 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -94,7 +94,7 @@ static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
 * Adding dirty entry into seglist is not critical operation.
 * If a given segment is one of current working segments, it won't be added.
 */
-void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
+static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
 {
        struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
        unsigned short valid_blocks;
@@ -126,17 +126,16 @@ void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
 static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
 {
        struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
-        unsigned int segno, offset = 0;
+        unsigned int segno = -1;
        unsigned int total_segs = TOTAL_SEGS(sbi);
        mutex_lock(&dirty_i->seglist_lock);
        while (1) {
                segno = find_next_bit(dirty_i->dirty_segmap[PRE], total_segs,
-                                offset);
+                                segno + 1);
                if (segno >= total_segs)
                        break;
                __set_test_and_free(sbi, segno);
-                offset = segno + 1;
        }
        mutex_unlock(&dirty_i->seglist_lock);
 }
@@ -144,17 +143,16 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
 void clear_prefree_segments(struct f2fs_sb_info *sbi)
 {
        struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
-        unsigned int segno, offset = 0;
+        unsigned int segno = -1;
        unsigned int total_segs = TOTAL_SEGS(sbi);
        mutex_lock(&dirty_i->seglist_lock);
        while (1) {
                segno = find_next_bit(dirty_i->dirty_segmap[PRE], total_segs,
-                                offset);
+                                segno + 1);
                if (segno >= total_segs)
                        break;
-                offset = segno + 1;
                if (test_and_clear_bit(segno, dirty_i->dirty_segmap[PRE]))
                        dirty_i->nr_dirty[PRE]--;
@@ -257,11 +255,11 @@ void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr)
 * This function should be resided under the curseg_mutex lock
 */
 static void __add_sum_entry(struct f2fs_sb_info *sbi, int type,
-                struct f2fs_summary *sum, unsigned short offset)
+                                        struct f2fs_summary *sum)
 {
        struct curseg_info *curseg = CURSEG_I(sbi, type);
        void *addr = curseg->sum_blk;
-        addr += offset * sizeof(struct f2fs_summary);
+        addr += curseg->next_blkoff * sizeof(struct f2fs_summary);
        memcpy(addr, sum, sizeof(struct f2fs_summary));
        return;
 }
@@ -311,64 +309,14 @@ static void write_sum_page(struct f2fs_sb_info *sbi,
        f2fs_put_page(page, 1);
 }
-static unsigned int check_prefree_segments(struct f2fs_sb_info *sbi, int type)
-{
-        struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
-        unsigned long *prefree_segmap = dirty_i->dirty_segmap[PRE];
-        unsigned int segno;
-        unsigned int ofs = 0;
-        /*
-         * If there is not enough reserved sections,
-         * we should not reuse prefree segments.
-         */
-        if (has_not_enough_free_secs(sbi, 0))
-                return NULL_SEGNO;
-        /*
-         * NODE page should not reuse prefree segment,
-         * since those information is used for SPOR.
-         */
-        if (IS_NODESEG(type))
-                return NULL_SEGNO;
-next:
-        segno = find_next_bit(prefree_segmap, TOTAL_SEGS(sbi), ofs);
-        ofs += sbi->segs_per_sec;
-        if (segno < TOTAL_SEGS(sbi)) {
-                int i;
-                /* skip intermediate segments in a section */
-                if (segno % sbi->segs_per_sec)
-                        goto next;
-                /* skip if the section is currently used */
-                if (sec_usage_check(sbi, GET_SECNO(sbi, segno)))
-                        goto next;
-                /* skip if whole section is not prefree */
-                for (i = 1; i < sbi->segs_per_sec; i++)
-                        if (!test_bit(segno + i, prefree_segmap))
-                                goto next;
-                /* skip if whole section was not free at the last checkpoint */
-                for (i = 0; i < sbi->segs_per_sec; i++)
-                        if (get_seg_entry(sbi, segno + i)->ckpt_valid_blocks)
-                                goto next;
-                return segno;
-        }
-        return NULL_SEGNO;
-}
 static int is_next_segment_free(struct f2fs_sb_info *sbi, int type)
 {
        struct curseg_info *curseg = CURSEG_I(sbi, type);
-        unsigned int segno = curseg->segno;
+        unsigned int segno = curseg->segno + 1;
        struct free_segmap_info *free_i = FREE_I(sbi);
-        if (segno + 1 < TOTAL_SEGS(sbi) && (segno + 1) % sbi->segs_per_sec)
+        if (segno < TOTAL_SEGS(sbi) && segno % sbi->segs_per_sec)
-                return !test_bit(segno + 1, free_i->free_segmap);
+                return !test_bit(segno, free_i->free_segmap);
        return 0;
 }
@@ -495,7 +443,7 @@ static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec)
        int dir = ALLOC_LEFT;
        write_sum_page(sbi, curseg->sum_blk,
-                                GET_SUM_BLOCK(sbi, curseg->segno));
+                                GET_SUM_BLOCK(sbi, segno));
        if (type == CURSEG_WARM_DATA || type == CURSEG_COLD_DATA)
                dir = ALLOC_RIGHT;
@@ -599,11 +547,7 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi,
                goto out;
        }
-        curseg->next_segno = check_prefree_segments(sbi, type);
+        if (type == CURSEG_WARM_NODE)
-        if (curseg->next_segno != NULL_SEGNO)
-                change_curseg(sbi, type, false);
-        else if (type == CURSEG_WARM_NODE)
                new_curseg(sbi, type, false);
        else if (curseg->alloc_type == LFS && is_next_segment_free(sbi, type))
                new_curseg(sbi, type, false);
@@ -612,7 +556,10 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi,
        else
                new_curseg(sbi, type, false);
 out:
+#ifdef CONFIG_F2FS_STAT_FS
        sbi->segment_count[curseg->alloc_type]++;
+#endif
+        return;
 }
 void allocate_new_segments(struct f2fs_sb_info *sbi)
@@ -795,7 +742,7 @@ static int __get_segment_type_6(struct page *page, enum page_type p_type)
                if (S_ISDIR(inode->i_mode))
                        return CURSEG_HOT_DATA;
-                else if (is_cold_data(page) || is_cold_file(inode))
+                else if (is_cold_data(page) || file_is_cold(inode))
                        return CURSEG_COLD_DATA;
                else
                        return CURSEG_WARM_DATA;
@@ -844,11 +791,13 @@ static void do_write_page(struct f2fs_sb_info *sbi, struct page *page,
         * because, this function updates a summary entry in the
         * current summary block.
         */
-        __add_sum_entry(sbi, type, sum, curseg->next_blkoff);
+        __add_sum_entry(sbi, type, sum);
        mutex_lock(&sit_i->sentry_lock);
        __refresh_next_blkoff(sbi, curseg);
+#ifdef CONFIG_F2FS_STAT_FS
        sbi->block_count[curseg->alloc_type]++;
+#endif
        /*
         * SIT information should be updated before segment allocation,
@@ -943,7 +892,7 @@ void recover_data_page(struct f2fs_sb_info *sbi,
        curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, new_blkaddr) &
                                        (sbi->blocks_per_seg - 1);
-        __add_sum_entry(sbi, type, sum, curseg->next_blkoff);
+        __add_sum_entry(sbi, type, sum);
        refresh_sit_entry(sbi, old_blkaddr, new_blkaddr);
@@ -980,7 +929,7 @@ void rewrite_node_page(struct f2fs_sb_info *sbi,
        }
        curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, new_blkaddr) &
                                        (sbi->blocks_per_seg - 1);
-        __add_sum_entry(sbi, type, sum, curseg->next_blkoff);
+        __add_sum_entry(sbi, type, sum);
        /* change the current log to the next block addr in advance */
        if (next_segno != segno) {
@@ -1579,13 +1528,13 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi)
 {
        struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
        struct free_segmap_info *free_i = FREE_I(sbi);
-        unsigned int segno = 0, offset = 0;
+        unsigned int segno = 0, offset = 0, total_segs = TOTAL_SEGS(sbi);
        unsigned short valid_blocks;
-        while (segno < TOTAL_SEGS(sbi)) {
+        while (1) {
                /* find dirty segment based on free segmap */
-                segno = find_next_inuse(free_i, TOTAL_SEGS(sbi), offset);
+                segno = find_next_inuse(free_i, total_segs, offset);
-                if (segno >= TOTAL_SEGS(sbi))
+                if (segno >= total_segs)
                        break;
                offset = segno + 1;
                valid_blocks = get_valid_blocks(sbi, segno, 0);
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 8555f7df82c7..75c7dc363e92 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -34,7 +34,7 @@
 static struct kmem_cache *f2fs_inode_cachep;
 enum {
-        Opt_gc_background_off,
+        Opt_gc_background,
        Opt_disable_roll_forward,
        Opt_discard,
        Opt_noheap,
@@ -46,7 +46,7 @@ enum {
 };
 static match_table_t f2fs_tokens = {
-        {Opt_gc_background_off, "background_gc_off"},
+        {Opt_gc_background, "background_gc=%s"},
        {Opt_disable_roll_forward, "disable_roll_forward"},
        {Opt_discard, "discard"},
        {Opt_noheap, "no_heap"},
@@ -76,6 +76,91 @@ static void init_once(void *foo)
        inode_init_once(&fi->vfs_inode);
 }
+static int parse_options(struct super_block *sb, char *options)
+{
+        struct f2fs_sb_info *sbi = F2FS_SB(sb);
+        substring_t args[MAX_OPT_ARGS];
+        char *p, *name;
+        int arg = 0;
+        if (!options)
+                return 0;
+        while ((p = strsep(&options, ",")) != NULL) {
+                int token;
+                if (!*p)
+                        continue;
+                /*
+                 * Initialize args struct so we know whether arg was
+                 * found; some options take optional arguments.
+                 */
+                args[0].to = args[0].from = NULL;
+                token = match_token(p, f2fs_tokens, args);
+                switch (token) {
+                case Opt_gc_background:
+                        name = match_strdup(&args[0]);
+                        if (!name)
+                                return -ENOMEM;
+                        if (!strncmp(name, "on", 2))
+                                set_opt(sbi, BG_GC);
+                        else if (!strncmp(name, "off", 3))
+                                clear_opt(sbi, BG_GC);
+                        else {
+                                kfree(name);
+                                return -EINVAL;
+                        }
+                        kfree(name);
+                        break;
+                case Opt_disable_roll_forward:
+                        set_opt(sbi, DISABLE_ROLL_FORWARD);
+                        break;
+                case Opt_discard:
+                        set_opt(sbi, DISCARD);
+                        break;
+                case Opt_noheap:
+                        set_opt(sbi, NOHEAP);
+                        break;
+#ifdef CONFIG_F2FS_FS_XATTR
+                case Opt_nouser_xattr:
+                        clear_opt(sbi, XATTR_USER);
+                        break;
+#else
+                case Opt_nouser_xattr:
+                        f2fs_msg(sb, KERN_INFO,
+                                "nouser_xattr options not supported");
+                        break;
+#endif
+#ifdef CONFIG_F2FS_FS_POSIX_ACL
+                case Opt_noacl:
+                        clear_opt(sbi, POSIX_ACL);
+                        break;
+#else
+                case Opt_noacl:
+                        f2fs_msg(sb, KERN_INFO, "noacl options not supported");
+                        break;
+#endif
+                case Opt_active_logs:
+                        if (args->from && match_int(args, &arg))
+                                return -EINVAL;
+                        if (arg != 2 && arg != 4 && arg != NR_CURSEG_TYPE)
+                                return -EINVAL;
+                        sbi->active_logs = arg;
+                        break;
+                case Opt_disable_ext_identify:
+                        set_opt(sbi, DISABLE_EXT_IDENTIFY);
+                        break;
+                default:
+                        f2fs_msg(sb, KERN_ERR,
+                                "Unrecognized mount option \"%s\" or missing value",
+                                p);
+                        return -EINVAL;
+                }
+        }
+        return 0;
+}
 static struct inode *f2fs_alloc_inode(struct super_block *sb)
 {
        struct f2fs_inode_info *fi;
@@ -112,6 +197,17 @@ static int f2fs_drop_inode(struct inode *inode)
        return generic_drop_inode(inode);
 }
+/*
+ * f2fs_dirty_inode() is called from __mark_inode_dirty()
+ *
+ * We should call set_dirty_inode to write the dirty inode through write_inode.
+ */
+static void f2fs_dirty_inode(struct inode *inode, int flags)
+{
+        set_inode_flag(F2FS_I(inode), FI_DIRTY_INODE);
+        return;
+}
 static void f2fs_i_callback(struct rcu_head *head)
 {
        struct inode *inode = container_of(head, struct inode, i_rcu);
@@ -170,7 +266,7 @@ static int f2fs_freeze(struct super_block *sb)
 {
        int err;
-        if (sb->s_flags & MS_RDONLY)
+        if (f2fs_readonly(sb))
                return 0;
        err = f2fs_sync_fs(sb, 1);
@@ -214,10 +310,10 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
 {
        struct f2fs_sb_info *sbi = F2FS_SB(root->d_sb);
-        if (test_opt(sbi, BG_GC))
+        if (!(root->d_sb->s_flags & MS_RDONLY) && test_opt(sbi, BG_GC))
-                seq_puts(seq, ",background_gc_on");
+                seq_printf(seq, ",background_gc=%s", "on");
        else
-                seq_puts(seq, ",background_gc_off");
+                seq_printf(seq, ",background_gc=%s", "off");
        if (test_opt(sbi, DISABLE_ROLL_FORWARD))
                seq_puts(seq, ",disable_roll_forward");
        if (test_opt(sbi, DISCARD))
@@ -244,11 +340,64 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
        return 0;
 }
+static int f2fs_remount(struct super_block *sb, int *flags, char *data)
+{
+        struct f2fs_sb_info *sbi = F2FS_SB(sb);
+        struct f2fs_mount_info org_mount_opt;
+        int err, active_logs;
+        /*
+         * Save the old mount options in case we
+         * need to restore them.
+         */
+        org_mount_opt = sbi->mount_opt;
+        active_logs = sbi->active_logs;
+        /* parse mount options */
+        err = parse_options(sb, data);
+        if (err)
+                goto restore_opts;
+        /*
+         * Previous and new state of filesystem is RO,
+         * so no point in checking GC conditions.
+         */
+        if ((sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY))
+                goto skip;
+        /*
+         * We stop the GC thread if FS is mounted as RO
+         * or if background_gc = off is passed in mount
+         * option. Also sync the filesystem.
+         */
+        if ((*flags & MS_RDONLY) || !test_opt(sbi, BG_GC)) {
+                if (sbi->gc_thread) {
+                        stop_gc_thread(sbi);
+                        f2fs_sync_fs(sb, 1);
+                }
+        } else if (test_opt(sbi, BG_GC) && !sbi->gc_thread) {
+                err = start_gc_thread(sbi);
+                if (err)
+                        goto restore_opts;
+        }
+skip:
+        /* Update the POSIXACL Flag */
+         sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+                (test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0);
+        return 0;
+restore_opts:
+        sbi->mount_opt = org_mount_opt;
+        sbi->active_logs = active_logs;
+        return err;
+}
 static struct super_operations f2fs_sops = {
        .alloc_inode    = f2fs_alloc_inode,
        .drop_inode     = f2fs_drop_inode,
        .destroy_inode  = f2fs_destroy_inode,
        .write_inode    = f2fs_write_inode,
+        .dirty_inode    = f2fs_dirty_inode,
        .show_options   = f2fs_show_options,
        .evict_inode    = f2fs_evict_inode,
        .put_super      = f2fs_put_super,
@@ -256,6 +405,7 @@ static struct super_operations f2fs_sops = {
        .freeze_fs      = f2fs_freeze,
        .unfreeze_fs    = f2fs_unfreeze,
        .statfs         = f2fs_statfs,
+        .remount_fs     = f2fs_remount,
 };
 static struct inode *f2fs_nfs_get_inode(struct super_block *sb,
@@ -303,79 +453,6 @@ static const struct export_operations f2fs_export_ops = {
        .get_parent = f2fs_get_parent,
 };
-static int parse_options(struct super_block *sb, struct f2fs_sb_info *sbi,
-                                char *options)
-{
-        substring_t args[MAX_OPT_ARGS];
-        char *p;
-        int arg = 0;
-        if (!options)
-                return 0;
-        while ((p = strsep(&options, ",")) != NULL) {
-                int token;
-                if (!*p)
-                        continue;
-                /*
-                 * Initialize args struct so we know whether arg was
-                 * found; some options take optional arguments.
-                 */
-                args[0].to = args[0].from = NULL;
-                token = match_token(p, f2fs_tokens, args);
-                switch (token) {
-                case Opt_gc_background_off:
-                        clear_opt(sbi, BG_GC);
-                        break;
-                case Opt_disable_roll_forward:
-                        set_opt(sbi, DISABLE_ROLL_FORWARD);
-                        break;
-                case Opt_discard:
-                        set_opt(sbi, DISCARD);
-                        break;
-                case Opt_noheap:
-                        set_opt(sbi, NOHEAP);
-                        break;
-#ifdef CONFIG_F2FS_FS_XATTR
-                case Opt_nouser_xattr:
-                        clear_opt(sbi, XATTR_USER);
-                        break;
-#else
-                case Opt_nouser_xattr:
-                        f2fs_msg(sb, KERN_INFO,
-                                "nouser_xattr options not supported");
-                        break;
-#endif
-#ifdef CONFIG_F2FS_FS_POSIX_ACL
-                case Opt_noacl:
-                        clear_opt(sbi, POSIX_ACL);
-                        break;
-#else
-                case Opt_noacl:
-                        f2fs_msg(sb, KERN_INFO, "noacl options not supported");
-                        break;
-#endif
-                case Opt_active_logs:
-                        if (args->from && match_int(args, &arg))
-                                return -EINVAL;
-                        if (arg != 2 && arg != 4 && arg != NR_CURSEG_TYPE)
-                                return -EINVAL;
-                        sbi->active_logs = arg;
-                        break;
-                case Opt_disable_ext_identify:
-                        set_opt(sbi, DISABLE_EXT_IDENTIFY);
-                        break;
-                default:
-                        f2fs_msg(sb, KERN_ERR,
-                                "Unrecognized mount option \"%s\" or missing value",
-                                p);
-                        return -EINVAL;
-                }
-        }
-        return 0;
-}
 static loff_t max_file_size(unsigned bits)
 {
        loff_t result = ADDRS_PER_INODE;
@@ -541,6 +618,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
                if (err)
                        goto free_sb_buf;
        }
+        sb->s_fs_info = sbi;
        /* init some FS parameters */
        sbi->active_logs = NR_CURSEG_TYPE;
@@ -553,7 +631,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
        set_opt(sbi, POSIX_ACL);
 #endif
        /* parse mount options */
-        err = parse_options(sb, sbi, (char *)data);
+        err = parse_options(sb, (char *)data);
        if (err)
                goto free_sb_buf;
@@ -565,7 +643,6 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
        sb->s_xattr = f2fs_xattr_handlers;
        sb->s_export_op = &f2fs_export_ops;
        sb->s_magic = F2FS_SUPER_MAGIC;
-        sb->s_fs_info = sbi;
        sb->s_time_gran = 1;
        sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
                (test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0);
@@ -674,10 +751,16 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
                                "Cannot recover all fsync data errno=%ld", err);
        }
-        /* After POR, we can run background GC thread */
+        /*
-        err = start_gc_thread(sbi);
+         * If filesystem is not mounted as read-only then
-        if (err)
+         * do start the gc_thread.
-                goto fail;
+         */
+        if (!(sb->s_flags & MS_RDONLY)) {
+                /* After POR, we can run background GC thread.*/
+                err = start_gc_thread(sbi);
+                if (err)
+                        goto fail;
+        }
        err = f2fs_build_stats(sbi);
        if (err)
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 0b02dce31356..3ab07ecd86ca 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -20,6 +20,7 @@
 */
 #include <linux/rwsem.h>
 #include <linux/f2fs_fs.h>
+#include <linux/security.h>
 #include "f2fs.h"
 #include "xattr.h"
@@ -43,6 +44,10 @@ static size_t f2fs_xattr_generic_list(struct dentry *dentry, char *list,
                prefix = XATTR_TRUSTED_PREFIX;
                prefix_len = XATTR_TRUSTED_PREFIX_LEN;
                break;
+        case F2FS_XATTR_INDEX_SECURITY:
+                prefix = XATTR_SECURITY_PREFIX;
+                prefix_len = XATTR_SECURITY_PREFIX_LEN;
+                break;
        default:
                return -EINVAL;
        }
@@ -50,7 +55,7 @@ static size_t f2fs_xattr_generic_list(struct dentry *dentry, char *list,
        total_len = prefix_len + name_len + 1;
        if (list && total_len <= list_size) {
                memcpy(list, prefix, prefix_len);
-                memcpy(list+prefix_len, name, name_len);
+                memcpy(list + prefix_len, name, name_len);
                list[prefix_len + name_len] = '\0';
        }
        return total_len;
@@ -70,13 +75,14 @@ static int f2fs_xattr_generic_get(struct dentry *dentry, const char *name,
                if (!capable(CAP_SYS_ADMIN))
                        return -EPERM;
                break;
+        case F2FS_XATTR_INDEX_SECURITY:
+                break;
        default:
                return -EINVAL;
        }
        if (strcmp(name, "") == 0)
                return -EINVAL;
-        return f2fs_getxattr(dentry->d_inode, type, name,
+        return f2fs_getxattr(dentry->d_inode, type, name, buffer, size);
-                        buffer, size);
 }
 static int f2fs_xattr_generic_set(struct dentry *dentry, const char *name,
@@ -93,13 +99,15 @@ static int f2fs_xattr_generic_set(struct dentry *dentry, const char *name,
                if (!capable(CAP_SYS_ADMIN))
                        return -EPERM;
                break;
+        case F2FS_XATTR_INDEX_SECURITY:
+                break;
        default:
                return -EINVAL;
        }
        if (strcmp(name, "") == 0)
                return -EINVAL;
-        return f2fs_setxattr(dentry->d_inode, type, name, value, size);
+        return f2fs_setxattr(dentry->d_inode, type, name, value, size, NULL);
 }
 static size_t f2fs_xattr_advise_list(struct dentry *dentry, char *list,
@@ -145,6 +153,31 @@ static int f2fs_xattr_advise_set(struct dentry *dentry, const char *name,
        return 0;
 }
+#ifdef CONFIG_F2FS_FS_SECURITY
+static int f2fs_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+                void *page)
+{
+        const struct xattr *xattr;
+        int err = 0;
+        for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+                err = f2fs_setxattr(inode, F2FS_XATTR_INDEX_SECURITY,
+                                xattr->name, xattr->value,
+                                xattr->value_len, (struct page *)page);
+                if (err < 0)
+                        break;
+        }
+        return err;
+}
+int f2fs_init_security(struct inode *inode, struct inode *dir,
+                                const struct qstr *qstr, struct page *ipage)
+{
+        return security_inode_init_security(inode, dir, qstr,
+                                &f2fs_initxattrs, ipage);
+}
+#endif
 const struct xattr_handler f2fs_xattr_user_handler = {
        .prefix = XATTR_USER_PREFIX,
        .flags  = F2FS_XATTR_INDEX_USER,
@@ -169,6 +202,14 @@ const struct xattr_handler f2fs_xattr_advise_handler = {
        .set    = f2fs_xattr_advise_set,
 };
+const struct xattr_handler f2fs_xattr_security_handler = {
+        .prefix = XATTR_SECURITY_PREFIX,
+        .flags  = F2FS_XATTR_INDEX_SECURITY,
+        .list   = f2fs_xattr_generic_list,
+        .get    = f2fs_xattr_generic_get,
+        .set    = f2fs_xattr_generic_set,
+};
 static const struct xattr_handler *f2fs_xattr_handler_map[] = {
        [F2FS_XATTR_INDEX_USER] = &f2fs_xattr_user_handler,
 #ifdef CONFIG_F2FS_FS_POSIX_ACL
@@ -176,6 +217,9 @@ static const struct xattr_handler *f2fs_xattr_handler_map[] = {
        [F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT] = &f2fs_xattr_acl_default_handler,
 #endif
        [F2FS_XATTR_INDEX_TRUSTED] = &f2fs_xattr_trusted_handler,
+#ifdef CONFIG_F2FS_FS_SECURITY
+        [F2FS_XATTR_INDEX_SECURITY] = &f2fs_xattr_security_handler,
+#endif
        [F2FS_XATTR_INDEX_ADVISE] = &f2fs_xattr_advise_handler,
 };
@@ -186,6 +230,9 @@ const struct xattr_handler *f2fs_xattr_handlers[] = {
        &f2fs_xattr_acl_default_handler,
 #endif
        &f2fs_xattr_trusted_handler,
+#ifdef CONFIG_F2FS_FS_SECURITY
+        &f2fs_xattr_security_handler,
+#endif
        &f2fs_xattr_advise_handler,
        NULL,
 };
@@ -218,6 +265,8 @@ int f2fs_getxattr(struct inode *inode, int name_index, const char *name,
                return -ENODATA;
        page = get_node_page(sbi, fi->i_xattr_nid);
+        if (IS_ERR(page))
+                return PTR_ERR(page);
        base_addr = page_address(page);
        list_for_each_xattr(entry, base_addr) {
@@ -268,6 +317,8 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
                return 0;
        page = get_node_page(sbi, fi->i_xattr_nid);
+        if (IS_ERR(page))
+                return PTR_ERR(page);
        base_addr = page_address(page);
        list_for_each_xattr(entry, base_addr) {
@@ -296,7 +347,7 @@ cleanup:
 }
 int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
-                                        const void *value, size_t value_len)
+                        const void *value, size_t value_len, struct page *ipage)
 {
        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
        struct f2fs_inode_info *fi = F2FS_I(inode);
@@ -335,7 +386,7 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
                set_new_dnode(&dn, inode, NULL, NULL, fi->i_xattr_nid);
                mark_inode_dirty(inode);
-                page = new_node_page(&dn, XATTR_NODE_OFFSET);
+                page = new_node_page(&dn, XATTR_NODE_OFFSET, ipage);
                if (IS_ERR(page)) {
                        alloc_nid_failed(sbi, fi->i_xattr_nid);
                        fi->i_xattr_nid = 0;
@@ -435,7 +486,10 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
                inode->i_ctime = CURRENT_TIME;
                clear_inode_flag(fi, FI_ACL_MODE);
        }
-        update_inode_page(inode);
+        if (ipage)
+                update_inode(inode, ipage);
+        else
+                update_inode_page(inode);
        mutex_unlock_op(sbi, ilock);
        return 0;
diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h
index 49c9558305e3..3c0817bef25d 100644
--- a/fs/f2fs/xattr.h
+++ b/fs/f2fs/xattr.h
@@ -112,21 +112,19 @@ extern const struct xattr_handler f2fs_xattr_trusted_handler;
 extern const struct xattr_handler f2fs_xattr_acl_access_handler;
 extern const struct xattr_handler f2fs_xattr_acl_default_handler;
 extern const struct xattr_handler f2fs_xattr_advise_handler;
+extern const struct xattr_handler f2fs_xattr_security_handler;
 extern const struct xattr_handler *f2fs_xattr_handlers[];
-extern int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
+extern int f2fs_setxattr(struct inode *, int, const char *,
-                const void *value, size_t value_len);
+                                const void *, size_t, struct page *);
-extern int f2fs_getxattr(struct inode *inode, int name_index, const char *name,
+extern int f2fs_getxattr(struct inode *, int, const char *, void *, size_t);
-                void *buffer, size_t buffer_size);
+extern ssize_t f2fs_listxattr(struct dentry *, char *, size_t);
-extern ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer,
-                size_t buffer_size);
 #else
 #define f2fs_xattr_handlers     NULL
 static inline int f2fs_setxattr(struct inode *inode, int name_index,
-        const char *name, const void *value, size_t value_len)
+                const char *name, const void *value, size_t value_len)
 {
        return -EOPNOTSUPP;
 }
@@ -142,4 +140,14 @@ static inline ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer,
 }
 #endif
+#ifdef CONFIG_F2FS_FS_SECURITY
+extern int f2fs_init_security(struct inode *, struct inode *,
+                                const struct qstr *, struct page *);
+#else
+static inline int f2fs_init_security(struct inode *inode, struct inode *dir,
+                                const struct qstr *qstr, struct page *ipage)
+{
+        return 0;
+}
+#endif
 #endif /* __F2FS_XATTR_H__ */
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 7a6f02caf286..3963ede84eb0 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -543,6 +543,7 @@ end_of_dir:
 EXPORT_SYMBOL_GPL(fat_search_long);
 struct fat_ioctl_filldir_callback {
+        struct dir_context ctx;
        void __user *dirent;
        int result;
        /* for dir ioctl */
@@ -552,8 +553,9 @@ struct fat_ioctl_filldir_callback {
        int short_len;
 };
-static int __fat_readdir(struct inode *inode, struct file *filp, void *dirent,
+static int __fat_readdir(struct inode *inode, struct file *file,
-                         filldir_t filldir, int short_only, int both)
+                         struct dir_context *ctx, int short_only,
+                         struct fat_ioctl_filldir_callback *both)
 {
        struct super_block *sb = inode->i_sb;
        struct msdos_sb_info *sbi = MSDOS_SB(sb);
@@ -564,27 +566,20 @@ static int __fat_readdir(struct inode *inode, struct file *filp, void *dirent,
        unsigned char bufname[FAT_MAX_SHORT_SIZE];
        int isvfat = sbi->options.isvfat;
        const char *fill_name = NULL;
-        unsigned long inum;
+        int fake_offset = 0;
-        unsigned long lpos, dummy, *furrfu = &lpos;
        loff_t cpos;
        int short_len = 0, fill_len = 0;
        int ret = 0;
        mutex_lock(&sbi->s_lock);
-        cpos = filp->f_pos;
+        cpos = ctx->pos;
        /* Fake . and .. for the root directory. */
        if (inode->i_ino == MSDOS_ROOT_INO) {
-                while (cpos < 2) {
+                if (!dir_emit_dots(file, ctx))
-                        if (filldir(dirent, "..", cpos+1, cpos,
+                        goto out;
-                                    MSDOS_ROOT_INO, DT_DIR) < 0)
+                if (ctx->pos == 2) {
-                                goto out;
+                        fake_offset = 1;
-                        cpos++;
-                        filp->f_pos++;
-                }
-                if (cpos == 2) {
-                        dummy = 2;
-                        furrfu = &dummy;
                        cpos = 0;
                }
        }
@@ -619,7 +614,7 @@ parse_record:
                int status = fat_parse_long(inode, &cpos, &bh, &de,
                                            &unicode, &nr_slots);
                if (status < 0) {
-                        filp->f_pos = cpos;
+                        ctx->pos = cpos;
                        ret = status;
                        goto out;
                } else if (status == PARSE_INVALID)
@@ -639,6 +634,19 @@ parse_record:
                        /* !both && !short_only, so we don't need shortname. */
                        if (!both)
                                goto start_filldir;
+                        short_len = fat_parse_short(sb, de, bufname,
+                                                    sbi->options.dotsOK);
+                        if (short_len == 0)
+                                goto record_end;
+                        /* hack for fat_ioctl_filldir() */
+                        both->longname = fill_name;
+                        both->long_len = fill_len;
+                        both->shortname = bufname;
+                        both->short_len = short_len;
+                        fill_name = NULL;
+                        fill_len = 0;
+                        goto start_filldir;
                }
        }
@@ -646,28 +654,21 @@ parse_record:
        if (short_len == 0)
                goto record_end;
-        if (nr_slots) {
+        fill_name = bufname;
-                /* hack for fat_ioctl_filldir() */
+        fill_len = short_len;
-                struct fat_ioctl_filldir_callback *p = dirent;
-                p->longname = fill_name;
-                p->long_len = fill_len;
-                p->shortname = bufname;
-                p->short_len = short_len;
-                fill_name = NULL;
-                fill_len = 0;
-        } else {
-                fill_name = bufname;
-                fill_len = short_len;
-        }
 start_filldir:
-        lpos = cpos - (nr_slots + 1) * sizeof(struct msdos_dir_entry);
+        if (!fake_offset)
-        if (!memcmp(de->name, MSDOS_DOT, MSDOS_NAME))
+                ctx->pos = cpos - (nr_slots + 1) * sizeof(struct msdos_dir_entry);
-                inum = inode->i_ino;
-        else if (!memcmp(de->name, MSDOS_DOTDOT, MSDOS_NAME)) {
+        if (!memcmp(de->name, MSDOS_DOT, MSDOS_NAME)) {
-                inum = parent_ino(filp->f_path.dentry);
+                if (!dir_emit_dot(file, ctx))
+                        goto fill_failed;
+        } else if (!memcmp(de->name, MSDOS_DOTDOT, MSDOS_NAME)) {
+                if (!dir_emit_dotdot(file, ctx))
+                        goto fill_failed;
        } else {
+                unsigned long inum;
                loff_t i_pos = fat_make_i_pos(sb, bh, de);
                struct inode *tmp = fat_iget(sb, i_pos);
                if (tmp) {
@@ -675,18 +676,17 @@ start_filldir:
                        iput(tmp);
                } else
                        inum = iunique(sb, MSDOS_ROOT_INO);
+                if (!dir_emit(ctx, fill_name, fill_len, inum,
+                            (de->attr & ATTR_DIR) ? DT_DIR : DT_REG))
+                        goto fill_failed;
        }
-        if (filldir(dirent, fill_name, fill_len, *furrfu, inum,
-                    (de->attr & ATTR_DIR) ? DT_DIR : DT_REG) < 0)
-                goto fill_failed;
 record_end:
-        furrfu = &lpos;
+        fake_offset = 0;
-        filp->f_pos = cpos;
+        ctx->pos = cpos;
        goto get_new;
 end_of_dir:
-        filp->f_pos = cpos;
+        ctx->pos = cpos;
 fill_failed:
        brelse(bh);
        if (unicode)
@@ -696,10 +696,9 @@ out:
        return ret;
 }
-static int fat_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int fat_readdir(struct file *file, struct dir_context *ctx)
 {
-        struct inode *inode = file_inode(filp);
+        return __fat_readdir(file_inode(file), file, ctx, 0, NULL);
-        return __fat_readdir(inode, filp, dirent, filldir, 0, 0);
 }
 #define FAT_IOCTL_FILLDIR_FUNC(func, dirent_type)                          \
@@ -755,20 +754,25 @@ efault:									   \
 FAT_IOCTL_FILLDIR_FUNC(fat_ioctl_filldir, __fat_dirent)
-static int fat_ioctl_readdir(struct inode *inode, struct file *filp,
+static int fat_ioctl_readdir(struct inode *inode, struct file *file,
                             void __user *dirent, filldir_t filldir,
                             int short_only, int both)
 {
-        struct fat_ioctl_filldir_callback buf;
+        struct fat_ioctl_filldir_callback buf = {
+                .ctx.actor = filldir,
+                .dirent = dirent
+        };
        int ret;
        buf.dirent = dirent;
        buf.result = 0;
        mutex_lock(&inode->i_mutex);
+        buf.ctx.pos = file->f_pos;
        ret = -ENOENT;
        if (!IS_DEADDIR(inode)) {
-                ret = __fat_readdir(inode, filp, &buf, filldir,
+                ret = __fat_readdir(inode, file, &buf.ctx,
-                                    short_only, both);
+                                    short_only, both ? &buf : NULL);
+                file->f_pos = buf.ctx.pos;
        }
        mutex_unlock(&inode->i_mutex);
        if (ret >= 0)
@@ -854,7 +858,7 @@ static long fat_compat_dir_ioctl(struct file *filp, unsigned cmd,
 const struct file_operations fat_dir_operations = {
        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
-        .readdir        = fat_readdir,
+        .iterate        = fat_readdir,
        .unlocked_ioctl = fat_dir_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = fat_compat_dir_ioctl,
diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c
index 664b07a53870..25d4099a4aea 100644
--- a/fs/freevxfs/vxfs_lookup.c
+++ b/fs/freevxfs/vxfs_lookup.c
@@ -49,7 +49,7 @@
 static struct dentry *  vxfs_lookup(struct inode *, struct dentry *, unsigned int);
-static int              vxfs_readdir(struct file *, void *, filldir_t);
+static int              vxfs_readdir(struct file *, struct dir_context *);
 const struct inode_operations vxfs_dir_inode_ops = {
        .lookup =               vxfs_lookup,
@@ -58,7 +58,7 @@ const struct inode_operations vxfs_dir_inode_ops = {
 const struct file_operations vxfs_dir_operations = {
        .llseek =               generic_file_llseek,
        .read =                 generic_read_dir,
-        .readdir =              vxfs_readdir,
+        .iterate =              vxfs_readdir,
 };
 
@@ -235,7 +235,7 @@ vxfs_lookup(struct inode *dip, struct dentry *dp, unsigned int flags)
 *   Zero.
 */
 static int
-vxfs_readdir(struct file *fp, void *retp, filldir_t filler)
+vxfs_readdir(struct file *fp, struct dir_context *ctx)
 {
        struct inode            *ip = file_inode(fp);
        struct super_block      *sbp = ip->i_sb;
@@ -243,20 +243,17 @@ vxfs_readdir(struct file *fp, void *retp, filldir_t filler)
        u_long                  page, npages, block, pblocks, nblocks, offset;
        loff_t                  pos;
-        switch ((long)fp->f_pos) {
+        if (ctx->pos == 0) {
-        case 0:
+                if (!dir_emit_dot(fp, ctx))
-                if (filler(retp, ".", 1, fp->f_pos, ip->i_ino, DT_DIR) < 0)
+                        return 0;
-                        goto out;
+                ctx->pos = 1;
-                fp->f_pos++;
-                /* fallthrough */
-        case 1:
-                if (filler(retp, "..", 2, fp->f_pos, VXFS_INO(ip)->vii_dotdot, DT_DIR) < 0)
-                        goto out;
-                fp->f_pos++;
-                /* fallthrough */
        }
+        if (ctx->pos == 1) {
-        pos = fp->f_pos - 2;
+                if (!dir_emit(ctx, "..", 2, VXFS_INO(ip)->vii_dotdot, DT_DIR))
+                        return 0;
+                ctx->pos = 2;
+        }
+        pos = ctx->pos - 2;
        
        if (pos > VXFS_DIRROUND(ip->i_size))
                return 0;
@@ -270,16 +267,16 @@ vxfs_readdir(struct file *fp, void *retp, filldir_t filler)
        block = (u_long)(pos >> sbp->s_blocksize_bits) % pblocks;
        for (; page < npages; page++, block = 0) {
-                caddr_t                 kaddr;
+                char                    *kaddr;
                struct page             *pp;
                pp = vxfs_get_page(ip->i_mapping, page);
                if (IS_ERR(pp))
                        continue;
-                kaddr = (caddr_t)page_address(pp);
+                kaddr = (char *)page_address(pp);
                for (; block <= nblocks && block <= pblocks; block++) {
-                        caddr_t                 baddr, limit;
+                        char                    *baddr, *limit;
                        struct vxfs_dirblk      *dbp;
                        struct vxfs_direct      *de;
@@ -292,21 +289,18 @@ vxfs_readdir(struct file *fp, void *retp, filldir_t filler)
                                 (kaddr + offset) :
                                 (baddr + VXFS_DIRBLKOV(dbp)));
-                        for (; (caddr_t)de <= limit; de = vxfs_next_entry(de)) {
+                        for (; (char *)de <= limit; de = vxfs_next_entry(de)) {
-                                int     over;
                                if (!de->d_reclen)
                                        break;
                                if (!de->d_ino)
                                        continue;
-                                offset = (caddr_t)de - kaddr;
+                                offset = (char *)de - kaddr;
-                                over = filler(retp, de->d_name, de->d_namelen,
+                                ctx->pos = ((page << PAGE_CACHE_SHIFT) | offset) + 2;
-                                        ((page << PAGE_CACHE_SHIFT) | offset) + 2,
+                                if (!dir_emit(ctx, de->d_name, de->d_namelen,
-                                        de->d_ino, DT_UNKNOWN);
+                                        de->d_ino, DT_UNKNOWN)) {
-                                if (over) {
                                        vxfs_put_page(pp);
-                                        goto done;
+                                        return 0;
                                }
                        }
                        offset = 0;
@@ -314,9 +308,6 @@ vxfs_readdir(struct file *fp, void *retp, filldir_t filler)
                vxfs_put_page(pp);
                offset = 0;
        }
+        ctx->pos = ((page << PAGE_CACHE_SHIFT) | offset) + 2;
-done:
-        fp->f_pos = ((page << PAGE_CACHE_SHIFT) | offset) + 2;
-out:
        return 0;
 }
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 3be57189efd5..a85ac4e33436 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -45,6 +45,7 @@ struct wb_writeback_work {
        unsigned int for_kupdate:1;
        unsigned int range_cyclic:1;
        unsigned int for_background:1;
+        unsigned int for_sync:1;        /* sync(2) WB_SYNC_ALL writeback */
        enum wb_reason reason;          /* why was writeback initiated? */
        struct list_head list;          /* pending work list */
@@ -443,9 +444,11 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
        /*
         * Make sure to wait on the data before writing out the metadata.
         * This is important for filesystems that modify metadata on data
-         * I/O completion.
+         * I/O completion. We don't do it for sync(2) writeback because it has a
+         * separate, external IO completion path and ->sync_fs for guaranteeing
+         * inode metadata is written back correctly.
         */
-        if (wbc->sync_mode == WB_SYNC_ALL) {
+        if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) {
                int err = filemap_fdatawait(mapping);
                if (ret == 0)
                        ret = err;
@@ -578,6 +581,7 @@ static long writeback_sb_inodes(struct super_block *sb,
                .tagged_writepages      = work->tagged_writepages,
                .for_kupdate            = work->for_kupdate,
                .for_background         = work->for_background,
+                .for_sync               = work->for_sync,
                .range_cyclic           = work->range_cyclic,
                .range_start            = 0,
                .range_end              = LLONG_MAX,
@@ -1362,6 +1366,7 @@ void sync_inodes_sb(struct super_block *sb)
                .range_cyclic   = 0,
                .done           = &done,
                .reason         = WB_REASON_SYNC,
+                .for_sync       = 1,
        };
        /* Nothing to do? */
diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c
index b52aed1dca97..f7cff367db7f 100644
--- a/fs/fscache/cache.c
+++ b/fs/fscache/cache.c
@@ -115,7 +115,7 @@ struct fscache_cache *fscache_select_cache_for_object(
                                     struct fscache_object, cookie_link);
                cache = object->cache;
-                if (object->state >= FSCACHE_OBJECT_DYING ||
+                if (fscache_object_is_dying(object) ||
                    test_bit(FSCACHE_IOERROR, &cache->flags))
                        cache = NULL;
@@ -224,8 +224,10 @@ int fscache_add_cache(struct fscache_cache *cache,
        BUG_ON(!ifsdef);
        cache->flags = 0;
-        ifsdef->event_mask = ULONG_MAX & ~(1 << FSCACHE_OBJECT_EV_CLEARED);
+        ifsdef->event_mask =
-        ifsdef->state = FSCACHE_OBJECT_ACTIVE;
+                ((1 << NR_FSCACHE_OBJECT_EVENTS) - 1) &
+                ~(1 << FSCACHE_OBJECT_EV_CLEARED);
+        __set_bit(FSCACHE_OBJECT_IS_AVAILABLE, &ifsdef->flags);
        if (!tagname)
                tagname = cache->identifier;
@@ -330,25 +332,25 @@ static void fscache_withdraw_all_objects(struct fscache_cache *cache,
 {
        struct fscache_object *object;
-        spin_lock(&cache->object_list_lock);
        while (!list_empty(&cache->object_list)) {
-                object = list_entry(cache->object_list.next,
+                spin_lock(&cache->object_list_lock);
-                                    struct fscache_object, cache_link);
-                list_move_tail(&object->cache_link, dying_objects);
-                _debug("withdraw %p", object->cookie);
+                if (!list_empty(&cache->object_list)) {
+                        object = list_entry(cache->object_list.next,
+                                            struct fscache_object, cache_link);
+                        list_move_tail(&object->cache_link, dying_objects);
-                spin_lock(&object->lock);
+                        _debug("withdraw %p", object->cookie);
-                spin_unlock(&cache->object_list_lock);
-                fscache_raise_event(object, FSCACHE_OBJECT_EV_WITHDRAW);
+                        /* This must be done under object_list_lock to prevent
-                spin_unlock(&object->lock);
+                         * a race with fscache_drop_object().
+                         */
+                        fscache_raise_event(object, FSCACHE_OBJECT_EV_KILL);
+                }
+                spin_unlock(&cache->object_list_lock);
                cond_resched();
-                spin_lock(&cache->object_list_lock);
        }
-        spin_unlock(&cache->object_list_lock);
 }
 /**
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index e2cba1f60c21..0e91a3c9fdb2 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -95,6 +95,11 @@ struct fscache_cookie *__fscache_acquire_cookie(
        atomic_set(&cookie->usage, 1);
        atomic_set(&cookie->n_children, 0);
+        /* We keep the active count elevated until relinquishment to prevent an
+         * attempt to wake up every time the object operations queue quiesces.
+         */
+        atomic_set(&cookie->n_active, 1);
        atomic_inc(&parent->usage);
        atomic_inc(&parent->n_children);
@@ -177,7 +182,6 @@ static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie)
        cookie->flags =
                (1 << FSCACHE_COOKIE_LOOKING_UP) |
-                (1 << FSCACHE_COOKIE_CREATING) |
                (1 << FSCACHE_COOKIE_NO_DATA_YET);
        /* ask the cache to allocate objects for this cookie and its parent
@@ -205,7 +209,7 @@ static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie)
        /* initiate the process of looking up all the objects in the chain
         * (done by fscache_initialise_object()) */
-        fscache_enqueue_object(object);
+        fscache_raise_event(object, FSCACHE_OBJECT_EV_NEW_CHILD);
        spin_unlock(&cookie->lock);
@@ -285,7 +289,7 @@ static int fscache_alloc_object(struct fscache_cache *cache,
 object_already_extant:
        ret = -ENOBUFS;
-        if (object->state >= FSCACHE_OBJECT_DYING) {
+        if (fscache_object_is_dead(object)) {
                spin_unlock(&cookie->lock);
                goto error;
        }
@@ -321,7 +325,7 @@ static int fscache_attach_object(struct fscache_cookie *cookie,
        ret = -EEXIST;
        hlist_for_each_entry(p, &cookie->backing_objects, cookie_link) {
                if (p->cache == object->cache) {
-                        if (p->state >= FSCACHE_OBJECT_DYING)
+                        if (fscache_object_is_dying(p))
                                ret = -ENOBUFS;
                        goto cant_attach_object;
                }
@@ -332,7 +336,7 @@ static int fscache_attach_object(struct fscache_cookie *cookie,
        hlist_for_each_entry(p, &cookie->parent->backing_objects,
                             cookie_link) {
                if (p->cache == object->cache) {
-                        if (p->state >= FSCACHE_OBJECT_DYING) {
+                        if (fscache_object_is_dying(p)) {
                                ret = -ENOBUFS;
                                spin_unlock(&cookie->parent->lock);
                                goto cant_attach_object;
@@ -400,7 +404,7 @@ void __fscache_invalidate(struct fscache_cookie *cookie)
                        object = hlist_entry(cookie->backing_objects.first,
                                             struct fscache_object,
                                             cookie_link);
-                        if (object->state < FSCACHE_OBJECT_DYING)
+                        if (fscache_object_is_live(object))
                                fscache_raise_event(
                                        object, FSCACHE_OBJECT_EV_INVALIDATE);
                }
@@ -467,9 +471,7 @@ EXPORT_SYMBOL(__fscache_update_cookie);
 */
 void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
 {
-        struct fscache_cache *cache;
        struct fscache_object *object;
-        unsigned long event;
        fscache_stat(&fscache_n_relinquishes);
        if (retire)
@@ -481,8 +483,11 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
                return;
        }
-        _enter("%p{%s,%p},%d",
+        _enter("%p{%s,%p,%d},%d",
-               cookie, cookie->def->name, cookie->netfs_data, retire);
+               cookie, cookie->def->name, cookie->netfs_data,
+               atomic_read(&cookie->n_active), retire);
+        ASSERTCMP(atomic_read(&cookie->n_active), >, 0);
        if (atomic_read(&cookie->n_children) != 0) {
                printk(KERN_ERR "FS-Cache: Cookie '%s' still has children\n",
@@ -490,62 +495,28 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
                BUG();
        }
-        /* wait for the cookie to finish being instantiated (or to fail) */
+        /* No further netfs-accessing operations on this cookie permitted */
-        if (test_bit(FSCACHE_COOKIE_CREATING, &cookie->flags)) {
+        set_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags);
-                fscache_stat(&fscache_n_relinquishes_waitcrt);
+        if (retire)
-                wait_on_bit(&cookie->flags, FSCACHE_COOKIE_CREATING,
+                set_bit(FSCACHE_COOKIE_RETIRED, &cookie->flags);
-                            fscache_wait_bit, TASK_UNINTERRUPTIBLE);
-        }
-        event = retire ? FSCACHE_OBJECT_EV_RETIRE : FSCACHE_OBJECT_EV_RELEASE;
-try_again:
        spin_lock(&cookie->lock);
+        hlist_for_each_entry(object, &cookie->backing_objects, cookie_link) {
-        /* break links with all the active objects */
+                fscache_raise_event(object, FSCACHE_OBJECT_EV_KILL);
-        while (!hlist_empty(&cookie->backing_objects)) {
-                int n_reads;
-                object = hlist_entry(cookie->backing_objects.first,
-                                     struct fscache_object,
-                                     cookie_link);
-                _debug("RELEASE OBJ%x", object->debug_id);
-                set_bit(FSCACHE_COOKIE_WAITING_ON_READS, &cookie->flags);
-                n_reads = atomic_read(&object->n_reads);
-                if (n_reads) {
-                        int n_ops = object->n_ops;
-                        int n_in_progress = object->n_in_progress;
-                        spin_unlock(&cookie->lock);
-                        printk(KERN_ERR "FS-Cache:"
-                               " Cookie '%s' still has %d outstanding reads (%d,%d)\n",
-                               cookie->def->name,
-                               n_reads, n_ops, n_in_progress);
-                        wait_on_bit(&cookie->flags, FSCACHE_COOKIE_WAITING_ON_READS,
-                                    fscache_wait_bit, TASK_UNINTERRUPTIBLE);
-                        printk("Wait finished\n");
-                        goto try_again;
-                }
-                /* detach each cache object from the object cookie */
-                spin_lock(&object->lock);
-                hlist_del_init(&object->cookie_link);
-                cache = object->cache;
-                object->cookie = NULL;
-                fscache_raise_event(object, event);
-                spin_unlock(&object->lock);
-                if (atomic_dec_and_test(&cookie->usage))
-                        /* the cookie refcount shouldn't be reduced to 0 yet */
-                        BUG();
        }
+        spin_unlock(&cookie->lock);
-        /* detach pointers back to the netfs */
+        /* Wait for cessation of activity requiring access to the netfs (when
+         * n_active reaches 0).
+         */
+        if (!atomic_dec_and_test(&cookie->n_active))
+                wait_on_atomic_t(&cookie->n_active, fscache_wait_atomic_t,
+                                 TASK_UNINTERRUPTIBLE);
+        /* Clear pointers back to the netfs */
        cookie->netfs_data      = NULL;
        cookie->def             = NULL;
+        BUG_ON(cookie->stores.rnode);
-        spin_unlock(&cookie->lock);
        if (cookie->parent) {
                ASSERTCMP(atomic_read(&cookie->parent->usage), >, 0);
@@ -553,7 +524,7 @@ try_again:
                atomic_dec(&cookie->parent->n_children);
        }
-        /* finally dispose of the cookie */
+        /* Dispose of the netfs's link to the cookie */
        ASSERTCMP(atomic_read(&cookie->usage), >, 0);
        fscache_cookie_put(cookie);
diff --git a/fs/fscache/fsdef.c b/fs/fscache/fsdef.c
index f5b4baee7352..10a2ade0bdf8 100644
--- a/fs/fscache/fsdef.c
+++ b/fs/fscache/fsdef.c
@@ -55,6 +55,7 @@ static struct fscache_cookie_def fscache_fsdef_index_def = {
 struct fscache_cookie fscache_fsdef_index = {
        .usage          = ATOMIC_INIT(1),
+        .n_active       = ATOMIC_INIT(1),
        .lock           = __SPIN_LOCK_UNLOCKED(fscache_fsdef_index.lock),
        .backing_objects = HLIST_HEAD_INIT,
        .def            = &fscache_fsdef_index_def,
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index ee38fef4be51..12d505bedb5c 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -93,14 +93,11 @@ static inline bool fscache_object_congested(void)
 extern int fscache_wait_bit(void *);
 extern int fscache_wait_bit_interruptible(void *);
+extern int fscache_wait_atomic_t(atomic_t *);
 /*
 * object.c
 */
-extern const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5];
-extern void fscache_withdrawing_object(struct fscache_cache *,
-                                       struct fscache_object *);
 extern void fscache_enqueue_object(struct fscache_object *);
 /*
@@ -110,8 +107,10 @@ extern void fscache_enqueue_object(struct fscache_object *);
 extern const struct file_operations fscache_objlist_fops;
 extern void fscache_objlist_add(struct fscache_object *);
+extern void fscache_objlist_remove(struct fscache_object *);
 #else
 #define fscache_objlist_add(object) do {} while(0)
+#define fscache_objlist_remove(object) do {} while(0)
 #endif
 /*
@@ -291,6 +290,10 @@ static inline void fscache_raise_event(struct fscache_object *object,
                                       unsigned event)
 {
        BUG_ON(event >= NR_FSCACHE_OBJECT_EVENTS);
+#if 0
+        printk("*** fscache_raise_event(OBJ%d{%lx},%x)\n",
+               object->debug_id, object->event_mask, (1 << event));
+#endif
        if (!test_and_set_bit(event, &object->events) &&
            test_bit(event, &object->event_mask))
                fscache_enqueue_object(object);
diff --git a/fs/fscache/main.c b/fs/fscache/main.c
index f9d856773f79..7c27907e650c 100644
--- a/fs/fscache/main.c
+++ b/fs/fscache/main.c
@@ -205,7 +205,6 @@ int fscache_wait_bit(void *flags)
        schedule();
        return 0;
 }
-EXPORT_SYMBOL(fscache_wait_bit);
 /*
 * wait_on_bit() sleep function for interruptible waiting
@@ -215,4 +214,12 @@ int fscache_wait_bit_interruptible(void *flags)
        schedule();
        return signal_pending(current);
 }
-EXPORT_SYMBOL(fscache_wait_bit_interruptible);
+/*
+ * wait_on_atomic_t() sleep function for uninterruptible waiting
+ */
+int fscache_wait_atomic_t(atomic_t *p)
+{
+        schedule();
+        return 0;
+}
diff --git a/fs/fscache/netfs.c b/fs/fscache/netfs.c
index e028b8eb1c40..b1bb6117473a 100644
--- a/fs/fscache/netfs.c
+++ b/fs/fscache/netfs.c
@@ -40,6 +40,7 @@ int __fscache_register_netfs(struct fscache_netfs *netfs)
        /* initialise the primary index cookie */
        atomic_set(&netfs->primary_index->usage, 1);
        atomic_set(&netfs->primary_index->n_children, 0);
+        atomic_set(&netfs->primary_index->n_active, 1);
        netfs->primary_index->def               = &fscache_fsdef_netfs_def;
        netfs->primary_index->parent            = &fscache_fsdef_index;
diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
index f27c89d17885..e1959efad64f 100644
--- a/fs/fscache/object-list.c
+++ b/fs/fscache/object-list.c
@@ -70,13 +70,10 @@ void fscache_objlist_add(struct fscache_object *obj)
        write_unlock(&fscache_object_list_lock);
 }
-/**
+/*
- * fscache_object_destroy - Note that a cache object is about to be destroyed
+ * Remove an object from the object list.
- * @object: The object to be destroyed
- *
- * Note the imminent destruction and deallocation of a cache object record.
 */
-void fscache_object_destroy(struct fscache_object *obj)
+void fscache_objlist_remove(struct fscache_object *obj)
 {
        write_lock(&fscache_object_list_lock);
@@ -85,7 +82,6 @@ void fscache_object_destroy(struct fscache_object *obj)
        write_unlock(&fscache_object_list_lock);
 }
-EXPORT_SYMBOL(fscache_object_destroy);
 /*
 * find the object in the tree on or after the specified index
@@ -166,15 +162,14 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
 {
        struct fscache_objlist_data *data = m->private;
        struct fscache_object *obj = v;
+        struct fscache_cookie *cookie;
        unsigned long config = data->config;
-        uint16_t keylen, auxlen;
        char _type[3], *type;
-        bool no_cookie;
        u8 *buf = data->buf, *p;
        if ((unsigned long) v == 1) {
                seq_puts(m, "OBJECT   PARENT   STAT CHLDN OPS OOP IPR EX READS"
-                         " EM EV F S"
+                         " EM EV FL S"
                         " | NETFS_COOKIE_DEF TY FL NETFS_DATA");
                if (config & (FSCACHE_OBJLIST_CONFIG_KEY |
                              FSCACHE_OBJLIST_CONFIG_AUX))
@@ -193,7 +188,7 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
        if ((unsigned long) v == 2) {
                seq_puts(m, "======== ======== ==== ===== === === === == ====="
-                         " == == = ="
+                         " == == == ="
                         " | ================ == == ================");
                if (config & (FSCACHE_OBJLIST_CONFIG_KEY |
                              FSCACHE_OBJLIST_CONFIG_AUX))
@@ -216,10 +211,11 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
                }                                                       \
        } while(0)
+        cookie = obj->cookie;
        if (~config) {
-                FILTER(obj->cookie,
+                FILTER(cookie->def,
                       COOKIE, NOCOOKIE);
-                FILTER(obj->state != FSCACHE_OBJECT_ACTIVE ||
+                FILTER(fscache_object_is_active(obj) ||
                       obj->n_ops != 0 ||
                       obj->n_obj_ops != 0 ||
                       obj->flags ||
@@ -235,10 +231,10 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
        }
        seq_printf(m,
-                   "%8x %8x %s %5u %3u %3u %3u %2u %5u %2lx %2lx %1lx %1x | ",
+                   "%8x %8x %s %5u %3u %3u %3u %2u %5u %2lx %2lx %2lx %1x | ",
                   obj->debug_id,
                   obj->parent ? obj->parent->debug_id : -1,
-                   fscache_object_states_short[obj->state],
+                   obj->state->short_name,
                   obj->n_children,
                   obj->n_ops,
                   obj->n_obj_ops,
@@ -250,48 +246,40 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
                   obj->flags,
                   work_busy(&obj->work));
-        no_cookie = true;
+        if (fscache_use_cookie(obj)) {
-        keylen = auxlen = 0;
+                uint16_t keylen = 0, auxlen = 0;
-        if (obj->cookie) {
-                spin_lock(&obj->lock);
-                if (obj->cookie) {
-                        switch (obj->cookie->def->type) {
-                        case 0:
-                                type = "IX";
-                                break;
-                        case 1:
-                                type = "DT";
-                                break;
-                        default:
-                                sprintf(_type, "%02u",
-                                        obj->cookie->def->type);
-                                type = _type;
-                                break;
-                        }
-                        seq_printf(m, "%-16s %s %2lx %16p",
+                switch (cookie->def->type) {
-                                   obj->cookie->def->name,
+                case 0:
-                                   type,
+                        type = "IX";
-                                   obj->cookie->flags,
+                        break;
-                                   obj->cookie->netfs_data);
+                case 1:
+                        type = "DT";
-                        if (obj->cookie->def->get_key &&
+                        break;
-                            config & FSCACHE_OBJLIST_CONFIG_KEY)
+                default:
-                                keylen = obj->cookie->def->get_key(
+                        sprintf(_type, "%02u", cookie->def->type);
-                                        obj->cookie->netfs_data,
+                        type = _type;
-                                        buf, 400);
+                        break;
-                        if (obj->cookie->def->get_aux &&
-                            config & FSCACHE_OBJLIST_CONFIG_AUX)
-                                auxlen = obj->cookie->def->get_aux(
-                                        obj->cookie->netfs_data,
-                                        buf + keylen, 512 - keylen);
-                        no_cookie = false;
                }
-                spin_unlock(&obj->lock);
-                if (!no_cookie && (keylen > 0 || auxlen > 0)) {
+                seq_printf(m, "%-16s %s %2lx %16p",
+                           cookie->def->name,
+                           type,
+                           cookie->flags,
+                           cookie->netfs_data);
+                if (cookie->def->get_key &&
+                    config & FSCACHE_OBJLIST_CONFIG_KEY)
+                        keylen = cookie->def->get_key(cookie->netfs_data,
+                                                      buf, 400);
+                if (cookie->def->get_aux &&
+                    config & FSCACHE_OBJLIST_CONFIG_AUX)
+                        auxlen = cookie->def->get_aux(cookie->netfs_data,
+                                                      buf + keylen, 512 - keylen);
+                fscache_unuse_cookie(obj);
+                if (keylen > 0 || auxlen > 0) {
                        seq_printf(m, " ");
                        for (p = buf; keylen > 0; keylen--)
                                seq_printf(m, "%02x", *p++);
@@ -302,12 +290,11 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
                                        seq_printf(m, "%02x", *p++);
                        }
                }
-        }
-        if (no_cookie)
-                seq_printf(m, "<no_cookie>\n");
-        else
                seq_printf(m, "\n");
+        } else {
+                seq_printf(m, "<no_netfs>\n");
+        }
        return 0;
 }
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index 50d41c180211..86d75a60b20c 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -15,52 +15,131 @@
 #define FSCACHE_DEBUG_LEVEL COOKIE
 #include <linux/module.h>
 #include <linux/slab.h>
+#include <linux/prefetch.h>
 #include "internal.h"
-const char *fscache_object_states[FSCACHE_OBJECT__NSTATES] = {
+static const struct fscache_state *fscache_abort_initialisation(struct fscache_object *, int);
-        [FSCACHE_OBJECT_INIT]           = "OBJECT_INIT",
+static const struct fscache_state *fscache_kill_dependents(struct fscache_object *, int);
-        [FSCACHE_OBJECT_LOOKING_UP]     = "OBJECT_LOOKING_UP",
+static const struct fscache_state *fscache_drop_object(struct fscache_object *, int);
-        [FSCACHE_OBJECT_CREATING]       = "OBJECT_CREATING",
+static const struct fscache_state *fscache_initialise_object(struct fscache_object *, int);
-        [FSCACHE_OBJECT_AVAILABLE]      = "OBJECT_AVAILABLE",
+static const struct fscache_state *fscache_invalidate_object(struct fscache_object *, int);
-        [FSCACHE_OBJECT_ACTIVE]         = "OBJECT_ACTIVE",
+static const struct fscache_state *fscache_jumpstart_dependents(struct fscache_object *, int);
-        [FSCACHE_OBJECT_INVALIDATING]   = "OBJECT_INVALIDATING",
+static const struct fscache_state *fscache_kill_object(struct fscache_object *, int);
-        [FSCACHE_OBJECT_UPDATING]       = "OBJECT_UPDATING",
+static const struct fscache_state *fscache_lookup_failure(struct fscache_object *, int);
-        [FSCACHE_OBJECT_DYING]          = "OBJECT_DYING",
+static const struct fscache_state *fscache_look_up_object(struct fscache_object *, int);
-        [FSCACHE_OBJECT_LC_DYING]       = "OBJECT_LC_DYING",
+static const struct fscache_state *fscache_object_available(struct fscache_object *, int);
-        [FSCACHE_OBJECT_ABORT_INIT]     = "OBJECT_ABORT_INIT",
+static const struct fscache_state *fscache_parent_ready(struct fscache_object *, int);
-        [FSCACHE_OBJECT_RELEASING]      = "OBJECT_RELEASING",
+static const struct fscache_state *fscache_update_object(struct fscache_object *, int);
-        [FSCACHE_OBJECT_RECYCLING]      = "OBJECT_RECYCLING",
-        [FSCACHE_OBJECT_WITHDRAWING]    = "OBJECT_WITHDRAWING",
+#define __STATE_NAME(n) fscache_osm_##n
-        [FSCACHE_OBJECT_DEAD]           = "OBJECT_DEAD",
+#define STATE(n) (&__STATE_NAME(n))
+/*
+ * Define a work state.  Work states are execution states.  No event processing
+ * is performed by them.  The function attached to a work state returns a
+ * pointer indicating the next state to which the state machine should
+ * transition.  Returning NO_TRANSIT repeats the current state, but goes back
+ * to the scheduler first.
+ */
+#define WORK_STATE(n, sn, f) \
+        const struct fscache_state __STATE_NAME(n) = {                  \
+                .name = #n,                                             \
+                .short_name = sn,                                       \
+                .work = f                                               \
+        }
+/*
+ * Returns from work states.
+ */
+#define transit_to(state) ({ prefetch(&STATE(state)->work); STATE(state); })
+#define NO_TRANSIT ((struct fscache_state *)NULL)
+/*
+ * Define a wait state.  Wait states are event processing states.  No execution
+ * is performed by them.  Wait states are just tables of "if event X occurs,
+ * clear it and transition to state Y".  The dispatcher returns to the
+ * scheduler if none of the events in which the wait state has an interest are
+ * currently pending.
+ */
+#define WAIT_STATE(n, sn, ...) \
+        const struct fscache_state __STATE_NAME(n) = {                  \
+                .name = #n,                                             \
+                .short_name = sn,                                       \
+                .work = NULL,                                           \
+                .transitions = { __VA_ARGS__, { 0, NULL } }             \
+        }
+#define TRANSIT_TO(state, emask) \
+        { .events = (emask), .transit_to = STATE(state) }
+/*
+ * The object state machine.
+ */
+static WORK_STATE(INIT_OBJECT,          "INIT", fscache_initialise_object);
+static WORK_STATE(PARENT_READY,         "PRDY", fscache_parent_ready);
+static WORK_STATE(ABORT_INIT,           "ABRT", fscache_abort_initialisation);
+static WORK_STATE(LOOK_UP_OBJECT,       "LOOK", fscache_look_up_object);
+static WORK_STATE(CREATE_OBJECT,        "CRTO", fscache_look_up_object);
+static WORK_STATE(OBJECT_AVAILABLE,     "AVBL", fscache_object_available);
+static WORK_STATE(JUMPSTART_DEPS,       "JUMP", fscache_jumpstart_dependents);
+static WORK_STATE(INVALIDATE_OBJECT,    "INVL", fscache_invalidate_object);
+static WORK_STATE(UPDATE_OBJECT,        "UPDT", fscache_update_object);
+static WORK_STATE(LOOKUP_FAILURE,       "LCFL", fscache_lookup_failure);
+static WORK_STATE(KILL_OBJECT,          "KILL", fscache_kill_object);
+static WORK_STATE(KILL_DEPENDENTS,      "KDEP", fscache_kill_dependents);
+static WORK_STATE(DROP_OBJECT,          "DROP", fscache_drop_object);
+static WORK_STATE(OBJECT_DEAD,          "DEAD", (void*)2UL);
+static WAIT_STATE(WAIT_FOR_INIT,        "?INI",
+                  TRANSIT_TO(INIT_OBJECT,       1 << FSCACHE_OBJECT_EV_NEW_CHILD));
+static WAIT_STATE(WAIT_FOR_PARENT,      "?PRN",
+                  TRANSIT_TO(PARENT_READY,      1 << FSCACHE_OBJECT_EV_PARENT_READY));
+static WAIT_STATE(WAIT_FOR_CMD,         "?CMD",
+                  TRANSIT_TO(INVALIDATE_OBJECT, 1 << FSCACHE_OBJECT_EV_INVALIDATE),
+                  TRANSIT_TO(UPDATE_OBJECT,     1 << FSCACHE_OBJECT_EV_UPDATE),
+                  TRANSIT_TO(JUMPSTART_DEPS,    1 << FSCACHE_OBJECT_EV_NEW_CHILD));
+static WAIT_STATE(WAIT_FOR_CLEARANCE,   "?CLR",
+                  TRANSIT_TO(KILL_OBJECT,       1 << FSCACHE_OBJECT_EV_CLEARED));
+/*
+ * Out-of-band event transition tables.  These are for handling unexpected
+ * events, such as an I/O error.  If an OOB event occurs, the state machine
+ * clears and disables the event and forces a transition to the nominated work
+ * state (acurrently executing work states will complete first).
+ *
+ * In such a situation, object->state remembers the state the machine should
+ * have been in/gone to and returning NO_TRANSIT returns to that.
+ */
+static const struct fscache_transition fscache_osm_init_oob[] = {
+           TRANSIT_TO(ABORT_INIT,
+                      (1 << FSCACHE_OBJECT_EV_ERROR) |
+                      (1 << FSCACHE_OBJECT_EV_KILL)),
+           { 0, NULL }
+};
+static const struct fscache_transition fscache_osm_lookup_oob[] = {
+           TRANSIT_TO(LOOKUP_FAILURE,
+                      (1 << FSCACHE_OBJECT_EV_ERROR) |
+                      (1 << FSCACHE_OBJECT_EV_KILL)),
+           { 0, NULL }
 };
-EXPORT_SYMBOL(fscache_object_states);
+static const struct fscache_transition fscache_osm_run_oob[] = {
-const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5] = {
+           TRANSIT_TO(KILL_OBJECT,
-        [FSCACHE_OBJECT_INIT]           = "INIT",
+                      (1 << FSCACHE_OBJECT_EV_ERROR) |
-        [FSCACHE_OBJECT_LOOKING_UP]     = "LOOK",
+                      (1 << FSCACHE_OBJECT_EV_KILL)),
-        [FSCACHE_OBJECT_CREATING]       = "CRTN",
+           { 0, NULL }
-        [FSCACHE_OBJECT_AVAILABLE]      = "AVBL",
-        [FSCACHE_OBJECT_ACTIVE]         = "ACTV",
-        [FSCACHE_OBJECT_INVALIDATING]   = "INVL",
-        [FSCACHE_OBJECT_UPDATING]       = "UPDT",
-        [FSCACHE_OBJECT_DYING]          = "DYNG",
-        [FSCACHE_OBJECT_LC_DYING]       = "LCDY",
-        [FSCACHE_OBJECT_ABORT_INIT]     = "ABTI",
-        [FSCACHE_OBJECT_RELEASING]      = "RELS",
-        [FSCACHE_OBJECT_RECYCLING]      = "RCYC",
-        [FSCACHE_OBJECT_WITHDRAWING]    = "WTHD",
-        [FSCACHE_OBJECT_DEAD]           = "DEAD",
 };
 static int  fscache_get_object(struct fscache_object *);
 static void fscache_put_object(struct fscache_object *);
-static void fscache_initialise_object(struct fscache_object *);
+static bool fscache_enqueue_dependents(struct fscache_object *, int);
-static void fscache_lookup_object(struct fscache_object *);
-static void fscache_object_available(struct fscache_object *);
-static void fscache_invalidate_object(struct fscache_object *);
-static void fscache_release_object(struct fscache_object *);
-static void fscache_withdraw_object(struct fscache_object *);
-static void fscache_enqueue_dependents(struct fscache_object *);
 static void fscache_dequeue_object(struct fscache_object *);
 /*
@@ -75,295 +154,116 @@ static inline void fscache_done_parent_op(struct fscache_object *object)
               object->debug_id, parent->debug_id, parent->n_ops);
        spin_lock_nested(&parent->lock, 1);
-        parent->n_ops--;
        parent->n_obj_ops--;
+        parent->n_ops--;
        if (parent->n_ops == 0)
                fscache_raise_event(parent, FSCACHE_OBJECT_EV_CLEARED);
        spin_unlock(&parent->lock);
 }
 /*
- * Notify netfs of invalidation completion.
+ * Object state machine dispatcher.
 */
-static inline void fscache_invalidation_complete(struct fscache_cookie *cookie)
+static void fscache_object_sm_dispatcher(struct fscache_object *object)
 {
-        if (test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags))
+        const struct fscache_transition *t;
-                wake_up_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING);
+        const struct fscache_state *state, *new_state;
-}
+        unsigned long events, event_mask;
+        int event = -1;
-/*
- * process events that have been sent to an object's state machine
- * - initiates parent lookup
- * - does object lookup
- * - does object creation
- * - does object recycling and retirement
- * - does object withdrawal
- */
-static void fscache_object_state_machine(struct fscache_object *object)
-{
-        enum fscache_object_state new_state;
-        struct fscache_cookie *cookie;
-        int event;
        ASSERT(object != NULL);
        _enter("{OBJ%x,%s,%lx}",
-               object->debug_id, fscache_object_states[object->state],
+               object->debug_id, object->state->name, object->events);
-               object->events);
+        event_mask = object->event_mask;
-        switch (object->state) {
+restart:
-                /* wait for the parent object to become ready */
+        object->event_mask = 0; /* Mask normal event handling */
-        case FSCACHE_OBJECT_INIT:
+        state = object->state;
-                object->event_mask =
+restart_masked:
-                        FSCACHE_OBJECT_EVENTS_MASK &
+        events = object->events;
-                        ~(1 << FSCACHE_OBJECT_EV_CLEARED);
-                fscache_initialise_object(object);
+        /* Handle any out-of-band events (typically an error) */
-                goto done;
+        if (events & object->oob_event_mask) {
+                _debug("{OBJ%x} oob %lx",
-                /* look up the object metadata on disk */
+                       object->debug_id, events & object->oob_event_mask);
-        case FSCACHE_OBJECT_LOOKING_UP:
+                for (t = object->oob_table; t->events; t++) {
-                fscache_lookup_object(object);
+                        if (events & t->events) {
-                goto lookup_transit;
+                                state = t->transit_to;
+                                ASSERT(state->work != NULL);
-                /* create the object metadata on disk */
+                                event = fls(events & t->events) - 1;
-        case FSCACHE_OBJECT_CREATING:
+                                __clear_bit(event, &object->oob_event_mask);
-                fscache_lookup_object(object);
+                                clear_bit(event, &object->events);
-                goto lookup_transit;
+                                goto execute_work_state;
+                        }
-                /* handle an object becoming available; start pending
-                 * operations and queue dependent operations for processing */
-        case FSCACHE_OBJECT_AVAILABLE:
-                fscache_object_available(object);
-                goto active_transit;
-                /* normal running state */
-        case FSCACHE_OBJECT_ACTIVE:
-                goto active_transit;
-                /* Invalidate an object on disk */
-        case FSCACHE_OBJECT_INVALIDATING:
-                clear_bit(FSCACHE_OBJECT_EV_INVALIDATE, &object->events);
-                fscache_stat(&fscache_n_invalidates_run);
-                fscache_stat(&fscache_n_cop_invalidate_object);
-                fscache_invalidate_object(object);
-                fscache_stat_d(&fscache_n_cop_invalidate_object);
-                fscache_raise_event(object, FSCACHE_OBJECT_EV_UPDATE);
-                goto active_transit;
-                /* update the object metadata on disk */
-        case FSCACHE_OBJECT_UPDATING:
-                clear_bit(FSCACHE_OBJECT_EV_UPDATE, &object->events);
-                fscache_stat(&fscache_n_updates_run);
-                fscache_stat(&fscache_n_cop_update_object);
-                object->cache->ops->update_object(object);
-                fscache_stat_d(&fscache_n_cop_update_object);
-                goto active_transit;
-                /* handle an object dying during lookup or creation */
-        case FSCACHE_OBJECT_LC_DYING:
-                object->event_mask &= ~(1 << FSCACHE_OBJECT_EV_UPDATE);
-                fscache_stat(&fscache_n_cop_lookup_complete);
-                object->cache->ops->lookup_complete(object);
-                fscache_stat_d(&fscache_n_cop_lookup_complete);
-                spin_lock(&object->lock);
-                object->state = FSCACHE_OBJECT_DYING;
-                cookie = object->cookie;
-                if (cookie) {
-                        if (test_and_clear_bit(FSCACHE_COOKIE_LOOKING_UP,
-                                               &cookie->flags))
-                                wake_up_bit(&cookie->flags,
-                                            FSCACHE_COOKIE_LOOKING_UP);
-                        if (test_and_clear_bit(FSCACHE_COOKIE_CREATING,
-                                               &cookie->flags))
-                                wake_up_bit(&cookie->flags,
-                                            FSCACHE_COOKIE_CREATING);
                }
-                spin_unlock(&object->lock);
+        }
-                fscache_done_parent_op(object);
+        /* Wait states are just transition tables */
+        if (!state->work) {
+                if (events & event_mask) {
+                        for (t = state->transitions; t->events; t++) {
+                                if (events & t->events) {
+                                        new_state = t->transit_to;
+                                        event = fls(events & t->events) - 1;
+                                        clear_bit(event, &object->events);
+                                        _debug("{OBJ%x} ev %d: %s -> %s",
+                                               object->debug_id, event,
+                                               state->name, new_state->name);
+                                        object->state = state = new_state;
+                                        goto execute_work_state;
+                                }
+                        }
-                /* wait for completion of all active operations on this object
+                        /* The event mask didn't include all the tabled bits */
-                 * and the death of all child objects of this object */
+                        BUG();
-        case FSCACHE_OBJECT_DYING:
-        dying:
-                clear_bit(FSCACHE_OBJECT_EV_CLEARED, &object->events);
-                spin_lock(&object->lock);
-                _debug("dying OBJ%x {%d,%d}",
-                       object->debug_id, object->n_ops, object->n_children);
-                if (object->n_ops == 0 && object->n_children == 0) {
-                        object->event_mask &=
-                                ~(1 << FSCACHE_OBJECT_EV_CLEARED);
-                        object->event_mask |=
-                                (1 << FSCACHE_OBJECT_EV_WITHDRAW) |
-                                (1 << FSCACHE_OBJECT_EV_RETIRE) |
-                                (1 << FSCACHE_OBJECT_EV_RELEASE) |
-                                (1 << FSCACHE_OBJECT_EV_ERROR);
-                } else {
-                        object->event_mask &=
-                                ~((1 << FSCACHE_OBJECT_EV_WITHDRAW) |
-                                  (1 << FSCACHE_OBJECT_EV_RETIRE) |
-                                  (1 << FSCACHE_OBJECT_EV_RELEASE) |
-                                  (1 << FSCACHE_OBJECT_EV_ERROR));
-                        object->event_mask |=
-                                1 << FSCACHE_OBJECT_EV_CLEARED;
                }
-                spin_unlock(&object->lock);
+                /* Randomly woke up */
-                fscache_enqueue_dependents(object);
+                goto unmask_events;
-                fscache_start_operations(object);
-                goto terminal_transit;
-                /* handle an abort during initialisation */
-        case FSCACHE_OBJECT_ABORT_INIT:
-                _debug("handle abort init %lx", object->events);
-                object->event_mask &= ~(1 << FSCACHE_OBJECT_EV_UPDATE);
-                spin_lock(&object->lock);
-                fscache_dequeue_object(object);
-                object->state = FSCACHE_OBJECT_DYING;
-                if (test_and_clear_bit(FSCACHE_COOKIE_CREATING,
-                                       &object->cookie->flags))
-                        wake_up_bit(&object->cookie->flags,
-                                    FSCACHE_COOKIE_CREATING);
-                spin_unlock(&object->lock);
-                goto dying;
-                /* handle the netfs releasing an object and possibly marking it
-                 * obsolete too */
-        case FSCACHE_OBJECT_RELEASING:
-        case FSCACHE_OBJECT_RECYCLING:
-                object->event_mask &=
-                        ~((1 << FSCACHE_OBJECT_EV_WITHDRAW) |
-                          (1 << FSCACHE_OBJECT_EV_RETIRE) |
-                          (1 << FSCACHE_OBJECT_EV_RELEASE) |
-                          (1 << FSCACHE_OBJECT_EV_ERROR));
-                fscache_release_object(object);
-                spin_lock(&object->lock);
-                object->state = FSCACHE_OBJECT_DEAD;
-                spin_unlock(&object->lock);
-                fscache_stat(&fscache_n_object_dead);
-                goto terminal_transit;
-                /* handle the parent cache of this object being withdrawn from
-                 * active service */
-        case FSCACHE_OBJECT_WITHDRAWING:
-                object->event_mask &=
-                        ~((1 << FSCACHE_OBJECT_EV_WITHDRAW) |
-                          (1 << FSCACHE_OBJECT_EV_RETIRE) |
-                          (1 << FSCACHE_OBJECT_EV_RELEASE) |
-                          (1 << FSCACHE_OBJECT_EV_ERROR));
-                fscache_withdraw_object(object);
-                spin_lock(&object->lock);
-                object->state = FSCACHE_OBJECT_DEAD;
-                spin_unlock(&object->lock);
-                fscache_stat(&fscache_n_object_dead);
-                goto terminal_transit;
-                /* complain about the object being woken up once it is
-                 * deceased */
-        case FSCACHE_OBJECT_DEAD:
-                printk(KERN_ERR "FS-Cache:"
-                       " Unexpected event in dead state %lx\n",
-                       object->events & object->event_mask);
-                BUG();
-        default:
-                printk(KERN_ERR "FS-Cache: Unknown object state %u\n",
-                       object->state);
-                BUG();
-        }
-        /* determine the transition from a lookup state */
-lookup_transit:
-        event = fls(object->events & object->event_mask) - 1;
-        switch (event) {
-        case FSCACHE_OBJECT_EV_WITHDRAW:
-        case FSCACHE_OBJECT_EV_RETIRE:
-        case FSCACHE_OBJECT_EV_RELEASE:
-        case FSCACHE_OBJECT_EV_ERROR:
-                new_state = FSCACHE_OBJECT_LC_DYING;
-                goto change_state;
-        case FSCACHE_OBJECT_EV_INVALIDATE:
-                new_state = FSCACHE_OBJECT_INVALIDATING;
-                goto change_state;
-        case FSCACHE_OBJECT_EV_REQUEUE:
-                goto done;
-        case -1:
-                goto done; /* sleep until event */
-        default:
-                goto unsupported_event;
        }
-        /* determine the transition from an active state */
+execute_work_state:
-active_transit:
+        _debug("{OBJ%x} exec %s", object->debug_id, state->name);
-        event = fls(object->events & object->event_mask) - 1;
-        switch (event) {
-        case FSCACHE_OBJECT_EV_WITHDRAW:
-        case FSCACHE_OBJECT_EV_RETIRE:
-        case FSCACHE_OBJECT_EV_RELEASE:
-        case FSCACHE_OBJECT_EV_ERROR:
-                new_state = FSCACHE_OBJECT_DYING;
-                goto change_state;
-        case FSCACHE_OBJECT_EV_INVALIDATE:
-                new_state = FSCACHE_OBJECT_INVALIDATING;
-                goto change_state;
-        case FSCACHE_OBJECT_EV_UPDATE:
-                new_state = FSCACHE_OBJECT_UPDATING;
-                goto change_state;
-        case -1:
-                new_state = FSCACHE_OBJECT_ACTIVE;
-                goto change_state; /* sleep until event */
-        default:
-                goto unsupported_event;
-        }
-        /* determine the transition from a terminal state */
+        new_state = state->work(object, event);
-terminal_transit:
+        event = -1;
-        event = fls(object->events & object->event_mask) - 1;
+        if (new_state == NO_TRANSIT) {
-        switch (event) {
+                _debug("{OBJ%x} %s notrans", object->debug_id, state->name);
-        case FSCACHE_OBJECT_EV_WITHDRAW:
+                fscache_enqueue_object(object);
-                new_state = FSCACHE_OBJECT_WITHDRAWING;
+                event_mask = object->oob_event_mask;
-                goto change_state;
+                goto unmask_events;
-        case FSCACHE_OBJECT_EV_RETIRE:
-                new_state = FSCACHE_OBJECT_RECYCLING;
-                goto change_state;
-        case FSCACHE_OBJECT_EV_RELEASE:
-                new_state = FSCACHE_OBJECT_RELEASING;
-                goto change_state;
-        case FSCACHE_OBJECT_EV_ERROR:
-                new_state = FSCACHE_OBJECT_WITHDRAWING;
-                goto change_state;
-        case FSCACHE_OBJECT_EV_CLEARED:
-                new_state = FSCACHE_OBJECT_DYING;
-                goto change_state;
-        case -1:
-                goto done; /* sleep until event */
-        default:
-                goto unsupported_event;
        }
-change_state:
+        _debug("{OBJ%x} %s -> %s",
-        spin_lock(&object->lock);
+               object->debug_id, state->name, new_state->name);
-        object->state = new_state;
+        object->state = state = new_state;
-        spin_unlock(&object->lock);
-done:
+        if (state->work) {
-        _leave(" [->%s]", fscache_object_states[object->state]);
+                if (unlikely(state->work == ((void *)2UL))) {
-        return;
+                        _leave(" [dead]");
+                        return;
+                }
+                goto restart_masked;
+        }
-unsupported_event:
+        /* Transited to wait state */
-        printk(KERN_ERR "FS-Cache:"
+        event_mask = object->oob_event_mask;
-               " Unsupported event %d [%lx/%lx] in state %s\n",
+        for (t = state->transitions; t->events; t++)
-               event, object->events, object->event_mask,
+                event_mask |= t->events;
-               fscache_object_states[object->state]);
-        BUG();
+unmask_events:
+        object->event_mask = event_mask;
+        smp_mb();
+        events = object->events;
+        if (events & event_mask)
+                goto restart;
+        _leave(" [msk %lx]", event_mask);
 }
 /*
 * execute an object
 */
-void fscache_object_work_func(struct work_struct *work)
+static void fscache_object_work_func(struct work_struct *work)
 {
        struct fscache_object *object =
                container_of(work, struct fscache_object, work);
@@ -372,14 +272,70 @@ void fscache_object_work_func(struct work_struct *work)
        _enter("{OBJ%x}", object->debug_id);
        start = jiffies;
-        fscache_object_state_machine(object);
+        fscache_object_sm_dispatcher(object);
        fscache_hist(fscache_objs_histogram, start);
-        if (object->events & object->event_mask)
-                fscache_enqueue_object(object);
-        clear_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
        fscache_put_object(object);
 }
-EXPORT_SYMBOL(fscache_object_work_func);
+/**
+ * fscache_object_init - Initialise a cache object description
+ * @object: Object description
+ * @cookie: Cookie object will be attached to
+ * @cache: Cache in which backing object will be found
+ *
+ * Initialise a cache object description to its basic values.
+ *
+ * See Documentation/filesystems/caching/backend-api.txt for a complete
+ * description.
+ */
+void fscache_object_init(struct fscache_object *object,
+                         struct fscache_cookie *cookie,
+                         struct fscache_cache *cache)
+{
+        const struct fscache_transition *t;
+        atomic_inc(&cache->object_count);
+        object->state = STATE(WAIT_FOR_INIT);
+        object->oob_table = fscache_osm_init_oob;
+        object->flags = 1 << FSCACHE_OBJECT_IS_LIVE;
+        spin_lock_init(&object->lock);
+        INIT_LIST_HEAD(&object->cache_link);
+        INIT_HLIST_NODE(&object->cookie_link);
+        INIT_WORK(&object->work, fscache_object_work_func);
+        INIT_LIST_HEAD(&object->dependents);
+        INIT_LIST_HEAD(&object->dep_link);
+        INIT_LIST_HEAD(&object->pending_ops);
+        object->n_children = 0;
+        object->n_ops = object->n_in_progress = object->n_exclusive = 0;
+        object->events = 0;
+        object->store_limit = 0;
+        object->store_limit_l = 0;
+        object->cache = cache;
+        object->cookie = cookie;
+        object->parent = NULL;
+        object->oob_event_mask = 0;
+        for (t = object->oob_table; t->events; t++)
+                object->oob_event_mask |= t->events;
+        object->event_mask = object->oob_event_mask;
+        for (t = object->state->transitions; t->events; t++)
+                object->event_mask |= t->events;
+}
+EXPORT_SYMBOL(fscache_object_init);
+/*
+ * Abort object initialisation before we start it.
+ */
+static const struct fscache_state *fscache_abort_initialisation(struct fscache_object *object,
+                                                                int event)
+{
+        _enter("{OBJ%x},%d", object->debug_id, event);
+        object->oob_event_mask = 0;
+        fscache_dequeue_object(object);
+        return transit_to(KILL_OBJECT);
+}
 /*
 * initialise an object
@@ -387,130 +343,136 @@ EXPORT_SYMBOL(fscache_object_work_func);
 *   immediately to do a creation
 * - we may need to start the process of creating a parent and we need to wait
 *   for the parent's lookup and creation to complete if it's not there yet
- * - an object's cookie is pinned until we clear FSCACHE_COOKIE_CREATING on the
- *   leaf-most cookies of the object and all its children
 */
-static void fscache_initialise_object(struct fscache_object *object)
+static const struct fscache_state *fscache_initialise_object(struct fscache_object *object,
+                                                             int event)
 {
        struct fscache_object *parent;
+        bool success;
-        _enter("");
+        _enter("{OBJ%x},%d", object->debug_id, event);
-        ASSERT(object->cookie != NULL);
-        ASSERT(object->cookie->parent != NULL);
-        if (object->events & ((1 << FSCACHE_OBJECT_EV_ERROR) |
-                              (1 << FSCACHE_OBJECT_EV_RELEASE) |
-                              (1 << FSCACHE_OBJECT_EV_RETIRE) |
-                              (1 << FSCACHE_OBJECT_EV_WITHDRAW))) {
-                _debug("abort init %lx", object->events);
-                spin_lock(&object->lock);
-                object->state = FSCACHE_OBJECT_ABORT_INIT;
-                spin_unlock(&object->lock);
-                return;
-        }
-        spin_lock(&object->cookie->lock);
+        ASSERT(list_empty(&object->dep_link));
-        spin_lock_nested(&object->cookie->parent->lock, 1);
        parent = object->parent;
        if (!parent) {
-                _debug("no parent");
+                _leave(" [no parent]");
-                set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events);
+                return transit_to(DROP_OBJECT);
-        } else {
+        }
-                spin_lock(&object->lock);
-                spin_lock_nested(&parent->lock, 1);
-                _debug("parent %s", fscache_object_states[parent->state]);
-                if (parent->state >= FSCACHE_OBJECT_DYING) {
-                        _debug("bad parent");
-                        set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events);
-                } else if (parent->state < FSCACHE_OBJECT_AVAILABLE) {
-                        _debug("wait");
-                        /* we may get woken up in this state by child objects
-                         * binding on to us, so we need to make sure we don't
-                         * add ourself to the list multiple times */
-                        if (list_empty(&object->dep_link)) {
-                                fscache_stat(&fscache_n_cop_grab_object);
-                                object->cache->ops->grab_object(object);
-                                fscache_stat_d(&fscache_n_cop_grab_object);
-                                list_add(&object->dep_link,
-                                         &parent->dependents);
-                                /* fscache_acquire_non_index_cookie() uses this
-                                 * to wake the chain up */
-                                if (parent->state == FSCACHE_OBJECT_INIT)
-                                        fscache_enqueue_object(parent);
-                        }
-                } else {
-                        _debug("go");
-                        parent->n_ops++;
-                        parent->n_obj_ops++;
-                        object->lookup_jif = jiffies;
-                        object->state = FSCACHE_OBJECT_LOOKING_UP;
-                        set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
-                }
-                spin_unlock(&parent->lock);
+        _debug("parent: %s of:%lx", parent->state->name, parent->flags);
-                spin_unlock(&object->lock);
+        if (fscache_object_is_dying(parent)) {
+                _leave(" [bad parent]");
+                return transit_to(DROP_OBJECT);
        }
-        spin_unlock(&object->cookie->parent->lock);
+        if (fscache_object_is_available(parent)) {
-        spin_unlock(&object->cookie->lock);
+                _leave(" [ready]");
+                return transit_to(PARENT_READY);
+        }
+        _debug("wait");
+        spin_lock(&parent->lock);
+        fscache_stat(&fscache_n_cop_grab_object);
+        success = false;
+        if (fscache_object_is_live(parent) &&
+            object->cache->ops->grab_object(object)) {
+                list_add(&object->dep_link, &parent->dependents);
+                success = true;
+        }
+        fscache_stat_d(&fscache_n_cop_grab_object);
+        spin_unlock(&parent->lock);
+        if (!success) {
+                _leave(" [grab failed]");
+                return transit_to(DROP_OBJECT);
+        }
+        /* fscache_acquire_non_index_cookie() uses this
+         * to wake the chain up */
+        fscache_raise_event(parent, FSCACHE_OBJECT_EV_NEW_CHILD);
+        _leave(" [wait]");
+        return transit_to(WAIT_FOR_PARENT);
+}
+/*
+ * Once the parent object is ready, we should kick off our lookup op.
+ */
+static const struct fscache_state *fscache_parent_ready(struct fscache_object *object,
+                                                        int event)
+{
+        struct fscache_object *parent = object->parent;
+        _enter("{OBJ%x},%d", object->debug_id, event);
+        ASSERT(parent != NULL);
+        spin_lock(&parent->lock);
+        parent->n_ops++;
+        parent->n_obj_ops++;
+        object->lookup_jif = jiffies;
+        spin_unlock(&parent->lock);
        _leave("");
+        return transit_to(LOOK_UP_OBJECT);
 }
 /*
 * look an object up in the cache from which it was allocated
 * - we hold an "access lock" on the parent object, so the parent object cannot
 *   be withdrawn by either party till we've finished
- * - an object's cookie is pinned until we clear FSCACHE_COOKIE_CREATING on the
- *   leaf-most cookies of the object and all its children
 */
-static void fscache_lookup_object(struct fscache_object *object)
+static const struct fscache_state *fscache_look_up_object(struct fscache_object *object,
+                                                          int event)
 {
        struct fscache_cookie *cookie = object->cookie;
-        struct fscache_object *parent;
+        struct fscache_object *parent = object->parent;
        int ret;
-        _enter("");
+        _enter("{OBJ%x},%d", object->debug_id, event);
+        object->oob_table = fscache_osm_lookup_oob;
-        parent = object->parent;
        ASSERT(parent != NULL);
        ASSERTCMP(parent->n_ops, >, 0);
        ASSERTCMP(parent->n_obj_ops, >, 0);
        /* make sure the parent is still available */
-        ASSERTCMP(parent->state, >=, FSCACHE_OBJECT_AVAILABLE);
+        ASSERT(fscache_object_is_available(parent));
-        if (parent->state >= FSCACHE_OBJECT_DYING ||
+        if (fscache_object_is_dying(parent) ||
-            test_bit(FSCACHE_IOERROR, &object->cache->flags)) {
+            test_bit(FSCACHE_IOERROR, &object->cache->flags) ||
-                _debug("unavailable");
+            !fscache_use_cookie(object)) {
-                set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events);
+                _leave(" [unavailable]");
-                _leave("");
+                return transit_to(LOOKUP_FAILURE);
-                return;
        }
-        _debug("LOOKUP \"%s/%s\" in \"%s\"",
+        _debug("LOOKUP \"%s\" in \"%s\"",
-               parent->cookie->def->name, cookie->def->name,
+               cookie->def->name, object->cache->tag->name);
-               object->cache->tag->name);
        fscache_stat(&fscache_n_object_lookups);
        fscache_stat(&fscache_n_cop_lookup_object);
        ret = object->cache->ops->lookup_object(object);
        fscache_stat_d(&fscache_n_cop_lookup_object);
-        if (test_bit(FSCACHE_OBJECT_EV_ERROR, &object->events))
+        fscache_unuse_cookie(object);
-                set_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags);
        if (ret == -ETIMEDOUT) {
                /* probably stuck behind another object, so move this one to
                 * the back of the queue */
                fscache_stat(&fscache_n_object_lookups_timed_out);
-                set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
+                _leave(" [timeout]");
+                return NO_TRANSIT;
        }
-        _leave("");
+        if (ret < 0) {
+                _leave(" [error]");
+                return transit_to(LOOKUP_FAILURE);
+        }
+        _leave(" [ok]");
+        return transit_to(OBJECT_AVAILABLE);
 }
 /**
@@ -524,32 +486,20 @@ void fscache_object_lookup_negative(struct fscache_object *object)
 {
        struct fscache_cookie *cookie = object->cookie;
-        _enter("{OBJ%x,%s}",
+        _enter("{OBJ%x,%s}", object->debug_id, object->state->name);
-               object->debug_id, fscache_object_states[object->state]);
-        spin_lock(&object->lock);
+        if (!test_and_set_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) {
-        if (object->state == FSCACHE_OBJECT_LOOKING_UP) {
                fscache_stat(&fscache_n_object_lookups_negative);
-                /* transit here to allow write requests to begin stacking up
+                /* Allow write requests to begin stacking up and read requests to begin
-                 * and read requests to begin returning ENODATA */
+                 * returning ENODATA.
-                object->state = FSCACHE_OBJECT_CREATING;
+                 */
-                spin_unlock(&object->lock);
-                set_bit(FSCACHE_COOKIE_PENDING_FILL, &cookie->flags);
                set_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
                _debug("wake up lookup %p", &cookie->flags);
-                smp_mb__before_clear_bit();
+                clear_bit_unlock(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
-                clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
-                smp_mb__after_clear_bit();
                wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP);
-                set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
-        } else {
-                ASSERTCMP(object->state, ==, FSCACHE_OBJECT_CREATING);
-                spin_unlock(&object->lock);
        }
        _leave("");
 }
 EXPORT_SYMBOL(fscache_object_lookup_negative);
@@ -568,38 +518,26 @@ void fscache_obtained_object(struct fscache_object *object)
 {
        struct fscache_cookie *cookie = object->cookie;
-        _enter("{OBJ%x,%s}",
+        _enter("{OBJ%x,%s}", object->debug_id, object->state->name);
-               object->debug_id, fscache_object_states[object->state]);
        /* if we were still looking up, then we must have a positive lookup
         * result, in which case there may be data available */
-        spin_lock(&object->lock);
+        if (!test_and_set_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) {
-        if (object->state == FSCACHE_OBJECT_LOOKING_UP) {
                fscache_stat(&fscache_n_object_lookups_positive);
-                clear_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
+                /* We do (presumably) have data */
+                clear_bit_unlock(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
-                object->state = FSCACHE_OBJECT_AVAILABLE;
+                /* Allow write requests to begin stacking up and read requests
-                spin_unlock(&object->lock);
+                 * to begin shovelling data.
+                 */
-                smp_mb__before_clear_bit();
+                clear_bit_unlock(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
-                clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
-                smp_mb__after_clear_bit();
                wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP);
-                set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
        } else {
-                ASSERTCMP(object->state, ==, FSCACHE_OBJECT_CREATING);
                fscache_stat(&fscache_n_object_created);
-                object->state = FSCACHE_OBJECT_AVAILABLE;
-                spin_unlock(&object->lock);
-                set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
-                smp_wmb();
        }
-        if (test_and_clear_bit(FSCACHE_COOKIE_CREATING, &cookie->flags))
+        set_bit(FSCACHE_OBJECT_IS_AVAILABLE, &object->flags);
-                wake_up_bit(&cookie->flags, FSCACHE_COOKIE_CREATING);
        _leave("");
 }
 EXPORT_SYMBOL(fscache_obtained_object);
@@ -607,15 +545,14 @@ EXPORT_SYMBOL(fscache_obtained_object);
 /*
 * handle an object that has just become available
 */
-static void fscache_object_available(struct fscache_object *object)
+static const struct fscache_state *fscache_object_available(struct fscache_object *object,
+                                                            int event)
 {
-        _enter("{OBJ%x}", object->debug_id);
+        _enter("{OBJ%x},%d", object->debug_id, event);
-        spin_lock(&object->lock);
+        object->oob_table = fscache_osm_run_oob;
-        if (object->cookie &&
+        spin_lock(&object->lock);
-            test_and_clear_bit(FSCACHE_COOKIE_CREATING, &object->cookie->flags))
-                wake_up_bit(&object->cookie->flags, FSCACHE_COOKIE_CREATING);
        fscache_done_parent_op(object);
        if (object->n_in_progress == 0) {
@@ -631,130 +568,158 @@ static void fscache_object_available(struct fscache_object *object)
        fscache_stat(&fscache_n_cop_lookup_complete);
        object->cache->ops->lookup_complete(object);
        fscache_stat_d(&fscache_n_cop_lookup_complete);
-        fscache_enqueue_dependents(object);
        fscache_hist(fscache_obj_instantiate_histogram, object->lookup_jif);
        fscache_stat(&fscache_n_object_avail);
        _leave("");
+        return transit_to(JUMPSTART_DEPS);
 }
 /*
- * drop an object's attachments
+ * Wake up this object's dependent objects now that we've become available.
 */
-static void fscache_drop_object(struct fscache_object *object)
+static const struct fscache_state *fscache_jumpstart_dependents(struct fscache_object *object,
+                                                                int event)
 {
-        struct fscache_object *parent = object->parent;
+        _enter("{OBJ%x},%d", object->debug_id, event);
-        struct fscache_cache *cache = object->cache;
-        _enter("{OBJ%x,%d}", object->debug_id, object->n_children);
+        if (!fscache_enqueue_dependents(object, FSCACHE_OBJECT_EV_PARENT_READY))
+                return NO_TRANSIT; /* Not finished; requeue */
+        return transit_to(WAIT_FOR_CMD);
+}
-        ASSERTCMP(object->cookie, ==, NULL);
+/*
-        ASSERT(hlist_unhashed(&object->cookie_link));
+ * Handle lookup or creation failute.
+ */
+static const struct fscache_state *fscache_lookup_failure(struct fscache_object *object,
+                                                          int event)
+{
+        struct fscache_cookie *cookie;
-        spin_lock(&cache->object_list_lock);
+        _enter("{OBJ%x},%d", object->debug_id, event);
-        list_del_init(&object->cache_link);
-        spin_unlock(&cache->object_list_lock);
-        fscache_stat(&fscache_n_cop_drop_object);
+        object->oob_event_mask = 0;
-        cache->ops->drop_object(object);
-        fscache_stat_d(&fscache_n_cop_drop_object);
-        if (parent) {
+        fscache_stat(&fscache_n_cop_lookup_complete);
-                _debug("release parent OBJ%x {%d}",
+        object->cache->ops->lookup_complete(object);
-                       parent->debug_id, parent->n_children);
+        fscache_stat_d(&fscache_n_cop_lookup_complete);
-                spin_lock(&parent->lock);
+        cookie = object->cookie;
-                parent->n_children--;
+        set_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags);
-                if (parent->n_children == 0)
+        if (test_and_clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags))
-                        fscache_raise_event(parent, FSCACHE_OBJECT_EV_CLEARED);
+                wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP);
-                spin_unlock(&parent->lock);
-                object->parent = NULL;
+        fscache_done_parent_op(object);
+        return transit_to(KILL_OBJECT);
+}
+/*
+ * Wait for completion of all active operations on this object and the death of
+ * all child objects of this object.
+ */
+static const struct fscache_state *fscache_kill_object(struct fscache_object *object,
+                                                       int event)
+{
+        _enter("{OBJ%x,%d,%d},%d",
+               object->debug_id, object->n_ops, object->n_children, event);
+        clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags);
+        object->oob_event_mask = 0;
+        if (list_empty(&object->dependents) &&
+            object->n_ops == 0 &&
+            object->n_children == 0)
+                return transit_to(DROP_OBJECT);
+        if (object->n_in_progress == 0) {
+                spin_lock(&object->lock);
+                if (object->n_ops > 0 && object->n_in_progress == 0)
+                        fscache_start_operations(object);
+                spin_unlock(&object->lock);
        }
-        /* this just shifts the object release to the work processor */
+        if (!list_empty(&object->dependents))
-        fscache_put_object(object);
+                return transit_to(KILL_DEPENDENTS);
-        _leave("");
+        return transit_to(WAIT_FOR_CLEARANCE);
 }
 /*
- * release or recycle an object that the netfs has discarded
+ * Kill dependent objects.
 */
-static void fscache_release_object(struct fscache_object *object)
+static const struct fscache_state *fscache_kill_dependents(struct fscache_object *object,
+                                                           int event)
 {
-        _enter("");
+        _enter("{OBJ%x},%d", object->debug_id, event);
-        fscache_drop_object(object);
+        if (!fscache_enqueue_dependents(object, FSCACHE_OBJECT_EV_KILL))
+                return NO_TRANSIT; /* Not finished */
+        return transit_to(WAIT_FOR_CLEARANCE);
 }
 /*
- * withdraw an object from active service
+ * Drop an object's attachments
 */
-static void fscache_withdraw_object(struct fscache_object *object)
+static const struct fscache_state *fscache_drop_object(struct fscache_object *object,
+                                                       int event)
 {
-        struct fscache_cookie *cookie;
+        struct fscache_object *parent = object->parent;
-        bool detached;
+        struct fscache_cookie *cookie = object->cookie;
+        struct fscache_cache *cache = object->cache;
+        bool awaken = false;
-        _enter("");
+        _enter("{OBJ%x,%d},%d", object->debug_id, object->n_children, event);
-        spin_lock(&object->lock);
+        ASSERT(cookie != NULL);
-        cookie = object->cookie;
+        ASSERT(!hlist_unhashed(&object->cookie_link));
-        if (cookie) {
-                /* need to get the cookie lock before the object lock, starting
-                 * from the object pointer */
-                atomic_inc(&cookie->usage);
-                spin_unlock(&object->lock);
-                detached = false;
+        /* Make sure the cookie no longer points here and that the netfs isn't
-                spin_lock(&cookie->lock);
+         * waiting for us.
-                spin_lock(&object->lock);
+         */
+        spin_lock(&cookie->lock);
+        hlist_del_init(&object->cookie_link);
+        if (test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags))
+                awaken = true;
+        spin_unlock(&cookie->lock);
-                if (object->cookie == cookie) {
+        if (awaken)
-                        hlist_del_init(&object->cookie_link);
+                wake_up_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING);
-                        object->cookie = NULL;
-                        fscache_invalidation_complete(cookie);
-                        detached = true;
-                }
-                spin_unlock(&cookie->lock);
-                fscache_cookie_put(cookie);
-                if (detached)
-                        fscache_cookie_put(cookie);
-        }
+        /* Prevent a race with our last child, which has to signal EV_CLEARED
+         * before dropping our spinlock.
+         */
+        spin_lock(&object->lock);
        spin_unlock(&object->lock);
-        fscache_drop_object(object);
+        /* Discard from the cache's collection of objects */
-}
+        spin_lock(&cache->object_list_lock);
+        list_del_init(&object->cache_link);
+        spin_unlock(&cache->object_list_lock);
-/*
+        fscache_stat(&fscache_n_cop_drop_object);
- * withdraw an object from active service at the behest of the cache
+        cache->ops->drop_object(object);
- * - need break the links to a cached object cookie
+        fscache_stat_d(&fscache_n_cop_drop_object);
- * - called under two situations:
- *   (1) recycler decides to reclaim an in-use object
- *   (2) a cache is unmounted
- * - have to take care as the cookie can be being relinquished by the netfs
- *   simultaneously
- * - the object is pinned by the caller holding a refcount on it
- */
-void fscache_withdrawing_object(struct fscache_cache *cache,
-                                struct fscache_object *object)
-{
-        bool enqueue = false;
-        _enter(",OBJ%x", object->debug_id);
+        /* The parent object wants to know when all it dependents have gone */
+        if (parent) {
+                _debug("release parent OBJ%x {%d}",
+                       parent->debug_id, parent->n_children);
-        spin_lock(&object->lock);
+                spin_lock(&parent->lock);
-        if (object->state < FSCACHE_OBJECT_WITHDRAWING) {
+                parent->n_children--;
-                object->state = FSCACHE_OBJECT_WITHDRAWING;
+                if (parent->n_children == 0)
-                enqueue = true;
+                        fscache_raise_event(parent, FSCACHE_OBJECT_EV_CLEARED);
+                spin_unlock(&parent->lock);
+                object->parent = NULL;
        }
-        spin_unlock(&object->lock);
-        if (enqueue)
+        /* this just shifts the object release to the work processor */
-                fscache_enqueue_object(object);
+        fscache_put_object(object);
+        fscache_stat(&fscache_n_object_dead);
        _leave("");
+        return transit_to(OBJECT_DEAD);
 }
 /*
@@ -771,7 +736,7 @@ static int fscache_get_object(struct fscache_object *object)
 }
 /*
- * discard a ref on a work item
+ * Discard a ref on an object
 */
 static void fscache_put_object(struct fscache_object *object)
 {
@@ -780,6 +745,22 @@ static void fscache_put_object(struct fscache_object *object)
        fscache_stat_d(&fscache_n_cop_put_object);
 }
+/**
+ * fscache_object_destroy - Note that a cache object is about to be destroyed
+ * @object: The object to be destroyed
+ *
+ * Note the imminent destruction and deallocation of a cache object record.
+ */
+void fscache_object_destroy(struct fscache_object *object)
+{
+        fscache_objlist_remove(object);
+        /* We can get rid of the cookie now */
+        fscache_cookie_put(object->cookie);
+        object->cookie = NULL;
+}
+EXPORT_SYMBOL(fscache_object_destroy);
 /*
 * enqueue an object for metadata-type processing
 */
@@ -803,7 +784,7 @@ void fscache_enqueue_object(struct fscache_object *object)
 /**
 * fscache_object_sleep_till_congested - Sleep until object wq is congested
- * @timoutp: Scheduler sleep timeout
+ * @timeoutp: Scheduler sleep timeout
 *
 * Allow an object handler to sleep until the object workqueue is congested.
 *
@@ -831,18 +812,21 @@ bool fscache_object_sleep_till_congested(signed long *timeoutp)
 EXPORT_SYMBOL_GPL(fscache_object_sleep_till_congested);
 /*
- * enqueue the dependents of an object for metadata-type processing
+ * Enqueue the dependents of an object for metadata-type processing.
- * - the caller must hold the object's lock
+ *
- * - this may cause an already locked object to wind up being processed again
+ * If we don't manage to finish the list before the scheduler wants to run
+ * again then return false immediately.  We return true if the list was
+ * cleared.
 */
-static void fscache_enqueue_dependents(struct fscache_object *object)
+static bool fscache_enqueue_dependents(struct fscache_object *object, int event)
 {
        struct fscache_object *dep;
+        bool ret = true;
        _enter("{OBJ%x}", object->debug_id);
        if (list_empty(&object->dependents))
-                return;
+                return true;
        spin_lock(&object->lock);
@@ -851,23 +835,23 @@ static void fscache_enqueue_dependents(struct fscache_object *object)
                                 struct fscache_object, dep_link);
                list_del_init(&dep->dep_link);
+                fscache_raise_event(dep, event);
-                /* sort onto appropriate lists */
-                fscache_enqueue_object(dep);
                fscache_put_object(dep);
-                if (!list_empty(&object->dependents))
+                if (!list_empty(&object->dependents) && need_resched()) {
-                        cond_resched_lock(&object->lock);
+                        ret = false;
+                        break;
+                }
        }
        spin_unlock(&object->lock);
+        return ret;
 }
 /*
 * remove an object from whatever queue it's waiting on
- * - the caller must hold object->lock
 */
-void fscache_dequeue_object(struct fscache_object *object)
+static void fscache_dequeue_object(struct fscache_object *object)
 {
        _enter("{OBJ%x}", object->debug_id);
@@ -886,7 +870,10 @@ void fscache_dequeue_object(struct fscache_object *object)
 * @data: The auxiliary data for the object
 * @datalen: The size of the auxiliary data
 *
- * This function consults the netfs about the coherency state of an object
+ * This function consults the netfs about the coherency state of an object.
+ * The caller must be holding a ref on cookie->n_active (held by
+ * fscache_look_up_object() on behalf of the cache backend during object lookup
+ * and creation).
 */
 enum fscache_checkaux fscache_check_aux(struct fscache_object *object,
                                        const void *data, uint16_t datalen)
@@ -927,12 +914,23 @@ EXPORT_SYMBOL(fscache_check_aux);
 /*
 * Asynchronously invalidate an object.
 */
-static void fscache_invalidate_object(struct fscache_object *object)
+static const struct fscache_state *_fscache_invalidate_object(struct fscache_object *object,
+                                                              int event)
 {
        struct fscache_operation *op;
        struct fscache_cookie *cookie = object->cookie;
-        _enter("{OBJ%x}", object->debug_id);
+        _enter("{OBJ%x},%d", object->debug_id, event);
+        /* We're going to need the cookie.  If the cookie is not available then
+         * retire the object instead.
+         */
+        if (!fscache_use_cookie(object)) {
+                ASSERT(object->cookie->stores.rnode == NULL);
+                set_bit(FSCACHE_COOKIE_RETIRED, &cookie->flags);
+                _leave(" [no cookie]");
+                return transit_to(KILL_OBJECT);
+        }
        /* Reject any new read/write ops and abort any that are pending. */
        fscache_invalidate_writes(cookie);
@@ -941,14 +939,13 @@ static void fscache_invalidate_object(struct fscache_object *object)
        /* Now we have to wait for in-progress reads and writes */
        op = kzalloc(sizeof(*op), GFP_KERNEL);
-        if (!op) {
+        if (!op)
-                fscache_raise_event(object, FSCACHE_OBJECT_EV_ERROR);
+                goto nomem;
-                _leave(" [ENOMEM]");
-                return;
-        }
        fscache_operation_init(op, object->cache->ops->invalidate_object, NULL);
-        op->flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_EXCLUSIVE);
+        op->flags = FSCACHE_OP_ASYNC |
+                (1 << FSCACHE_OP_EXCLUSIVE) |
+                (1 << FSCACHE_OP_UNUSE_COOKIE);
        spin_lock(&cookie->lock);
        if (fscache_submit_exclusive_op(object, op) < 0)
@@ -965,13 +962,50 @@ static void fscache_invalidate_object(struct fscache_object *object)
        /* We can allow read and write requests to come in once again.  They'll
         * queue up behind our exclusive invalidation operation.
         */
-        fscache_invalidation_complete(cookie);
+        if (test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags))
-        _leave("");
+                wake_up_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING);
-        return;
+        _leave(" [ok]");
+        return transit_to(UPDATE_OBJECT);
+nomem:
+        clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags);
+        fscache_unuse_cookie(object);
+        _leave(" [ENOMEM]");
+        return transit_to(KILL_OBJECT);
 submit_op_failed:
+        clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags);
        spin_unlock(&cookie->lock);
        kfree(op);
-        fscache_raise_event(object, FSCACHE_OBJECT_EV_ERROR);
        _leave(" [EIO]");
+        return transit_to(KILL_OBJECT);
+}
+static const struct fscache_state *fscache_invalidate_object(struct fscache_object *object,
+                                                             int event)
+{
+        const struct fscache_state *s;
+        fscache_stat(&fscache_n_invalidates_run);
+        fscache_stat(&fscache_n_cop_invalidate_object);
+        s = _fscache_invalidate_object(object, event);
+        fscache_stat_d(&fscache_n_cop_invalidate_object);
+        return s;
+}
+/*
+ * Asynchronously update an object.
+ */
+static const struct fscache_state *fscache_update_object(struct fscache_object *object,
+                                                         int event)
+{
+        _enter("{OBJ%x},%d", object->debug_id, event);
+        fscache_stat(&fscache_n_updates_run);
+        fscache_stat(&fscache_n_cop_update_object);
+        object->cache->ops->update_object(object);
+        fscache_stat_d(&fscache_n_cop_update_object);
+        _leave("");
+        return transit_to(WAIT_FOR_CMD);
 }
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index 762a9ec4ffa4..318071aca217 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -35,7 +35,7 @@ void fscache_enqueue_operation(struct fscache_operation *op)
        ASSERT(list_empty(&op->pend_link));
        ASSERT(op->processor != NULL);
-        ASSERTCMP(op->object->state, >=, FSCACHE_OBJECT_AVAILABLE);
+        ASSERT(fscache_object_is_available(op->object));
        ASSERTCMP(atomic_read(&op->usage), >, 0);
        ASSERTCMP(op->state, ==, FSCACHE_OP_ST_IN_PROGRESS);
@@ -119,7 +119,7 @@ int fscache_submit_exclusive_op(struct fscache_object *object,
                /* need to issue a new write op after this */
                clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags);
                ret = 0;
-        } else if (object->state == FSCACHE_OBJECT_CREATING) {
+        } else if (test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) {
                op->object = object;
                object->n_ops++;
                object->n_exclusive++;  /* reads and writes must wait */
@@ -144,7 +144,7 @@ int fscache_submit_exclusive_op(struct fscache_object *object,
 */
 static void fscache_report_unexpected_submission(struct fscache_object *object,
                                                 struct fscache_operation *op,
-                                                 unsigned long ostate)
+                                                 const struct fscache_state *ostate)
 {
        static bool once_only;
        struct fscache_operation *p;
@@ -155,11 +155,8 @@ static void fscache_report_unexpected_submission(struct fscache_object *object,
        once_only = true;
        kdebug("unexpected submission OP%x [OBJ%x %s]",
-               op->debug_id, object->debug_id,
+               op->debug_id, object->debug_id, object->state->name);
-               fscache_object_states[object->state]);
+        kdebug("objstate=%s [%s]", object->state->name, ostate->name);
-        kdebug("objstate=%s [%s]",
-               fscache_object_states[object->state],
-               fscache_object_states[ostate]);
        kdebug("objflags=%lx", object->flags);
        kdebug("objevent=%lx [%lx]", object->events, object->event_mask);
        kdebug("ops=%u inp=%u exc=%u",
@@ -190,7 +187,7 @@ static void fscache_report_unexpected_submission(struct fscache_object *object,
 int fscache_submit_op(struct fscache_object *object,
                      struct fscache_operation *op)
 {
-        unsigned long ostate;
+        const struct fscache_state *ostate;
        int ret;
        _enter("{OBJ%x OP%x},{%u}",
@@ -226,16 +223,14 @@ int fscache_submit_op(struct fscache_object *object,
                        fscache_run_op(object, op);
                }
                ret = 0;
-        } else if (object->state == FSCACHE_OBJECT_CREATING) {
+        } else if (test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) {
                op->object = object;
                object->n_ops++;
                atomic_inc(&op->usage);
                list_add_tail(&op->pend_link, &object->pending_ops);
                fscache_stat(&fscache_n_op_pend);
                ret = 0;
-        } else if (object->state == FSCACHE_OBJECT_DYING ||
+        } else if (fscache_object_is_dying(object)) {
-                   object->state == FSCACHE_OBJECT_LC_DYING ||
-                   object->state == FSCACHE_OBJECT_WITHDRAWING) {
                fscache_stat(&fscache_n_op_rejected);
                op->state = FSCACHE_OP_ST_CANCELLED;
                ret = -ENOBUFS;
@@ -265,8 +260,8 @@ void fscache_abort_object(struct fscache_object *object)
 }
 /*
- * jump start the operation processing on an object
+ * Jump start the operation processing on an object.  The caller must hold
- * - caller must hold object->lock
+ * object->lock.
 */
 void fscache_start_operations(struct fscache_object *object)
 {
@@ -428,14 +423,10 @@ void fscache_put_operation(struct fscache_operation *op)
        object = op->object;
-        if (test_bit(FSCACHE_OP_DEC_READ_CNT, &op->flags)) {
+        if (test_bit(FSCACHE_OP_DEC_READ_CNT, &op->flags))
-                if (atomic_dec_and_test(&object->n_reads)) {
+                atomic_dec(&object->n_reads);
-                        clear_bit(FSCACHE_COOKIE_WAITING_ON_READS,
+        if (test_bit(FSCACHE_OP_UNUSE_COOKIE, &op->flags))
-                                  &object->cookie->flags);
+                fscache_unuse_cookie(object);
-                        wake_up_bit(&object->cookie->flags,
-                                    FSCACHE_COOKIE_WAITING_ON_READS);
-                }
-        }
        /* now... we may get called with the object spinlock held, so we
         * complete the cleanup here only if we can immediately acquire the
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index ff000e52072d..d479ab3c63e4 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -109,7 +109,7 @@ page_busy:
         * allocator as the work threads writing to the cache may all end up
         * sleeping on memory allocation, so we may need to impose a timeout
         * too. */
-        if (!(gfp & __GFP_WAIT)) {
+        if (!(gfp & __GFP_WAIT) || !(gfp & __GFP_FS)) {
                fscache_stat(&fscache_n_store_vmscan_busy);
                return false;
        }
@@ -163,10 +163,12 @@ static void fscache_attr_changed_op(struct fscache_operation *op)
        fscache_stat(&fscache_n_attr_changed_calls);
-        if (fscache_object_is_active(object)) {
+        if (fscache_object_is_active(object) &&
+            fscache_use_cookie(object)) {
                fscache_stat(&fscache_n_cop_attr_changed);
                ret = object->cache->ops->attr_changed(object);
                fscache_stat_d(&fscache_n_cop_attr_changed);
+                fscache_unuse_cookie(object);
                if (ret < 0)
                        fscache_abort_object(object);
        }
@@ -233,7 +235,7 @@ static void fscache_release_retrieval_op(struct fscache_operation *_op)
        _enter("{OP%x}", op->op.debug_id);
-        ASSERTCMP(op->n_pages, ==, 0);
+        ASSERTCMP(atomic_read(&op->n_pages), ==, 0);
        fscache_hist(fscache_retrieval_histogram, op->start_time);
        if (op->context)
@@ -246,6 +248,7 @@ static void fscache_release_retrieval_op(struct fscache_operation *_op)
 * allocate a retrieval op
 */
 static struct fscache_retrieval *fscache_alloc_retrieval(
+        struct fscache_cookie *cookie,
        struct address_space *mapping,
        fscache_rw_complete_t end_io_func,
        void *context)
@@ -260,7 +263,10 @@ static struct fscache_retrieval *fscache_alloc_retrieval(
        }
        fscache_operation_init(&op->op, NULL, fscache_release_retrieval_op);
-        op->op.flags    = FSCACHE_OP_MYTHREAD | (1 << FSCACHE_OP_WAITING);
+        atomic_inc(&cookie->n_active);
+        op->op.flags    = FSCACHE_OP_MYTHREAD |
+                (1UL << FSCACHE_OP_WAITING) |
+                (1UL << FSCACHE_OP_UNUSE_COOKIE);
        op->mapping     = mapping;
        op->end_io_func = end_io_func;
        op->context     = context;
@@ -310,7 +316,7 @@ static void fscache_do_cancel_retrieval(struct fscache_operation *_op)
        struct fscache_retrieval *op =
                container_of(_op, struct fscache_retrieval, op);
-        op->n_pages = 0;
+        atomic_set(&op->n_pages, 0);
 }
 /*
@@ -394,12 +400,13 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
        if (fscache_wait_for_deferred_lookup(cookie) < 0)
                return -ERESTARTSYS;
-        op = fscache_alloc_retrieval(page->mapping, end_io_func, context);
+        op = fscache_alloc_retrieval(cookie, page->mapping,
+                                     end_io_func,context);
        if (!op) {
                _leave(" = -ENOMEM");
                return -ENOMEM;
        }
-        op->n_pages = 1;
+        atomic_set(&op->n_pages, 1);
        spin_lock(&cookie->lock);
@@ -408,7 +415,7 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
        object = hlist_entry(cookie->backing_objects.first,
                             struct fscache_object, cookie_link);
-        ASSERTCMP(object->state, >, FSCACHE_OBJECT_LOOKING_UP);
+        ASSERT(test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags));
        atomic_inc(&object->n_reads);
        __set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags);
@@ -465,6 +472,7 @@ nobufs_unlock_dec:
        atomic_dec(&object->n_reads);
 nobufs_unlock:
        spin_unlock(&cookie->lock);
+        atomic_dec(&cookie->n_active);
        kfree(op);
 nobufs:
        fscache_stat(&fscache_n_retrievals_nobufs);
@@ -522,10 +530,10 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
        if (fscache_wait_for_deferred_lookup(cookie) < 0)
                return -ERESTARTSYS;
-        op = fscache_alloc_retrieval(mapping, end_io_func, context);
+        op = fscache_alloc_retrieval(cookie, mapping, end_io_func, context);
        if (!op)
                return -ENOMEM;
-        op->n_pages = *nr_pages;
+        atomic_set(&op->n_pages, *nr_pages);
        spin_lock(&cookie->lock);
@@ -589,6 +597,7 @@ nobufs_unlock_dec:
        atomic_dec(&object->n_reads);
 nobufs_unlock:
        spin_unlock(&cookie->lock);
+        atomic_dec(&cookie->n_active);
        kfree(op);
 nobufs:
        fscache_stat(&fscache_n_retrievals_nobufs);
@@ -631,10 +640,10 @@ int __fscache_alloc_page(struct fscache_cookie *cookie,
        if (fscache_wait_for_deferred_lookup(cookie) < 0)
                return -ERESTARTSYS;
-        op = fscache_alloc_retrieval(page->mapping, NULL, NULL);
+        op = fscache_alloc_retrieval(cookie, page->mapping, NULL, NULL);
        if (!op)
                return -ENOMEM;
-        op->n_pages = 1;
+        atomic_set(&op->n_pages, 1);
        spin_lock(&cookie->lock);
@@ -675,6 +684,7 @@ error:
 nobufs_unlock:
        spin_unlock(&cookie->lock);
+        atomic_dec(&cookie->n_active);
        kfree(op);
 nobufs:
        fscache_stat(&fscache_n_allocs_nobufs);
@@ -729,8 +739,9 @@ static void fscache_write_op(struct fscache_operation *_op)
                 */
                spin_unlock(&object->lock);
                fscache_op_complete(&op->op, false);
-                _leave(" [cancel] op{f=%lx s=%u} obj{s=%u f=%lx}",
+                _leave(" [cancel] op{f=%lx s=%u} obj{s=%s f=%lx}",
-                       _op->flags, _op->state, object->state, object->flags);
+                       _op->flags, _op->state, object->state->short_name,
+                       object->flags);
                return;
        }
@@ -796,11 +807,16 @@ void fscache_invalidate_writes(struct fscache_cookie *cookie)
        _enter("");
-        while (spin_lock(&cookie->stores_lock),
+        for (;;) {
-               n = radix_tree_gang_lookup_tag(&cookie->stores, results, 0,
+                spin_lock(&cookie->stores_lock);
-                                              ARRAY_SIZE(results),
+                n = radix_tree_gang_lookup_tag(&cookie->stores, results, 0,
-                                              FSCACHE_COOKIE_PENDING_TAG),
+                                               ARRAY_SIZE(results),
-               n > 0) {
+                                               FSCACHE_COOKIE_PENDING_TAG);
+                if (n == 0) {
+                        spin_unlock(&cookie->stores_lock);
+                        break;
+                }
                for (i = n - 1; i >= 0; i--) {
                        page = results[i];
                        radix_tree_delete(&cookie->stores, page->index);
@@ -812,7 +828,6 @@ void fscache_invalidate_writes(struct fscache_cookie *cookie)
                        page_cache_release(results[i]);
        }
-        spin_unlock(&cookie->stores_lock);
        _leave("");
 }
@@ -829,14 +844,12 @@ void fscache_invalidate_writes(struct fscache_cookie *cookie)
 *  (1) negative lookup, object not yet created (FSCACHE_COOKIE_CREATING is
 *      set)
 *
- *      (a) no writes yet (set FSCACHE_COOKIE_PENDING_FILL and queue deferred
+ *      (a) no writes yet
- *          fill op)
 *
 *      (b) writes deferred till post-creation (mark page for writing and
 *          return immediately)
 *
 *  (2) negative lookup, object created, initial fill being made from netfs
- *      (FSCACHE_COOKIE_INITIAL_FILL is set)
 *
 *      (a) fill point not yet reached this page (mark page for writing and
 *          return)
@@ -873,7 +886,9 @@ int __fscache_write_page(struct fscache_cookie *cookie,
        fscache_operation_init(&op->op, fscache_write_op,
                               fscache_release_write_op);
-        op->op.flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_WAITING);
+        op->op.flags = FSCACHE_OP_ASYNC |
+                (1 << FSCACHE_OP_WAITING) |
+                (1 << FSCACHE_OP_UNUSE_COOKIE);
        ret = radix_tree_preload(gfp & ~__GFP_HIGHMEM);
        if (ret < 0)
@@ -919,6 +934,7 @@ int __fscache_write_page(struct fscache_cookie *cookie,
        op->op.debug_id = atomic_inc_return(&fscache_op_debug_id);
        op->store_limit = object->store_limit;
+        atomic_inc(&cookie->n_active);
        if (fscache_submit_op(object, &op->op) < 0)
                goto submit_failed;
@@ -945,6 +961,7 @@ already_pending:
        return 0;
 submit_failed:
+        atomic_dec(&cookie->n_active);
        spin_lock(&cookie->stores_lock);
        radix_tree_delete(&cookie->stores, page->index);
        spin_unlock(&cookie->stores_lock);
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index f3f783dc4f75..0eda52738ec4 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -14,7 +14,7 @@
 #include <linux/namei.h>
 #include <linux/slab.h>
-static bool fuse_use_readdirplus(struct inode *dir, struct file *filp)
+static bool fuse_use_readdirplus(struct inode *dir, struct dir_context *ctx)
 {
        struct fuse_conn *fc = get_fuse_conn(dir);
        struct fuse_inode *fi = get_fuse_inode(dir);
@@ -25,7 +25,7 @@ static bool fuse_use_readdirplus(struct inode *dir, struct file *filp)
                return true;
        if (test_and_clear_bit(FUSE_I_ADVISE_RDPLUS, &fi->state))
                return true;
-        if (filp->f_pos == 0)
+        if (ctx->pos == 0)
                return true;
        return false;
 }
@@ -1165,25 +1165,23 @@ static int fuse_permission(struct inode *inode, int mask)
 }
 static int parse_dirfile(char *buf, size_t nbytes, struct file *file,
-                         void *dstbuf, filldir_t filldir)
+                         struct dir_context *ctx)
 {
        while (nbytes >= FUSE_NAME_OFFSET) {
                struct fuse_dirent *dirent = (struct fuse_dirent *) buf;
                size_t reclen = FUSE_DIRENT_SIZE(dirent);
-                int over;
                if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX)
                        return -EIO;
                if (reclen > nbytes)
                        break;
-                over = filldir(dstbuf, dirent->name, dirent->namelen,
+                if (!dir_emit(ctx, dirent->name, dirent->namelen,
-                               file->f_pos, dirent->ino, dirent->type);
+                               dirent->ino, dirent->type))
-                if (over)
                        break;
                buf += reclen;
                nbytes -= reclen;
-                file->f_pos = dirent->off;
+                ctx->pos = dirent->off;
        }
        return 0;
@@ -1284,7 +1282,7 @@ out:
 }
 static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file,
-                             void *dstbuf, filldir_t filldir, u64 attr_version)
+                             struct dir_context *ctx, u64 attr_version)
 {
        struct fuse_direntplus *direntplus;
        struct fuse_dirent *dirent;
@@ -1309,10 +1307,9 @@ static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file,
                           we need to send a FORGET for each of those
                           which we did not link.
                        */
-                        over = filldir(dstbuf, dirent->name, dirent->namelen,
+                        over = !dir_emit(ctx, dirent->name, dirent->namelen,
-                                       file->f_pos, dirent->ino,
+                                       dirent->ino, dirent->type);
-                                       dirent->type);
+                        ctx->pos = dirent->off;
-                        file->f_pos = dirent->off;
                }
                buf += reclen;
@@ -1326,7 +1323,7 @@ static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file,
        return 0;
 }
-static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
+static int fuse_readdir(struct file *file, struct dir_context *ctx)
 {
        int plus, err;
        size_t nbytes;
@@ -1349,17 +1346,17 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
                return -ENOMEM;
        }
-        plus = fuse_use_readdirplus(inode, file);
+        plus = fuse_use_readdirplus(inode, ctx);
        req->out.argpages = 1;
        req->num_pages = 1;
        req->pages[0] = page;
        req->page_descs[0].length = PAGE_SIZE;
        if (plus) {
                attr_version = fuse_get_attr_version(fc);
-                fuse_read_fill(req, file, file->f_pos, PAGE_SIZE,
+                fuse_read_fill(req, file, ctx->pos, PAGE_SIZE,
                               FUSE_READDIRPLUS);
        } else {
-                fuse_read_fill(req, file, file->f_pos, PAGE_SIZE,
+                fuse_read_fill(req, file, ctx->pos, PAGE_SIZE,
                               FUSE_READDIR);
        }
        fuse_request_send(fc, req);
@@ -1369,11 +1366,11 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
        if (!err) {
                if (plus) {
                        err = parse_dirplusfile(page_address(page), nbytes,
-                                                file, dstbuf, filldir,
+                                                file, ctx,
                                                attr_version);
                } else {
                        err = parse_dirfile(page_address(page), nbytes, file,
-                                            dstbuf, filldir);
+                                            ctx);
                }
        }
@@ -1886,7 +1883,7 @@ static const struct inode_operations fuse_dir_inode_operations = {
 static const struct file_operations fuse_dir_operations = {
        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
-        .readdir        = fuse_readdir,
+        .iterate        = fuse_readdir,
        .open           = fuse_dir_open,
        .release        = fuse_dir_release,
        .fsync          = fuse_dir_fsync,
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index e570081f9f76..35f281033142 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -2470,13 +2470,16 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
                .mode = mode
        };
        int err;
+        bool lock_inode = !(mode & FALLOC_FL_KEEP_SIZE) ||
+                           (mode & FALLOC_FL_PUNCH_HOLE);
        if (fc->no_fallocate)
                return -EOPNOTSUPP;
-        if (mode & FALLOC_FL_PUNCH_HOLE) {
+        if (lock_inode) {
                mutex_lock(&inode->i_mutex);
-                fuse_set_nowrite(inode);
+                if (mode & FALLOC_FL_PUNCH_HOLE)
+                        fuse_set_nowrite(inode);
        }
        req = fuse_get_req_nopages(fc);
@@ -2511,8 +2514,9 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
        fuse_invalidate_attr(inode);
 out:
-        if (mode & FALLOC_FL_PUNCH_HOLE) {
+        if (lock_inode) {
-                fuse_release_nowrite(inode);
+                if (mode & FALLOC_FL_PUNCH_HOLE)
+                        fuse_release_nowrite(inode);
                mutex_unlock(&inode->i_mutex);
        }
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index 5a376ab81feb..90c6a8faaecb 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -20,13 +20,12 @@ config GFS2_FS
          be found here: http://sources.redhat.com/cluster
          The "nolock" lock module is now built in to GFS2 by default. If
-          you want to use the DLM, be sure to enable HOTPLUG and IPv4/6
+          you want to use the DLM, be sure to enable IPv4/6 networking.
-          networking.
 config GFS2_FS_LOCKING_DLM
        bool "GFS2 DLM locking"
        depends on (GFS2_FS!=n) && NET && INET && (IPV6 || IPV6=n) && \
-                HOTPLUG && CONFIGFS_FS && SYSFS && (DLM=y || DLM=GFS2_FS)
+                CONFIGFS_FS && SYSFS && (DLM=y || DLM=GFS2_FS)
        help
          Multiple node locking module for GFS2
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 0bad69ed6336..ee48ad37d9c0 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -110,7 +110,7 @@ static int gfs2_writepage_common(struct page *page,
        /* Is the page fully outside i_size? (truncate in progress) */
        offset = i_size & (PAGE_CACHE_SIZE-1);
        if (page->index > end_index || (page->index == end_index && !offset)) {
-                page->mapping->a_ops->invalidatepage(page, 0);
+                page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE);
                goto out;
        }
        return 1;
@@ -299,7 +299,8 @@ static int gfs2_write_jdata_pagevec(struct address_space *mapping,
                /* Is the page fully outside i_size? (truncate in progress) */
                if (page->index > end_index || (page->index == end_index && !offset)) {
-                        page->mapping->a_ops->invalidatepage(page, 0);
+                        page->mapping->a_ops->invalidatepage(page, 0,
+                                                             PAGE_CACHE_SIZE);
                        unlock_page(page);
                        continue;
                }
@@ -943,27 +944,33 @@ static void gfs2_discard(struct gfs2_sbd *sdp, struct buffer_head *bh)
        unlock_buffer(bh);
 }
-static void gfs2_invalidatepage(struct page *page, unsigned long offset)
+static void gfs2_invalidatepage(struct page *page, unsigned int offset,
+                                unsigned int length)
 {
        struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
+        unsigned int stop = offset + length;
+        int partial_page = (offset || length < PAGE_CACHE_SIZE);
        struct buffer_head *bh, *head;
        unsigned long pos = 0;
        BUG_ON(!PageLocked(page));
-        if (offset == 0)
+        if (!partial_page)
                ClearPageChecked(page);
        if (!page_has_buffers(page))
                goto out;
        bh = head = page_buffers(page);
        do {
+                if (pos + bh->b_size > stop)
+                        return;
                if (offset <= pos)
                        gfs2_discard(sdp, bh);
                pos += bh->b_size;
                bh = bh->b_this_page;
        } while (bh != head);
 out:
-        if (offset == 0)
+        if (!partial_page)
                try_to_release_page(page, 0);
 }
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 93b5809c20bb..5e2f56fccf6b 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -1232,7 +1232,9 @@ static int do_grow(struct inode *inode, u64 size)
                unstuff = 1;
        }
-        error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT, 0);
+        error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT +
+                                 (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF ?
+                                  0 : RES_QUOTA), 0);
        if (error)
                goto do_grow_release;
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index b631c9043460..0cb4c1557f20 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -1125,13 +1125,14 @@ static int dir_double_exhash(struct gfs2_inode *dip)
        if (IS_ERR(hc))
                return PTR_ERR(hc);
-        h = hc2 = kmalloc(hsize_bytes * 2, GFP_NOFS | __GFP_NOWARN);
+        hc2 = kmalloc(hsize_bytes * 2, GFP_NOFS | __GFP_NOWARN);
        if (hc2 == NULL)
                hc2 = __vmalloc(hsize_bytes * 2, GFP_NOFS, PAGE_KERNEL);
        if (!hc2)
                return -ENOMEM;
+        h = hc2;
        error = gfs2_meta_inode_buffer(dip, &dibh);
        if (error)
                goto out_kfree;
@@ -1212,9 +1213,7 @@ static int compare_dents(const void *a, const void *b)
 /**
 * do_filldir_main - read out directory entries
 * @dip: The GFS2 inode
- * @offset: The offset in the file to read from
+ * @ctx: what to feed the entries to
- * @opaque: opaque data to pass to filldir
- * @filldir: The function to pass entries to
 * @darr: an array of struct gfs2_dirent pointers to read
 * @entries: the number of entries in darr
 * @copied: pointer to int that's non-zero if a entry has been copied out
@@ -1224,11 +1223,10 @@ static int compare_dents(const void *a, const void *b)
 * the possibility that they will fall into different readdir buffers or
 * that someone will want to seek to that location.
 *
- * Returns: errno, >0 on exception from filldir
+ * Returns: errno, >0 if the actor tells you to stop
 */
-static int do_filldir_main(struct gfs2_inode *dip, u64 *offset,
+static int do_filldir_main(struct gfs2_inode *dip, struct dir_context *ctx,
-                           void *opaque, filldir_t filldir,
                           const struct gfs2_dirent **darr, u32 entries,
                           int *copied)
 {
@@ -1236,7 +1234,6 @@ static int do_filldir_main(struct gfs2_inode *dip, u64 *offset,
        u64 off, off_next;
        unsigned int x, y;
        int run = 0;
-        int error = 0;
        sort(darr, entries, sizeof(struct gfs2_dirent *), compare_dents, NULL);
@@ -1253,9 +1250,9 @@ static int do_filldir_main(struct gfs2_inode *dip, u64 *offset,
                        off_next = be32_to_cpu(dent_next->de_hash);
                        off_next = gfs2_disk_hash2offset(off_next);
-                        if (off < *offset)
+                        if (off < ctx->pos)
                                continue;
-                        *offset = off;
+                        ctx->pos = off;
                        if (off_next == off) {
                                if (*copied && !run)
@@ -1264,26 +1261,25 @@ static int do_filldir_main(struct gfs2_inode *dip, u64 *offset,
                        } else
                                run = 0;
                } else {
-                        if (off < *offset)
+                        if (off < ctx->pos)
                                continue;
-                        *offset = off;
+                        ctx->pos = off;
                }
-                error = filldir(opaque, (const char *)(dent + 1),
+                if (!dir_emit(ctx, (const char *)(dent + 1),
                                be16_to_cpu(dent->de_name_len),
-                                off, be64_to_cpu(dent->de_inum.no_addr),
+                                be64_to_cpu(dent->de_inum.no_addr),
-                                be16_to_cpu(dent->de_type));
+                                be16_to_cpu(dent->de_type)))
-                if (error)
                        return 1;
                *copied = 1;
        }
-        /* Increment the *offset by one, so the next time we come into the
+        /* Increment the ctx->pos by one, so the next time we come into the
           do_filldir fxn, we get the next entry instead of the last one in the
           current leaf */
-        (*offset)++;
+        ctx->pos++;
        return 0;
 }
@@ -1307,8 +1303,8 @@ static void gfs2_free_sort_buffer(void *ptr)
                kfree(ptr);
 }
-static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
+static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx,
-                              filldir_t filldir, int *copied, unsigned *depth,
+                              int *copied, unsigned *depth,
                              u64 leaf_no)
 {
        struct gfs2_inode *ip = GFS2_I(inode);
@@ -1386,8 +1382,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
        } while(lfn);
        BUG_ON(entries2 != entries);
-        error = do_filldir_main(ip, offset, opaque, filldir, darr,
+        error = do_filldir_main(ip, ctx, darr, entries, copied);
-                                entries, copied);
 out_free:
        for(i = 0; i < leaf; i++)
                brelse(larr[i]);
@@ -1446,15 +1441,13 @@ static void gfs2_dir_readahead(struct inode *inode, unsigned hsize, u32 index,
 /**
 * dir_e_read - Reads the entries from a directory into a filldir buffer
 * @dip: dinode pointer
- * @offset: the hash of the last entry read shifted to the right once
+ * @ctx: actor to feed the entries to
- * @opaque: buffer for the filldir function to fill
- * @filldir: points to the filldir function to use
 *
 * Returns: errno
 */
-static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
+static int dir_e_read(struct inode *inode, struct dir_context *ctx,
-                      filldir_t filldir, struct file_ra_state *f_ra)
+                      struct file_ra_state *f_ra)
 {
        struct gfs2_inode *dip = GFS2_I(inode);
        u32 hsize, len = 0;
@@ -1465,7 +1458,7 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
        unsigned depth = 0;
        hsize = 1 << dip->i_depth;
-        hash = gfs2_dir_offset2hash(*offset);
+        hash = gfs2_dir_offset2hash(ctx->pos);
        index = hash >> (32 - dip->i_depth);
        if (dip->i_hash_cache == NULL)
@@ -1477,7 +1470,7 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
        gfs2_dir_readahead(inode, hsize, index, f_ra);
        while (index < hsize) {
-                error = gfs2_dir_read_leaf(inode, offset, opaque, filldir,
+                error = gfs2_dir_read_leaf(inode, ctx,
                                           &copied, &depth,
                                           be64_to_cpu(lp[index]));
                if (error)
@@ -1492,8 +1485,8 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
        return error;
 }
-int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
+int gfs2_dir_read(struct inode *inode, struct dir_context *ctx,
-                  filldir_t filldir, struct file_ra_state *f_ra)
+                  struct file_ra_state *f_ra)
 {
        struct gfs2_inode *dip = GFS2_I(inode);
        struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -1507,7 +1500,7 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
                return 0;
        if (dip->i_diskflags & GFS2_DIF_EXHASH)
-                return dir_e_read(inode, offset, opaque, filldir, f_ra);
+                return dir_e_read(inode, ctx, f_ra);
        if (!gfs2_is_stuffed(dip)) {
                gfs2_consist_inode(dip);
@@ -1539,7 +1532,7 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
                        error = -EIO;
                        goto out;
                }
-                error = do_filldir_main(dip, offset, opaque, filldir, darr,
+                error = do_filldir_main(dip, ctx, darr,
                                        dip->i_entries, &copied);
 out:
                kfree(darr);
@@ -1555,9 +1548,9 @@ out:
 /**
 * gfs2_dir_search - Search a directory
- * @dip: The GFS2 inode
+ * @dip: The GFS2 dir inode
- * @filename:
+ * @name: The name we are looking up
- * @inode:
+ * @fail_on_exist: Fail if the name exists rather than looking it up
 *
 * This routine searches a directory for a file or another directory.
 * Assumes a glock is held on dip.
@@ -1565,22 +1558,25 @@ out:
 * Returns: errno
 */
-struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name)
+struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name,
+                              bool fail_on_exist)
 {
        struct buffer_head *bh;
        struct gfs2_dirent *dent;
-        struct inode *inode;
+        u64 addr, formal_ino;
+        u16 dtype;
        dent = gfs2_dirent_search(dir, name, gfs2_dirent_find, &bh);
        if (dent) {
                if (IS_ERR(dent))
                        return ERR_CAST(dent);
-                inode = gfs2_inode_lookup(dir->i_sb, 
+                dtype = be16_to_cpu(dent->de_type);
-                                be16_to_cpu(dent->de_type),
+                addr = be64_to_cpu(dent->de_inum.no_addr);
-                                be64_to_cpu(dent->de_inum.no_addr),
+                formal_ino = be64_to_cpu(dent->de_inum.no_formal_ino);
-                                be64_to_cpu(dent->de_inum.no_formal_ino), 0);
                brelse(bh);
-                return inode;
+                if (fail_on_exist)
+                        return ERR_PTR(-EEXIST);
+                return gfs2_inode_lookup(dir->i_sb, dtype, addr, formal_ino, 0);
        }
        return ERR_PTR(-ENOENT);
 }
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
index 98c960beab35..4f03bbd1873f 100644
--- a/fs/gfs2/dir.h
+++ b/fs/gfs2/dir.h
@@ -18,14 +18,15 @@ struct gfs2_inode;
 struct gfs2_inum;
 extern struct inode *gfs2_dir_search(struct inode *dir,
-                                     const struct qstr *filename);
+                                     const struct qstr *filename,
+                                     bool fail_on_exist);
 extern int gfs2_dir_check(struct inode *dir, const struct qstr *filename,
                          const struct gfs2_inode *ip);
 extern int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
                        const struct gfs2_inode *ip);
 extern int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry);
-extern int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
+extern int gfs2_dir_read(struct inode *inode, struct dir_context *ctx,
-                         filldir_t filldir, struct file_ra_state *f_ra);
+                         struct file_ra_state *f_ra);
 extern int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
                          const struct gfs2_inode *nip, unsigned int new_type);
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index 9973df4ff565..8b9b3775e2e7 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -64,6 +64,7 @@ static int gfs2_encode_fh(struct inode *inode, __u32 *p, int *len,
 }
 struct get_name_filldir {
+        struct dir_context ctx;
        struct gfs2_inum_host inum;
        char *name;
 };
@@ -88,9 +89,11 @@ static int gfs2_get_name(struct dentry *parent, char *name,
        struct inode *dir = parent->d_inode;
        struct inode *inode = child->d_inode;
        struct gfs2_inode *dip, *ip;
-        struct get_name_filldir gnfd;
+        struct get_name_filldir gnfd = {
+                .ctx.actor = get_name_filldir,
+                .name = name
+        };
        struct gfs2_holder gh;
-        u64 offset = 0;
        int error;
        struct file_ra_state f_ra = { .start = 0 };
@@ -106,13 +109,12 @@ static int gfs2_get_name(struct dentry *parent, char *name,
        *name = 0;
        gnfd.inum.no_addr = ip->i_no_addr;
        gnfd.inum.no_formal_ino = ip->i_no_formal_ino;
-        gnfd.name = name;
        error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &gh);
        if (error)
                return error;
-        error = gfs2_dir_read(dir, &offset, &gnfd, get_name_filldir, &f_ra);
+        error = gfs2_dir_read(dir, &gnfd.ctx, &f_ra);
        gfs2_glock_dq_uninit(&gh);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index ad0dc38d87ab..f99f9e8a325f 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -82,35 +82,28 @@ static loff_t gfs2_llseek(struct file *file, loff_t offset, int whence)
 }
 /**
- * gfs2_readdir - Read directory entries from a directory
+ * gfs2_readdir - Iterator for a directory
 * @file: The directory to read from
- * @dirent: Buffer for dirents
+ * @ctx: What to feed directory entries to
- * @filldir: Function used to do the copying
 *
 * Returns: errno
 */
-static int gfs2_readdir(struct file *file, void *dirent, filldir_t filldir)
+static int gfs2_readdir(struct file *file, struct dir_context *ctx)
 {
        struct inode *dir = file->f_mapping->host;
        struct gfs2_inode *dip = GFS2_I(dir);
        struct gfs2_holder d_gh;
-        u64 offset = file->f_pos;
        int error;
-        gfs2_holder_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
+        error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
-        error = gfs2_glock_nq(&d_gh);
+        if (error)
-        if (error) {
-                gfs2_holder_uninit(&d_gh);
                return error;
-        }
-        error = gfs2_dir_read(dir, &offset, dirent, filldir, &file->f_ra);
+        error = gfs2_dir_read(dir, ctx, &file->f_ra);
        gfs2_glock_dq_uninit(&d_gh);
-        file->f_pos = offset;
        return error;
 }
@@ -538,21 +531,30 @@ static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
 }
 /**
- * gfs2_open - open a file
+ * gfs2_open_common - This is common to open and atomic_open
- * @inode: the inode to open
+ * @inode: The inode being opened
- * @file: the struct file for this opening
+ * @file: The file being opened
 *
- * Returns: errno
+ * This maybe called under a glock or not depending upon how it has
+ * been called. We must always be called under a glock for regular
+ * files, however. For other file types, it does not matter whether
+ * we hold the glock or not.
+ *
+ * Returns: Error code or 0 for success
 */
-static int gfs2_open(struct inode *inode, struct file *file)
+int gfs2_open_common(struct inode *inode, struct file *file)
 {
-        struct gfs2_inode *ip = GFS2_I(inode);
-        struct gfs2_holder i_gh;
        struct gfs2_file *fp;
-        int error;
+        int ret;
+        if (S_ISREG(inode->i_mode)) {
+                ret = generic_file_open(inode, file);
+                if (ret)
+                        return ret;
+        }
-        fp = kzalloc(sizeof(struct gfs2_file), GFP_KERNEL);
+        fp = kzalloc(sizeof(struct gfs2_file), GFP_NOFS);
        if (!fp)
                return -ENOMEM;
@@ -560,29 +562,43 @@ static int gfs2_open(struct inode *inode, struct file *file)
        gfs2_assert_warn(GFS2_SB(inode), !file->private_data);
        file->private_data = fp;
+        return 0;
+}
+/**
+ * gfs2_open - open a file
+ * @inode: the inode to open
+ * @file: the struct file for this opening
+ *
+ * After atomic_open, this function is only used for opening files
+ * which are already cached. We must still get the glock for regular
+ * files to ensure that we have the file size uptodate for the large
+ * file check which is in the common code. That is only an issue for
+ * regular files though.
+ *
+ * Returns: errno
+ */
+static int gfs2_open(struct inode *inode, struct file *file)
+{
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_holder i_gh;
+        int error;
+        bool need_unlock = false;
        if (S_ISREG(ip->i_inode.i_mode)) {
                error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
                                           &i_gh);
                if (error)
-                        goto fail;
+                        return error;
+                need_unlock = true;
+        }
-                if (!(file->f_flags & O_LARGEFILE) &&
+        error = gfs2_open_common(inode, file);
-                    i_size_read(inode) > MAX_NON_LFS) {
-                        error = -EOVERFLOW;
-                        goto fail_gunlock;
-                }
+        if (need_unlock)
                gfs2_glock_dq_uninit(&i_gh);
-        }
-        return 0;
-fail_gunlock:
-        gfs2_glock_dq_uninit(&i_gh);
-fail:
-        file->private_data = NULL;
-        kfree(fp);
        return error;
 }
@@ -1048,7 +1064,7 @@ const struct file_operations gfs2_file_fops = {
 };
 const struct file_operations gfs2_dir_fops = {
-        .readdir        = gfs2_readdir,
+        .iterate        = gfs2_readdir,
        .unlocked_ioctl = gfs2_ioctl,
        .open           = gfs2_open,
        .release        = gfs2_release,
@@ -1078,7 +1094,7 @@ const struct file_operations gfs2_file_fops_nolock = {
 };
 const struct file_operations gfs2_dir_fops_nolock = {
-        .readdir        = gfs2_readdir,
+        .iterate        = gfs2_readdir,
        .unlocked_ioctl = gfs2_ioctl,
        .open           = gfs2_open,
        .release        = gfs2_release,
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index c66e99c97571..5f2e5224c51c 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -54,7 +54,6 @@ static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
        struct gfs2_bufdata *bd, *tmp;
        struct buffer_head *bh;
        const unsigned long b_state = (1UL << BH_Dirty)|(1UL << BH_Pinned)|(1UL << BH_Lock);
-        sector_t blocknr;
        gfs2_log_lock(sdp);
        spin_lock(&sdp->sd_ail_lock);
@@ -65,13 +64,6 @@ static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
                                continue;
                        gfs2_ail_error(gl, bh);
                }
-                blocknr = bh->b_blocknr;
-                bh->b_private = NULL;
-                gfs2_remove_from_ail(bd); /* drops ref on bh */
-                bd->bd_bh = NULL;
-                bd->bd_blkno = blocknr;
                gfs2_trans_add_revoke(sdp, bd);
        }
        GLOCK_BUG_ON(gl, !fsync && atomic_read(&gl->gl_ail_count));
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 62b484e4a9e4..bbb2715171cd 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -313,7 +313,7 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
                        goto out;
        }
-        inode = gfs2_dir_search(dir, name);
+        inode = gfs2_dir_search(dir, name, false);
        if (IS_ERR(inode))
                error = PTR_ERR(inode);
 out:
@@ -346,17 +346,6 @@ static int create_ok(struct gfs2_inode *dip, const struct qstr *name,
        if (!dip->i_inode.i_nlink)
                return -ENOENT;
-        error = gfs2_dir_check(&dip->i_inode, name, NULL);
-        switch (error) {
-        case -ENOENT:
-                error = 0;
-                break;
-        case 0:
-                return -EEXIST;
-        default:
-                return error;
-        }
        if (dip->i_entries == (u32)-1)
                return -EFBIG;
        if (S_ISDIR(mode) && dip->i_inode.i_nlink == (u32)-1)
@@ -546,6 +535,7 @@ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip,
 * gfs2_create_inode - Create a new inode
 * @dir: The parent directory
 * @dentry: The new dentry
+ * @file: If non-NULL, the file which is being opened
 * @mode: The permissions on the new inode
 * @dev: For device nodes, this is the device number
 * @symname: For symlinks, this is the link destination
@@ -555,8 +545,9 @@ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip,
 */
 static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
+                             struct file *file,
                             umode_t mode, dev_t dev, const char *symname,
-                             unsigned int size, int excl)
+                             unsigned int size, int excl, int *opened)
 {
        const struct qstr *name = &dentry->d_name;
        struct gfs2_holder ghs[2];
@@ -564,6 +555,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
        struct gfs2_inode *dip = GFS2_I(dir), *ip;
        struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
        struct gfs2_glock *io_gl;
+        struct dentry *d;
        int error;
        u32 aflags = 0;
        int arq;
@@ -584,15 +576,30 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
                goto fail;
        error = create_ok(dip, name, mode);
-        if ((error == -EEXIST) && S_ISREG(mode) && !excl) {
-                inode = gfs2_lookupi(dir, &dentry->d_name, 0);
-                gfs2_glock_dq_uninit(ghs);
-                d_instantiate(dentry, inode);
-                return IS_ERR(inode) ? PTR_ERR(inode) : 0;
-        }
        if (error)
                goto fail_gunlock;
+        inode = gfs2_dir_search(dir, &dentry->d_name, !S_ISREG(mode) || excl);
+        error = PTR_ERR(inode);
+        if (!IS_ERR(inode)) {
+                d = d_splice_alias(inode, dentry);
+                error = 0;
+                if (file && !IS_ERR(d)) {
+                        if (d == NULL)
+                                d = dentry;
+                        if (S_ISREG(inode->i_mode))
+                                error = finish_open(file, d, gfs2_open_common, opened);
+                        else
+                                error = finish_no_open(file, d);
+                }
+                gfs2_glock_dq_uninit(ghs);
+                if (IS_ERR(d))
+                        return PTR_RET(d);
+                return error;
+        } else if (error != -ENOENT) {
+                goto fail_gunlock;
+        }
        arq = error = gfs2_diradd_alloc_required(dir, name);
        if (error < 0)
                goto fail_gunlock;
@@ -686,10 +693,12 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
                goto fail_gunlock3;
        mark_inode_dirty(inode);
+        d_instantiate(dentry, inode);
+        if (file)
+                error = finish_open(file, dentry, gfs2_open_common, opened);
        gfs2_glock_dq_uninit(ghs);
        gfs2_glock_dq_uninit(ghs + 1);
-        d_instantiate(dentry, inode);
+        return error;
-        return 0;
 fail_gunlock3:
        gfs2_glock_dq_uninit(ghs + 1);
@@ -729,36 +738,56 @@ fail:
 static int gfs2_create(struct inode *dir, struct dentry *dentry,
                       umode_t mode, bool excl)
 {
-        return gfs2_create_inode(dir, dentry, S_IFREG | mode, 0, NULL, 0, excl);
+        return gfs2_create_inode(dir, dentry, NULL, S_IFREG | mode, 0, NULL, 0, excl, NULL);
 }
 /**
- * gfs2_lookup - Look up a filename in a directory and return its inode
+ * __gfs2_lookup - Look up a filename in a directory and return its inode
 * @dir: The directory inode
 * @dentry: The dentry of the new inode
- * @nd: passed from Linux VFS, ignored by us
+ * @file: File to be opened
+ * @opened: atomic_open flags
 *
- * Called by the VFS layer. Lock dir and call gfs2_lookupi()
 *
 * Returns: errno
 */
-static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry,
+static struct dentry *__gfs2_lookup(struct inode *dir, struct dentry *dentry,
-                                  unsigned int flags)
+                                    struct file *file, int *opened)
 {
-        struct inode *inode = gfs2_lookupi(dir, &dentry->d_name, 0);
+        struct inode *inode;
-        if (inode && !IS_ERR(inode)) {
+        struct dentry *d;
-                struct gfs2_glock *gl = GFS2_I(inode)->i_gl;
+        struct gfs2_holder gh;
-                struct gfs2_holder gh;
+        struct gfs2_glock *gl;
-                int error;
+        int error;
-                error = gfs2_glock_nq_init(gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
-                if (error) {
+        inode = gfs2_lookupi(dir, &dentry->d_name, 0);
-                        iput(inode);
+        if (!inode)
-                        return ERR_PTR(error);
+                return NULL;
-                }
+        if (IS_ERR(inode))
-                gfs2_glock_dq_uninit(&gh);
+                return ERR_CAST(inode);
+        gl = GFS2_I(inode)->i_gl;
+        error = gfs2_glock_nq_init(gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
+        if (error) {
+                iput(inode);
+                return ERR_PTR(error);
        }
-        return d_splice_alias(inode, dentry);
+        d = d_splice_alias(inode, dentry);
+        if (file && S_ISREG(inode->i_mode))
+                error = finish_open(file, dentry, gfs2_open_common, opened);
+        gfs2_glock_dq_uninit(&gh);
+        if (error)
+                return ERR_PTR(error);
+        return d;
+}
+static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry,
+                                  unsigned flags)
+{
+        return __gfs2_lookup(dir, dentry, NULL, NULL);
 }
 /**
@@ -1076,7 +1105,7 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
        if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode) - 1)
                return -ENAMETOOLONG;
-        return gfs2_create_inode(dir, dentry, S_IFLNK | S_IRWXUGO, 0, symname, size, 0);
+        return gfs2_create_inode(dir, dentry, NULL, S_IFLNK | S_IRWXUGO, 0, symname, size, 0, NULL);
 }
 /**
@@ -1092,7 +1121,7 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
        struct gfs2_sbd *sdp = GFS2_SB(dir);
        unsigned dsize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode);
-        return gfs2_create_inode(dir, dentry, S_IFDIR | mode, 0, NULL, dsize, 0);
+        return gfs2_create_inode(dir, dentry, NULL, S_IFDIR | mode, 0, NULL, dsize, 0, NULL);
 }
 /**
@@ -1107,7 +1136,43 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 static int gfs2_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
                      dev_t dev)
 {
-        return gfs2_create_inode(dir, dentry, mode, dev, NULL, 0, 0);
+        return gfs2_create_inode(dir, dentry, NULL, mode, dev, NULL, 0, 0, NULL);
+}
+/**
+ * gfs2_atomic_open - Atomically open a file
+ * @dir: The directory
+ * @dentry: The proposed new entry
+ * @file: The proposed new struct file
+ * @flags: open flags
+ * @mode: File mode
+ * @opened: Flag to say whether the file has been opened or not
+ *
+ * Returns: error code or 0 for success
+ */
+static int gfs2_atomic_open(struct inode *dir, struct dentry *dentry,
+                            struct file *file, unsigned flags,
+                            umode_t mode, int *opened)
+{
+        struct dentry *d;
+        bool excl = !!(flags & O_EXCL);
+        d = __gfs2_lookup(dir, dentry, file, opened);
+        if (IS_ERR(d))
+                return PTR_ERR(d);
+        if (d == NULL)
+                d = dentry;
+        if (d->d_inode) {
+                if (!(*opened & FILE_OPENED))
+                        return finish_no_open(file, d);
+                return 0;
+        }
+        if (!(flags & O_CREAT))
+                return -ENOENT;
+        return gfs2_create_inode(dir, dentry, file, S_IFREG | mode, 0, NULL, 0, excl, opened);
 }
 /*
@@ -1787,6 +1852,7 @@ const struct inode_operations gfs2_dir_iops = {
        .removexattr = gfs2_removexattr,
        .fiemap = gfs2_fiemap,
        .get_acl = gfs2_get_acl,
+        .atomic_open = gfs2_atomic_open,
 };
 const struct inode_operations gfs2_symlink_iops = {
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index c53c7477f6da..ba4d9492d422 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -109,6 +109,7 @@ extern int gfs2_permission(struct inode *inode, int mask);
 extern int gfs2_setattr_simple(struct inode *inode, struct iattr *attr);
 extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
 extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
+extern int gfs2_open_common(struct inode *inode, struct file *file);
 extern const struct inode_operations gfs2_file_iops;
 extern const struct inode_operations gfs2_dir_iops;
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index b404f4853034..610613fb65b5 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -211,15 +211,16 @@ static void gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
 static int gfs2_ail1_empty(struct gfs2_sbd *sdp)
 {
        struct gfs2_trans *tr, *s;
+        int oldest_tr = 1;
        int ret;
        spin_lock(&sdp->sd_ail_lock);
        list_for_each_entry_safe_reverse(tr, s, &sdp->sd_ail1_list, tr_list) {
                gfs2_ail1_empty_one(sdp, tr);
-                if (list_empty(&tr->tr_ail1_list))
+                if (list_empty(&tr->tr_ail1_list) && oldest_tr)
                        list_move(&tr->tr_list, &sdp->sd_ail2_list);
                else
-                        break;
+                        oldest_tr = 0;
        }
        ret = list_empty(&sdp->sd_ail1_list);
        spin_unlock(&sdp->sd_ail_lock);
@@ -317,7 +318,7 @@ static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail)
 int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks)
 {
-        unsigned reserved_blks = 6 * (4096 / sdp->sd_vfs->s_blocksize);
+        unsigned reserved_blks = 7 * (4096 / sdp->sd_vfs->s_blocksize);
        unsigned wanted = blks + reserved_blks;
        DEFINE_WAIT(wait);
        int did_wait = 0;
@@ -545,6 +546,76 @@ void gfs2_ordered_del_inode(struct gfs2_inode *ip)
        spin_unlock(&sdp->sd_ordered_lock);
 }
+void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
+{
+        struct buffer_head *bh = bd->bd_bh;
+        struct gfs2_glock *gl = bd->bd_gl;
+        gfs2_remove_from_ail(bd);
+        bd->bd_bh = NULL;
+        bh->b_private = NULL;
+        bd->bd_blkno = bh->b_blocknr;
+        bd->bd_ops = &gfs2_revoke_lops;
+        sdp->sd_log_num_revoke++;
+        atomic_inc(&gl->gl_revokes);
+        set_bit(GLF_LFLUSH, &gl->gl_flags);
+        list_add(&bd->bd_list, &sdp->sd_log_le_revoke);
+}
+void gfs2_write_revokes(struct gfs2_sbd *sdp)
+{
+        struct gfs2_trans *tr;
+        struct gfs2_bufdata *bd, *tmp;
+        int have_revokes = 0;
+        int max_revokes = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_log_descriptor)) / sizeof(u64);
+        gfs2_ail1_empty(sdp);
+        spin_lock(&sdp->sd_ail_lock);
+        list_for_each_entry(tr, &sdp->sd_ail1_list, tr_list) {
+                list_for_each_entry(bd, &tr->tr_ail2_list, bd_ail_st_list) {
+                        if (list_empty(&bd->bd_list)) {
+                                have_revokes = 1;
+                                goto done;
+                        }
+                }
+        }
+done:
+        spin_unlock(&sdp->sd_ail_lock);
+        if (have_revokes == 0)
+                return;
+        while (sdp->sd_log_num_revoke > max_revokes)
+                max_revokes += (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header)) / sizeof(u64);
+        max_revokes -= sdp->sd_log_num_revoke;
+        if (!sdp->sd_log_num_revoke) {
+                atomic_dec(&sdp->sd_log_blks_free);
+                /* If no blocks have been reserved, we need to also
+                 * reserve a block for the header */
+                if (!sdp->sd_log_blks_reserved)
+                        atomic_dec(&sdp->sd_log_blks_free);
+        }
+        gfs2_log_lock(sdp);
+        spin_lock(&sdp->sd_ail_lock);
+        list_for_each_entry(tr, &sdp->sd_ail1_list, tr_list) {
+                list_for_each_entry_safe(bd, tmp, &tr->tr_ail2_list, bd_ail_st_list) {
+                        if (max_revokes == 0)
+                                goto out_of_blocks;
+                        if (!list_empty(&bd->bd_list))
+                                continue;
+                        gfs2_add_revoke(sdp, bd);
+                        max_revokes--;
+                }
+        }
+out_of_blocks:
+        spin_unlock(&sdp->sd_ail_lock);
+        gfs2_log_unlock(sdp);
+        if (!sdp->sd_log_num_revoke) {
+                atomic_inc(&sdp->sd_log_blks_free);
+                if (!sdp->sd_log_blks_reserved)
+                        atomic_inc(&sdp->sd_log_blks_free);
+        }
+}
 /**
 * log_write_header - Get and initialize a journal header buffer
 * @sdp: The GFS2 superblock
@@ -562,7 +633,6 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags)
        lh = page_address(page);
        clear_page(lh);
-        gfs2_ail1_empty(sdp);
        tail = current_tail(sdp);
        lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index 3566f35915e0..37216634f0aa 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -72,5 +72,7 @@ extern void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc)
 extern void gfs2_log_shutdown(struct gfs2_sbd *sdp);
 extern void gfs2_meta_syncfs(struct gfs2_sbd *sdp);
 extern int gfs2_logd(void *data);
+extern void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd);
+extern void gfs2_write_revokes(struct gfs2_sbd *sdp);
 #endif /* __LOG_DOT_H__ */
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 6c33d7b6e0c4..17c5b5d7dc88 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -16,6 +16,7 @@
 #include <linux/gfs2_ondisk.h>
 #include <linux/bio.h>
 #include <linux/fs.h>
+#include <linux/list_sort.h>
 #include "gfs2.h"
 #include "incore.h"
@@ -401,6 +402,20 @@ static void gfs2_check_magic(struct buffer_head *bh)
        kunmap_atomic(kaddr);
 }
+static int blocknr_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+        struct gfs2_bufdata *bda, *bdb;
+        bda = list_entry(a, struct gfs2_bufdata, bd_list);
+        bdb = list_entry(b, struct gfs2_bufdata, bd_list);
+        if (bda->bd_bh->b_blocknr < bdb->bd_bh->b_blocknr)
+                return -1;
+        if (bda->bd_bh->b_blocknr > bdb->bd_bh->b_blocknr)
+                return 1;
+        return 0;
+}
 static void gfs2_before_commit(struct gfs2_sbd *sdp, unsigned int limit,
                                unsigned int total, struct list_head *blist,
                                bool is_databuf)
@@ -413,6 +428,7 @@ static void gfs2_before_commit(struct gfs2_sbd *sdp, unsigned int limit,
        __be64 *ptr;
        gfs2_log_lock(sdp);
+        list_sort(NULL, blist, blocknr_cmp);
        bd1 = bd2 = list_prepare_entry(bd1, blist, bd_list);
        while(total) {
                num = total;
@@ -590,6 +606,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
        struct page *page;
        unsigned int length;
+        gfs2_write_revokes(sdp);
        if (!sdp->sd_log_num_revoke)
                return;
@@ -836,10 +853,6 @@ const struct gfs2_log_operations gfs2_revoke_lops = {
        .lo_name = "revoke",
 };
-const struct gfs2_log_operations gfs2_rg_lops = {
-        .lo_name = "rg",
-};
 const struct gfs2_log_operations gfs2_databuf_lops = {
        .lo_before_commit = databuf_lo_before_commit,
        .lo_after_commit = databuf_lo_after_commit,
@@ -851,7 +864,6 @@ const struct gfs2_log_operations gfs2_databuf_lops = {
 const struct gfs2_log_operations *gfs2_log_ops[] = {
        &gfs2_databuf_lops,
        &gfs2_buf_lops,
-        &gfs2_rg_lops,
        &gfs2_revoke_lops,
        NULL,
 };
diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h
index 87e062e05c92..9ca2e6438419 100644
--- a/fs/gfs2/lops.h
+++ b/fs/gfs2/lops.h
@@ -23,7 +23,6 @@
 extern const struct gfs2_log_operations gfs2_glock_lops;
 extern const struct gfs2_log_operations gfs2_buf_lops;
 extern const struct gfs2_log_operations gfs2_revoke_lops;
-extern const struct gfs2_log_operations gfs2_rg_lops;
 extern const struct gfs2_log_operations gfs2_databuf_lops;
 extern const struct gfs2_log_operations *gfs2_log_ops[];
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 1a89afb68472..0da390686c08 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -296,10 +296,6 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int
        if (bd) {
                spin_lock(&sdp->sd_ail_lock);
                if (bd->bd_tr) {
-                        gfs2_remove_from_ail(bd);
-                        bh->b_private = NULL;
-                        bd->bd_bh = NULL;
-                        bd->bd_blkno = bh->b_blocknr;
                        gfs2_trans_add_revoke(sdp, bd);
                }
                spin_unlock(&sdp->sd_ail_lock);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 60ede2a0f43f..0262c190b6f9 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -916,16 +916,16 @@ static int init_threads(struct gfs2_sbd *sdp, int undo)
                goto fail_quotad;
        p = kthread_run(gfs2_logd, sdp, "gfs2_logd");
-        error = IS_ERR(p);
+        if (IS_ERR(p)) {
-        if (error) {
+                error = PTR_ERR(p);
                fs_err(sdp, "can't start logd thread: %d\n", error);
                return error;
        }
        sdp->sd_logd_process = p;
        p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad");
-        error = IS_ERR(p);
+        if (IS_ERR(p)) {
-        if (error) {
+                error = PTR_ERR(p);
                fs_err(sdp, "can't start quotad thread: %d\n", error);
                goto fail;
        }
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index c253b13722e8..3768c2f40e43 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -1154,11 +1154,6 @@ int gfs2_quota_sync(struct super_block *sb, int type)
        return error;
 }
-static int gfs2_quota_sync_timeo(struct super_block *sb, int type)
-{
-        return gfs2_quota_sync(sb, type);
-}
 int gfs2_quota_refresh(struct gfs2_sbd *sdp, struct kqid qid)
 {
        struct gfs2_quota_data *qd;
@@ -1414,7 +1409,7 @@ int gfs2_quotad(void *data)
                                           &tune->gt_statfs_quantum);
                /* Update quota file */
-                quotad_check_timeo(sdp, "sync", gfs2_quota_sync_timeo, t,
+                quotad_check_timeo(sdp, "sync", gfs2_quota_sync, t,
                                   &quotad_timeo, &tune->gt_quota_quantum);
                /* Check for & recover partially truncated inodes */
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 9809156e3d04..69317435faa7 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1288,13 +1288,15 @@ int gfs2_fitrim(struct file *filp, void __user *argp)
        minlen = max_t(u64, r.minlen,
                       q->limits.discard_granularity) >> bs_shift;
+        if (end <= start || minlen > sdp->sd_max_rg_data)
+                return -EINVAL;
        rgd = gfs2_blk2rgrpd(sdp, start, 0);
-        rgd_end = gfs2_blk2rgrpd(sdp, end - 1, 0);
+        rgd_end = gfs2_blk2rgrpd(sdp, end, 0);
-        if (end <= start ||
+        if ((gfs2_rgrpd_get_first(sdp) == gfs2_rgrpd_get_next(rgd_end))
-            minlen > sdp->sd_max_rg_data ||
+            && (start > rgd_end->rd_data0 + rgd_end->rd_data))
-            start > rgd_end->rd_data0 + rgd_end->rd_data)
+                return -EINVAL; /* start is beyond the end of the fs */
-                return -EINVAL;
        while (1) {
@@ -1336,7 +1338,7 @@ int gfs2_fitrim(struct file *filp, void __user *argp)
        }
 out:
-        r.len = trimmed << 9;
+        r.len = trimmed << bs_shift;
        if (copy_to_user(argp, &r, sizeof(r)))
                return -EFAULT;
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 7374907742a8..2b20d7046bf3 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -270,19 +270,12 @@ void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh)
 void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
 {
-        struct gfs2_glock *gl = bd->bd_gl;
        struct gfs2_trans *tr = current->journal_info;
        BUG_ON(!list_empty(&bd->bd_list));
-        BUG_ON(!list_empty(&bd->bd_ail_st_list));
+        gfs2_add_revoke(sdp, bd);
-        BUG_ON(!list_empty(&bd->bd_ail_gl_list));
-        bd->bd_ops = &gfs2_revoke_lops;
        tr->tr_touched = 1;
        tr->tr_num_revoke++;
-        sdp->sd_log_num_revoke++;
-        atomic_inc(&gl->gl_revokes);
-        set_bit(GLF_LFLUSH, &gl->gl_flags);
-        list_add(&bd->bd_list, &sdp->sd_log_le_revoke);
 }
 void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len)
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index e0101b6fb0d7..145566851e7a 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -51,9 +51,9 @@ done:
 /*
 * hfs_readdir
 */
-static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int hfs_readdir(struct file *file, struct dir_context *ctx)
 {
-        struct inode *inode = file_inode(filp);
+        struct inode *inode = file_inode(file);
        struct super_block *sb = inode->i_sb;
        int len, err;
        char strbuf[HFS_MAX_NAMELEN];
@@ -62,7 +62,7 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
        struct hfs_readdir_data *rd;
        u16 type;
-        if (filp->f_pos >= inode->i_size)
+        if (ctx->pos >= inode->i_size)
                return 0;
        err = hfs_find_init(HFS_SB(sb)->cat_tree, &fd);
@@ -73,14 +73,13 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
        if (err)
                goto out;
-        switch ((u32)filp->f_pos) {
+        if (ctx->pos == 0) {
-        case 0:
                /* This is completely artificial... */
-                if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR))
+                if (!dir_emit_dot(file, ctx))
                        goto out;
-                filp->f_pos++;
+                ctx->pos = 1;
-                /* fall through */
+        }
-        case 1:
+        if (ctx->pos == 1) {
                if (fd.entrylength > sizeof(entry) || fd.entrylength < 0) {
                        err = -EIO;
                        goto out;
@@ -97,18 +96,16 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                //      err = -EIO;
                //      goto out;
                //}
-                if (filldir(dirent, "..", 2, 1,
+                if (!dir_emit(ctx, "..", 2,
                            be32_to_cpu(entry.thread.ParID), DT_DIR))
                        goto out;
-                filp->f_pos++;
+                ctx->pos = 2;
-                /* fall through */
-        default:
-                if (filp->f_pos >= inode->i_size)
-                        goto out;
-                err = hfs_brec_goto(&fd, filp->f_pos - 1);
-                if (err)
-                        goto out;
        }
+        if (ctx->pos >= inode->i_size)
+                goto out;
+        err = hfs_brec_goto(&fd, ctx->pos - 1);
+        if (err)
+                goto out;
        for (;;) {
                if (be32_to_cpu(fd.key->cat.ParID) != inode->i_ino) {
@@ -131,7 +128,7 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                                err = -EIO;
                                goto out;
                        }
-                        if (filldir(dirent, strbuf, len, filp->f_pos,
+                        if (!dir_emit(ctx, strbuf, len,
                                    be32_to_cpu(entry.dir.DirID), DT_DIR))
                                break;
                } else if (type == HFS_CDR_FIL) {
@@ -140,7 +137,7 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                                err = -EIO;
                                goto out;
                        }
-                        if (filldir(dirent, strbuf, len, filp->f_pos,
+                        if (!dir_emit(ctx, strbuf, len,
                                    be32_to_cpu(entry.file.FlNum), DT_REG))
                                break;
                } else {
@@ -148,22 +145,22 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                        err = -EIO;
                        goto out;
                }
-                filp->f_pos++;
+                ctx->pos++;
-                if (filp->f_pos >= inode->i_size)
+                if (ctx->pos >= inode->i_size)
                        goto out;
                err = hfs_brec_goto(&fd, 1);
                if (err)
                        goto out;
        }
-        rd = filp->private_data;
+        rd = file->private_data;
        if (!rd) {
                rd = kmalloc(sizeof(struct hfs_readdir_data), GFP_KERNEL);
                if (!rd) {
                        err = -ENOMEM;
                        goto out;
                }
-                filp->private_data = rd;
+                file->private_data = rd;
-                rd->file = filp;
+                rd->file = file;
                list_add(&rd->list, &HFS_I(inode)->open_dir_list);
        }
        memcpy(&rd->key, &fd.key, sizeof(struct hfs_cat_key));
@@ -306,7 +303,7 @@ static int hfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 const struct file_operations hfs_dir_operations = {
        .read           = generic_read_dir,
-        .readdir        = hfs_readdir,
+        .iterate        = hfs_readdir,
        .llseek         = generic_file_llseek,
        .release        = hfs_dir_release,
 };
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index a37ac934732f..d8ce4bd17fc5 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -121,9 +121,9 @@ fail:
        return ERR_PTR(err);
 }
-static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int hfsplus_readdir(struct file *file, struct dir_context *ctx)
 {
-        struct inode *inode = file_inode(filp);
+        struct inode *inode = file_inode(file);
        struct super_block *sb = inode->i_sb;
        int len, err;
        char strbuf[HFSPLUS_MAX_STRLEN + 1];
@@ -132,7 +132,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
        struct hfsplus_readdir_data *rd;
        u16 type;
-        if (filp->f_pos >= inode->i_size)
+        if (file->f_pos >= inode->i_size)
                return 0;
        err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
@@ -143,14 +143,13 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
        if (err)
                goto out;
-        switch ((u32)filp->f_pos) {
+        if (ctx->pos == 0) {
-        case 0:
                /* This is completely artificial... */
-                if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR))
+                if (!dir_emit_dot(file, ctx))
                        goto out;
-                filp->f_pos++;
+                ctx->pos = 1;
-                /* fall through */
+        }
-        case 1:
+        if (ctx->pos == 1) {
                if (fd.entrylength > sizeof(entry) || fd.entrylength < 0) {
                        err = -EIO;
                        goto out;
@@ -168,19 +167,16 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
                        err = -EIO;
                        goto out;
                }
-                if (filldir(dirent, "..", 2, 1,
+                if (!dir_emit(ctx, "..", 2,
                            be32_to_cpu(entry.thread.parentID), DT_DIR))
                        goto out;
-                filp->f_pos++;
+                ctx->pos = 2;
-                /* fall through */
-        default:
-                if (filp->f_pos >= inode->i_size)
-                        goto out;
-                err = hfs_brec_goto(&fd, filp->f_pos - 1);
-                if (err)
-                        goto out;
        }
+        if (ctx->pos >= inode->i_size)
+                goto out;
+        err = hfs_brec_goto(&fd, ctx->pos - 1);
+        if (err)
+                goto out;
        for (;;) {
                if (be32_to_cpu(fd.key->cat.parent) != inode->i_ino) {
                        pr_err("walked past end of dir\n");
@@ -211,7 +207,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
                            HFSPLUS_SB(sb)->hidden_dir->i_ino ==
                                        be32_to_cpu(entry.folder.id))
                                goto next;
-                        if (filldir(dirent, strbuf, len, filp->f_pos,
+                        if (!dir_emit(ctx, strbuf, len,
                                    be32_to_cpu(entry.folder.id), DT_DIR))
                                break;
                } else if (type == HFSPLUS_FILE) {
@@ -220,7 +216,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
                                err = -EIO;
                                goto out;
                        }
-                        if (filldir(dirent, strbuf, len, filp->f_pos,
+                        if (!dir_emit(ctx, strbuf, len,
                                    be32_to_cpu(entry.file.id), DT_REG))
                                break;
                } else {
@@ -229,22 +225,22 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
                        goto out;
                }
 next:
-                filp->f_pos++;
+                ctx->pos++;
-                if (filp->f_pos >= inode->i_size)
+                if (ctx->pos >= inode->i_size)
                        goto out;
                err = hfs_brec_goto(&fd, 1);
                if (err)
                        goto out;
        }
-        rd = filp->private_data;
+        rd = file->private_data;
        if (!rd) {
                rd = kmalloc(sizeof(struct hfsplus_readdir_data), GFP_KERNEL);
                if (!rd) {
                        err = -ENOMEM;
                        goto out;
                }
-                filp->private_data = rd;
+                file->private_data = rd;
-                rd->file = filp;
+                rd->file = file;
                list_add(&rd->list, &HFSPLUS_I(inode)->open_dir_list);
        }
        memcpy(&rd->key, fd.key, sizeof(struct hfsplus_cat_key));
@@ -538,7 +534,7 @@ const struct inode_operations hfsplus_dir_inode_operations = {
 const struct file_operations hfsplus_dir_operations = {
        .fsync          = hfsplus_file_fsync,
        .read           = generic_read_dir,
-        .readdir        = hfsplus_readdir,
+        .iterate        = hfsplus_readdir,
        .unlocked_ioctl = hfsplus_ioctl,
        .llseek         = generic_file_llseek,
        .release        = hfsplus_dir_release,
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 32f35f187989..cddb05217512 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -277,7 +277,7 @@ static const struct super_operations hostfs_sbops = {
        .show_options   = hostfs_show_options,
 };
-int hostfs_readdir(struct file *file, void *ent, filldir_t filldir)
+int hostfs_readdir(struct file *file, struct dir_context *ctx)
 {
        void *dir;
        char *name;
@@ -292,12 +292,11 @@ int hostfs_readdir(struct file *file, void *ent, filldir_t filldir)
        __putname(name);
        if (dir == NULL)
                return -error;
-        next = file->f_pos;
+        next = ctx->pos;
        while ((name = read_dir(dir, &next, &ino, &len, &type)) != NULL) {
-                error = (*filldir)(ent, name, len, file->f_pos,
+                if (!dir_emit(ctx, name, len, ino, type))
-                                   ino, type);
+                        break;
-                if (error) break;
+                ctx->pos = next;
-                file->f_pos = next;
        }
        close_dir(dir);
        return 0;
@@ -393,7 +392,7 @@ static const struct file_operations hostfs_file_fops = {
 static const struct file_operations hostfs_dir_fops = {
        .llseek         = generic_file_llseek,
-        .readdir        = hostfs_readdir,
+        .iterate        = hostfs_readdir,
        .read           = generic_read_dir,
 };
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index 834ac13c04b7..292b1acb9b81 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -57,14 +57,14 @@ fail:
        return -ESPIPE;
 }
-static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int hpfs_readdir(struct file *file, struct dir_context *ctx)
 {
-        struct inode *inode = file_inode(filp);
+        struct inode *inode = file_inode(file);
        struct hpfs_inode_info *hpfs_inode = hpfs_i(inode);
        struct quad_buffer_head qbh;
        struct hpfs_dirent *de;
        int lc;
-        long old_pos;
+        loff_t next_pos;
        unsigned char *tempname;
        int c1, c2 = 0;
        int ret = 0;
@@ -105,11 +105,11 @@ static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                }
        }
        lc = hpfs_sb(inode->i_sb)->sb_lowercase;
-        if (filp->f_pos == 12) { /* diff -r requires this (note, that diff -r */
+        if (ctx->pos == 12) { /* diff -r requires this (note, that diff -r */
-                filp->f_pos = 13; /* also fails on msdos filesystem in 2.0) */
+                ctx->pos = 13; /* also fails on msdos filesystem in 2.0) */
                goto out;
        }
-        if (filp->f_pos == 13) {
+        if (ctx->pos == 13) {
                ret = -ENOENT;
                goto out;
        }
@@ -120,33 +120,34 @@ static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                   accepted by filldir, but what can I do?
                   maybe killall -9 ls helps */
                if (hpfs_sb(inode->i_sb)->sb_chk)
-                        if (hpfs_stop_cycles(inode->i_sb, filp->f_pos, &c1, &c2, "hpfs_readdir")) {
+                        if (hpfs_stop_cycles(inode->i_sb, ctx->pos, &c1, &c2, "hpfs_readdir")) {
                                ret = -EFSERROR;
                                goto out;
                        }
-                if (filp->f_pos == 12)
+                if (ctx->pos == 12)
                        goto out;
-                if (filp->f_pos == 3 || filp->f_pos == 4 || filp->f_pos == 5) {
+                if (ctx->pos == 3 || ctx->pos == 4 || ctx->pos == 5) {
-                        printk("HPFS: warning: pos==%d\n",(int)filp->f_pos);
+                        printk("HPFS: warning: pos==%d\n",(int)ctx->pos);
                        goto out;
                }
-                if (filp->f_pos == 0) {
+                if (ctx->pos == 0) {
-                        if (filldir(dirent, ".", 1, filp->f_pos, inode->i_ino, DT_DIR) < 0)
+                        if (!dir_emit_dot(file, ctx))
                                goto out;
-                        filp->f_pos = 11;
+                        ctx->pos = 11;
                }
-                if (filp->f_pos == 11) {
+                if (ctx->pos == 11) {
-                        if (filldir(dirent, "..", 2, filp->f_pos, hpfs_inode->i_parent_dir, DT_DIR) < 0)
+                        if (!dir_emit(ctx, "..", 2, hpfs_inode->i_parent_dir, DT_DIR))
                                goto out;
-                        filp->f_pos = 1;
+                        ctx->pos = 1;
                }
-                if (filp->f_pos == 1) {
+                if (ctx->pos == 1) {
-                        filp->f_pos = ((loff_t) hpfs_de_as_down_as_possible(inode->i_sb, hpfs_inode->i_dno) << 4) + 1;
+                        ctx->pos = ((loff_t) hpfs_de_as_down_as_possible(inode->i_sb, hpfs_inode->i_dno) << 4) + 1;
-                        hpfs_add_pos(inode, &filp->f_pos);
+                        hpfs_add_pos(inode, &file->f_pos);
-                        filp->f_version = inode->i_version;
+                        file->f_version = inode->i_version;
                }
-                old_pos = filp->f_pos;
+                next_pos = ctx->pos;
-                if (!(de = map_pos_dirent(inode, &filp->f_pos, &qbh))) {
+                if (!(de = map_pos_dirent(inode, &next_pos, &qbh))) {
+                        ctx->pos = next_pos;
                        ret = -EIOERROR;
                        goto out;
                }
@@ -154,20 +155,21 @@ static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                        if (hpfs_sb(inode->i_sb)->sb_chk) {
                                if (de->first && !de->last && (de->namelen != 2
                                    || de ->name[0] != 1 || de->name[1] != 1))
-                                        hpfs_error(inode->i_sb, "hpfs_readdir: bad ^A^A entry; pos = %08lx", old_pos);
+                                        hpfs_error(inode->i_sb, "hpfs_readdir: bad ^A^A entry; pos = %08lx", (unsigned long)ctx->pos);
                                if (de->last && (de->namelen != 1 || de ->name[0] != 255))
-                                        hpfs_error(inode->i_sb, "hpfs_readdir: bad \\377 entry; pos = %08lx", old_pos);
+                                        hpfs_error(inode->i_sb, "hpfs_readdir: bad \\377 entry; pos = %08lx", (unsigned long)ctx->pos);
                        }
                        hpfs_brelse4(&qbh);
+                        ctx->pos = next_pos;
                        goto again;
                }
                tempname = hpfs_translate_name(inode->i_sb, de->name, de->namelen, lc, de->not_8x3);
-                if (filldir(dirent, tempname, de->namelen, old_pos, le32_to_cpu(de->fnode), DT_UNKNOWN) < 0) {
+                if (!dir_emit(ctx, tempname, de->namelen, le32_to_cpu(de->fnode), DT_UNKNOWN)) {
-                        filp->f_pos = old_pos;
                        if (tempname != de->name) kfree(tempname);
                        hpfs_brelse4(&qbh);
                        goto out;
                }
+                ctx->pos = next_pos;
                if (tempname != de->name) kfree(tempname);
                hpfs_brelse4(&qbh);
        }
@@ -322,7 +324,7 @@ const struct file_operations hpfs_dir_ops =
 {
        .llseek         = hpfs_dir_lseek,
        .read           = generic_read_dir,
-        .readdir        = hpfs_readdir,
+        .iterate        = hpfs_readdir,
        .release        = hpfs_dir_release,
        .fsync          = hpfs_file_fsync,
 };
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index cd3e38972c86..fc90ab11c340 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -542,8 +542,8 @@ static const struct file_operations hppfs_file_fops = {
 };
 struct hppfs_dirent {
-        void *vfs_dirent;
+        struct dir_context ctx;
-        filldir_t filldir;
+        struct dir_context *caller;
        struct dentry *dentry;
 };
@@ -555,34 +555,29 @@ static int hppfs_filldir(void *d, const char *name, int size,
        if (file_removed(dirent->dentry, name))
                return 0;
-        return (*dirent->filldir)(dirent->vfs_dirent, name, size, offset,
+        dirent->caller->pos = dirent->ctx.pos;
-                                  inode, type);
+        return !dir_emit(dirent->caller, name, size, inode, type);
 }
-static int hppfs_readdir(struct file *file, void *ent, filldir_t filldir)
+static int hppfs_readdir(struct file *file, struct dir_context *ctx)
 {
        struct hppfs_private *data = file->private_data;
        struct file *proc_file = data->proc_file;
-        int (*readdir)(struct file *, void *, filldir_t);
+        struct hppfs_dirent d = {
-        struct hppfs_dirent dirent = ((struct hppfs_dirent)
+                .ctx.actor      = hppfs_filldir,
-                                      { .vfs_dirent     = ent,
+                .caller         = ctx,
-                                        .filldir        = filldir,
+                .dentry         = file->f_path.dentry
-                                        .dentry         = file->f_path.dentry
+        };
-                                      });
        int err;
+        proc_file->f_pos = ctx->pos;
-        readdir = file_inode(proc_file)->i_fop->readdir;
+        err = iterate_dir(proc_file, &d.ctx);
+        ctx->pos = d.ctx.pos;
-        proc_file->f_pos = file->f_pos;
-        err = (*readdir)(proc_file, &dirent, hppfs_filldir);
-        file->f_pos = proc_file->f_pos;
        return err;
 }
 static const struct file_operations hppfs_dir_fops = {
        .owner          = NULL,
-        .readdir        = hppfs_readdir,
+        .iterate        = hppfs_readdir,
        .open           = hppfs_dir_open,
        .llseek         = default_llseek,
        .release        = hppfs_release,
diff --git a/fs/internal.h b/fs/internal.h
index eaa75f75b625..68121584ae37 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -132,6 +132,12 @@ extern struct dentry *__d_alloc(struct super_block *, const struct qstr *);
 extern ssize_t __kernel_write(struct file *, const char *, size_t, loff_t *);
 /*
+ * splice.c
+ */
+extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
+                loff_t *opos, size_t len, unsigned int flags);
+/*
 * pipe.c
 */
 extern const struct file_operations pipefifo_fops;
diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c
index a7d5c3c3d4e6..b943cbd963bb 100644
--- a/fs/isofs/dir.c
+++ b/fs/isofs/dir.c
@@ -78,8 +78,8 @@ int get_acorn_filename(struct iso_directory_record *de,
 /*
 * This should _really_ be cleaned up some day..
 */
-static int do_isofs_readdir(struct inode *inode, struct file *filp,
+static int do_isofs_readdir(struct inode *inode, struct file *file,
-                void *dirent, filldir_t filldir,
+                struct dir_context *ctx,
                char *tmpname, struct iso_directory_record *tmpde)
 {
        unsigned long bufsize = ISOFS_BUFFER_SIZE(inode);
@@ -94,10 +94,10 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
        struct iso_directory_record *de;
        struct isofs_sb_info *sbi = ISOFS_SB(inode->i_sb);
-        offset = filp->f_pos & (bufsize - 1);
+        offset = ctx->pos & (bufsize - 1);
-        block = filp->f_pos >> bufbits;
+        block = ctx->pos >> bufbits;
-        while (filp->f_pos < inode->i_size) {
+        while (ctx->pos < inode->i_size) {
                int de_len;
                if (!bh) {
@@ -108,7 +108,7 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
                de = (struct iso_directory_record *) (bh->b_data + offset);
-                de_len = *(unsigned char *) de;
+                de_len = *(unsigned char *)de;
                /*
                 * If the length byte is zero, we should move on to the next
@@ -119,8 +119,8 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
                if (de_len == 0) {
                        brelse(bh);
                        bh = NULL;
-                        filp->f_pos = (filp->f_pos + ISOFS_BLOCK_SIZE) & ~(ISOFS_BLOCK_SIZE - 1);
+                        ctx->pos = (ctx->pos + ISOFS_BLOCK_SIZE) & ~(ISOFS_BLOCK_SIZE - 1);
-                        block = filp->f_pos >> bufbits;
+                        block = ctx->pos >> bufbits;
                        offset = 0;
                        continue;
                }
@@ -164,16 +164,16 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
                if (de->flags[-sbi->s_high_sierra] & 0x80) {
                        first_de = 0;
-                        filp->f_pos += de_len;
+                        ctx->pos += de_len;
                        continue;
                }
                first_de = 1;
                /* Handle the case of the '.' directory */
                if (de->name_len[0] == 1 && de->name[0] == 0) {
-                        if (filldir(dirent, ".", 1, filp->f_pos, inode->i_ino, DT_DIR) < 0)
+                        if (!dir_emit_dot(file, ctx))
                                break;
-                        filp->f_pos += de_len;
+                        ctx->pos += de_len;
                        continue;
                }
@@ -181,10 +181,9 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
                /* Handle the case of the '..' directory */
                if (de->name_len[0] == 1 && de->name[0] == 1) {
-                        inode_number = parent_ino(filp->f_path.dentry);
+                        if (!dir_emit_dotdot(file, ctx))
-                        if (filldir(dirent, "..", 2, filp->f_pos, inode_number, DT_DIR) < 0)
                                break;
-                        filp->f_pos += de_len;
+                        ctx->pos += de_len;
                        continue;
                }
@@ -198,7 +197,7 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
                if ((sbi->s_hide && (de->flags[-sbi->s_high_sierra] & 1)) ||
                    (!sbi->s_showassoc &&
                                (de->flags[-sbi->s_high_sierra] & 4))) {
-                        filp->f_pos += de_len;
+                        ctx->pos += de_len;
                        continue;
                }
@@ -230,10 +229,10 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
                        }
                }
                if (len > 0) {
-                        if (filldir(dirent, p, len, filp->f_pos, inode_number, DT_UNKNOWN) < 0)
+                        if (!dir_emit(ctx, p, len, inode_number, DT_UNKNOWN))
                                break;
                }
-                filp->f_pos += de_len;
+                ctx->pos += de_len;
                continue;
        }
@@ -247,13 +246,12 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
 * handling split directory entries.. The real work is done by
 * "do_isofs_readdir()".
 */
-static int isofs_readdir(struct file *filp,
+static int isofs_readdir(struct file *file, struct dir_context *ctx)
-                void *dirent, filldir_t filldir)
 {
        int result;
        char *tmpname;
        struct iso_directory_record *tmpde;
-        struct inode *inode = file_inode(filp);
+        struct inode *inode = file_inode(file);
        tmpname = (char *)__get_free_page(GFP_KERNEL);
        if (tmpname == NULL)
@@ -261,7 +259,7 @@ static int isofs_readdir(struct file *filp,
        tmpde = (struct iso_directory_record *) (tmpname+1024);
-        result = do_isofs_readdir(inode, filp, dirent, filldir, tmpname, tmpde);
+        result = do_isofs_readdir(inode, file, ctx, tmpname, tmpde);
        free_page((unsigned long) tmpname);
        return result;
@@ -271,7 +269,7 @@ const struct file_operations isofs_dir_operations =
 {
        .llseek = generic_file_llseek,
        .read = generic_read_dir,
-        .readdir = isofs_readdir,
+        .iterate = isofs_readdir,
 };
 /*
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index e3e255c0a509..be0c39b66fe0 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -2019,16 +2019,20 @@ zap_buffer_unlocked:
 * void journal_invalidatepage() - invalidate a journal page
 * @journal: journal to use for flush
 * @page:    page to flush
- * @offset:  length of page to invalidate.
+ * @offset:  offset of the range to invalidate
+ * @length:  length of the range to invalidate
 *
- * Reap page buffers containing data after offset in page.
+ * Reap page buffers containing data in specified range in page.
 */
 void journal_invalidatepage(journal_t *journal,
                      struct page *page,
-                      unsigned long offset)
+                      unsigned int offset,
+                      unsigned int length)
 {
        struct buffer_head *head, *bh, *next;
+        unsigned int stop = offset + length;
        unsigned int curr_off = 0;
+        int partial_page = (offset || length < PAGE_CACHE_SIZE);
        int may_free = 1;
        if (!PageLocked(page))
@@ -2036,6 +2040,8 @@ void journal_invalidatepage(journal_t *journal,
        if (!page_has_buffers(page))
                return;
+        BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
        /* We will potentially be playing with lists other than just the
         * data lists (especially for journaled data mode), so be
         * cautious in our locking. */
@@ -2045,11 +2051,14 @@ void journal_invalidatepage(journal_t *journal,
                unsigned int next_off = curr_off + bh->b_size;
                next = bh->b_this_page;
+                if (next_off > stop)
+                        return;
                if (offset <= curr_off) {
                        /* This block is wholly outside the truncation point */
                        lock_buffer(bh);
                        may_free &= journal_unmap_buffer(journal, bh,
-                                                         offset > 0);
+                                                         partial_page);
                        unlock_buffer(bh);
                }
                curr_off = next_off;
@@ -2057,7 +2066,7 @@ void journal_invalidatepage(journal_t *journal,
        } while (bh != head);
-        if (!offset) {
+        if (!partial_page) {
                if (may_free && try_to_free_buffers(page))
                        J_ASSERT(!page_has_buffers(page));
        }
diff --git a/fs/jbd2/Kconfig b/fs/jbd2/Kconfig
index 69a48c2944da..5a9f5534d57b 100644
--- a/fs/jbd2/Kconfig
+++ b/fs/jbd2/Kconfig
@@ -20,7 +20,7 @@ config JBD2
 config JBD2_DEBUG
        bool "JBD2 (ext4) debugging support"
-        depends on JBD2 && DEBUG_FS
+        depends on JBD2
        help
          If you are using the ext4 journaled file system (or
          potentially any other filesystem/device using JBD2), this option
@@ -29,7 +29,7 @@ config JBD2_DEBUG
          By default, the debugging output will be turned off.
          If you select Y here, then you will be able to turn on debugging
-          with "echo N > /sys/kernel/debug/jbd2/jbd2-debug", where N is a
+          with "echo N > /sys/module/jbd2/parameters/jbd2_debug", where N is a
          number between 1 and 5. The higher the number, the more debugging
          output is generated.  To turn debugging off again, do
-          "echo 0 > /sys/kernel/debug/jbd2/jbd2-debug".
+          "echo 0 > /sys/module/jbd2/parameters/jbd2_debug".
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index c78841ee81cf..7f34f4716165 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -120,8 +120,8 @@ void __jbd2_log_wait_for_space(journal_t *journal)
        int nblocks, space_left;
        /* assert_spin_locked(&journal->j_state_lock); */
-        nblocks = jbd_space_needed(journal);
+        nblocks = jbd2_space_needed(journal);
-        while (__jbd2_log_space_left(journal) < nblocks) {
+        while (jbd2_log_space_left(journal) < nblocks) {
                if (journal->j_flags & JBD2_ABORT)
                        return;
                write_unlock(&journal->j_state_lock);
@@ -140,8 +140,8 @@ void __jbd2_log_wait_for_space(journal_t *journal)
                 */
                write_lock(&journal->j_state_lock);
                spin_lock(&journal->j_list_lock);
-                nblocks = jbd_space_needed(journal);
+                nblocks = jbd2_space_needed(journal);
-                space_left = __jbd2_log_space_left(journal);
+                space_left = jbd2_log_space_left(journal);
                if (space_left < nblocks) {
                        int chkpt = journal->j_checkpoint_transactions != NULL;
                        tid_t tid = 0;
@@ -156,7 +156,15 @@ void __jbd2_log_wait_for_space(journal_t *journal)
                                /* We were able to recover space; yay! */
                                ;
                        } else if (tid) {
+                                /*
+                                 * jbd2_journal_commit_transaction() may want
+                                 * to take the checkpoint_mutex if JBD2_FLUSHED
+                                 * is set.  So we need to temporarily drop it.
+                                 */
+                                mutex_unlock(&journal->j_checkpoint_mutex);
                                jbd2_log_wait_commit(journal, tid);
+                                write_lock(&journal->j_state_lock);
+                                continue;
                        } else {
                                printk(KERN_ERR "%s: needed %d blocks and "
                                       "only had %d space available\n",
@@ -625,10 +633,6 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
        __jbd2_journal_drop_transaction(journal, transaction);
        jbd2_journal_free_transaction(transaction);
-        /* Just in case anybody was waiting for more transactions to be
-           checkpointed... */
-        wake_up(&journal->j_wait_logspace);
        ret = 1;
 out:
        return ret;
@@ -690,9 +694,7 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact
        J_ASSERT(transaction->t_state == T_FINISHED);
        J_ASSERT(transaction->t_buffers == NULL);
        J_ASSERT(transaction->t_forget == NULL);
-        J_ASSERT(transaction->t_iobuf_list == NULL);
        J_ASSERT(transaction->t_shadow_list == NULL);
-        J_ASSERT(transaction->t_log_list == NULL);
        J_ASSERT(transaction->t_checkpoint_list == NULL);
        J_ASSERT(transaction->t_checkpoint_io_list == NULL);
        J_ASSERT(atomic_read(&transaction->t_updates) == 0);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 0f53946f13c1..559bec1a37b4 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -30,15 +30,22 @@
 #include <trace/events/jbd2.h>
 /*
- * Default IO end handler for temporary BJ_IO buffer_heads.
+ * IO end handler for temporary buffer_heads handling writes to the journal.
 */
 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 {
+        struct buffer_head *orig_bh = bh->b_private;
        BUFFER_TRACE(bh, "");
        if (uptodate)
                set_buffer_uptodate(bh);
        else
                clear_buffer_uptodate(bh);
+        if (orig_bh) {
+                clear_bit_unlock(BH_Shadow, &orig_bh->b_state);
+                smp_mb__after_clear_bit();
+                wake_up_bit(&orig_bh->b_state, BH_Shadow);
+        }
        unlock_buffer(bh);
 }
@@ -85,8 +92,7 @@ nope:
        __brelse(bh);
 }
-static void jbd2_commit_block_csum_set(journal_t *j,
+static void jbd2_commit_block_csum_set(journal_t *j, struct buffer_head *bh)
-                                       struct journal_head *descriptor)
 {
        struct commit_header *h;
        __u32 csum;
@@ -94,12 +100,11 @@ static void jbd2_commit_block_csum_set(journal_t *j,
        if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
                return;
-        h = (struct commit_header *)(jh2bh(descriptor)->b_data);
+        h = (struct commit_header *)(bh->b_data);
        h->h_chksum_type = 0;
        h->h_chksum_size = 0;
        h->h_chksum[0] = 0;
-        csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data,
+        csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
-                           j->j_blocksize);
        h->h_chksum[0] = cpu_to_be32(csum);
 }
@@ -116,7 +121,6 @@ static int journal_submit_commit_record(journal_t *journal,
                                        struct buffer_head **cbh,
                                        __u32 crc32_sum)
 {
-        struct journal_head *descriptor;
        struct commit_header *tmp;
        struct buffer_head *bh;
        int ret;
@@ -127,12 +131,10 @@ static int journal_submit_commit_record(journal_t *journal,
        if (is_journal_aborted(journal))
                return 0;
-        descriptor = jbd2_journal_get_descriptor_buffer(journal);
+        bh = jbd2_journal_get_descriptor_buffer(journal);
-        if (!descriptor)
+        if (!bh)
                return 1;
-        bh = jh2bh(descriptor);
        tmp = (struct commit_header *)bh->b_data;
        tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
        tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
@@ -146,9 +148,9 @@ static int journal_submit_commit_record(journal_t *journal,
                tmp->h_chksum_size      = JBD2_CRC32_CHKSUM_SIZE;
                tmp->h_chksum[0]        = cpu_to_be32(crc32_sum);
        }
-        jbd2_commit_block_csum_set(journal, descriptor);
+        jbd2_commit_block_csum_set(journal, bh);
-        JBUFFER_TRACE(descriptor, "submit commit block");
+        BUFFER_TRACE(bh, "submit commit block");
        lock_buffer(bh);
        clear_buffer_dirty(bh);
        set_buffer_uptodate(bh);
@@ -180,7 +182,6 @@ static int journal_wait_on_commit_record(journal_t *journal,
        if (unlikely(!buffer_uptodate(bh)))
                ret = -EIO;
        put_bh(bh);            /* One for getblk() */
-        jbd2_journal_put_journal_head(bh2jh(bh));
        return ret;
 }
@@ -321,7 +322,7 @@ static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
 }
 static void jbd2_descr_block_csum_set(journal_t *j,
-                                      struct journal_head *descriptor)
+                                      struct buffer_head *bh)
 {
        struct jbd2_journal_block_tail *tail;
        __u32 csum;
@@ -329,12 +330,10 @@ static void jbd2_descr_block_csum_set(journal_t *j,
        if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
                return;
-        tail = (struct jbd2_journal_block_tail *)
+        tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize -
-                        (jh2bh(descriptor)->b_data + j->j_blocksize -
                        sizeof(struct jbd2_journal_block_tail));
        tail->t_checksum = 0;
-        csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data,
+        csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
-                           j->j_blocksize);
        tail->t_checksum = cpu_to_be32(csum);
 }
@@ -343,20 +342,21 @@ static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
 {
        struct page *page = bh->b_page;
        __u8 *addr;
-        __u32 csum;
+        __u32 csum32;
        if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
                return;
        sequence = cpu_to_be32(sequence);
        addr = kmap_atomic(page);
-        csum = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence,
+        csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence,
-                          sizeof(sequence));
+                             sizeof(sequence));
-        csum = jbd2_chksum(j, csum, addr + offset_in_page(bh->b_data),
+        csum32 = jbd2_chksum(j, csum32, addr + offset_in_page(bh->b_data),
-                          bh->b_size);
+                             bh->b_size);
        kunmap_atomic(addr);
-        tag->t_checksum = cpu_to_be32(csum);
+        /* We only have space to store the lower 16 bits of the crc32c. */
+        tag->t_checksum = cpu_to_be16(csum32);
 }
 /*
 * jbd2_journal_commit_transaction
@@ -368,7 +368,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 {
        struct transaction_stats_s stats;
        transaction_t *commit_transaction;
-        struct journal_head *jh, *new_jh, *descriptor;
+        struct journal_head *jh;
+        struct buffer_head *descriptor;
        struct buffer_head **wbuf = journal->j_wbuf;
        int bufs;
        int flags;
@@ -392,6 +393,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        tid_t first_tid;
        int update_tail;
        int csum_size = 0;
+        LIST_HEAD(io_bufs);
+        LIST_HEAD(log_bufs);
        if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
                csum_size = sizeof(struct jbd2_journal_block_tail);
@@ -424,13 +427,13 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        J_ASSERT(journal->j_committing_transaction == NULL);
        commit_transaction = journal->j_running_transaction;
-        J_ASSERT(commit_transaction->t_state == T_RUNNING);
        trace_jbd2_start_commit(journal, commit_transaction);
        jbd_debug(1, "JBD2: starting commit of transaction %d\n",
                        commit_transaction->t_tid);
        write_lock(&journal->j_state_lock);
+        J_ASSERT(commit_transaction->t_state == T_RUNNING);
        commit_transaction->t_state = T_LOCKED;
        trace_jbd2_commit_locking(journal, commit_transaction);
@@ -520,6 +523,12 @@ void jbd2_journal_commit_transaction(journal_t *journal)
         */
        jbd2_journal_switch_revoke_table(journal);
+        /*
+         * Reserved credits cannot be claimed anymore, free them
+         */
+        atomic_sub(atomic_read(&journal->j_reserved_credits),
+                   &commit_transaction->t_outstanding_credits);
        trace_jbd2_commit_flushing(journal, commit_transaction);
        stats.run.rs_flushing = jiffies;
        stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
@@ -533,7 +542,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        wake_up(&journal->j_wait_transaction_locked);
        write_unlock(&journal->j_state_lock);
-        jbd_debug(3, "JBD2: commit phase 2\n");
+        jbd_debug(3, "JBD2: commit phase 2a\n");
        /*
         * Now start flushing things to disk, in the order they appear
@@ -545,10 +554,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        blk_start_plug(&plug);
        jbd2_journal_write_revoke_records(journal, commit_transaction,
-                                          WRITE_SYNC);
+                                          &log_bufs, WRITE_SYNC);
        blk_finish_plug(&plug);
-        jbd_debug(3, "JBD2: commit phase 2\n");
+        jbd_debug(3, "JBD2: commit phase 2b\n");
        /*
         * Way to go: we have now written out all of the data for a
@@ -571,8 +580,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
                 atomic_read(&commit_transaction->t_outstanding_credits));
        err = 0;
-        descriptor = NULL;
        bufs = 0;
+        descriptor = NULL;
        blk_start_plug(&plug);
        while (commit_transaction->t_buffers) {
@@ -604,8 +613,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
                   record the metadata buffer. */
                if (!descriptor) {
-                        struct buffer_head *bh;
                        J_ASSERT (bufs == 0);
                        jbd_debug(4, "JBD2: get descriptor\n");
@@ -616,26 +623,26 @@ void jbd2_journal_commit_transaction(journal_t *journal)
                                continue;
                        }
-                        bh = jh2bh(descriptor);
                        jbd_debug(4, "JBD2: got buffer %llu (%p)\n",
-                                (unsigned long long)bh->b_blocknr, bh->b_data);
+                                (unsigned long long)descriptor->b_blocknr,
-                        header = (journal_header_t *)&bh->b_data[0];
+                                descriptor->b_data);
+                        header = (journal_header_t *)descriptor->b_data;
                        header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
                        header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
                        header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
-                        tagp = &bh->b_data[sizeof(journal_header_t)];
+                        tagp = &descriptor->b_data[sizeof(journal_header_t)];
-                        space_left = bh->b_size - sizeof(journal_header_t);
+                        space_left = descriptor->b_size -
+                                                sizeof(journal_header_t);
                        first_tag = 1;
-                        set_buffer_jwrite(bh);
+                        set_buffer_jwrite(descriptor);
-                        set_buffer_dirty(bh);
+                        set_buffer_dirty(descriptor);
-                        wbuf[bufs++] = bh;
+                        wbuf[bufs++] = descriptor;
                        /* Record it so that we can wait for IO
                           completion later */
-                        BUFFER_TRACE(bh, "ph3: file as descriptor");
+                        BUFFER_TRACE(descriptor, "ph3: file as descriptor");
-                        jbd2_journal_file_buffer(descriptor, commit_transaction,
+                        jbd2_file_log_bh(&log_bufs, descriptor);
-                                        BJ_LogCtl);
                }
                /* Where is the buffer to be written? */
@@ -658,29 +665,22 @@ void jbd2_journal_commit_transaction(journal_t *journal)
                /* Bump b_count to prevent truncate from stumbling over
                   the shadowed buffer!  @@@ This can go if we ever get
-                   rid of the BJ_IO/BJ_Shadow pairing of buffers. */
+                   rid of the shadow pairing of buffers. */
                atomic_inc(&jh2bh(jh)->b_count);
-                /* Make a temporary IO buffer with which to write it out
-                   (this will requeue both the metadata buffer and the
-                   temporary IO buffer). new_bh goes on BJ_IO*/
-                set_bit(BH_JWrite, &jh2bh(jh)->b_state);
                /*
-                 * akpm: jbd2_journal_write_metadata_buffer() sets
+                 * Make a temporary IO buffer with which to write it out
-                 * new_bh->b_transaction to commit_transaction.
+                 * (this will requeue the metadata buffer to BJ_Shadow).
-                 * We need to clean this up before we release new_bh
-                 * (which is of type BJ_IO)
                 */
+                set_bit(BH_JWrite, &jh2bh(jh)->b_state);
                JBUFFER_TRACE(jh, "ph3: write metadata");
                flags = jbd2_journal_write_metadata_buffer(commit_transaction,
-                                                      jh, &new_jh, blocknr);
+                                                jh, &wbuf[bufs], blocknr);
                if (flags < 0) {
                        jbd2_journal_abort(journal, flags);
                        continue;
                }
-                set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
+                jbd2_file_log_bh(&io_bufs, wbuf[bufs]);
-                wbuf[bufs++] = jh2bh(new_jh);
                /* Record the new block's tag in the current descriptor
                   buffer */
@@ -694,10 +694,11 @@ void jbd2_journal_commit_transaction(journal_t *journal)
                tag = (journal_block_tag_t *) tagp;
                write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
                tag->t_flags = cpu_to_be16(tag_flag);
-                jbd2_block_tag_csum_set(journal, tag, jh2bh(new_jh),
+                jbd2_block_tag_csum_set(journal, tag, wbuf[bufs],
                                        commit_transaction->t_tid);
                tagp += tag_bytes;
                space_left -= tag_bytes;
+                bufs++;
                if (first_tag) {
                        memcpy (tagp, journal->j_uuid, 16);
@@ -809,7 +810,7 @@ start_journal_io:
           the log.  Before we can commit it, wait for the IO so far to
           complete.  Control buffers being written are on the
           transaction's t_log_list queue, and metadata buffers are on
-           the t_iobuf_list queue.
+           the io_bufs list.
           Wait for the buffers in reverse order.  That way we are
           less likely to be woken up until all IOs have completed, and
@@ -818,47 +819,33 @@ start_journal_io:
        jbd_debug(3, "JBD2: commit phase 3\n");
-        /*
+        while (!list_empty(&io_bufs)) {
-         * akpm: these are BJ_IO, and j_list_lock is not needed.
+                struct buffer_head *bh = list_entry(io_bufs.prev,
-         * See __journal_try_to_free_buffer.
+                                                    struct buffer_head,
-         */
+                                                    b_assoc_buffers);
-wait_for_iobuf:
-        while (commit_transaction->t_iobuf_list != NULL) {
-                struct buffer_head *bh;
-                jh = commit_transaction->t_iobuf_list->b_tprev;
+                wait_on_buffer(bh);
-                bh = jh2bh(jh);
+                cond_resched();
-                if (buffer_locked(bh)) {
-                        wait_on_buffer(bh);
-                        goto wait_for_iobuf;
-                }
-                if (cond_resched())
-                        goto wait_for_iobuf;
                if (unlikely(!buffer_uptodate(bh)))
                        err = -EIO;
+                jbd2_unfile_log_bh(bh);
-                clear_buffer_jwrite(bh);
-                JBUFFER_TRACE(jh, "ph4: unfile after journal write");
-                jbd2_journal_unfile_buffer(journal, jh);
                /*
-                 * ->t_iobuf_list should contain only dummy buffer_heads
+                 * The list contains temporary buffer heads created by
-                 * which were created by jbd2_journal_write_metadata_buffer().
+                 * jbd2_journal_write_metadata_buffer().
                 */
                BUFFER_TRACE(bh, "dumping temporary bh");
-                jbd2_journal_put_journal_head(jh);
                __brelse(bh);
                J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
                free_buffer_head(bh);
-                /* We also have to unlock and free the corresponding
+                /* We also have to refile the corresponding shadowed buffer */
-                   shadowed buffer */
                jh = commit_transaction->t_shadow_list->b_tprev;
                bh = jh2bh(jh);
-                clear_bit(BH_JWrite, &bh->b_state);
+                clear_buffer_jwrite(bh);
                J_ASSERT_BH(bh, buffer_jbddirty(bh));
+                J_ASSERT_BH(bh, !buffer_shadow(bh));
                /* The metadata is now released for reuse, but we need
                   to remember it against this transaction so that when
@@ -866,14 +853,6 @@ wait_for_iobuf:
                   required. */
                JBUFFER_TRACE(jh, "file as BJ_Forget");
                jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
-                /*
-                 * Wake up any transactions which were waiting for this IO to
-                 * complete. The barrier must be here so that changes by
-                 * jbd2_journal_file_buffer() take effect before wake_up_bit()
-                 * does the waitqueue check.
-                 */
-                smp_mb();
-                wake_up_bit(&bh->b_state, BH_Unshadow);
                JBUFFER_TRACE(jh, "brelse shadowed buffer");
                __brelse(bh);
        }
@@ -883,26 +862,19 @@ wait_for_iobuf:
        jbd_debug(3, "JBD2: commit phase 4\n");
        /* Here we wait for the revoke record and descriptor record buffers */
- wait_for_ctlbuf:
+        while (!list_empty(&log_bufs)) {
-        while (commit_transaction->t_log_list != NULL) {
                struct buffer_head *bh;
-                jh = commit_transaction->t_log_list->b_tprev;
+                bh = list_entry(log_bufs.prev, struct buffer_head, b_assoc_buffers);
-                bh = jh2bh(jh);
+                wait_on_buffer(bh);
-                if (buffer_locked(bh)) {
+                cond_resched();
-                        wait_on_buffer(bh);
-                        goto wait_for_ctlbuf;
-                }
-                if (cond_resched())
-                        goto wait_for_ctlbuf;
                if (unlikely(!buffer_uptodate(bh)))
                        err = -EIO;
                BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
                clear_buffer_jwrite(bh);
-                jbd2_journal_unfile_buffer(journal, jh);
+                jbd2_unfile_log_bh(bh);
-                jbd2_journal_put_journal_head(jh);
                __brelse(bh);           /* One for getblk */
                /* AKPM: bforget here */
        }
@@ -952,9 +924,7 @@ wait_for_iobuf:
        J_ASSERT(list_empty(&commit_transaction->t_inode_list));
        J_ASSERT(commit_transaction->t_buffers == NULL);
        J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
-        J_ASSERT(commit_transaction->t_iobuf_list == NULL);
        J_ASSERT(commit_transaction->t_shadow_list == NULL);
-        J_ASSERT(commit_transaction->t_log_list == NULL);
 restart_loop:
        /*
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 95457576e434..02c7ad9d7a41 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -103,6 +103,24 @@ EXPORT_SYMBOL(jbd2_inode_cache);
 static void __journal_abort_soft (journal_t *journal, int errno);
 static int jbd2_journal_create_slab(size_t slab_size);
+#ifdef CONFIG_JBD2_DEBUG
+void __jbd2_debug(int level, const char *file, const char *func,
+                  unsigned int line, const char *fmt, ...)
+{
+        struct va_format vaf;
+        va_list args;
+        if (level > jbd2_journal_enable_debug)
+                return;
+        va_start(args, fmt);
+        vaf.fmt = fmt;
+        vaf.va = &args;
+        printk(KERN_DEBUG "%s: (%s, %u): %pV\n", file, func, line, &vaf);
+        va_end(args);
+}
+EXPORT_SYMBOL(__jbd2_debug);
+#endif
 /* Checksumming functions */
 int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb)
 {
@@ -310,14 +328,12 @@ static void journal_kill_thread(journal_t *journal)
 *
 * If the source buffer has already been modified by a new transaction
 * since we took the last commit snapshot, we use the frozen copy of
- * that data for IO.  If we end up using the existing buffer_head's data
+ * that data for IO. If we end up using the existing buffer_head's data
- * for the write, then we *have* to lock the buffer to prevent anyone
+ * for the write, then we have to make sure nobody modifies it while the
- * else from using and possibly modifying it while the IO is in
+ * IO is in progress. do_get_write_access() handles this.
- * progress.
 *
- * The function returns a pointer to the buffer_heads to be used for IO.
+ * The function returns a pointer to the buffer_head to be used for IO.
- *
+ * 
- * We assume that the journal has already been locked in this function.
 *
 * Return value:
 *  <0: Error
@@ -330,15 +346,14 @@ static void journal_kill_thread(journal_t *journal)
 int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
                                  struct journal_head  *jh_in,
-                                  struct journal_head **jh_out,
+                                  struct buffer_head **bh_out,
-                                  unsigned long long blocknr)
+                                  sector_t blocknr)
 {
        int need_copy_out = 0;
        int done_copy_out = 0;
        int do_escape = 0;
        char *mapped_data;
        struct buffer_head *new_bh;
-        struct journal_head *new_jh;
        struct page *new_page;
        unsigned int new_offset;
        struct buffer_head *bh_in = jh2bh(jh_in);
@@ -368,14 +383,13 @@ retry_alloc:
        /* keep subsequent assertions sane */
        atomic_set(&new_bh->b_count, 1);
-        new_jh = jbd2_journal_add_journal_head(new_bh); /* This sleeps */
+        jbd_lock_bh_state(bh_in);
+repeat:
        /*
         * If a new transaction has already done a buffer copy-out, then
         * we use that version of the data for the commit.
         */
-        jbd_lock_bh_state(bh_in);
-repeat:
        if (jh_in->b_frozen_data) {
                done_copy_out = 1;
                new_page = virt_to_page(jh_in->b_frozen_data);
@@ -415,7 +429,7 @@ repeat:
                jbd_unlock_bh_state(bh_in);
                tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS);
                if (!tmp) {
-                        jbd2_journal_put_journal_head(new_jh);
+                        brelse(new_bh);
                        return -ENOMEM;
                }
                jbd_lock_bh_state(bh_in);
@@ -426,7 +440,7 @@ repeat:
                jh_in->b_frozen_data = tmp;
                mapped_data = kmap_atomic(new_page);
-                memcpy(tmp, mapped_data + new_offset, jh2bh(jh_in)->b_size);
+                memcpy(tmp, mapped_data + new_offset, bh_in->b_size);
                kunmap_atomic(mapped_data);
                new_page = virt_to_page(tmp);
@@ -452,14 +466,14 @@ repeat:
        }
        set_bh_page(new_bh, new_page, new_offset);
-        new_jh->b_transaction = NULL;
+        new_bh->b_size = bh_in->b_size;
-        new_bh->b_size = jh2bh(jh_in)->b_size;
+        new_bh->b_bdev = journal->j_dev;
-        new_bh->b_bdev = transaction->t_journal->j_dev;
        new_bh->b_blocknr = blocknr;
+        new_bh->b_private = bh_in;
        set_buffer_mapped(new_bh);
        set_buffer_dirty(new_bh);
-        *jh_out = new_jh;
+        *bh_out = new_bh;
        /*
         * The to-be-written buffer needs to get moved to the io queue,
@@ -470,11 +484,9 @@ repeat:
        spin_lock(&journal->j_list_lock);
        __jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow);
        spin_unlock(&journal->j_list_lock);
+        set_buffer_shadow(bh_in);
        jbd_unlock_bh_state(bh_in);
-        JBUFFER_TRACE(new_jh, "file as BJ_IO");
-        jbd2_journal_file_buffer(new_jh, transaction, BJ_IO);
        return do_escape | (done_copy_out << 1);
 }
@@ -484,35 +496,6 @@ repeat:
 */
 /*
- * __jbd2_log_space_left: Return the number of free blocks left in the journal.
- *
- * Called with the journal already locked.
- *
- * Called under j_state_lock
- */
-int __jbd2_log_space_left(journal_t *journal)
-{
-        int left = journal->j_free;
-        /* assert_spin_locked(&journal->j_state_lock); */
-        /*
-         * Be pessimistic here about the number of those free blocks which
-         * might be required for log descriptor control blocks.
-         */
-#define MIN_LOG_RESERVED_BLOCKS 32 /* Allow for rounding errors */
-        left -= MIN_LOG_RESERVED_BLOCKS;
-        if (left <= 0)
-                return 0;
-        left -= (left >> 3);
-        return left;
-}
-/*
 * Called with j_state_lock locked for writing.
 * Returns true if a transaction commit was started.
 */
@@ -564,20 +547,17 @@ int jbd2_log_start_commit(journal_t *journal, tid_t tid)
 }
 /*
- * Force and wait upon a commit if the calling process is not within
+ * Force and wait any uncommitted transactions.  We can only force the running
- * transaction.  This is used for forcing out undo-protected data which contains
+ * transaction if we don't have an active handle, otherwise, we will deadlock.
- * bitmaps, when the fs is running out of space.
+ * Returns: <0 in case of error,
- *
+ *           0 if nothing to commit,
- * We can only force the running transaction if we don't have an active handle;
+ *           1 if transaction was successfully committed.
- * otherwise, we will deadlock.
- *
- * Returns true if a transaction was started.
 */
-int jbd2_journal_force_commit_nested(journal_t *journal)
+static int __jbd2_journal_force_commit(journal_t *journal)
 {
        transaction_t *transaction = NULL;
        tid_t tid;
-        int need_to_start = 0;
+        int need_to_start = 0, ret = 0;
        read_lock(&journal->j_state_lock);
        if (journal->j_running_transaction && !current->journal_info) {
@@ -588,16 +568,53 @@ int jbd2_journal_force_commit_nested(journal_t *journal)
                transaction = journal->j_committing_transaction;
        if (!transaction) {
+                /* Nothing to commit */
                read_unlock(&journal->j_state_lock);
-                return 0;       /* Nothing to retry */
+                return 0;
        }
        tid = transaction->t_tid;
        read_unlock(&journal->j_state_lock);
        if (need_to_start)
                jbd2_log_start_commit(journal, tid);
-        jbd2_log_wait_commit(journal, tid);
+        ret = jbd2_log_wait_commit(journal, tid);
-        return 1;
+        if (!ret)
+                ret = 1;
+        return ret;
+}
+/**
+ * Force and wait upon a commit if the calling process is not within
+ * transaction.  This is used for forcing out undo-protected data which contains
+ * bitmaps, when the fs is running out of space.
+ *
+ * @journal: journal to force
+ * Returns true if progress was made.
+ */
+int jbd2_journal_force_commit_nested(journal_t *journal)
+{
+        int ret;
+        ret = __jbd2_journal_force_commit(journal);
+        return ret > 0;
+}
+/**
+ * int journal_force_commit() - force any uncommitted transactions
+ * @journal: journal to force
+ *
+ * Caller want unconditional commit. We can only force the running transaction
+ * if we don't have an active handle, otherwise, we will deadlock.
+ */
+int jbd2_journal_force_commit(journal_t *journal)
+{
+        int ret;
+        J_ASSERT(!current->journal_info);
+        ret = __jbd2_journal_force_commit(journal);
+        if (ret > 0)
+                ret = 0;
+        return ret;
 }
 /*
@@ -798,7 +815,7 @@ int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr,
 * But we don't bother doing that, so there will be coherency problems with
 * mmaps of blockdevs which hold live JBD-controlled filesystems.
 */
-struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)
+struct buffer_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)
 {
        struct buffer_head *bh;
        unsigned long long blocknr;
@@ -817,7 +834,7 @@ struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)
        set_buffer_uptodate(bh);
        unlock_buffer(bh);
        BUFFER_TRACE(bh, "return this buffer");
-        return jbd2_journal_add_journal_head(bh);
+        return bh;
 }
 /*
@@ -1062,11 +1079,10 @@ static journal_t * journal_init_common (void)
                return NULL;
        init_waitqueue_head(&journal->j_wait_transaction_locked);
-        init_waitqueue_head(&journal->j_wait_logspace);
        init_waitqueue_head(&journal->j_wait_done_commit);
-        init_waitqueue_head(&journal->j_wait_checkpoint);
        init_waitqueue_head(&journal->j_wait_commit);
        init_waitqueue_head(&journal->j_wait_updates);
+        init_waitqueue_head(&journal->j_wait_reserved);
        mutex_init(&journal->j_barrier);
        mutex_init(&journal->j_checkpoint_mutex);
        spin_lock_init(&journal->j_revoke_lock);
@@ -1076,6 +1092,7 @@ static journal_t * journal_init_common (void)
        journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE);
        journal->j_min_batch_time = 0;
        journal->j_max_batch_time = 15000; /* 15ms */
+        atomic_set(&journal->j_reserved_credits, 0);
        /* The journal is marked for error until we succeed with recovery! */
        journal->j_flags = JBD2_ABORT;
@@ -1318,6 +1335,7 @@ static int journal_reset(journal_t *journal)
 static void jbd2_write_superblock(journal_t *journal, int write_op)
 {
        struct buffer_head *bh = journal->j_sb_buffer;
+        journal_superblock_t *sb = journal->j_superblock;
        int ret;
        trace_jbd2_write_superblock(journal, write_op);
@@ -1339,6 +1357,7 @@ static void jbd2_write_superblock(journal_t *journal, int write_op)
                clear_buffer_write_io_error(bh);
                set_buffer_uptodate(bh);
        }
+        jbd2_superblock_csum_set(journal, sb);
        get_bh(bh);
        bh->b_end_io = end_buffer_write_sync;
        ret = submit_bh(write_op, bh);
@@ -1435,7 +1454,6 @@ void jbd2_journal_update_sb_errno(journal_t *journal)
        jbd_debug(1, "JBD2: updating superblock error (errno %d)\n",
                  journal->j_errno);
        sb->s_errno    = cpu_to_be32(journal->j_errno);
-        jbd2_superblock_csum_set(journal, sb);
        read_unlock(&journal->j_state_lock);
        jbd2_write_superblock(journal, WRITE_SYNC);
@@ -2325,13 +2343,13 @@ static struct journal_head *journal_alloc_journal_head(void)
 #ifdef CONFIG_JBD2_DEBUG
        atomic_inc(&nr_journal_heads);
 #endif
-        ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS);
+        ret = kmem_cache_zalloc(jbd2_journal_head_cache, GFP_NOFS);
        if (!ret) {
                jbd_debug(1, "out of memory for journal_head\n");
                pr_notice_ratelimited("ENOMEM in %s, retrying.\n", __func__);
                while (!ret) {
                        yield();
-                        ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS);
+                        ret = kmem_cache_zalloc(jbd2_journal_head_cache, GFP_NOFS);
                }
        }
        return ret;
@@ -2393,10 +2411,8 @@ struct journal_head *jbd2_journal_add_journal_head(struct buffer_head *bh)
        struct journal_head *new_jh = NULL;
 repeat:
-        if (!buffer_jbd(bh)) {
+        if (!buffer_jbd(bh))
                new_jh = journal_alloc_journal_head();
-                memset(new_jh, 0, sizeof(*new_jh));
-        }
        jbd_lock_bh_journal_head(bh);
        if (buffer_jbd(bh)) {
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 626846bac32f..d4851464b57e 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -399,18 +399,17 @@ static int jbd2_commit_block_csum_verify(journal_t *j, void *buf)
 static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag,
                                      void *buf, __u32 sequence)
 {
-        __u32 provided, calculated;
+        __u32 csum32;
        if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
                return 1;
        sequence = cpu_to_be32(sequence);
-        calculated = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence,
+        csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence,
-                                 sizeof(sequence));
+                             sizeof(sequence));
-        calculated = jbd2_chksum(j, calculated, buf, j->j_blocksize);
+        csum32 = jbd2_chksum(j, csum32, buf, j->j_blocksize);
-        provided = be32_to_cpu(tag->t_checksum);
-        return provided == cpu_to_be32(calculated);
+        return tag->t_checksum == cpu_to_be16(csum32);
 }
 static int do_one_pass(journal_t *journal,
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index f30b80b4ce8b..198c9c10276d 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -122,9 +122,10 @@ struct jbd2_revoke_table_s
 #ifdef __KERNEL__
 static void write_one_revoke_record(journal_t *, transaction_t *,
-                                    struct journal_head **, int *,
+                                    struct list_head *,
+                                    struct buffer_head **, int *,
                                    struct jbd2_revoke_record_s *, int);
-static void flush_descriptor(journal_t *, struct journal_head *, int, int);
+static void flush_descriptor(journal_t *, struct buffer_head *, int, int);
 #endif
 /* Utility functions to maintain the revoke table */
@@ -531,9 +532,10 @@ void jbd2_journal_switch_revoke_table(journal_t *journal)
 */
 void jbd2_journal_write_revoke_records(journal_t *journal,
                                       transaction_t *transaction,
+                                       struct list_head *log_bufs,
                                       int write_op)
 {
-        struct journal_head *descriptor;
+        struct buffer_head *descriptor;
        struct jbd2_revoke_record_s *record;
        struct jbd2_revoke_table_s *revoke;
        struct list_head *hash_list;
@@ -553,7 +555,7 @@ void jbd2_journal_write_revoke_records(journal_t *journal,
                while (!list_empty(hash_list)) {
                        record = (struct jbd2_revoke_record_s *)
                                hash_list->next;
-                        write_one_revoke_record(journal, transaction,
+                        write_one_revoke_record(journal, transaction, log_bufs,
                                                &descriptor, &offset,
                                                record, write_op);
                        count++;
@@ -573,13 +575,14 @@ void jbd2_journal_write_revoke_records(journal_t *journal,
 static void write_one_revoke_record(journal_t *journal,
                                    transaction_t *transaction,
-                                    struct journal_head **descriptorp,
+                                    struct list_head *log_bufs,
+                                    struct buffer_head **descriptorp,
                                    int *offsetp,
                                    struct jbd2_revoke_record_s *record,
                                    int write_op)
 {
        int csum_size = 0;
-        struct journal_head *descriptor;
+        struct buffer_head *descriptor;
        int offset;
        journal_header_t *header;
@@ -609,26 +612,26 @@ static void write_one_revoke_record(journal_t *journal,
                descriptor = jbd2_journal_get_descriptor_buffer(journal);
                if (!descriptor)
                        return;
-                header = (journal_header_t *) &jh2bh(descriptor)->b_data[0];
+                header = (journal_header_t *)descriptor->b_data;
                header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
                header->h_blocktype = cpu_to_be32(JBD2_REVOKE_BLOCK);
                header->h_sequence  = cpu_to_be32(transaction->t_tid);
                /* Record it so that we can wait for IO completion later */
-                JBUFFER_TRACE(descriptor, "file as BJ_LogCtl");
+                BUFFER_TRACE(descriptor, "file in log_bufs");
-                jbd2_journal_file_buffer(descriptor, transaction, BJ_LogCtl);
+                jbd2_file_log_bh(log_bufs, descriptor);
                offset = sizeof(jbd2_journal_revoke_header_t);
                *descriptorp = descriptor;
        }
        if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) {
-                * ((__be64 *)(&jh2bh(descriptor)->b_data[offset])) =
+                * ((__be64 *)(&descriptor->b_data[offset])) =
                        cpu_to_be64(record->blocknr);
                offset += 8;
        } else {
-                * ((__be32 *)(&jh2bh(descriptor)->b_data[offset])) =
+                * ((__be32 *)(&descriptor->b_data[offset])) =
                        cpu_to_be32(record->blocknr);
                offset += 4;
        }
@@ -636,8 +639,7 @@ static void write_one_revoke_record(journal_t *journal,
        *offsetp = offset;
 }
-static void jbd2_revoke_csum_set(journal_t *j,
+static void jbd2_revoke_csum_set(journal_t *j, struct buffer_head *bh)
-                                 struct journal_head *descriptor)
 {
        struct jbd2_journal_revoke_tail *tail;
        __u32 csum;
@@ -645,12 +647,10 @@ static void jbd2_revoke_csum_set(journal_t *j,
        if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
                return;
-        tail = (struct jbd2_journal_revoke_tail *)
+        tail = (struct jbd2_journal_revoke_tail *)(bh->b_data + j->j_blocksize -
-                        (jh2bh(descriptor)->b_data + j->j_blocksize -
                        sizeof(struct jbd2_journal_revoke_tail));
        tail->r_checksum = 0;
-        csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data,
+        csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
-                           j->j_blocksize);
        tail->r_checksum = cpu_to_be32(csum);
 }
@@ -662,25 +662,24 @@ static void jbd2_revoke_csum_set(journal_t *j,
 */
 static void flush_descriptor(journal_t *journal,
-                             struct journal_head *descriptor,
+                             struct buffer_head *descriptor,
                             int offset, int write_op)
 {
        jbd2_journal_revoke_header_t *header;
-        struct buffer_head *bh = jh2bh(descriptor);
        if (is_journal_aborted(journal)) {
-                put_bh(bh);
+                put_bh(descriptor);
                return;
        }
-        header = (jbd2_journal_revoke_header_t *) jh2bh(descriptor)->b_data;
+        header = (jbd2_journal_revoke_header_t *)descriptor->b_data;
        header->r_count = cpu_to_be32(offset);
        jbd2_revoke_csum_set(journal, descriptor);
-        set_buffer_jwrite(bh);
+        set_buffer_jwrite(descriptor);
-        BUFFER_TRACE(bh, "write");
+        BUFFER_TRACE(descriptor, "write");
-        set_buffer_dirty(bh);
+        set_buffer_dirty(descriptor);
-        write_dirty_buffer(bh, write_op);
+        write_dirty_buffer(descriptor, write_op);
 }
 #endif
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 10f524c59ea8..7aa9a32573bb 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -89,7 +89,8 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
        transaction->t_expires = jiffies + journal->j_commit_interval;
        spin_lock_init(&transaction->t_handle_lock);
        atomic_set(&transaction->t_updates, 0);
-        atomic_set(&transaction->t_outstanding_credits, 0);
+        atomic_set(&transaction->t_outstanding_credits,
+                   atomic_read(&journal->j_reserved_credits));
        atomic_set(&transaction->t_handle_count, 0);
        INIT_LIST_HEAD(&transaction->t_inode_list);
        INIT_LIST_HEAD(&transaction->t_private_list);
@@ -141,6 +142,112 @@ static inline void update_t_max_wait(transaction_t *transaction,
 }
 /*
+ * Wait until running transaction passes T_LOCKED state. Also starts the commit
+ * if needed. The function expects running transaction to exist and releases
+ * j_state_lock.
+ */
+static void wait_transaction_locked(journal_t *journal)
+        __releases(journal->j_state_lock)
+{
+        DEFINE_WAIT(wait);
+        int need_to_start;
+        tid_t tid = journal->j_running_transaction->t_tid;
+        prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
+                        TASK_UNINTERRUPTIBLE);
+        need_to_start = !tid_geq(journal->j_commit_request, tid);
+        read_unlock(&journal->j_state_lock);
+        if (need_to_start)
+                jbd2_log_start_commit(journal, tid);
+        schedule();
+        finish_wait(&journal->j_wait_transaction_locked, &wait);
+}
+static void sub_reserved_credits(journal_t *journal, int blocks)
+{
+        atomic_sub(blocks, &journal->j_reserved_credits);
+        wake_up(&journal->j_wait_reserved);
+}
+/*
+ * Wait until we can add credits for handle to the running transaction.  Called
+ * with j_state_lock held for reading. Returns 0 if handle joined the running
+ * transaction. Returns 1 if we had to wait, j_state_lock is dropped, and
+ * caller must retry.
+ */
+static int add_transaction_credits(journal_t *journal, int blocks,
+                                   int rsv_blocks)
+{
+        transaction_t *t = journal->j_running_transaction;
+        int needed;
+        int total = blocks + rsv_blocks;
+        /*
+         * If the current transaction is locked down for commit, wait
+         * for the lock to be released.
+         */
+        if (t->t_state == T_LOCKED) {
+                wait_transaction_locked(journal);
+                return 1;
+        }
+        /*
+         * If there is not enough space left in the log to write all
+         * potential buffers requested by this operation, we need to
+         * stall pending a log checkpoint to free some more log space.
+         */
+        needed = atomic_add_return(total, &t->t_outstanding_credits);
+        if (needed > journal->j_max_transaction_buffers) {
+                /*
+                 * If the current transaction is already too large,
+                 * then start to commit it: we can then go back and
+                 * attach this handle to a new transaction.
+                 */
+                atomic_sub(total, &t->t_outstanding_credits);
+                wait_transaction_locked(journal);
+                return 1;
+        }
+        /*
+         * The commit code assumes that it can get enough log space
+         * without forcing a checkpoint.  This is *critical* for
+         * correctness: a checkpoint of a buffer which is also
+         * associated with a committing transaction creates a deadlock,
+         * so commit simply cannot force through checkpoints.
+         *
+         * We must therefore ensure the necessary space in the journal
+         * *before* starting to dirty potentially checkpointed buffers
+         * in the new transaction.
+         */
+        if (jbd2_log_space_left(journal) < jbd2_space_needed(journal)) {
+                atomic_sub(total, &t->t_outstanding_credits);
+                read_unlock(&journal->j_state_lock);
+                write_lock(&journal->j_state_lock);
+                if (jbd2_log_space_left(journal) < jbd2_space_needed(journal))
+                        __jbd2_log_wait_for_space(journal);
+                write_unlock(&journal->j_state_lock);
+                return 1;
+        }
+        /* No reservation? We are done... */
+        if (!rsv_blocks)
+                return 0;
+        needed = atomic_add_return(rsv_blocks, &journal->j_reserved_credits);
+        /* We allow at most half of a transaction to be reserved */
+        if (needed > journal->j_max_transaction_buffers / 2) {
+                sub_reserved_credits(journal, rsv_blocks);
+                atomic_sub(total, &t->t_outstanding_credits);
+                read_unlock(&journal->j_state_lock);
+                wait_event(journal->j_wait_reserved,
+                         atomic_read(&journal->j_reserved_credits) + rsv_blocks
+                         <= journal->j_max_transaction_buffers / 2);
+                return 1;
+        }
+        return 0;
+}
+/*
 * start_this_handle: Given a handle, deal with any locking or stalling
 * needed to make sure that there is enough journal space for the handle
 * to begin.  Attach the handle to a transaction and set up the
@@ -151,18 +258,24 @@ static int start_this_handle(journal_t *journal, handle_t *handle,
                             gfp_t gfp_mask)
 {
        transaction_t   *transaction, *new_transaction = NULL;
-        tid_t           tid;
+        int             blocks = handle->h_buffer_credits;
-        int             needed, need_to_start;
+        int             rsv_blocks = 0;
-        int             nblocks = handle->h_buffer_credits;
        unsigned long ts = jiffies;
-        if (nblocks > journal->j_max_transaction_buffers) {
+        /*
+         * 1/2 of transaction can be reserved so we can practically handle
+         * only 1/2 of maximum transaction size per operation
+         */
+        if (WARN_ON(blocks > journal->j_max_transaction_buffers / 2)) {
                printk(KERN_ERR "JBD2: %s wants too many credits (%d > %d)\n",
-                       current->comm, nblocks,
+                       current->comm, blocks,
-                       journal->j_max_transaction_buffers);
+                       journal->j_max_transaction_buffers / 2);
                return -ENOSPC;
        }
+        if (handle->h_rsv_handle)
+                rsv_blocks = handle->h_rsv_handle->h_buffer_credits;
 alloc_transaction:
        if (!journal->j_running_transaction) {
                new_transaction = kmem_cache_zalloc(transaction_cache,
@@ -199,8 +312,12 @@ repeat:
                return -EROFS;
        }
-        /* Wait on the journal's transaction barrier if necessary */
+        /*
-        if (journal->j_barrier_count) {
+         * Wait on the journal's transaction barrier if necessary. Specifically
+         * we allow reserved handles to proceed because otherwise commit could
+         * deadlock on page writeback not being able to complete.
+         */
+        if (!handle->h_reserved && journal->j_barrier_count) {
                read_unlock(&journal->j_state_lock);
                wait_event(journal->j_wait_transaction_locked,
                                journal->j_barrier_count == 0);
@@ -213,7 +330,7 @@ repeat:
                        goto alloc_transaction;
                write_lock(&journal->j_state_lock);
                if (!journal->j_running_transaction &&
-                    !journal->j_barrier_count) {
+                    (handle->h_reserved || !journal->j_barrier_count)) {
                        jbd2_get_transaction(journal, new_transaction);
                        new_transaction = NULL;
                }
@@ -223,85 +340,18 @@ repeat:
        transaction = journal->j_running_transaction;
-        /*
+        if (!handle->h_reserved) {
-         * If the current transaction is locked down for commit, wait for the
+                /* We may have dropped j_state_lock - restart in that case */
-         * lock to be released.
+                if (add_transaction_credits(journal, blocks, rsv_blocks))
-         */
+                        goto repeat;
-        if (transaction->t_state == T_LOCKED) {
+        } else {
-                DEFINE_WAIT(wait);
-                prepare_to_wait(&journal->j_wait_transaction_locked,
-                                        &wait, TASK_UNINTERRUPTIBLE);
-                read_unlock(&journal->j_state_lock);
-                schedule();
-                finish_wait(&journal->j_wait_transaction_locked, &wait);
-                goto repeat;
-        }
-        /*
-         * If there is not enough space left in the log to write all potential
-         * buffers requested by this operation, we need to stall pending a log
-         * checkpoint to free some more log space.
-         */
-        needed = atomic_add_return(nblocks,
-                                   &transaction->t_outstanding_credits);
-        if (needed > journal->j_max_transaction_buffers) {
                /*
-                 * If the current transaction is already too large, then start
+                 * We have handle reserved so we are allowed to join T_LOCKED
-                 * to commit it: we can then go back and attach this handle to
+                 * transaction and we don't have to check for transaction size
-                 * a new transaction.
+                 * and journal space.
                 */
-                DEFINE_WAIT(wait);
+                sub_reserved_credits(journal, blocks);
+                handle->h_reserved = 0;
-                jbd_debug(2, "Handle %p starting new commit...\n", handle);
-                atomic_sub(nblocks, &transaction->t_outstanding_credits);
-                prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
-                                TASK_UNINTERRUPTIBLE);
-                tid = transaction->t_tid;
-                need_to_start = !tid_geq(journal->j_commit_request, tid);
-                read_unlock(&journal->j_state_lock);
-                if (need_to_start)
-                        jbd2_log_start_commit(journal, tid);
-                schedule();
-                finish_wait(&journal->j_wait_transaction_locked, &wait);
-                goto repeat;
-        }
-        /*
-         * The commit code assumes that it can get enough log space
-         * without forcing a checkpoint.  This is *critical* for
-         * correctness: a checkpoint of a buffer which is also
-         * associated with a committing transaction creates a deadlock,
-         * so commit simply cannot force through checkpoints.
-         *
-         * We must therefore ensure the necessary space in the journal
-         * *before* starting to dirty potentially checkpointed buffers
-         * in the new transaction.
-         *
-         * The worst part is, any transaction currently committing can
-         * reduce the free space arbitrarily.  Be careful to account for
-         * those buffers when checkpointing.
-         */
-        /*
-         * @@@ AKPM: This seems rather over-defensive.  We're giving commit
-         * a _lot_ of headroom: 1/4 of the journal plus the size of
-         * the committing transaction.  Really, we only need to give it
-         * committing_transaction->t_outstanding_credits plus "enough" for
-         * the log control blocks.
-         * Also, this test is inconsistent with the matching one in
-         * jbd2_journal_extend().
-         */
-        if (__jbd2_log_space_left(journal) < jbd_space_needed(journal)) {
-                jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle);
-                atomic_sub(nblocks, &transaction->t_outstanding_credits);
-                read_unlock(&journal->j_state_lock);
-                write_lock(&journal->j_state_lock);
-                if (__jbd2_log_space_left(journal) < jbd_space_needed(journal))
-                        __jbd2_log_wait_for_space(journal);
-                write_unlock(&journal->j_state_lock);
-                goto repeat;
        }
        /* OK, account for the buffers that this operation expects to
@@ -309,15 +359,16 @@ repeat:
         */
        update_t_max_wait(transaction, ts);
        handle->h_transaction = transaction;
-        handle->h_requested_credits = nblocks;
+        handle->h_requested_credits = blocks;
        handle->h_start_jiffies = jiffies;
        atomic_inc(&transaction->t_updates);
        atomic_inc(&transaction->t_handle_count);
-        jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n",
+        jbd_debug(4, "Handle %p given %d credits (total %d, free %lu)\n",
-                  handle, nblocks,
+                  handle, blocks,
                  atomic_read(&transaction->t_outstanding_credits),
-                  __jbd2_log_space_left(journal));
+                  jbd2_log_space_left(journal));
        read_unlock(&journal->j_state_lock);
+        current->journal_info = handle;
        lock_map_acquire(&handle->h_lockdep_map);
        jbd2_journal_free_transaction(new_transaction);
@@ -348,16 +399,21 @@ static handle_t *new_handle(int nblocks)
 *
 * We make sure that the transaction can guarantee at least nblocks of
 * modified buffers in the log.  We block until the log can guarantee
- * that much space.
+ * that much space. Additionally, if rsv_blocks > 0, we also create another
- *
+ * handle with rsv_blocks reserved blocks in the journal. This handle is
- * This function is visible to journal users (like ext3fs), so is not
+ * is stored in h_rsv_handle. It is not attached to any particular transaction
- * called with the journal already locked.
+ * and thus doesn't block transaction commit. If the caller uses this reserved
+ * handle, it has to set h_rsv_handle to NULL as otherwise jbd2_journal_stop()
+ * on the parent handle will dispose the reserved one. Reserved handle has to
+ * be converted to a normal handle using jbd2_journal_start_reserved() before
+ * it can be used.
 *
 * Return a pointer to a newly allocated handle, or an ERR_PTR() value
 * on failure.
 */
-handle_t *jbd2__journal_start(journal_t *journal, int nblocks, gfp_t gfp_mask,
+handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks,
-                              unsigned int type, unsigned int line_no)
+                              gfp_t gfp_mask, unsigned int type,
+                              unsigned int line_no)
 {
        handle_t *handle = journal_current_handle();
        int err;
@@ -374,13 +430,24 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, gfp_t gfp_mask,
        handle = new_handle(nblocks);
        if (!handle)
                return ERR_PTR(-ENOMEM);
+        if (rsv_blocks) {
+                handle_t *rsv_handle;
-        current->journal_info = handle;
+                rsv_handle = new_handle(rsv_blocks);
+                if (!rsv_handle) {
+                        jbd2_free_handle(handle);
+                        return ERR_PTR(-ENOMEM);
+                }
+                rsv_handle->h_reserved = 1;
+                rsv_handle->h_journal = journal;
+                handle->h_rsv_handle = rsv_handle;
+        }
        err = start_this_handle(journal, handle, gfp_mask);
        if (err < 0) {
+                if (handle->h_rsv_handle)
+                        jbd2_free_handle(handle->h_rsv_handle);
                jbd2_free_handle(handle);
-                current->journal_info = NULL;
                return ERR_PTR(err);
        }
        handle->h_type = type;
@@ -395,10 +462,65 @@ EXPORT_SYMBOL(jbd2__journal_start);
 handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
 {
-        return jbd2__journal_start(journal, nblocks, GFP_NOFS, 0, 0);
+        return jbd2__journal_start(journal, nblocks, 0, GFP_NOFS, 0, 0);
 }
 EXPORT_SYMBOL(jbd2_journal_start);
+void jbd2_journal_free_reserved(handle_t *handle)
+{
+        journal_t *journal = handle->h_journal;
+        WARN_ON(!handle->h_reserved);
+        sub_reserved_credits(journal, handle->h_buffer_credits);
+        jbd2_free_handle(handle);
+}
+EXPORT_SYMBOL(jbd2_journal_free_reserved);
+/**
+ * int jbd2_journal_start_reserved(handle_t *handle) - start reserved handle
+ * @handle: handle to start
+ *
+ * Start handle that has been previously reserved with jbd2_journal_reserve().
+ * This attaches @handle to the running transaction (or creates one if there's
+ * not transaction running). Unlike jbd2_journal_start() this function cannot
+ * block on journal commit, checkpointing, or similar stuff. It can block on
+ * memory allocation or frozen journal though.
+ *
+ * Return 0 on success, non-zero on error - handle is freed in that case.
+ */
+int jbd2_journal_start_reserved(handle_t *handle, unsigned int type,
+                                unsigned int line_no)
+{
+        journal_t *journal = handle->h_journal;
+        int ret = -EIO;
+        if (WARN_ON(!handle->h_reserved)) {
+                /* Someone passed in normal handle? Just stop it. */
+                jbd2_journal_stop(handle);
+                return ret;
+        }
+        /*
+         * Usefulness of mixing of reserved and unreserved handles is
+         * questionable. So far nobody seems to need it so just error out.
+         */
+        if (WARN_ON(current->journal_info)) {
+                jbd2_journal_free_reserved(handle);
+                return ret;
+        }
+        handle->h_journal = NULL;
+        /*
+         * GFP_NOFS is here because callers are likely from writeback or
+         * similarly constrained call sites
+         */
+        ret = start_this_handle(journal, handle, GFP_NOFS);
+        if (ret < 0)
+                jbd2_journal_free_reserved(handle);
+        handle->h_type = type;
+        handle->h_line_no = line_no;
+        return ret;
+}
+EXPORT_SYMBOL(jbd2_journal_start_reserved);
 /**
 * int jbd2_journal_extend() - extend buffer credits.
@@ -423,49 +545,53 @@ EXPORT_SYMBOL(jbd2_journal_start);
 int jbd2_journal_extend(handle_t *handle, int nblocks)
 {
        transaction_t *transaction = handle->h_transaction;
-        journal_t *journal = transaction->t_journal;
+        journal_t *journal;
        int result;
        int wanted;
-        result = -EIO;
+        WARN_ON(!transaction);
        if (is_handle_aborted(handle))
-                goto out;
+                return -EROFS;
+        journal = transaction->t_journal;
        result = 1;
        read_lock(&journal->j_state_lock);
        /* Don't extend a locked-down transaction! */
-        if (handle->h_transaction->t_state != T_RUNNING) {
+        if (transaction->t_state != T_RUNNING) {
                jbd_debug(3, "denied handle %p %d blocks: "
                          "transaction not running\n", handle, nblocks);
                goto error_out;
        }
        spin_lock(&transaction->t_handle_lock);
-        wanted = atomic_read(&transaction->t_outstanding_credits) + nblocks;
+        wanted = atomic_add_return(nblocks,
+                                   &transaction->t_outstanding_credits);
        if (wanted > journal->j_max_transaction_buffers) {
                jbd_debug(3, "denied handle %p %d blocks: "
                          "transaction too large\n", handle, nblocks);
+                atomic_sub(nblocks, &transaction->t_outstanding_credits);
                goto unlock;
        }
-        if (wanted > __jbd2_log_space_left(journal)) {
+        if (wanted + (wanted >> JBD2_CONTROL_BLOCKS_SHIFT) >
+            jbd2_log_space_left(journal)) {
                jbd_debug(3, "denied handle %p %d blocks: "
                          "insufficient log space\n", handle, nblocks);
+                atomic_sub(nblocks, &transaction->t_outstanding_credits);
                goto unlock;
        }
        trace_jbd2_handle_extend(journal->j_fs_dev->bd_dev,
-                                 handle->h_transaction->t_tid,
+                                 transaction->t_tid,
                                 handle->h_type, handle->h_line_no,
                                 handle->h_buffer_credits,
                                 nblocks);
        handle->h_buffer_credits += nblocks;
        handle->h_requested_credits += nblocks;
-        atomic_add(nblocks, &transaction->t_outstanding_credits);
        result = 0;
        jbd_debug(3, "extended handle %p by %d\n", handle, nblocks);
@@ -473,7 +599,6 @@ unlock:
        spin_unlock(&transaction->t_handle_lock);
 error_out:
        read_unlock(&journal->j_state_lock);
-out:
        return result;
 }
@@ -490,19 +615,22 @@ out:
 * to a running handle, a call to jbd2_journal_restart will commit the
 * handle's transaction so far and reattach the handle to a new
 * transaction capabable of guaranteeing the requested number of
- * credits.
+ * credits. We preserve reserved handle if there's any attached to the
+ * passed in handle.
 */
 int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask)
 {
        transaction_t *transaction = handle->h_transaction;
-        journal_t *journal = transaction->t_journal;
+        journal_t *journal;
        tid_t           tid;
        int             need_to_start, ret;
+        WARN_ON(!transaction);
        /* If we've had an abort of any type, don't even think about
         * actually doing the restart! */
        if (is_handle_aborted(handle))
                return 0;
+        journal = transaction->t_journal;
        /*
         * First unlink the handle from its current transaction, and start the
@@ -515,12 +643,18 @@ int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask)
        spin_lock(&transaction->t_handle_lock);
        atomic_sub(handle->h_buffer_credits,
                   &transaction->t_outstanding_credits);
+        if (handle->h_rsv_handle) {
+                sub_reserved_credits(journal,
+                                     handle->h_rsv_handle->h_buffer_credits);
+        }
        if (atomic_dec_and_test(&transaction->t_updates))
                wake_up(&journal->j_wait_updates);
+        tid = transaction->t_tid;
        spin_unlock(&transaction->t_handle_lock);
+        handle->h_transaction = NULL;
+        current->journal_info = NULL;
        jbd_debug(2, "restarting handle %p\n", handle);
-        tid = transaction->t_tid;
        need_to_start = !tid_geq(journal->j_commit_request, tid);
        read_unlock(&journal->j_state_lock);
        if (need_to_start)
@@ -557,6 +691,14 @@ void jbd2_journal_lock_updates(journal_t *journal)
        write_lock(&journal->j_state_lock);
        ++journal->j_barrier_count;
+        /* Wait until there are no reserved handles */
+        if (atomic_read(&journal->j_reserved_credits)) {
+                write_unlock(&journal->j_state_lock);
+                wait_event(journal->j_wait_reserved,
+                           atomic_read(&journal->j_reserved_credits) == 0);
+                write_lock(&journal->j_state_lock);
+        }
        /* Wait until there are no running updates */
        while (1) {
                transaction_t *transaction = journal->j_running_transaction;
@@ -619,6 +761,12 @@ static void warn_dirty_buffer(struct buffer_head *bh)
               bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr);
 }
+static int sleep_on_shadow_bh(void *word)
+{
+        io_schedule();
+        return 0;
+}
 /*
 * If the buffer is already part of the current transaction, then there
 * is nothing we need to do.  If it is already part of a prior
@@ -634,17 +782,16 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
                        int force_copy)
 {
        struct buffer_head *bh;
-        transaction_t *transaction;
+        transaction_t *transaction = handle->h_transaction;
        journal_t *journal;
        int error;
        char *frozen_buffer = NULL;
        int need_copy = 0;
        unsigned long start_lock, time_lock;
+        WARN_ON(!transaction);
        if (is_handle_aborted(handle))
                return -EROFS;
-        transaction = handle->h_transaction;
        journal = transaction->t_journal;
        jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy);
@@ -754,41 +901,29 @@ repeat:
                 * journaled.  If the primary copy is already going to
                 * disk then we cannot do copy-out here. */
-                if (jh->b_jlist == BJ_Shadow) {
+                if (buffer_shadow(bh)) {
-                        DEFINE_WAIT_BIT(wait, &bh->b_state, BH_Unshadow);
-                        wait_queue_head_t *wqh;
-                        wqh = bit_waitqueue(&bh->b_state, BH_Unshadow);
                        JBUFFER_TRACE(jh, "on shadow: sleep");
                        jbd_unlock_bh_state(bh);
-                        /* commit wakes up all shadow buffers after IO */
+                        wait_on_bit(&bh->b_state, BH_Shadow,
-                        for ( ; ; ) {
+                                    sleep_on_shadow_bh, TASK_UNINTERRUPTIBLE);
-                                prepare_to_wait(wqh, &wait.wait,
-                                                TASK_UNINTERRUPTIBLE);
-                                if (jh->b_jlist != BJ_Shadow)
-                                        break;
-                                schedule();
-                        }
-                        finish_wait(wqh, &wait.wait);
                        goto repeat;
                }
-                /* Only do the copy if the currently-owning transaction
+                /*
-                 * still needs it.  If it is on the Forget list, the
+                 * Only do the copy if the currently-owning transaction still
-                 * committing transaction is past that stage.  The
+                 * needs it. If buffer isn't on BJ_Metadata list, the
-                 * buffer had better remain locked during the kmalloc,
+                 * committing transaction is past that stage (here we use the
-                 * but that should be true --- we hold the journal lock
+                 * fact that BH_Shadow is set under bh_state lock together with
-                 * still and the buffer is already on the BUF_JOURNAL
+                 * refiling to BJ_Shadow list and at this point we know the
-                 * list so won't be flushed.
+                 * buffer doesn't have BH_Shadow set).
                 *
                 * Subtle point, though: if this is a get_undo_access,
                 * then we will be relying on the frozen_data to contain
                 * the new value of the committed_data record after the
                 * transaction, so we HAVE to force the frozen_data copy
-                 * in that case. */
+                 * in that case.
+                 */
-                if (jh->b_jlist != BJ_Forget || force_copy) {
+                if (jh->b_jlist == BJ_Metadata || force_copy) {
                        JBUFFER_TRACE(jh, "generate frozen data");
                        if (!frozen_buffer) {
                                JBUFFER_TRACE(jh, "allocate memory for buffer");
@@ -915,14 +1050,16 @@ int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh)
 int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
 {
        transaction_t *transaction = handle->h_transaction;
-        journal_t *journal = transaction->t_journal;
+        journal_t *journal;
        struct journal_head *jh = jbd2_journal_add_journal_head(bh);
        int err;
        jbd_debug(5, "journal_head %p\n", jh);
+        WARN_ON(!transaction);
        err = -EROFS;
        if (is_handle_aborted(handle))
                goto out;
+        journal = transaction->t_journal;
        err = 0;
        JBUFFER_TRACE(jh, "entry");
@@ -1128,12 +1265,14 @@ void jbd2_buffer_abort_trigger(struct journal_head *jh,
 int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
 {
        transaction_t *transaction = handle->h_transaction;
-        journal_t *journal = transaction->t_journal;
+        journal_t *journal;
        struct journal_head *jh;
        int ret = 0;
+        WARN_ON(!transaction);
        if (is_handle_aborted(handle))
-                goto out;
+                return -EROFS;
+        journal = transaction->t_journal;
        jh = jbd2_journal_grab_journal_head(bh);
        if (!jh) {
                ret = -EUCLEAN;
@@ -1227,7 +1366,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
        JBUFFER_TRACE(jh, "file as BJ_Metadata");
        spin_lock(&journal->j_list_lock);
-        __jbd2_journal_file_buffer(jh, handle->h_transaction, BJ_Metadata);
+        __jbd2_journal_file_buffer(jh, transaction, BJ_Metadata);
        spin_unlock(&journal->j_list_lock);
 out_unlock_bh:
        jbd_unlock_bh_state(bh);
@@ -1258,12 +1397,17 @@ out:
 int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
 {
        transaction_t *transaction = handle->h_transaction;
-        journal_t *journal = transaction->t_journal;
+        journal_t *journal;
        struct journal_head *jh;
        int drop_reserve = 0;
        int err = 0;
        int was_modified = 0;
+        WARN_ON(!transaction);
+        if (is_handle_aborted(handle))
+                return -EROFS;
+        journal = transaction->t_journal;
        BUFFER_TRACE(bh, "entry");
        jbd_lock_bh_state(bh);
@@ -1290,7 +1434,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
         */
        jh->b_modified = 0;
-        if (jh->b_transaction == handle->h_transaction) {
+        if (jh->b_transaction == transaction) {
                J_ASSERT_JH(jh, !jh->b_frozen_data);
                /* If we are forgetting a buffer which is already part
@@ -1385,19 +1529,21 @@ drop:
 int jbd2_journal_stop(handle_t *handle)
 {
        transaction_t *transaction = handle->h_transaction;
-        journal_t *journal = transaction->t_journal;
+        journal_t *journal;
-        int err, wait_for_commit = 0;
+        int err = 0, wait_for_commit = 0;
        tid_t tid;
        pid_t pid;
+        if (!transaction)
+                goto free_and_exit;
+        journal = transaction->t_journal;
        J_ASSERT(journal_current_handle() == handle);
        if (is_handle_aborted(handle))
                err = -EIO;
-        else {
+        else
                J_ASSERT(atomic_read(&transaction->t_updates) > 0);
-                err = 0;
-        }
        if (--handle->h_ref > 0) {
                jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
@@ -1407,7 +1553,7 @@ int jbd2_journal_stop(handle_t *handle)
        jbd_debug(4, "Handle %p going down\n", handle);
        trace_jbd2_handle_stats(journal->j_fs_dev->bd_dev,
-                                handle->h_transaction->t_tid,
+                                transaction->t_tid,
                                handle->h_type, handle->h_line_no,
                                jiffies - handle->h_start_jiffies,
                                handle->h_sync, handle->h_requested_credits,
@@ -1518,33 +1664,13 @@ int jbd2_journal_stop(handle_t *handle)
        lock_map_release(&handle->h_lockdep_map);
+        if (handle->h_rsv_handle)
+                jbd2_journal_free_reserved(handle->h_rsv_handle);
+free_and_exit:
        jbd2_free_handle(handle);
        return err;
 }
-/**
- * int jbd2_journal_force_commit() - force any uncommitted transactions
- * @journal: journal to force
- *
- * For synchronous operations: force any uncommitted transactions
- * to disk.  May seem kludgy, but it reuses all the handle batching
- * code in a very simple manner.
- */
-int jbd2_journal_force_commit(journal_t *journal)
-{
-        handle_t *handle;
-        int ret;
-        handle = jbd2_journal_start(journal, 1);
-        if (IS_ERR(handle)) {
-                ret = PTR_ERR(handle);
-        } else {
-                handle->h_sync = 1;
-                ret = jbd2_journal_stop(handle);
-        }
-        return ret;
-}
 /*
 *
 * List management code snippets: various functions for manipulating the
@@ -1601,10 +1727,10 @@ __blist_del_buffer(struct journal_head **list, struct journal_head *jh)
 * Remove a buffer from the appropriate transaction list.
 *
 * Note that this function can *change* the value of
- * bh->b_transaction->t_buffers, t_forget, t_iobuf_list, t_shadow_list,
+ * bh->b_transaction->t_buffers, t_forget, t_shadow_list, t_log_list or
- * t_log_list or t_reserved_list.  If the caller is holding onto a copy of one
+ * t_reserved_list.  If the caller is holding onto a copy of one of these
- * of these pointers, it could go bad.  Generally the caller needs to re-read
+ * pointers, it could go bad.  Generally the caller needs to re-read the
- * the pointer from the transaction_t.
+ * pointer from the transaction_t.
 *
 * Called under j_list_lock.
 */
@@ -1634,15 +1760,9 @@ static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
        case BJ_Forget:
                list = &transaction->t_forget;
                break;
-        case BJ_IO:
-                list = &transaction->t_iobuf_list;
-                break;
        case BJ_Shadow:
                list = &transaction->t_shadow_list;
                break;
-        case BJ_LogCtl:
-                list = &transaction->t_log_list;
-                break;
        case BJ_Reserved:
                list = &transaction->t_reserved_list;
                break;
@@ -2034,18 +2154,23 @@ zap_buffer_unlocked:
 * void jbd2_journal_invalidatepage()
 * @journal: journal to use for flush...
 * @page:    page to flush
- * @offset:  length of page to invalidate.
+ * @offset:  start of the range to invalidate
+ * @length:  length of the range to invalidate
 *
- * Reap page buffers containing data after offset in page. Can return -EBUSY
+ * Reap page buffers containing data after in the specified range in page.
- * if buffers are part of the committing transaction and the page is straddling
+ * Can return -EBUSY if buffers are part of the committing transaction and
- * i_size. Caller then has to wait for current commit and try again.
+ * the page is straddling i_size. Caller then has to wait for current commit
+ * and try again.
 */
 int jbd2_journal_invalidatepage(journal_t *journal,
                                struct page *page,
-                                unsigned long offset)
+                                unsigned int offset,
+                                unsigned int length)
 {
        struct buffer_head *head, *bh, *next;
+        unsigned int stop = offset + length;
        unsigned int curr_off = 0;
+        int partial_page = (offset || length < PAGE_CACHE_SIZE);
        int may_free = 1;
        int ret = 0;
@@ -2054,6 +2179,8 @@ int jbd2_journal_invalidatepage(journal_t *journal,
        if (!page_has_buffers(page))
                return 0;
+        BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
        /* We will potentially be playing with lists other than just the
         * data lists (especially for journaled data mode), so be
         * cautious in our locking. */
@@ -2063,10 +2190,13 @@ int jbd2_journal_invalidatepage(journal_t *journal,
                unsigned int next_off = curr_off + bh->b_size;
                next = bh->b_this_page;
+                if (next_off > stop)
+                        return 0;
                if (offset <= curr_off) {
                        /* This block is wholly outside the truncation point */
                        lock_buffer(bh);
-                        ret = journal_unmap_buffer(journal, bh, offset > 0);
+                        ret = journal_unmap_buffer(journal, bh, partial_page);
                        unlock_buffer(bh);
                        if (ret < 0)
                                return ret;
@@ -2077,7 +2207,7 @@ int jbd2_journal_invalidatepage(journal_t *journal,
        } while (bh != head);
-        if (!offset) {
+        if (!partial_page) {
                if (may_free && try_to_free_buffers(page))
                        J_ASSERT(!page_has_buffers(page));
        }
@@ -2138,15 +2268,9 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
        case BJ_Forget:
                list = &transaction->t_forget;
                break;
-        case BJ_IO:
-                list = &transaction->t_iobuf_list;
-                break;
        case BJ_Shadow:
                list = &transaction->t_shadow_list;
                break;
-        case BJ_LogCtl:
-                list = &transaction->t_log_list;
-                break;
        case BJ_Reserved:
                list = &transaction->t_reserved_list;
                break;
@@ -2248,10 +2372,12 @@ void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh)
 int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode)
 {
        transaction_t *transaction = handle->h_transaction;
-        journal_t *journal = transaction->t_journal;
+        journal_t *journal;
+        WARN_ON(!transaction);
        if (is_handle_aborted(handle))
-                return -EIO;
+                return -EROFS;
+        journal = transaction->t_journal;
        jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino,
                        transaction->t_tid);
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index acd46a4160cb..e3aac222472e 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -22,7 +22,7 @@
 #include <linux/time.h>
 #include "nodelist.h"
-static int jffs2_readdir (struct file *, void *, filldir_t);
+static int jffs2_readdir (struct file *, struct dir_context *);
 static int jffs2_create (struct inode *,struct dentry *,umode_t,
                         bool);
@@ -40,7 +40,7 @@ static int jffs2_rename (struct inode *, struct dentry *,
 const struct file_operations jffs2_dir_operations =
 {
        .read =         generic_read_dir,
-        .readdir =      jffs2_readdir,
+        .iterate =      jffs2_readdir,
        .unlocked_ioctl=jffs2_ioctl,
        .fsync =        jffs2_fsync,
        .llseek =       generic_file_llseek,
@@ -114,60 +114,40 @@ static struct dentry *jffs2_lookup(struct inode *dir_i, struct dentry *target,
 /***********************************************************************/
-static int jffs2_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int jffs2_readdir(struct file *file, struct dir_context *ctx)
 {
-        struct jffs2_inode_info *f;
+        struct inode *inode = file_inode(file);
-        struct inode *inode = file_inode(filp);
+        struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
        struct jffs2_full_dirent *fd;
-        unsigned long offset, curofs;
+        unsigned long curofs = 1;
-        jffs2_dbg(1, "jffs2_readdir() for dir_i #%lu\n",
+        jffs2_dbg(1, "jffs2_readdir() for dir_i #%lu\n", inode->i_ino);
-                  file_inode(filp)->i_ino);
-        f = JFFS2_INODE_INFO(inode);
+        if (!dir_emit_dots(file, ctx))
+                return 0;
-        offset = filp->f_pos;
-        if (offset == 0) {
-                jffs2_dbg(1, "Dirent 0: \".\", ino #%lu\n", inode->i_ino);
-                if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0)
-                        goto out;
-                offset++;
-        }
-        if (offset == 1) {
-                unsigned long pino = parent_ino(filp->f_path.dentry);
-                jffs2_dbg(1, "Dirent 1: \"..\", ino #%lu\n", pino);
-                if (filldir(dirent, "..", 2, 1, pino, DT_DIR) < 0)
-                        goto out;
-                offset++;
-        }
-        curofs=1;
        mutex_lock(&f->sem);
        for (fd = f->dents; fd; fd = fd->next) {
                curofs++;
-                /* First loop: curofs = 2; offset = 2 */
+                /* First loop: curofs = 2; pos = 2 */
-                if (curofs < offset) {
+                if (curofs < ctx->pos) {
                        jffs2_dbg(2, "Skipping dirent: \"%s\", ino #%u, type %d, because curofs %ld < offset %ld\n",
-                                  fd->name, fd->ino, fd->type, curofs, offset);
+                                  fd->name, fd->ino, fd->type, curofs, (unsigned long)ctx->pos);
                        continue;
                }
                if (!fd->ino) {
                        jffs2_dbg(2, "Skipping deletion dirent \"%s\"\n",
                                  fd->name);
-                        offset++;
+                        ctx->pos++;
                        continue;
                }
                jffs2_dbg(2, "Dirent %ld: \"%s\", ino #%u, type %d\n",
-                          offset, fd->name, fd->ino, fd->type);
+                          (unsigned long)ctx->pos, fd->name, fd->ino, fd->type);
-                if (filldir(dirent, fd->name, strlen(fd->name), offset, fd->ino, fd->type) < 0)
+                if (!dir_emit(ctx, fd->name, strlen(fd->name), fd->ino, fd->type))
                        break;
-                offset++;
+                ctx->pos++;
        }
        mutex_unlock(&f->sem);
- out:
-        filp->f_pos = offset;
        return 0;
 }
diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c
index 0ddbeceafc62..9f4ed13d9f15 100644
--- a/fs/jfs/jfs_dtree.c
+++ b/fs/jfs/jfs_dtree.c
@@ -3002,9 +3002,9 @@ static inline struct jfs_dirent *next_jfs_dirent(struct jfs_dirent *dirent)
 * return: offset = (pn, index) of start entry
 *      of next jfs_readdir()/dtRead()
 */
-int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+int jfs_readdir(struct file *file, struct dir_context *ctx)
 {
-        struct inode *ip = file_inode(filp);
+        struct inode *ip = file_inode(file);
        struct nls_table *codepage = JFS_SBI(ip->i_sb)->nls_tab;
        int rc = 0;
        loff_t dtpos;   /* legacy OS/2 style position */
@@ -3033,7 +3033,7 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
        int overflow, fix_page, page_fixed = 0;
        static int unique_pos = 2;      /* If we can't fix broken index */
-        if (filp->f_pos == DIREND)
+        if (ctx->pos == DIREND)
                return 0;
        if (DO_INDEX(ip)) {
@@ -3045,7 +3045,7 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                 */
                do_index = 1;
-                dir_index = (u32) filp->f_pos;
+                dir_index = (u32) ctx->pos;
                if (dir_index > 1) {
                        struct dir_table_slot dirtab_slot;
@@ -3053,25 +3053,25 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                        if (dtEmpty(ip) ||
                            (dir_index >= JFS_IP(ip)->next_index)) {
                                /* Stale position.  Directory has shrunk */
-                                filp->f_pos = DIREND;
+                                ctx->pos = DIREND;
                                return 0;
                        }
                      repeat:
                        rc = read_index(ip, dir_index, &dirtab_slot);
                        if (rc) {
-                                filp->f_pos = DIREND;
+                                ctx->pos = DIREND;
                                return rc;
                        }
                        if (dirtab_slot.flag == DIR_INDEX_FREE) {
                                if (loop_count++ > JFS_IP(ip)->next_index) {
                                        jfs_err("jfs_readdir detected "
                                                   "infinite loop!");
-                                        filp->f_pos = DIREND;
+                                        ctx->pos = DIREND;
                                        return 0;
                                }
                                dir_index = le32_to_cpu(dirtab_slot.addr2);
                                if (dir_index == -1) {
-                                        filp->f_pos = DIREND;
+                                        ctx->pos = DIREND;
                                        return 0;
                                }
                                goto repeat;
@@ -3080,13 +3080,13 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                        index = dirtab_slot.slot;
                        DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
                        if (rc) {
-                                filp->f_pos = DIREND;
+                                ctx->pos = DIREND;
                                return 0;
                        }
                        if (p->header.flag & BT_INTERNAL) {
                                jfs_err("jfs_readdir: bad index table");
                                DT_PUTPAGE(mp);
-                                filp->f_pos = -1;
+                                ctx->pos = -1;
                                return 0;
                        }
                } else {
@@ -3094,23 +3094,22 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                                /*
                                 * self "."
                                 */
-                                filp->f_pos = 0;
+                                ctx->pos = 0;
-                                if (filldir(dirent, ".", 1, 0, ip->i_ino,
+                                if (!dir_emit(ctx, ".", 1, ip->i_ino, DT_DIR))
-                                            DT_DIR))
                                        return 0;
                        }
                        /*
                         * parent ".."
                         */
-                        filp->f_pos = 1;
+                        ctx->pos = 1;
-                        if (filldir(dirent, "..", 2, 1, PARENT(ip), DT_DIR))
+                        if (!dir_emit(ctx, "..", 2, PARENT(ip), DT_DIR))
                                return 0;
                        /*
                         * Find first entry of left-most leaf
                         */
                        if (dtEmpty(ip)) {
-                                filp->f_pos = DIREND;
+                                ctx->pos = DIREND;
                                return 0;
                        }
@@ -3128,23 +3127,19 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                 * pn > 0:              Real entries, pn=1 -> leftmost page
                 * pn = index = -1:     No more entries
                 */
-                dtpos = filp->f_pos;
+                dtpos = ctx->pos;
                if (dtpos == 0) {
                        /* build "." entry */
+                        if (!dir_emit(ctx, ".", 1, ip->i_ino, DT_DIR))
-                        if (filldir(dirent, ".", 1, filp->f_pos, ip->i_ino,
-                                    DT_DIR))
                                return 0;
                        dtoffset->index = 1;
-                        filp->f_pos = dtpos;
+                        ctx->pos = dtpos;
                }
                if (dtoffset->pn == 0) {
                        if (dtoffset->index == 1) {
                                /* build ".." entry */
+                                if (!dir_emit(ctx, "..", 2, PARENT(ip), DT_DIR))
-                                if (filldir(dirent, "..", 2, filp->f_pos,
-                                            PARENT(ip), DT_DIR))
                                        return 0;
                        } else {
                                jfs_err("jfs_readdir called with "
@@ -3152,18 +3147,18 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                        }
                        dtoffset->pn = 1;
                        dtoffset->index = 0;
-                        filp->f_pos = dtpos;
+                        ctx->pos = dtpos;
                }
                if (dtEmpty(ip)) {
-                        filp->f_pos = DIREND;
+                        ctx->pos = DIREND;
                        return 0;
                }
-                if ((rc = dtReadNext(ip, &filp->f_pos, &btstack))) {
+                if ((rc = dtReadNext(ip, &ctx->pos, &btstack))) {
                        jfs_err("jfs_readdir: unexpected rc = %d "
                                "from dtReadNext", rc);
-                        filp->f_pos = DIREND;
+                        ctx->pos = DIREND;
                        return 0;
                }
                /* get start leaf page and index */
@@ -3171,7 +3166,7 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                /* offset beyond directory eof ? */
                if (bn < 0) {
-                        filp->f_pos = DIREND;
+                        ctx->pos = DIREND;
                        return 0;
                }
        }
@@ -3180,7 +3175,7 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
        if (dirent_buf == 0) {
                DT_PUTPAGE(mp);
                jfs_warn("jfs_readdir: __get_free_page failed!");
-                filp->f_pos = DIREND;
+                ctx->pos = DIREND;
                return -ENOMEM;
        }
@@ -3295,9 +3290,9 @@ skip_one:
                jfs_dirent = (struct jfs_dirent *) dirent_buf;
                while (jfs_dirents--) {
-                        filp->f_pos = jfs_dirent->position;
+                        ctx->pos = jfs_dirent->position;
-                        if (filldir(dirent, jfs_dirent->name,
+                        if (!dir_emit(ctx, jfs_dirent->name,
-                                    jfs_dirent->name_len, filp->f_pos,
+                                    jfs_dirent->name_len,
                                    jfs_dirent->ino, DT_UNKNOWN))
                                goto out;
                        jfs_dirent = next_jfs_dirent(jfs_dirent);
@@ -3309,7 +3304,7 @@ skip_one:
                }
                if (!overflow && (bn == 0)) {
-                        filp->f_pos = DIREND;
+                        ctx->pos = DIREND;
                        break;
                }
diff --git a/fs/jfs/jfs_dtree.h b/fs/jfs/jfs_dtree.h
index 2545bb317235..fd4169e6e698 100644
--- a/fs/jfs/jfs_dtree.h
+++ b/fs/jfs/jfs_dtree.h
@@ -265,5 +265,5 @@ extern int dtDelete(tid_t tid, struct inode *ip, struct component_name * key,
 extern int dtModify(tid_t tid, struct inode *ip, struct component_name * key,
                    ino_t * orig_ino, ino_t new_ino, int flag);
-extern int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir);
+extern int jfs_readdir(struct file *file, struct dir_context *ctx);
 #endif                          /* !_H_JFS_DTREE */
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 6740d34cd82b..9e3aaff11f89 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -571,9 +571,10 @@ static int metapage_releasepage(struct page *page, gfp_t gfp_mask)
        return ret;
 }
-static void metapage_invalidatepage(struct page *page, unsigned long offset)
+static void metapage_invalidatepage(struct page *page, unsigned int offset,
+                                    unsigned int length)
 {
-        BUG_ON(offset);
+        BUG_ON(offset || length < PAGE_CACHE_SIZE);
        BUG_ON(PageWriteback(page));
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 3b91a7ad6086..89186b7b9002 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1529,7 +1529,7 @@ const struct inode_operations jfs_dir_inode_operations = {
 const struct file_operations jfs_dir_operations = {
        .read           = generic_read_dir,
-        .readdir        = jfs_readdir,
+        .iterate        = jfs_readdir,
        .fsync          = jfs_fsync,
        .unlocked_ioctl = jfs_ioctl,
 #ifdef CONFIG_COMPAT
diff --git a/fs/libfs.c b/fs/libfs.c
index 916da8c4158b..c3a0837fb861 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -135,60 +135,40 @@ static inline unsigned char dt_type(struct inode *inode)
 * both impossible due to the lock on directory.
 */
-int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir)
+int dcache_readdir(struct file *file, struct dir_context *ctx)
 {
-        struct dentry *dentry = filp->f_path.dentry;
+        struct dentry *dentry = file->f_path.dentry;
-        struct dentry *cursor = filp->private_data;
+        struct dentry *cursor = file->private_data;
        struct list_head *p, *q = &cursor->d_u.d_child;
-        ino_t ino;
-        int i = filp->f_pos;
-        switch (i) {
+        if (!dir_emit_dots(file, ctx))
-                case 0:
+                return 0;
-                        ino = dentry->d_inode->i_ino;
+        spin_lock(&dentry->d_lock);
-                        if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
+        if (ctx->pos == 2)
-                                break;
+                list_move(q, &dentry->d_subdirs);
-                        filp->f_pos++;
-                        i++;
+        for (p = q->next; p != &dentry->d_subdirs; p = p->next) {
-                        /* fallthrough */
+                struct dentry *next = list_entry(p, struct dentry, d_u.d_child);
-                case 1:
+                spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
-                        ino = parent_ino(dentry);
+                if (!simple_positive(next)) {
-                        if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0)
+                        spin_unlock(&next->d_lock);
-                                break;
+                        continue;
-                        filp->f_pos++;
+                }
-                        i++;
-                        /* fallthrough */
-                default:
-                        spin_lock(&dentry->d_lock);
-                        if (filp->f_pos == 2)
-                                list_move(q, &dentry->d_subdirs);
-                        for (p=q->next; p != &dentry->d_subdirs; p=p->next) {
-                                struct dentry *next;
-                                next = list_entry(p, struct dentry, d_u.d_child);
-                                spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
-                                if (!simple_positive(next)) {
-                                        spin_unlock(&next->d_lock);
-                                        continue;
-                                }
-                                spin_unlock(&next->d_lock);
+                spin_unlock(&next->d_lock);
-                                spin_unlock(&dentry->d_lock);
+                spin_unlock(&dentry->d_lock);
-                                if (filldir(dirent, next->d_name.name, 
+                if (!dir_emit(ctx, next->d_name.name, next->d_name.len,
-                                            next->d_name.len, filp->f_pos, 
+                              next->d_inode->i_ino, dt_type(next->d_inode)))
-                                            next->d_inode->i_ino, 
+                        return 0;
-                                            dt_type(next->d_inode)) < 0)
+                spin_lock(&dentry->d_lock);
-                                        return 0;
+                spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
-                                spin_lock(&dentry->d_lock);
+                /* next is still alive */
-                                spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
+                list_move(q, p);
-                                /* next is still alive */
+                spin_unlock(&next->d_lock);
-                                list_move(q, p);
+                p = q;
-                                spin_unlock(&next->d_lock);
+                ctx->pos++;
-                                p = q;
-                                filp->f_pos++;
-                        }
-                        spin_unlock(&dentry->d_lock);
        }
+        spin_unlock(&dentry->d_lock);
        return 0;
 }
@@ -202,7 +182,7 @@ const struct file_operations simple_dir_operations = {
        .release        = dcache_dir_close,
        .llseek         = dcache_dir_lseek,
        .read           = generic_read_dir,
-        .readdir        = dcache_readdir,
+        .iterate        = dcache_readdir,
        .fsync          = noop_fsync,
 };
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index b82751082112..6bdc347008f5 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -281,17 +281,23 @@ static int logfs_rmdir(struct inode *dir, struct dentry *dentry)
 /* FIXME: readdir currently has it's own dir_walk code.  I don't see a good
 * way to combine the two copies */
-#define IMPLICIT_NODES 2
+static int logfs_readdir(struct file *file, struct dir_context *ctx)
-static int __logfs_readdir(struct file *file, void *buf, filldir_t filldir)
 {
        struct inode *dir = file_inode(file);
-        loff_t pos = file->f_pos - IMPLICIT_NODES;
+        loff_t pos;
        struct page *page;
        struct logfs_disk_dentry *dd;
-        int full;
+        if (ctx->pos < 0)
+                return -EINVAL;
+        if (!dir_emit_dots(file, ctx))
+                return 0;
+        pos = ctx->pos - 2;
        BUG_ON(pos < 0);
-        for (;; pos++) {
+        for (;; pos++, ctx->pos++) {
+                bool full;
                if (beyond_eof(dir, pos))
                        break;
                if (!logfs_exist_block(dir, pos)) {
@@ -306,42 +312,17 @@ static int __logfs_readdir(struct file *file, void *buf, filldir_t filldir)
                dd = kmap(page);
                BUG_ON(dd->namelen == 0);
-                full = filldir(buf, (char *)dd->name, be16_to_cpu(dd->namelen),
+                full = !dir_emit(ctx, (char *)dd->name,
-                                pos, be64_to_cpu(dd->ino), dd->type);
+                                be16_to_cpu(dd->namelen),
+                                be64_to_cpu(dd->ino), dd->type);
                kunmap(page);
                page_cache_release(page);
                if (full)
                        break;
        }
-        file->f_pos = pos + IMPLICIT_NODES;
        return 0;
 }
-static int logfs_readdir(struct file *file, void *buf, filldir_t filldir)
-{
-        struct inode *inode = file_inode(file);
-        ino_t pino = parent_ino(file->f_dentry);
-        int err;
-        if (file->f_pos < 0)
-                return -EINVAL;
-        if (file->f_pos == 0) {
-                if (filldir(buf, ".", 1, 1, inode->i_ino, DT_DIR) < 0)
-                        return 0;
-                file->f_pos++;
-        }
-        if (file->f_pos == 1) {
-                if (filldir(buf, "..", 2, 2, pino, DT_DIR) < 0)
-                        return 0;
-                file->f_pos++;
-        }
-        err = __logfs_readdir(file, buf, filldir);
-        return err;
-}
 static void logfs_set_name(struct logfs_disk_dentry *dd, struct qstr *name)
 {
        dd->namelen = cpu_to_be16(name->len);
@@ -814,7 +795,7 @@ const struct inode_operations logfs_dir_iops = {
 const struct file_operations logfs_dir_fops = {
        .fsync          = logfs_fsync,
        .unlocked_ioctl = logfs_ioctl,
-        .readdir        = logfs_readdir,
+        .iterate        = logfs_readdir,
        .read           = generic_read_dir,
        .llseek         = default_llseek,
 };
diff --git a/fs/logfs/file.c b/fs/logfs/file.c
index c2219a6dd3c8..57914fc32b62 100644
--- a/fs/logfs/file.c
+++ b/fs/logfs/file.c
@@ -159,7 +159,8 @@ static int logfs_writepage(struct page *page, struct writeback_control *wbc)
        return __logfs_writepage(page);
 }
-static void logfs_invalidatepage(struct page *page, unsigned long offset)
+static void logfs_invalidatepage(struct page *page, unsigned int offset,
+                                 unsigned int length)
 {
        struct logfs_block *block = logfs_block(page);
diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c
index 038da0991794..d448a777166b 100644
--- a/fs/logfs/segment.c
+++ b/fs/logfs/segment.c
@@ -884,7 +884,8 @@ static struct logfs_area *alloc_area(struct super_block *sb)
        return area;
 }
-static void map_invalidatepage(struct page *page, unsigned long l)
+static void map_invalidatepage(struct page *page, unsigned int o,
+                               unsigned int l)
 {
        return;
 }
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index a9ed6f36e6ea..08c442902fcd 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -16,12 +16,12 @@
 typedef struct minix_dir_entry minix_dirent;
 typedef struct minix3_dir_entry minix3_dirent;
-static int minix_readdir(struct file *, void *, filldir_t);
+static int minix_readdir(struct file *, struct dir_context *);
 const struct file_operations minix_dir_operations = {
        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
-        .readdir        = minix_readdir,
+        .iterate        = minix_readdir,
        .fsync          = generic_file_fsync,
 };
@@ -82,22 +82,23 @@ static inline void *minix_next_entry(void *de, struct minix_sb_info *sbi)
        return (void*)((char*)de + sbi->s_dirsize);
 }
-static int minix_readdir(struct file * filp, void * dirent, filldir_t filldir)
+static int minix_readdir(struct file *file, struct dir_context *ctx)
 {
-        unsigned long pos = filp->f_pos;
+        struct inode *inode = file_inode(file);
-        struct inode *inode = file_inode(filp);
        struct super_block *sb = inode->i_sb;
-        unsigned offset = pos & ~PAGE_CACHE_MASK;
-        unsigned long n = pos >> PAGE_CACHE_SHIFT;
-        unsigned long npages = dir_pages(inode);
        struct minix_sb_info *sbi = minix_sb(sb);
        unsigned chunk_size = sbi->s_dirsize;
-        char *name;
+        unsigned long npages = dir_pages(inode);
-        __u32 inumber;
+        unsigned long pos = ctx->pos;
+        unsigned offset;
+        unsigned long n;
-        pos = (pos + chunk_size-1) & ~(chunk_size-1);
+        ctx->pos = pos = (pos + chunk_size-1) & ~(chunk_size-1);
        if (pos >= inode->i_size)
-                goto done;
+                return 0;
+        offset = pos & ~PAGE_CACHE_MASK;
+        n = pos >> PAGE_CACHE_SHIFT;
        for ( ; n < npages; n++, offset = 0) {
                char *p, *kaddr, *limit;
@@ -109,6 +110,8 @@ static int minix_readdir(struct file * filp, void * dirent, filldir_t filldir)
                p = kaddr+offset;
                limit = kaddr + minix_last_byte(inode, n) - chunk_size;
                for ( ; p <= limit; p = minix_next_entry(p, sbi)) {
+                        const char *name;
+                        __u32 inumber;
                        if (sbi->s_version == MINIX_V3) {
                                minix3_dirent *de3 = (minix3_dirent *)p;
                                name = de3->name;
@@ -119,24 +122,17 @@ static int minix_readdir(struct file * filp, void * dirent, filldir_t filldir)
                                inumber = de->inode;
                        }
                        if (inumber) {
-                                int over;
                                unsigned l = strnlen(name, sbi->s_namelen);
-                                offset = p - kaddr;
+                                if (!dir_emit(ctx, name, l,
-                                over = filldir(dirent, name, l,
+                                              inumber, DT_UNKNOWN)) {
-                                        (n << PAGE_CACHE_SHIFT) | offset,
-                                        inumber, DT_UNKNOWN);
-                                if (over) {
                                        dir_put_page(page);
-                                        goto done;
+                                        return 0;
                                }
                        }
+                        ctx->pos += chunk_size;
                }
                dir_put_page(page);
        }
-done:
-        filp->f_pos = (n << PAGE_CACHE_SHIFT) | offset;
        return 0;
 }
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 6792ce11f2bf..0e7f00298213 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -23,12 +23,12 @@
 #include "ncp_fs.h"
-static void ncp_read_volume_list(struct file *, void *, filldir_t,
+static void ncp_read_volume_list(struct file *, struct dir_context *,
                                struct ncp_cache_control *);
-static void ncp_do_readdir(struct file *, void *, filldir_t,
+static void ncp_do_readdir(struct file *, struct dir_context *,
                                struct ncp_cache_control *);
-static int ncp_readdir(struct file *, void *, filldir_t);
+static int ncp_readdir(struct file *, struct dir_context *);
 static int ncp_create(struct inode *, struct dentry *, umode_t, bool);
 static struct dentry *ncp_lookup(struct inode *, struct dentry *, unsigned int);
@@ -49,7 +49,7 @@ const struct file_operations ncp_dir_operations =
 {
        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
-        .readdir        = ncp_readdir,
+        .iterate        = ncp_readdir,
        .unlocked_ioctl = ncp_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = ncp_compat_ioctl,
@@ -424,9 +424,9 @@ static time_t ncp_obtain_mtime(struct dentry *dentry)
        return ncp_date_dos2unix(i.modifyTime, i.modifyDate);
 }
-static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int ncp_readdir(struct file *file, struct dir_context *ctx)
 {
-        struct dentry *dentry = filp->f_path.dentry;
+        struct dentry *dentry = file->f_path.dentry;
        struct inode *inode = dentry->d_inode;
        struct page *page = NULL;
        struct ncp_server *server = NCP_SERVER(inode);
@@ -440,7 +440,7 @@ static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir)
        DDPRINTK("ncp_readdir: reading %s/%s, pos=%d\n",
                dentry->d_parent->d_name.name, dentry->d_name.name,
-                (int) filp->f_pos);
+                (int) ctx->pos);
        result = -EIO;
        /* Do not generate '.' and '..' when server is dead. */
@@ -448,16 +448,8 @@ static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir)
                goto out;
        result = 0;
-        if (filp->f_pos == 0) {
+        if (!dir_emit_dots(file, ctx))
-                if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR))
+                goto out;
-                        goto out;
-                filp->f_pos = 1;
-        }
-        if (filp->f_pos == 1) {
-                if (filldir(dirent, "..", 2, 1, parent_ino(dentry), DT_DIR))
-                        goto out;
-                filp->f_pos = 2;
-        }
        page = grab_cache_page(&inode->i_data, 0);
        if (!page)
@@ -469,7 +461,7 @@ static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir)
        if (!PageUptodate(page) || !ctl.head.eof)
                goto init_cache;
-        if (filp->f_pos == 2) {
+        if (ctx->pos == 2) {
                if (jiffies - ctl.head.time >= NCP_MAX_AGE(server))
                        goto init_cache;
@@ -479,10 +471,10 @@ static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir)
                        goto init_cache;
        }
-        if (filp->f_pos > ctl.head.end)
+        if (ctx->pos > ctl.head.end)
                goto finished;
-        ctl.fpos = filp->f_pos + (NCP_DIRCACHE_START - 2);
+        ctl.fpos = ctx->pos + (NCP_DIRCACHE_START - 2);
        ctl.ofs  = ctl.fpos / NCP_DIRCACHE_SIZE;
        ctl.idx  = ctl.fpos % NCP_DIRCACHE_SIZE;
@@ -497,21 +489,21 @@ static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir)
                }
                while (ctl.idx < NCP_DIRCACHE_SIZE) {
                        struct dentry *dent;
-                        int res;
+                        bool over;
                        dent = ncp_dget_fpos(ctl.cache->dentry[ctl.idx],
-                                                dentry, filp->f_pos);
+                                                dentry, ctx->pos);
                        if (!dent)
                                goto invalid_cache;
-                        res = filldir(dirent, dent->d_name.name,
+                        over = !dir_emit(ctx, dent->d_name.name,
-                                        dent->d_name.len, filp->f_pos,
+                                        dent->d_name.len,
                                        dent->d_inode->i_ino, DT_UNKNOWN);
                        dput(dent);
-                        if (res)
+                        if (over)
                                goto finished;
-                        filp->f_pos += 1;
+                        ctx->pos += 1;
                        ctl.idx += 1;
-                        if (filp->f_pos > ctl.head.end)
+                        if (ctx->pos > ctl.head.end)
                                goto finished;
                }
                if (ctl.page) {
@@ -548,9 +540,9 @@ init_cache:
        ctl.valid  = 1;
 read_really:
        if (ncp_is_server_root(inode)) {
-                ncp_read_volume_list(filp, dirent, filldir, &ctl);
+                ncp_read_volume_list(file, ctx, &ctl);
        } else {
-                ncp_do_readdir(filp, dirent, filldir, &ctl);
+                ncp_do_readdir(file, ctx, &ctl);
        }
        ctl.head.end = ctl.fpos - 1;
        ctl.head.eof = ctl.valid;
@@ -573,11 +565,11 @@ out:
 }
 static int
-ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
+ncp_fill_cache(struct file *file, struct dir_context *ctx,
                struct ncp_cache_control *ctrl, struct ncp_entry_info *entry,
                int inval_childs)
 {
-        struct dentry *newdent, *dentry = filp->f_path.dentry;
+        struct dentry *newdent, *dentry = file->f_path.dentry;
        struct inode *dir = dentry->d_inode;
        struct ncp_cache_control ctl = *ctrl;
        struct qstr qname;
@@ -666,15 +658,15 @@ ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
 end_advance:
        if (!valid)
                ctl.valid = 0;
-        if (!ctl.filled && (ctl.fpos == filp->f_pos)) {
+        if (!ctl.filled && (ctl.fpos == ctx->pos)) {
                if (!ino)
                        ino = find_inode_number(dentry, &qname);
                if (!ino)
                        ino = iunique(dir->i_sb, 2);
-                ctl.filled = filldir(dirent, qname.name, qname.len,
+                ctl.filled = !dir_emit(ctx, qname.name, qname.len,
-                                     filp->f_pos, ino, DT_UNKNOWN);
+                                     ino, DT_UNKNOWN);
                if (!ctl.filled)
-                        filp->f_pos += 1;
+                        ctx->pos += 1;
        }
        ctl.fpos += 1;
        ctl.idx  += 1;
@@ -683,10 +675,10 @@ end_advance:
 }
 static void
-ncp_read_volume_list(struct file *filp, void *dirent, filldir_t filldir,
+ncp_read_volume_list(struct file *file, struct dir_context *ctx,
                        struct ncp_cache_control *ctl)
 {
-        struct dentry *dentry = filp->f_path.dentry;
+        struct dentry *dentry = file->f_path.dentry;
        struct inode *inode = dentry->d_inode;
        struct ncp_server *server = NCP_SERVER(inode);
        struct ncp_volume_info info;
@@ -694,7 +686,7 @@ ncp_read_volume_list(struct file *filp, void *dirent, filldir_t filldir,
        int i;
        DPRINTK("ncp_read_volume_list: pos=%ld\n",
-                        (unsigned long) filp->f_pos);
+                        (unsigned long) ctx->pos);
        for (i = 0; i < NCP_NUMBER_OF_VOLUMES; i++) {
                int inval_dentry;
@@ -715,16 +707,16 @@ ncp_read_volume_list(struct file *filp, void *dirent, filldir_t filldir,
                }
                inval_dentry = ncp_update_known_namespace(server, entry.i.volNumber, NULL);
                entry.volume = entry.i.volNumber;
-                if (!ncp_fill_cache(filp, dirent, filldir, ctl, &entry, inval_dentry))
+                if (!ncp_fill_cache(file, ctx, ctl, &entry, inval_dentry))
                        return;
        }
 }
 static void
-ncp_do_readdir(struct file *filp, void *dirent, filldir_t filldir,
+ncp_do_readdir(struct file *file, struct dir_context *ctx,
                                                struct ncp_cache_control *ctl)
 {
-        struct dentry *dentry = filp->f_path.dentry;
+        struct dentry *dentry = file->f_path.dentry;
        struct inode *dir = dentry->d_inode;
        struct ncp_server *server = NCP_SERVER(dir);
        struct nw_search_sequence seq;
@@ -736,7 +728,7 @@ ncp_do_readdir(struct file *filp, void *dirent, filldir_t filldir,
        DPRINTK("ncp_do_readdir: %s/%s, fpos=%ld\n",
                dentry->d_parent->d_name.name, dentry->d_name.name,
-                (unsigned long) filp->f_pos);
+                (unsigned long) ctx->pos);
        PPRINTK("ncp_do_readdir: init %s, volnum=%d, dirent=%u\n",
                dentry->d_name.name, NCP_FINFO(dir)->volNumber,
                NCP_FINFO(dir)->dirEntNum);
@@ -778,7 +770,7 @@ ncp_do_readdir(struct file *filp, void *dirent, filldir_t filldir,
                        rpl += onerpl;
                        rpls -= onerpl;
                        entry.volume = entry.i.volNumber;
-                        if (!ncp_fill_cache(filp, dirent, filldir, ctl, &entry, 0))
+                        if (!ncp_fill_cache(file, ctx, ctl, &entry, 0))
                                break;
                }
        } while (more);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index e093e73178b7..5d051419527b 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -46,7 +46,7 @@
 static int nfs_opendir(struct inode *, struct file *);
 static int nfs_closedir(struct inode *, struct file *);
-static int nfs_readdir(struct file *, void *, filldir_t);
+static int nfs_readdir(struct file *, struct dir_context *);
 static int nfs_fsync_dir(struct file *, loff_t, loff_t, int);
 static loff_t nfs_llseek_dir(struct file *, loff_t, int);
 static void nfs_readdir_clear_array(struct page*);
@@ -54,7 +54,7 @@ static void nfs_readdir_clear_array(struct page*);
 const struct file_operations nfs_dir_operations = {
        .llseek         = nfs_llseek_dir,
        .read           = generic_read_dir,
-        .readdir        = nfs_readdir,
+        .iterate        = nfs_readdir,
        .open           = nfs_opendir,
        .release        = nfs_closedir,
        .fsync          = nfs_fsync_dir,
@@ -147,6 +147,7 @@ typedef int (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, int);
 typedef struct {
        struct file     *file;
        struct page     *page;
+        struct dir_context *ctx;
        unsigned long   page_index;
        u64             *dir_cookie;
        u64             last_cookie;
@@ -252,7 +253,7 @@ out:
 static
 int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc)
 {
-        loff_t diff = desc->file->f_pos - desc->current_index;
+        loff_t diff = desc->ctx->pos - desc->current_index;
        unsigned int index;
        if (diff < 0)
@@ -289,7 +290,7 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
                            || (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))) {
                                ctx->duped = 0;
                                ctx->attr_gencount = nfsi->attr_gencount;
-                        } else if (new_pos < desc->file->f_pos) {
+                        } else if (new_pos < desc->ctx->pos) {
                                if (ctx->duped > 0
                                    && ctx->dup_cookie == *desc->dir_cookie) {
                                        if (printk_ratelimit()) {
@@ -307,7 +308,7 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
                                ctx->dup_cookie = *desc->dir_cookie;
                                ctx->duped = -1;
                        }
-                        desc->file->f_pos = new_pos;
+                        desc->ctx->pos = new_pos;
                        desc->cache_entry_index = i;
                        return 0;
                }
@@ -405,13 +406,13 @@ different:
 }
 static
-bool nfs_use_readdirplus(struct inode *dir, struct file *filp)
+bool nfs_use_readdirplus(struct inode *dir, struct dir_context *ctx)
 {
        if (!nfs_server_capable(dir, NFS_CAP_READDIRPLUS))
                return false;
        if (test_and_clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(dir)->flags))
                return true;
-        if (filp->f_pos == 0)
+        if (ctx->pos == 0)
                return true;
        return false;
 }
@@ -702,8 +703,7 @@ int readdir_search_pagecache(nfs_readdir_descriptor_t *desc)
 * Once we've found the start of the dirent within a page: fill 'er up...
 */
 static 
-int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent,
+int nfs_do_filldir(nfs_readdir_descriptor_t *desc)
-                   filldir_t filldir)
 {
        struct file     *file = desc->file;
        int i = 0;
@@ -721,13 +721,12 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent,
                struct nfs_cache_array_entry *ent;
                ent = &array->array[i];
-                if (filldir(dirent, ent->string.name, ent->string.len,
+                if (!dir_emit(desc->ctx, ent->string.name, ent->string.len,
-                    file->f_pos, nfs_compat_user_ino64(ent->ino),
+                    nfs_compat_user_ino64(ent->ino), ent->d_type)) {
-                    ent->d_type) < 0) {
                        desc->eof = 1;
                        break;
                }
-                file->f_pos++;
+                desc->ctx->pos++;
                if (i < (array->size-1))
                        *desc->dir_cookie = array->array[i+1].cookie;
                else
@@ -759,8 +758,7 @@ out:
 *       directory in the page cache by the time we get here.
 */
 static inline
-int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
+int uncached_readdir(nfs_readdir_descriptor_t *desc)
-                     filldir_t filldir)
 {
        struct page     *page = NULL;
        int             status;
@@ -785,7 +783,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
        if (status < 0)
                goto out_release;
-        status = nfs_do_filldir(desc, dirent, filldir);
+        status = nfs_do_filldir(desc);
 out:
        dfprintk(DIRCACHE, "NFS: %s: returns %d\n",
@@ -800,35 +798,36 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
   last cookie cache takes care of the common case of reading the
   whole directory.
 */
-static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int nfs_readdir(struct file *file, struct dir_context *ctx)
 {
-        struct dentry   *dentry = filp->f_path.dentry;
+        struct dentry   *dentry = file->f_path.dentry;
        struct inode    *inode = dentry->d_inode;
        nfs_readdir_descriptor_t my_desc,
                        *desc = &my_desc;
-        struct nfs_open_dir_context *dir_ctx = filp->private_data;
+        struct nfs_open_dir_context *dir_ctx = file->private_data;
        int res;
        dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n",
                        dentry->d_parent->d_name.name, dentry->d_name.name,
-                        (long long)filp->f_pos);
+                        (long long)ctx->pos);
        nfs_inc_stats(inode, NFSIOS_VFSGETDENTS);
        /*
-         * filp->f_pos points to the dirent entry number.
+         * ctx->pos points to the dirent entry number.
         * *desc->dir_cookie has the cookie for the next entry. We have
         * to either find the entry with the appropriate number or
         * revalidate the cookie.
         */
        memset(desc, 0, sizeof(*desc));
-        desc->file = filp;
+        desc->file = file;
+        desc->ctx = ctx;
        desc->dir_cookie = &dir_ctx->dir_cookie;
        desc->decode = NFS_PROTO(inode)->decode_dirent;
-        desc->plus = nfs_use_readdirplus(inode, filp) ? 1 : 0;
+        desc->plus = nfs_use_readdirplus(inode, ctx) ? 1 : 0;
        nfs_block_sillyrename(dentry);
-        res = nfs_revalidate_mapping(inode, filp->f_mapping);
+        res = nfs_revalidate_mapping(inode, file->f_mapping);
        if (res < 0)
                goto out;
@@ -840,7 +839,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                        /* This means either end of directory */
                        if (*desc->dir_cookie && desc->eof == 0) {
                                /* Or that the server has 'lost' a cookie */
-                                res = uncached_readdir(desc, dirent, filldir);
+                                res = uncached_readdir(desc);
                                if (res == 0)
                                        continue;
                        }
@@ -857,7 +856,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                if (res < 0)
                        break;
-                res = nfs_do_filldir(desc, dirent, filldir);
+                res = nfs_do_filldir(desc);
                if (res < 0)
                        break;
        } while (!desc->eof);
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index a87a44f84113..6b4a79f4ad1d 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -451,11 +451,13 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
 * - Called if either PG_private or PG_fscache is set on the page
 * - Caller holds page lock
 */
-static void nfs_invalidate_page(struct page *page, unsigned long offset)
+static void nfs_invalidate_page(struct page *page, unsigned int offset,
+                                unsigned int length)
 {
-        dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %lu)\n", page, offset);
+        dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %u, %u)\n",
+                 page, offset, length);
-        if (offset != 0)
+        if (offset != 0 || length < PAGE_CACHE_SIZE)
                return;
        /* Cancel any unstarted writes on this page */
        nfs_wb_page_cancel(page_file_mapping(page)->host, page);
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 4e9a21db867a..105a3b080d12 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -240,11 +240,16 @@ struct name_list {
        struct list_head list;
 };
+struct nfs4_dir_ctx {
+        struct dir_context ctx;
+        struct list_head names;
+};
 static int
 nfsd4_build_namelist(void *arg, const char *name, int namlen,
                loff_t offset, u64 ino, unsigned int d_type)
 {
-        struct list_head *names = arg;
+        struct nfs4_dir_ctx *ctx = arg;
        struct name_list *entry;
        if (namlen != HEXDIR_LEN - 1)
@@ -254,7 +259,7 @@ nfsd4_build_namelist(void *arg, const char *name, int namlen,
                return -ENOMEM;
        memcpy(entry->name, name, HEXDIR_LEN - 1);
        entry->name[HEXDIR_LEN - 1] = '\0';
-        list_add(&entry->list, names);
+        list_add(&entry->list, &ctx->names);
        return 0;
 }
@@ -263,7 +268,10 @@ nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn)
 {
        const struct cred *original_cred;
        struct dentry *dir = nn->rec_file->f_path.dentry;
-        LIST_HEAD(names);
+        struct nfs4_dir_ctx ctx = {
+                .ctx.actor = nfsd4_build_namelist,
+                .names = LIST_HEAD_INIT(ctx.names)
+        };
        int status;
        status = nfs4_save_creds(&original_cred);
@@ -276,11 +284,11 @@ nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn)
                return status;
        }
-        status = vfs_readdir(nn->rec_file, nfsd4_build_namelist, &names);
+        status = iterate_dir(nn->rec_file, &ctx.ctx);
        mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
-        while (!list_empty(&names)) {
+        while (!list_empty(&ctx.names)) {
                struct name_list *entry;
-                entry = list_entry(names.next, struct name_list, list);
+                entry = list_entry(ctx.names.next, struct name_list, list);
                if (!status) {
                        struct dentry *dentry;
                        dentry = lookup_one_len(entry->name, dir, HEXDIR_LEN-1);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 84ce601d8063..a6bc8a7423db 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1912,6 +1912,7 @@ struct buffered_dirent {
 };
 struct readdir_data {
+        struct dir_context ctx;
        char            *dirent;
        size_t          used;
        int             full;
@@ -1943,13 +1944,15 @@ static int nfsd_buffered_filldir(void *__buf, const char *name, int namlen,
 static __be32 nfsd_buffered_readdir(struct file *file, filldir_t func,
                                    struct readdir_cd *cdp, loff_t *offsetp)
 {
-        struct readdir_data buf;
        struct buffered_dirent *de;
        int host_err;
        int size;
        loff_t offset;
+        struct readdir_data buf = {
+                .ctx.actor = nfsd_buffered_filldir,
+                .dirent = (void *)__get_free_page(GFP_KERNEL)
+        };
-        buf.dirent = (void *)__get_free_page(GFP_KERNEL);
        if (!buf.dirent)
                return nfserrno(-ENOMEM);
@@ -1963,7 +1966,7 @@ static __be32 nfsd_buffered_readdir(struct file *file, filldir_t func,
                buf.used = 0;
                buf.full = 0;
-                host_err = vfs_readdir(file, nfsd_buffered_filldir, &buf);
+                host_err = iterate_dir(file, &buf.ctx);
                if (buf.full)
                        host_err = 0;
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index f30b017740a7..197a63e9d102 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -256,22 +256,18 @@ static void nilfs_set_de_type(struct nilfs_dir_entry *de, struct inode *inode)
        de->file_type = nilfs_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
 }
-static int nilfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int nilfs_readdir(struct file *file, struct dir_context *ctx)
 {
-        loff_t pos = filp->f_pos;
+        loff_t pos = ctx->pos;
-        struct inode *inode = file_inode(filp);
+        struct inode *inode = file_inode(file);
        struct super_block *sb = inode->i_sb;
        unsigned int offset = pos & ~PAGE_CACHE_MASK;
        unsigned long n = pos >> PAGE_CACHE_SHIFT;
        unsigned long npages = dir_pages(inode);
 /*      unsigned chunk_mask = ~(nilfs_chunk_size(inode)-1); */
-        unsigned char *types = NULL;
-        int ret;
        if (pos > inode->i_size - NILFS_DIR_REC_LEN(1))
-                goto success;
+                return 0;
-        types = nilfs_filetype_table;
        for ( ; n < npages; n++, offset = 0) {
                char *kaddr, *limit;
@@ -281,9 +277,8 @@ static int nilfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                if (IS_ERR(page)) {
                        nilfs_error(sb, __func__, "bad page in #%lu",
                                    inode->i_ino);
-                        filp->f_pos += PAGE_CACHE_SIZE - offset;
+                        ctx->pos += PAGE_CACHE_SIZE - offset;
-                        ret = -EIO;
+                        return -EIO;
-                        goto done;
                }
                kaddr = page_address(page);
                de = (struct nilfs_dir_entry *)(kaddr + offset);
@@ -293,35 +288,28 @@ static int nilfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                        if (de->rec_len == 0) {
                                nilfs_error(sb, __func__,
                                            "zero-length directory entry");
-                                ret = -EIO;
                                nilfs_put_page(page);
-                                goto done;
+                                return -EIO;
                        }
                        if (de->inode) {
-                                int over;
+                                unsigned char t;
-                                unsigned char d_type = DT_UNKNOWN;
-                                if (types && de->file_type < NILFS_FT_MAX)
+                                if (de->file_type < NILFS_FT_MAX)
-                                        d_type = types[de->file_type];
+                                        t = nilfs_filetype_table[de->file_type];
+                                else
+                                        t = DT_UNKNOWN;
-                                offset = (char *)de - kaddr;
+                                if (!dir_emit(ctx, de->name, de->name_len,
-                                over = filldir(dirent, de->name, de->name_len,
+                                                le64_to_cpu(de->inode), t)) {
-                                                (n<<PAGE_CACHE_SHIFT) | offset,
-                                                le64_to_cpu(de->inode), d_type);
-                                if (over) {
                                        nilfs_put_page(page);
-                                        goto success;
+                                        return 0;
                                }
                        }
-                        filp->f_pos += nilfs_rec_len_from_disk(de->rec_len);
+                        ctx->pos += nilfs_rec_len_from_disk(de->rec_len);
                }
                nilfs_put_page(page);
        }
+        return 0;
-success:
-        ret = 0;
-done:
-        return ret;
 }
 /*
@@ -678,7 +666,7 @@ not_empty:
 const struct file_operations nilfs_dir_operations = {
        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
-        .readdir        = nilfs_readdir,
+        .iterate        = nilfs_readdir,
        .unlocked_ioctl = nilfs_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = nilfs_compat_ioctl,
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index fa9c05f97af4..d267ea6aa1a0 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -1372,7 +1372,7 @@ retry_writepage:
                 * The page may have dirty, unmapped buffers.  Make them
                 * freeable here, so the page does not leak.
                 */
-                block_invalidatepage(page, 0);
+                block_invalidatepage(page, 0, PAGE_CACHE_SIZE);
                unlock_page(page);
                ntfs_debug("Write outside i_size - truncated?");
                return 0;
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
index aa411c3f20e9..9e38dafa3bc7 100644
--- a/fs/ntfs/dir.c
+++ b/fs/ntfs/dir.c
@@ -1004,13 +1004,11 @@ dir_err_out:
 /**
 * ntfs_filldir - ntfs specific filldir method
 * @vol:        current ntfs volume
- * @fpos:       position in the directory
 * @ndir:       ntfs inode of current directory
 * @ia_page:    page in which the index allocation buffer @ie is in resides
 * @ie:         current index entry
 * @name:       buffer to use for the converted name
- * @dirent:     vfs filldir callback context
+ * @actor:      what to feed the entries to
- * @filldir:    vfs filldir callback
 *
 * Convert the Unicode @name to the loaded NLS and pass it to the @filldir
 * callback.
@@ -1024,12 +1022,12 @@ dir_err_out:
 * retake the lock if we are returning a non-zero value as ntfs_readdir()
 * would need to drop the lock immediately anyway.
 */
-static inline int ntfs_filldir(ntfs_volume *vol, loff_t fpos,
+static inline int ntfs_filldir(ntfs_volume *vol,
                ntfs_inode *ndir, struct page *ia_page, INDEX_ENTRY *ie,
-                u8 *name, void *dirent, filldir_t filldir)
+                u8 *name, struct dir_context *actor)
 {
        unsigned long mref;
-        int name_len, rc;
+        int name_len;
        unsigned dt_type;
        FILE_NAME_TYPE_FLAGS name_type;
@@ -1068,13 +1066,14 @@ static inline int ntfs_filldir(ntfs_volume *vol, loff_t fpos,
        if (ia_page)
                unlock_page(ia_page);
        ntfs_debug("Calling filldir for %s with len %i, fpos 0x%llx, inode "
-                        "0x%lx, DT_%s.", name, name_len, fpos, mref,
+                        "0x%lx, DT_%s.", name, name_len, actor->pos, mref,
                        dt_type == DT_DIR ? "DIR" : "REG");
-        rc = filldir(dirent, name, name_len, fpos, mref, dt_type);
+        if (!dir_emit(actor, name, name_len, mref, dt_type))
+                return 1;
        /* Relock the page but not if we are aborting ->readdir. */
-        if (!rc && ia_page)
+        if (ia_page)
                lock_page(ia_page);
-        return rc;
+        return 0;
 }
 /*
@@ -1097,11 +1096,11 @@ static inline int ntfs_filldir(ntfs_volume *vol, loff_t fpos,
 *             removes them again after the write is complete after which it 
 *             unlocks the page.
 */
-static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int ntfs_readdir(struct file *file, struct dir_context *actor)
 {
        s64 ia_pos, ia_start, prev_ia_pos, bmp_pos;
-        loff_t fpos, i_size;
+        loff_t i_size;
-        struct inode *bmp_vi, *vdir = file_inode(filp);
+        struct inode *bmp_vi, *vdir = file_inode(file);
        struct super_block *sb = vdir->i_sb;
        ntfs_inode *ndir = NTFS_I(vdir);
        ntfs_volume *vol = NTFS_SB(sb);
@@ -1116,33 +1115,16 @@ static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
        u8 *kaddr, *bmp, *index_end;
        ntfs_attr_search_ctx *ctx;
-        fpos = filp->f_pos;
        ntfs_debug("Entering for inode 0x%lx, fpos 0x%llx.",
-                        vdir->i_ino, fpos);
+                        vdir->i_ino, actor->pos);
        rc = err = 0;
        /* Are we at end of dir yet? */
        i_size = i_size_read(vdir);
-        if (fpos >= i_size + vol->mft_record_size)
+        if (actor->pos >= i_size + vol->mft_record_size)
-                goto done;
+                return 0;
        /* Emulate . and .. for all directories. */
-        if (!fpos) {
+        if (!dir_emit_dots(file, actor))
-                ntfs_debug("Calling filldir for . with len 1, fpos 0x0, "
+                return 0;
-                                "inode 0x%lx, DT_DIR.", vdir->i_ino);
-                rc = filldir(dirent, ".", 1, fpos, vdir->i_ino, DT_DIR);
-                if (rc)
-                        goto done;
-                fpos++;
-        }
-        if (fpos == 1) {
-                ntfs_debug("Calling filldir for .. with len 2, fpos 0x1, "
-                                "inode 0x%lx, DT_DIR.",
-                                (unsigned long)parent_ino(filp->f_path.dentry));
-                rc = filldir(dirent, "..", 2, fpos,
-                                parent_ino(filp->f_path.dentry), DT_DIR);
-                if (rc)
-                        goto done;
-                fpos++;
-        }
        m = NULL;
        ctx = NULL;
        /*
@@ -1155,7 +1137,7 @@ static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                goto err_out;
        }
        /* Are we jumping straight into the index allocation attribute? */
-        if (fpos >= vol->mft_record_size)
+        if (actor->pos >= vol->mft_record_size)
                goto skip_index_root;
        /* Get hold of the mft record for the directory. */
        m = map_mft_record(ndir);
@@ -1170,7 +1152,7 @@ static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                goto err_out;
        }
        /* Get the offset into the index root attribute. */
-        ir_pos = (s64)fpos;
+        ir_pos = (s64)actor->pos;
        /* Find the index root attribute in the mft record. */
        err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL,
                        0, ctx);
@@ -1226,10 +1208,9 @@ static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                if (ir_pos > (u8*)ie - (u8*)ir)
                        continue;
                /* Advance the position even if going to skip the entry. */
-                fpos = (u8*)ie - (u8*)ir;
+                actor->pos = (u8*)ie - (u8*)ir;
                /* Submit the name to the filldir callback. */
-                rc = ntfs_filldir(vol, fpos, ndir, NULL, ie, name, dirent,
+                rc = ntfs_filldir(vol, ndir, NULL, ie, name, actor);
-                                filldir);
                if (rc) {
                        kfree(ir);
                        goto abort;
@@ -1242,12 +1223,12 @@ static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
        if (!NInoIndexAllocPresent(ndir))
                goto EOD;
        /* Advance fpos to the beginning of the index allocation. */
-        fpos = vol->mft_record_size;
+        actor->pos = vol->mft_record_size;
 skip_index_root:
        kaddr = NULL;
        prev_ia_pos = -1LL;
        /* Get the offset into the index allocation attribute. */
-        ia_pos = (s64)fpos - vol->mft_record_size;
+        ia_pos = (s64)actor->pos - vol->mft_record_size;
        ia_mapping = vdir->i_mapping;
        ntfs_debug("Inode 0x%lx, getting index bitmap.", vdir->i_ino);
        bmp_vi = ntfs_attr_iget(vdir, AT_BITMAP, I30, 4);
@@ -1409,7 +1390,7 @@ find_next_index_buffer:
                if (ia_pos - ia_start > (u8*)ie - (u8*)ia)
                        continue;
                /* Advance the position even if going to skip the entry. */
-                fpos = (u8*)ie - (u8*)ia +
+                actor->pos = (u8*)ie - (u8*)ia +
                                (sle64_to_cpu(ia->index_block_vcn) <<
                                ndir->itype.index.vcn_size_bits) +
                                vol->mft_record_size;
@@ -1419,8 +1400,7 @@ find_next_index_buffer:
                 * before returning, unless a non-zero value is returned in
                 * which case the page is left unlocked.
                 */
-                rc = ntfs_filldir(vol, fpos, ndir, ia_page, ie, name, dirent,
+                rc = ntfs_filldir(vol, ndir, ia_page, ie, name, actor);
-                                filldir);
                if (rc) {
                        /* @ia_page is already unlocked in this case. */
                        ntfs_unmap_page(ia_page);
@@ -1439,18 +1419,9 @@ unm_EOD:
        iput(bmp_vi);
 EOD:
        /* We are finished, set fpos to EOD. */
-        fpos = i_size + vol->mft_record_size;
+        actor->pos = i_size + vol->mft_record_size;
 abort:
        kfree(name);
-done:
-#ifdef DEBUG
-        if (!rc)
-                ntfs_debug("EOD, fpos 0x%llx, returning 0.", fpos);
-        else
-                ntfs_debug("filldir returned %i, fpos 0x%llx, returning 0.",
-                                rc, fpos);
-#endif
-        filp->f_pos = fpos;
        return 0;
 err_out:
        if (bmp_page) {
@@ -1471,7 +1442,6 @@ iput_err_out:
        if (!err)
                err = -EIO;
        ntfs_debug("Failed. Returning error code %i.", -err);
-        filp->f_pos = fpos;
        return err;
 }
@@ -1571,7 +1541,7 @@ static int ntfs_dir_fsync(struct file *filp, loff_t start, loff_t end,
 const struct file_operations ntfs_dir_ops = {
        .llseek         = generic_file_llseek,  /* Seek inside directory. */
        .read           = generic_read_dir,     /* Return -EISDIR. */
-        .readdir        = ntfs_readdir,         /* Read directory contents. */
+        .iterate        = ntfs_readdir,         /* Read directory contents. */
 #ifdef NTFS_RW
        .fsync          = ntfs_dir_fsync,       /* Sync a directory to disk. */
        /*.aio_fsync    = ,*/                   /* Sync all outstanding async
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 20dfec72e903..79736a28d84f 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -603,11 +603,12 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
 * from ext3.  PageChecked() bits have been removed as OCFS2 does not
 * do journalled data.
 */
-static void ocfs2_invalidatepage(struct page *page, unsigned long offset)
+static void ocfs2_invalidatepage(struct page *page, unsigned int offset,
+                                 unsigned int length)
 {
        journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal;
-        jbd2_journal_invalidatepage(journal, page, offset);
+        jbd2_journal_invalidatepage(journal, page, offset, length);
 }
 static int ocfs2_releasepage(struct page *page, gfp_t wait)
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index f1e1aed8f638..eb760d8acd50 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -1761,11 +1761,10 @@ bail:
 static int ocfs2_dir_foreach_blk_id(struct inode *inode,
                                    u64 *f_version,
-                                    loff_t *f_pos, void *priv,
+                                    struct dir_context *ctx)
-                                    filldir_t filldir, int *filldir_err)
 {
-        int ret, i, filldir_ret;
+        int ret, i;
-        unsigned long offset = *f_pos;
+        unsigned long offset = ctx->pos;
        struct buffer_head *di_bh = NULL;
        struct ocfs2_dinode *di;
        struct ocfs2_inline_data *data;
@@ -1781,8 +1780,7 @@ static int ocfs2_dir_foreach_blk_id(struct inode *inode,
        di = (struct ocfs2_dinode *)di_bh->b_data;
        data = &di->id2.i_data;
-        while (*f_pos < i_size_read(inode)) {
+        while (ctx->pos < i_size_read(inode)) {
-revalidate:
                /* If the dir block has changed since the last call to
                 * readdir(2), then we might be pointing to an invalid
                 * dirent right now.  Scan from the start of the block
@@ -1802,50 +1800,31 @@ revalidate:
                                        break;
                                i += le16_to_cpu(de->rec_len);
                        }
-                        *f_pos = offset = i;
+                        ctx->pos = offset = i;
                        *f_version = inode->i_version;
                }
-                de = (struct ocfs2_dir_entry *) (data->id_data + *f_pos);
+                de = (struct ocfs2_dir_entry *) (data->id_data + ctx->pos);
-                if (!ocfs2_check_dir_entry(inode, de, di_bh, *f_pos)) {
+                if (!ocfs2_check_dir_entry(inode, de, di_bh, ctx->pos)) {
                        /* On error, skip the f_pos to the end. */
-                        *f_pos = i_size_read(inode);
+                        ctx->pos = i_size_read(inode);
-                        goto out;
+                        break;
                }
                offset += le16_to_cpu(de->rec_len);
                if (le64_to_cpu(de->inode)) {
-                        /* We might block in the next section
-                         * if the data destination is
-                         * currently swapped out.  So, use a
-                         * version stamp to detect whether or
-                         * not the directory has been modified
-                         * during the copy operation.
-                         */
-                        u64 version = *f_version;
                        unsigned char d_type = DT_UNKNOWN;
                        if (de->file_type < OCFS2_FT_MAX)
                                d_type = ocfs2_filetype_table[de->file_type];
-                        filldir_ret = filldir(priv, de->name,
+                        if (!dir_emit(ctx, de->name, de->name_len,
-                                              de->name_len,
+                                      le64_to_cpu(de->inode), d_type))
-                                              *f_pos,
+                                goto out;
-                                              le64_to_cpu(de->inode),
-                                              d_type);
-                        if (filldir_ret) {
-                                if (filldir_err)
-                                        *filldir_err = filldir_ret;
-                                break;
-                        }
-                        if (version != *f_version)
-                                goto revalidate;
                }
-                *f_pos += le16_to_cpu(de->rec_len);
+                ctx->pos += le16_to_cpu(de->rec_len);
        }
 out:
        brelse(di_bh);
        return 0;
 }
@@ -1855,27 +1834,26 @@ out:
 */
 static int ocfs2_dir_foreach_blk_el(struct inode *inode,
                                    u64 *f_version,
-                                    loff_t *f_pos, void *priv,
+                                    struct dir_context *ctx,
-                                    filldir_t filldir, int *filldir_err)
+                                    bool persist)
 {
-        int error = 0;
        unsigned long offset, blk, last_ra_blk = 0;
-        int i, stored;
+        int i;
        struct buffer_head * bh, * tmp;
        struct ocfs2_dir_entry * de;
        struct super_block * sb = inode->i_sb;
        unsigned int ra_sectors = 16;
+        int stored = 0;
-        stored = 0;
        bh = NULL;
-        offset = (*f_pos) & (sb->s_blocksize - 1);
+        offset = ctx->pos & (sb->s_blocksize - 1);
-        while (!error && !stored && *f_pos < i_size_read(inode)) {
+        while (ctx->pos < i_size_read(inode)) {
-                blk = (*f_pos) >> sb->s_blocksize_bits;
+                blk = ctx->pos >> sb->s_blocksize_bits;
                if (ocfs2_read_dir_block(inode, blk, &bh, 0)) {
                        /* Skip the corrupt dirblock and keep trying */
-                        *f_pos += sb->s_blocksize - offset;
+                        ctx->pos += sb->s_blocksize - offset;
                        continue;
                }
@@ -1897,7 +1875,6 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode,
                        ra_sectors = 8;
                }
-revalidate:
                /* If the dir block has changed since the last call to
                 * readdir(2), then we might be pointing to an invalid
                 * dirent right now.  Scan from the start of the block
@@ -1917,93 +1894,64 @@ revalidate:
                                i += le16_to_cpu(de->rec_len);
                        }
                        offset = i;
-                        *f_pos = ((*f_pos) & ~(sb->s_blocksize - 1))
+                        ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1))
                                | offset;
                        *f_version = inode->i_version;
                }
-                while (!error && *f_pos < i_size_read(inode)
+                while (ctx->pos < i_size_read(inode)
                       && offset < sb->s_blocksize) {
                        de = (struct ocfs2_dir_entry *) (bh->b_data + offset);
                        if (!ocfs2_check_dir_entry(inode, de, bh, offset)) {
                                /* On error, skip the f_pos to the
                                   next block. */
-                                *f_pos = ((*f_pos) | (sb->s_blocksize - 1)) + 1;
+                                ctx->pos = (ctx->pos | (sb->s_blocksize - 1)) + 1;
                                brelse(bh);
-                                goto out;
+                                continue;
                        }
-                        offset += le16_to_cpu(de->rec_len);
                        if (le64_to_cpu(de->inode)) {
-                                /* We might block in the next section
-                                 * if the data destination is
-                                 * currently swapped out.  So, use a
-                                 * version stamp to detect whether or
-                                 * not the directory has been modified
-                                 * during the copy operation.
-                                 */
-                                unsigned long version = *f_version;
                                unsigned char d_type = DT_UNKNOWN;
                                if (de->file_type < OCFS2_FT_MAX)
                                        d_type = ocfs2_filetype_table[de->file_type];
-                                error = filldir(priv, de->name,
+                                if (!dir_emit(ctx, de->name,
                                                de->name_len,
-                                                *f_pos,
                                                le64_to_cpu(de->inode),
-                                                d_type);
+                                                d_type)) {
-                                if (error) {
+                                        brelse(bh);
-                                        if (filldir_err)
+                                        return 0;
-                                                *filldir_err = error;
-                                        break;
                                }
-                                if (version != *f_version)
+                                stored++;
-                                        goto revalidate;
-                                stored ++;
                        }
-                        *f_pos += le16_to_cpu(de->rec_len);
+                        offset += le16_to_cpu(de->rec_len);
+                        ctx->pos += le16_to_cpu(de->rec_len);
                }
                offset = 0;
                brelse(bh);
                bh = NULL;
+                if (!persist && stored)
+                        break;
        }
+        return 0;
-        stored = 0;
-out:
-        return stored;
 }
 static int ocfs2_dir_foreach_blk(struct inode *inode, u64 *f_version,
-                                 loff_t *f_pos, void *priv, filldir_t filldir,
+                                 struct dir_context *ctx,
-                                 int *filldir_err)
+                                 bool persist)
 {
        if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
-                return ocfs2_dir_foreach_blk_id(inode, f_version, f_pos, priv,
+                return ocfs2_dir_foreach_blk_id(inode, f_version, ctx);
-                                                filldir, filldir_err);
+        return ocfs2_dir_foreach_blk_el(inode, f_version, ctx, persist);
-        return ocfs2_dir_foreach_blk_el(inode, f_version, f_pos, priv, filldir,
-                                        filldir_err);
 }
 /*
 * This is intended to be called from inside other kernel functions,
 * so we fake some arguments.
 */
-int ocfs2_dir_foreach(struct inode *inode, loff_t *f_pos, void *priv,
+int ocfs2_dir_foreach(struct inode *inode, struct dir_context *ctx)
-                      filldir_t filldir)
 {
-        int ret = 0, filldir_err = 0;
        u64 version = inode->i_version;
+        ocfs2_dir_foreach_blk(inode, &version, ctx, true);
-        while (*f_pos < i_size_read(inode)) {
-                ret = ocfs2_dir_foreach_blk(inode, &version, f_pos, priv,
-                                            filldir, &filldir_err);
-                if (ret || filldir_err)
-                        break;
-        }
-        if (ret > 0)
-                ret = -EIO;
        return 0;
 }
@@ -2011,15 +1959,15 @@ int ocfs2_dir_foreach(struct inode *inode, loff_t *f_pos, void *priv,
 * ocfs2_readdir()
 *
 */
-int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
+int ocfs2_readdir(struct file *file, struct dir_context *ctx)
 {
        int error = 0;
-        struct inode *inode = file_inode(filp);
+        struct inode *inode = file_inode(file);
        int lock_level = 0;
        trace_ocfs2_readdir((unsigned long long)OCFS2_I(inode)->ip_blkno);
-        error = ocfs2_inode_lock_atime(inode, filp->f_path.mnt, &lock_level);
+        error = ocfs2_inode_lock_atime(inode, file->f_path.mnt, &lock_level);
        if (lock_level && error >= 0) {
                /* We release EX lock which used to update atime
                 * and get PR lock again to reduce contention
@@ -2035,8 +1983,7 @@ int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
                goto bail_nolock;
        }
-        error = ocfs2_dir_foreach_blk(inode, &filp->f_version, &filp->f_pos,
+        error = ocfs2_dir_foreach_blk(inode, &file->f_version, ctx, false);
-                                      dirent, filldir, NULL);
        ocfs2_inode_unlock(inode, lock_level);
        if (error)
@@ -2120,6 +2067,7 @@ bail:
 }
 struct ocfs2_empty_dir_priv {
+        struct dir_context ctx;
        unsigned seen_dot;
        unsigned seen_dot_dot;
        unsigned seen_other;
@@ -2204,8 +2152,9 @@ out:
 int ocfs2_empty_dir(struct inode *inode)
 {
        int ret;
-        loff_t start = 0;
+        struct ocfs2_empty_dir_priv priv = {
-        struct ocfs2_empty_dir_priv priv;
+                .ctx.actor = ocfs2_empty_dir_filldir
+        };
        memset(&priv, 0, sizeof(priv));
@@ -2219,7 +2168,7 @@ int ocfs2_empty_dir(struct inode *inode)
                 */
        }
-        ret = ocfs2_dir_foreach(inode, &start, &priv, ocfs2_empty_dir_filldir);
+        ret = ocfs2_dir_foreach(inode, &priv.ctx);
        if (ret)
                mlog_errno(ret);
diff --git a/fs/ocfs2/dir.h b/fs/ocfs2/dir.h
index e683f3deb645..f0344b75b14d 100644
--- a/fs/ocfs2/dir.h
+++ b/fs/ocfs2/dir.h
@@ -92,9 +92,8 @@ int ocfs2_find_files_on_disk(const char *name,
                             struct ocfs2_dir_lookup_result *res);
 int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name,
                               int namelen, u64 *blkno);
-int ocfs2_readdir(struct file *filp, void *dirent, filldir_t filldir);
+int ocfs2_readdir(struct file *file, struct dir_context *ctx);
-int ocfs2_dir_foreach(struct inode *inode, loff_t *f_pos, void *priv,
+int ocfs2_dir_foreach(struct inode *inode, struct dir_context *ctx);
-                      filldir_t filldir);
 int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
                                 struct inode *dir,
                                 struct buffer_head *parent_fe_bh,
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index ff54014a24ec..8a38714f1d92 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2712,7 +2712,7 @@ const struct file_operations ocfs2_fops = {
 const struct file_operations ocfs2_dops = {
        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
-        .readdir        = ocfs2_readdir,
+        .iterate        = ocfs2_readdir,
        .fsync          = ocfs2_sync_file,
        .release        = ocfs2_dir_release,
        .open           = ocfs2_dir_open,
@@ -2759,7 +2759,7 @@ const struct file_operations ocfs2_fops_no_plocks = {
 const struct file_operations ocfs2_dops_no_plocks = {
        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
-        .readdir        = ocfs2_readdir,
+        .iterate        = ocfs2_readdir,
        .fsync          = ocfs2_sync_file,
        .release        = ocfs2_dir_release,
        .open           = ocfs2_dir_open,
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 8eccfabcd12e..242170d83971 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1941,6 +1941,7 @@ void ocfs2_orphan_scan_start(struct ocfs2_super *osb)
 }
 struct ocfs2_orphan_filldir_priv {
+        struct dir_context      ctx;
        struct inode            *head;
        struct ocfs2_super      *osb;
 };
@@ -1977,11 +1978,11 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
 {
        int status;
        struct inode *orphan_dir_inode = NULL;
-        struct ocfs2_orphan_filldir_priv priv;
+        struct ocfs2_orphan_filldir_priv priv = {
-        loff_t pos = 0;
+                .ctx.actor = ocfs2_orphan_filldir,
+                .osb = osb,
-        priv.osb = osb;
+                .head = *head
-        priv.head = *head;
+        };
        orphan_dir_inode = ocfs2_get_system_file_inode(osb,
                                                       ORPHAN_DIR_SYSTEM_INODE,
@@ -1999,8 +2000,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
                goto out;
        }
-        status = ocfs2_dir_foreach(orphan_dir_inode, &pos, &priv,
+        status = ocfs2_dir_foreach(orphan_dir_inode, &priv.ctx);
-                                   ocfs2_orphan_filldir);
        if (status) {
                mlog_errno(status);
                goto out_cluster;
diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
index acbaebcad3a8..1b8e9e8405b2 100644
--- a/fs/omfs/dir.c
+++ b/fs/omfs/dir.c
@@ -327,26 +327,23 @@ int omfs_is_bad(struct omfs_sb_info *sbi, struct omfs_header *header,
        return is_bad;
 }
-static int omfs_fill_chain(struct file *filp, void *dirent, filldir_t filldir,
+static bool omfs_fill_chain(struct inode *dir, struct dir_context *ctx,
                u64 fsblock, int hindex)
 {
-        struct inode *dir = file_inode(filp);
-        struct buffer_head *bh;
-        struct omfs_inode *oi;
-        u64 self;
-        int res = 0;
-        unsigned char d_type;
        /* follow chain in this bucket */
        while (fsblock != ~0) {
-                bh = omfs_bread(dir->i_sb, fsblock);
+                struct buffer_head *bh = omfs_bread(dir->i_sb, fsblock);
+                struct omfs_inode *oi;
+                u64 self;
+                unsigned char d_type;
                if (!bh)
-                        goto out;
+                        return true;
                oi = (struct omfs_inode *) bh->b_data;
                if (omfs_is_bad(OMFS_SB(dir->i_sb), &oi->i_head, fsblock)) {
                        brelse(bh);
-                        goto out;
+                        return true;
                }
                self = fsblock;
@@ -361,15 +358,16 @@ static int omfs_fill_chain(struct file *filp, void *dirent, filldir_t filldir,
                d_type = (oi->i_type == OMFS_DIR) ? DT_DIR : DT_REG;
-                res = filldir(dirent, oi->i_name, strnlen(oi->i_name,
+                if (!dir_emit(ctx, oi->i_name,
-                        OMFS_NAMELEN), filp->f_pos, self, d_type);
+                              strnlen(oi->i_name, OMFS_NAMELEN),
+                              self, d_type)) {
+                        brelse(bh);
+                        return false;
+                }
                brelse(bh);
-                if (res < 0)
+                ctx->pos++;
-                        break;
-                filp->f_pos++;
        }
-out:
+        return true;
-        return res;
 }
 static int omfs_rename(struct inode *old_dir, struct dentry *old_dentry,
@@ -403,60 +401,44 @@ out:
        return err;
 }
-static int omfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int omfs_readdir(struct file *file, struct dir_context *ctx)
 {
-        struct inode *dir = file_inode(filp);
+        struct inode *dir = file_inode(file);
        struct buffer_head *bh;
-        loff_t offset, res;
+        __be64 *p;
        unsigned int hchain, hindex;
        int nbuckets;
-        u64 fsblock;
-        int ret = -EINVAL;
+        if (ctx->pos >> 32)
+                return -EINVAL;
-        if (filp->f_pos >> 32)
-                goto success;
+        if (ctx->pos < 1 << 20) {
+                if (!dir_emit_dots(file, ctx))
-        switch ((unsigned long) filp->f_pos) {
+                        return 0;
-        case 0:
+                ctx->pos = 1 << 20;
-                if (filldir(dirent, ".", 1, 0, dir->i_ino, DT_DIR) < 0)
-                        goto success;
-                filp->f_pos++;
-                /* fall through */
-        case 1:
-                if (filldir(dirent, "..", 2, 1,
-                    parent_ino(filp->f_dentry), DT_DIR) < 0)
-                        goto success;
-                filp->f_pos = 1 << 20;
-                /* fall through */
        }
        nbuckets = (dir->i_size - OMFS_DIR_START) / 8;
        /* high 12 bits store bucket + 1 and low 20 bits store hash index */
-        hchain = (filp->f_pos >> 20) - 1;
+        hchain = (ctx->pos >> 20) - 1;
-        hindex = filp->f_pos & 0xfffff;
+        hindex = ctx->pos & 0xfffff;
        bh = omfs_bread(dir->i_sb, dir->i_ino);
        if (!bh)
-                goto out;
+                return -EINVAL;
-        offset = OMFS_DIR_START + hchain * 8;
+        p = (__be64 *)(bh->b_data + OMFS_DIR_START) + hchain;
-        for (; hchain < nbuckets; hchain++, offset += 8) {
+        for (; hchain < nbuckets; hchain++) {
-                fsblock = be64_to_cpu(*((__be64 *) &bh->b_data[offset]));
+                __u64 fsblock = be64_to_cpu(*p++);
+                if (!omfs_fill_chain(dir, ctx, fsblock, hindex))
-                res = omfs_fill_chain(filp, dirent, filldir, fsblock, hindex);
-                hindex = 0;
-                if (res < 0)
                        break;
+                hindex = 0;
-                filp->f_pos = (hchain+2) << 20;
+                ctx->pos = (hchain+2) << 20;
        }
        brelse(bh);
-success:
+        return 0;
-        ret = 0;
-out:
-        return ret;
 }
 const struct inode_operations omfs_dir_inops = {
@@ -470,6 +452,6 @@ const struct inode_operations omfs_dir_inops = {
 const struct file_operations omfs_dir_operations = {
        .read = generic_read_dir,
-        .readdir = omfs_readdir,
+        .iterate = omfs_readdir,
        .llseek = generic_file_llseek,
 };
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index 75885ffde44e..8c0ceb8dd1f7 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -162,11 +162,11 @@ static const struct file_operations openpromfs_prop_ops = {
        .release        = seq_release,
 };
-static int openpromfs_readdir(struct file *, void *, filldir_t);
+static int openpromfs_readdir(struct file *, struct dir_context *);
 static const struct file_operations openprom_operations = {
        .read           = generic_read_dir,
-        .readdir        = openpromfs_readdir,
+        .iterate        = openpromfs_readdir,
        .llseek         = generic_file_llseek,
 };
@@ -260,71 +260,64 @@ found:
        return NULL;
 }
-static int openpromfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
+static int openpromfs_readdir(struct file *file, struct dir_context *ctx)
 {
-        struct inode *inode = file_inode(filp);
+        struct inode *inode = file_inode(file);
        struct op_inode_info *oi = OP_I(inode);
        struct device_node *dp = oi->u.node;
        struct device_node *child;
        struct property *prop;
-        unsigned int ino;
        int i;
        mutex_lock(&op_mutex);
        
-        ino = inode->i_ino;
+        if (ctx->pos == 0) {
-        i = filp->f_pos;
+                if (!dir_emit(ctx, ".", 1, inode->i_ino, DT_DIR))
-        switch (i) {
-        case 0:
-                if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
                        goto out;
-                i++;
+                ctx->pos = 1;
-                filp->f_pos++;
+        }
-                /* fall thru */
+        if (ctx->pos == 1) {
-        case 1:
+                if (!dir_emit(ctx, "..", 2,
-                if (filldir(dirent, "..", 2, i,
                            (dp->parent == NULL ?
                             OPENPROM_ROOT_INO :
-                             dp->parent->unique_id), DT_DIR) < 0) 
+                             dp->parent->unique_id), DT_DIR))
                        goto out;
-                i++;
+                ctx->pos = 2;
-                filp->f_pos++;
+        }
-                /* fall thru */
+        i = ctx->pos - 2;
-        default:
-                i -= 2;
-                /* First, the children nodes as directories.  */
-                child = dp->child;
-                while (i && child) {
-                        child = child->sibling;
-                        i--;
-                }
-                while (child) {
-                        if (filldir(dirent,
-                                    child->path_component_name,
-                                    strlen(child->path_component_name),
-                                    filp->f_pos, child->unique_id, DT_DIR) < 0)
-                                goto out;
-                        filp->f_pos++;
-                        child = child->sibling;
-                }
-                /* Next, the properties as files.  */
+        /* First, the children nodes as directories.  */
-                prop = dp->properties;
+        child = dp->child;
-                while (i && prop) {
+        while (i && child) {
-                        prop = prop->next;
+                child = child->sibling;
-                        i--;
+                i--;
-                }
+        }
-                while (prop) {
+        while (child) {
-                        if (filldir(dirent, prop->name, strlen(prop->name),
+                if (!dir_emit(ctx,
-                                    filp->f_pos, prop->unique_id, DT_REG) < 0)
+                            child->path_component_name,
-                                goto out;
+                            strlen(child->path_component_name),
+                            child->unique_id, DT_DIR))
+                        goto out;
-                        filp->f_pos++;
+                ctx->pos++;
-                        prop = prop->next;
+                child = child->sibling;
-                }
+        }
+        /* Next, the properties as files.  */
+        prop = dp->properties;
+        while (i && prop) {
+                prop = prop->next;
+                i--;
        }
+        while (prop) {
+                if (!dir_emit(ctx, prop->name, strlen(prop->name),
+                            prop->unique_id, DT_REG))
+                        goto out;
+                ctx->pos++;
+                prop = prop->next;
+        }
 out:
        mutex_unlock(&op_mutex);
        return 0;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index c3834dad09b3..0016350ad95e 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1681,11 +1681,11 @@ const struct dentry_operations pid_dentry_operations =
 * reported by readdir in sync with the inode numbers reported
 * by stat.
 */
-int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
+bool proc_fill_cache(struct file *file, struct dir_context *ctx,
        const char *name, int len,
        instantiate_t instantiate, struct task_struct *task, const void *ptr)
 {
-        struct dentry *child, *dir = filp->f_path.dentry;
+        struct dentry *child, *dir = file->f_path.dentry;
        struct inode *inode;
        struct qstr qname;
        ino_t ino = 0;
@@ -1720,7 +1720,7 @@ end_instantiate:
                ino = find_inode_number(dir, &qname);
        if (!ino)
                ino = 1;
-        return filldir(dirent, name, len, filp->f_pos, ino, type);
+        return dir_emit(ctx, name, len, ino, type);
 }
 #ifdef CONFIG_CHECKPOINT_RESTORE
@@ -1931,14 +1931,15 @@ static const struct inode_operations proc_map_files_inode_operations = {
 };
 static int
-proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir)
+proc_map_files_readdir(struct file *file, struct dir_context *ctx)
 {
-        struct dentry *dentry = filp->f_path.dentry;
-        struct inode *inode = dentry->d_inode;
        struct vm_area_struct *vma;
        struct task_struct *task;
        struct mm_struct *mm;
-        ino_t ino;
+        unsigned long nr_files, pos, i;
+        struct flex_array *fa = NULL;
+        struct map_files_info info;
+        struct map_files_info *p;
        int ret;
        ret = -EPERM;
@@ -1946,7 +1947,7 @@ proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir)
                goto out;
        ret = -ENOENT;
-        task = get_proc_task(inode);
+        task = get_proc_task(file_inode(file));
        if (!task)
                goto out;
@@ -1955,91 +1956,73 @@ proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir)
                goto out_put_task;
        ret = 0;
-        switch (filp->f_pos) {
+        if (!dir_emit_dots(file, ctx))
-        case 0:
+                goto out_put_task;
-                ino = inode->i_ino;
-                if (filldir(dirent, ".", 1, 0, ino, DT_DIR) < 0)
-                        goto out_put_task;
-                filp->f_pos++;
-        case 1:
-                ino = parent_ino(dentry);
-                if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
-                        goto out_put_task;
-                filp->f_pos++;
-        default:
-        {
-                unsigned long nr_files, pos, i;
-                struct flex_array *fa = NULL;
-                struct map_files_info info;
-                struct map_files_info *p;
-                mm = get_task_mm(task);
-                if (!mm)
-                        goto out_put_task;
-                down_read(&mm->mmap_sem);
-                nr_files = 0;
+        mm = get_task_mm(task);
+        if (!mm)
+                goto out_put_task;
+        down_read(&mm->mmap_sem);
-                /*
+        nr_files = 0;
-                 * We need two passes here:
-                 *
-                 *  1) Collect vmas of mapped files with mmap_sem taken
-                 *  2) Release mmap_sem and instantiate entries
-                 *
-                 * otherwise we get lockdep complained, since filldir()
-                 * routine might require mmap_sem taken in might_fault().
-                 */
-                for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) {
+        /*
-                        if (vma->vm_file && ++pos > filp->f_pos)
+         * We need two passes here:
-                                nr_files++;
+         *
-                }
+         *  1) Collect vmas of mapped files with mmap_sem taken
+         *  2) Release mmap_sem and instantiate entries
+         *
+         * otherwise we get lockdep complained, since filldir()
+         * routine might require mmap_sem taken in might_fault().
+         */
-                if (nr_files) {
+        for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) {
-                        fa = flex_array_alloc(sizeof(info), nr_files,
+                if (vma->vm_file && ++pos > ctx->pos)
-                                                GFP_KERNEL);
+                        nr_files++;
-                        if (!fa || flex_array_prealloc(fa, 0, nr_files,
+        }
-                                                        GFP_KERNEL)) {
-                                ret = -ENOMEM;
+        if (nr_files) {
-                                if (fa)
+                fa = flex_array_alloc(sizeof(info), nr_files,
-                                        flex_array_free(fa);
+                                        GFP_KERNEL);
-                                up_read(&mm->mmap_sem);
+                if (!fa || flex_array_prealloc(fa, 0, nr_files,
-                                mmput(mm);
+                                                GFP_KERNEL)) {
-                                goto out_put_task;
+                        ret = -ENOMEM;
-                        }
+                        if (fa)
-                        for (i = 0, vma = mm->mmap, pos = 2; vma;
+                                flex_array_free(fa);
-                                        vma = vma->vm_next) {
+                        up_read(&mm->mmap_sem);
-                                if (!vma->vm_file)
+                        mmput(mm);
-                                        continue;
+                        goto out_put_task;
-                                if (++pos <= filp->f_pos)
-                                        continue;
-                                info.mode = vma->vm_file->f_mode;
-                                info.len = snprintf(info.name,
-                                                sizeof(info.name), "%lx-%lx",
-                                                vma->vm_start, vma->vm_end);
-                                if (flex_array_put(fa, i++, &info, GFP_KERNEL))
-                                        BUG();
-                        }
                }
-                up_read(&mm->mmap_sem);
+                for (i = 0, vma = mm->mmap, pos = 2; vma;
+                                vma = vma->vm_next) {
-                for (i = 0; i < nr_files; i++) {
+                        if (!vma->vm_file)
-                        p = flex_array_get(fa, i);
+                                continue;
-                        ret = proc_fill_cache(filp, dirent, filldir,
+                        if (++pos <= ctx->pos)
-                                              p->name, p->len,
+                                continue;
-                                              proc_map_files_instantiate,
-                                              task,
+                        info.mode = vma->vm_file->f_mode;
-                                              (void *)(unsigned long)p->mode);
+                        info.len = snprintf(info.name,
-                        if (ret)
+                                        sizeof(info.name), "%lx-%lx",
-                                break;
+                                        vma->vm_start, vma->vm_end);
-                        filp->f_pos++;
+                        if (flex_array_put(fa, i++, &info, GFP_KERNEL))
+                                BUG();
                }
-                if (fa)
-                        flex_array_free(fa);
-                mmput(mm);
        }
+        up_read(&mm->mmap_sem);
+        for (i = 0; i < nr_files; i++) {
+                p = flex_array_get(fa, i);
+                if (!proc_fill_cache(file, ctx,
+                                      p->name, p->len,
+                                      proc_map_files_instantiate,
+                                      task,
+                                      (void *)(unsigned long)p->mode))
+                        break;
+                ctx->pos++;
        }
+        if (fa)
+                flex_array_free(fa);
+        mmput(mm);
 out_put_task:
        put_task_struct(task);
@@ -2049,7 +2032,7 @@ out:
 static const struct file_operations proc_map_files_operations = {
        .read           = generic_read_dir,
-        .readdir        = proc_map_files_readdir,
+        .iterate        = proc_map_files_readdir,
        .llseek         = default_llseek,
 };
@@ -2217,67 +2200,30 @@ out_no_task:
        return error;
 }
-static int proc_pident_fill_cache(struct file *filp, void *dirent,
+static int proc_pident_readdir(struct file *file, struct dir_context *ctx,
-        filldir_t filldir, struct task_struct *task, const struct pid_entry *p)
-{
-        return proc_fill_cache(filp, dirent, filldir, p->name, p->len,
-                                proc_pident_instantiate, task, p);
-}
-static int proc_pident_readdir(struct file *filp,
-                void *dirent, filldir_t filldir,
                const struct pid_entry *ents, unsigned int nents)
 {
-        int i;
+        struct task_struct *task = get_proc_task(file_inode(file));
-        struct dentry *dentry = filp->f_path.dentry;
+        const struct pid_entry *p;
-        struct inode *inode = dentry->d_inode;
-        struct task_struct *task = get_proc_task(inode);
-        const struct pid_entry *p, *last;
-        ino_t ino;
-        int ret;
-        ret = -ENOENT;
        if (!task)
-                goto out_no_task;
+                return -ENOENT;
-        ret = 0;
+        if (!dir_emit_dots(file, ctx))
-        i = filp->f_pos;
+                goto out;
-        switch (i) {
-        case 0:
+        if (ctx->pos >= nents + 2)
-                ino = inode->i_ino;
+                goto out;
-                if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
-                        goto out;
-                i++;
-                filp->f_pos++;
-                /* fall through */
-        case 1:
-                ino = parent_ino(dentry);
-                if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0)
-                        goto out;
-                i++;
-                filp->f_pos++;
-                /* fall through */
-        default:
-                i -= 2;
-                if (i >= nents) {
-                        ret = 1;
-                        goto out;
-                }
-                p = ents + i;
-                last = &ents[nents - 1];
-                while (p <= last) {
-                        if (proc_pident_fill_cache(filp, dirent, filldir, task, p) < 0)
-                                goto out;
-                        filp->f_pos++;
-                        p++;
-                }
-        }
-        ret = 1;
+        for (p = ents + (ctx->pos - 2); p <= ents + nents - 1; p++) {
+                if (!proc_fill_cache(file, ctx, p->name, p->len,
+                                proc_pident_instantiate, task, p))
+                        break;
+                ctx->pos++;
+        }
 out:
        put_task_struct(task);
-out_no_task:
+        return 0;
-        return ret;
 }
 #ifdef CONFIG_SECURITY
@@ -2362,16 +2308,15 @@ static const struct pid_entry attr_dir_stuff[] = {
        REG("sockcreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations),
 };
-static int proc_attr_dir_readdir(struct file * filp,
+static int proc_attr_dir_readdir(struct file *file, struct dir_context *ctx)
-                             void * dirent, filldir_t filldir)
 {
-        return proc_pident_readdir(filp,dirent,filldir,
+        return proc_pident_readdir(file, ctx, 
-                                   attr_dir_stuff,ARRAY_SIZE(attr_dir_stuff));
+                                   attr_dir_stuff, ARRAY_SIZE(attr_dir_stuff));
 }
 static const struct file_operations proc_attr_dir_operations = {
        .read           = generic_read_dir,
-        .readdir        = proc_attr_dir_readdir,
+        .iterate        = proc_attr_dir_readdir,
        .llseek         = default_llseek,
 };
@@ -2725,16 +2670,15 @@ static const struct pid_entry tgid_base_stuff[] = {
 #endif
 };
-static int proc_tgid_base_readdir(struct file * filp,
+static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
-                             void * dirent, filldir_t filldir)
 {
-        return proc_pident_readdir(filp,dirent,filldir,
+        return proc_pident_readdir(file, ctx,
-                                   tgid_base_stuff,ARRAY_SIZE(tgid_base_stuff));
+                                   tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff));
 }
 static const struct file_operations proc_tgid_base_operations = {
        .read           = generic_read_dir,
-        .readdir        = proc_tgid_base_readdir,
+        .iterate        = proc_tgid_base_readdir,
        .llseek         = default_llseek,
 };
@@ -2936,58 +2880,42 @@ retry:
 #define TGID_OFFSET (FIRST_PROCESS_ENTRY + 1)
-static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
-        struct tgid_iter iter)
-{
-        char name[PROC_NUMBUF];
-        int len = snprintf(name, sizeof(name), "%d", iter.tgid);
-        return proc_fill_cache(filp, dirent, filldir, name, len,
-                                proc_pid_instantiate, iter.task, NULL);
-}
-static int fake_filldir(void *buf, const char *name, int namelen,
-                        loff_t offset, u64 ino, unsigned d_type)
-{
-        return 0;
-}
 /* for the /proc/ directory itself, after non-process stuff has been done */
-int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
+int proc_pid_readdir(struct file *file, struct dir_context *ctx)
 {
        struct tgid_iter iter;
        struct pid_namespace *ns;
-        filldir_t __filldir;
+        loff_t pos = ctx->pos;
-        loff_t pos = filp->f_pos;
        if (pos >= PID_MAX_LIMIT + TGID_OFFSET)
-                goto out;
+                return 0;
        if (pos == TGID_OFFSET - 1) {
-                if (proc_fill_cache(filp, dirent, filldir, "self", 4,
+                if (!proc_fill_cache(file, ctx, "self", 4, NULL, NULL, NULL))
-                                        NULL, NULL, NULL) < 0)
+                        return 0;
-                        goto out;
                iter.tgid = 0;
        } else {
                iter.tgid = pos - TGID_OFFSET;
        }
        iter.task = NULL;
-        ns = filp->f_dentry->d_sb->s_fs_info;
+        ns = file->f_dentry->d_sb->s_fs_info;
        for (iter = next_tgid(ns, iter);
             iter.task;
             iter.tgid += 1, iter = next_tgid(ns, iter)) {
-                if (has_pid_permissions(ns, iter.task, 2))
+                char name[PROC_NUMBUF];
-                        __filldir = filldir;
+                int len;
-                else
+                if (!has_pid_permissions(ns, iter.task, 2))
-                        __filldir = fake_filldir;
+                        continue;
-                filp->f_pos = iter.tgid + TGID_OFFSET;
+                len = snprintf(name, sizeof(name), "%d", iter.tgid);
-                if (proc_pid_fill_cache(filp, dirent, __filldir, iter) < 0) {
+                ctx->pos = iter.tgid + TGID_OFFSET;
+                if (!proc_fill_cache(file, ctx, name, len,
+                                     proc_pid_instantiate, iter.task, NULL)) {
                        put_task_struct(iter.task);
-                        goto out;
+                        return 0;
                }
        }
-        filp->f_pos = PID_MAX_LIMIT + TGID_OFFSET;
+        ctx->pos = PID_MAX_LIMIT + TGID_OFFSET;
-out:
        return 0;
 }
@@ -3075,11 +3003,10 @@ static const struct pid_entry tid_base_stuff[] = {
 #endif
 };
-static int proc_tid_base_readdir(struct file * filp,
+static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx)
-                             void * dirent, filldir_t filldir)
 {
-        return proc_pident_readdir(filp,dirent,filldir,
+        return proc_pident_readdir(file, ctx,
-                                   tid_base_stuff,ARRAY_SIZE(tid_base_stuff));
+                                   tid_base_stuff, ARRAY_SIZE(tid_base_stuff));
 }
 static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
@@ -3090,7 +3017,7 @@ static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *den
 static const struct file_operations proc_tid_base_operations = {
        .read           = generic_read_dir,
-        .readdir        = proc_tid_base_readdir,
+        .iterate        = proc_tid_base_readdir,
        .llseek         = default_llseek,
 };
@@ -3231,30 +3158,16 @@ static struct task_struct *next_tid(struct task_struct *start)
        return pos;
 }
-static int proc_task_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
-        struct task_struct *task, int tid)
-{
-        char name[PROC_NUMBUF];
-        int len = snprintf(name, sizeof(name), "%d", tid);
-        return proc_fill_cache(filp, dirent, filldir, name, len,
-                                proc_task_instantiate, task, NULL);
-}
 /* for the /proc/TGID/task/ directories */
-static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldir)
+static int proc_task_readdir(struct file *file, struct dir_context *ctx)
 {
-        struct dentry *dentry = filp->f_path.dentry;
-        struct inode *inode = dentry->d_inode;
        struct task_struct *leader = NULL;
-        struct task_struct *task;
+        struct task_struct *task = get_proc_task(file_inode(file));
-        int retval = -ENOENT;
-        ino_t ino;
-        int tid;
        struct pid_namespace *ns;
+        int tid;
-        task = get_proc_task(inode);
        if (!task)
-                goto out_no_task;
+                return -ENOENT;
        rcu_read_lock();
        if (pid_alive(task)) {
                leader = task->group_leader;
@@ -3263,46 +3176,36 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi
        rcu_read_unlock();
        put_task_struct(task);
        if (!leader)
-                goto out_no_task;
+                return -ENOENT;
-        retval = 0;
-        switch ((unsigned long)filp->f_pos) {
+        if (!dir_emit_dots(file, ctx))
-        case 0:
+                goto out;
-                ino = inode->i_ino;
-                if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) < 0)
-                        goto out;
-                filp->f_pos++;
-                /* fall through */
-        case 1:
-                ino = parent_ino(dentry);
-                if (filldir(dirent, "..", 2, filp->f_pos, ino, DT_DIR) < 0)
-                        goto out;
-                filp->f_pos++;
-                /* fall through */
-        }
        /* f_version caches the tgid value that the last readdir call couldn't
         * return. lseek aka telldir automagically resets f_version to 0.
         */
-        ns = filp->f_dentry->d_sb->s_fs_info;
+        ns = file->f_dentry->d_sb->s_fs_info;
-        tid = (int)filp->f_version;
+        tid = (int)file->f_version;
-        filp->f_version = 0;
+        file->f_version = 0;
-        for (task = first_tid(leader, tid, filp->f_pos - 2, ns);
+        for (task = first_tid(leader, tid, ctx->pos - 2, ns);
             task;
-             task = next_tid(task), filp->f_pos++) {
+             task = next_tid(task), ctx->pos++) {
+                char name[PROC_NUMBUF];
+                int len;
                tid = task_pid_nr_ns(task, ns);
-                if (proc_task_fill_cache(filp, dirent, filldir, task, tid) < 0) {
+                len = snprintf(name, sizeof(name), "%d", tid);
+                if (!proc_fill_cache(file, ctx, name, len,
+                                proc_task_instantiate, task, NULL)) {
                        /* returning this tgid failed, save it as the first
                         * pid for the next readir call */
-                        filp->f_version = (u64)tid;
+                        file->f_version = (u64)tid;
                        put_task_struct(task);
                        break;
                }
        }
 out:
        put_task_struct(leader);
-out_no_task:
+        return 0;
-        return retval;
 }
 static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
@@ -3328,6 +3231,6 @@ static const struct inode_operations proc_task_inode_operations = {
 static const struct file_operations proc_task_operations = {
        .read           = generic_read_dir,
-        .readdir        = proc_task_readdir,
+        .iterate        = proc_task_readdir,
        .llseek         = default_llseek,
 };
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index d7a4a28ef630..1441f143c43b 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -219,74 +219,58 @@ out_no_task:
        return result;
 }
-static int proc_readfd_common(struct file * filp, void * dirent,
+static int proc_readfd_common(struct file *file, struct dir_context *ctx,
-                              filldir_t filldir, instantiate_t instantiate)
+                              instantiate_t instantiate)
 {
-        struct dentry *dentry = filp->f_path.dentry;
+        struct task_struct *p = get_proc_task(file_inode(file));
-        struct inode *inode = dentry->d_inode;
-        struct task_struct *p = get_proc_task(inode);
        struct files_struct *files;
-        unsigned int fd, ino;
+        unsigned int fd;
-        int retval;
-        retval = -ENOENT;
        if (!p)
-                goto out_no_task;
+                return -ENOENT;
-        retval = 0;
-        fd = filp->f_pos;
-        switch (fd) {
-                case 0:
-                        if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0)
-                                goto out;
-                        filp->f_pos++;
-                case 1:
-                        ino = parent_ino(dentry);
-                        if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
-                                goto out;
-                        filp->f_pos++;
-                default:
-                        files = get_files_struct(p);
-                        if (!files)
-                                goto out;
-                        rcu_read_lock();
-                        for (fd = filp->f_pos - 2;
-                             fd < files_fdtable(files)->max_fds;
-                             fd++, filp->f_pos++) {
-                                char name[PROC_NUMBUF];
-                                int len;
-                                int rv;
-                                if (!fcheck_files(files, fd))
-                                        continue;
-                                rcu_read_unlock();
-                                len = snprintf(name, sizeof(name), "%d", fd);
+        if (!dir_emit_dots(file, ctx))
-                                rv = proc_fill_cache(filp, dirent, filldir,
+                goto out;
-                                                     name, len, instantiate, p,
+        if (!dir_emit_dots(file, ctx))
-                                                     (void *)(unsigned long)fd);
+                goto out;
-                                if (rv < 0)
+        files = get_files_struct(p);
-                                        goto out_fd_loop;
+        if (!files)
-                                rcu_read_lock();
+                goto out;
-                        }
-                        rcu_read_unlock();
+        rcu_read_lock();
-out_fd_loop:
+        for (fd = ctx->pos - 2;
-                        put_files_struct(files);
+             fd < files_fdtable(files)->max_fds;
+             fd++, ctx->pos++) {
+                char name[PROC_NUMBUF];
+                int len;
+                if (!fcheck_files(files, fd))
+                        continue;
+                rcu_read_unlock();
+                len = snprintf(name, sizeof(name), "%d", fd);
+                if (!proc_fill_cache(file, ctx,
+                                     name, len, instantiate, p,
+                                     (void *)(unsigned long)fd))
+                        goto out_fd_loop;
+                rcu_read_lock();
        }
+        rcu_read_unlock();
+out_fd_loop:
+        put_files_struct(files);
 out:
        put_task_struct(p);
-out_no_task:
+        return 0;
-        return retval;
 }
-static int proc_readfd(struct file *filp, void *dirent, filldir_t filldir)
+static int proc_readfd(struct file *file, struct dir_context *ctx)
 {
-        return proc_readfd_common(filp, dirent, filldir, proc_fd_instantiate);
+        return proc_readfd_common(file, ctx, proc_fd_instantiate);
 }
 const struct file_operations proc_fd_operations = {
        .read           = generic_read_dir,
-        .readdir        = proc_readfd,
+        .iterate        = proc_readfd,
        .llseek         = default_llseek,
 };
@@ -351,9 +335,9 @@ proc_lookupfdinfo(struct inode *dir, struct dentry *dentry, unsigned int flags)
        return proc_lookupfd_common(dir, dentry, proc_fdinfo_instantiate);
 }
-static int proc_readfdinfo(struct file *filp, void *dirent, filldir_t filldir)
+static int proc_readfdinfo(struct file *file, struct dir_context *ctx)
 {
-        return proc_readfd_common(filp, dirent, filldir,
+        return proc_readfd_common(file, ctx,
                                  proc_fdinfo_instantiate);
 }
@@ -364,6 +348,6 @@ const struct inode_operations proc_fdinfo_inode_operations = {
 const struct file_operations proc_fdinfo_operations = {
        .read           = generic_read_dir,
-        .readdir        = proc_readfdinfo,
+        .iterate        = proc_readfdinfo,
        .llseek         = default_llseek,
 };
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index a2596afffae6..94441a407337 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -233,76 +233,52 @@ struct dentry *proc_lookup(struct inode *dir, struct dentry *dentry,
 * value of the readdir() call, as long as it's non-negative
 * for success..
 */
-int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent,
+int proc_readdir_de(struct proc_dir_entry *de, struct file *file,
-                filldir_t filldir)
+                    struct dir_context *ctx)
 {
-        unsigned int ino;
        int i;
-        struct inode *inode = file_inode(filp);
-        int ret = 0;
-        ino = inode->i_ino;
-        i = filp->f_pos;
-        switch (i) {
-                case 0:
-                        if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
-                                goto out;
-                        i++;
-                        filp->f_pos++;
-                        /* fall through */
-                case 1:
-                        if (filldir(dirent, "..", 2, i,
-                                    parent_ino(filp->f_path.dentry),
-                                    DT_DIR) < 0)
-                                goto out;
-                        i++;
-                        filp->f_pos++;
-                        /* fall through */
-                default:
-                        spin_lock(&proc_subdir_lock);
-                        de = de->subdir;
-                        i -= 2;
-                        for (;;) {
-                                if (!de) {
-                                        ret = 1;
-                                        spin_unlock(&proc_subdir_lock);
-                                        goto out;
-                                }
-                                if (!i)
-                                        break;
-                                de = de->next;
-                                i--;
-                        }
-                        do {
+        if (!dir_emit_dots(file, ctx))
-                                struct proc_dir_entry *next;
+                return 0;
-                                /* filldir passes info to user space */
+        spin_lock(&proc_subdir_lock);
-                                pde_get(de);
+        de = de->subdir;
-                                spin_unlock(&proc_subdir_lock);
+        i = ctx->pos - 2;
-                                if (filldir(dirent, de->name, de->namelen, filp->f_pos,
+        for (;;) {
-                                            de->low_ino, de->mode >> 12) < 0) {
+                if (!de) {
-                                        pde_put(de);
-                                        goto out;
-                                }
-                                spin_lock(&proc_subdir_lock);
-                                filp->f_pos++;
-                                next = de->next;
-                                pde_put(de);
-                                de = next;
-                        } while (de);
                        spin_unlock(&proc_subdir_lock);
+                        return 0;
+                }
+                if (!i)
+                        break;
+                de = de->next;
+                i--;
        }
-        ret = 1;
-out:
+        do {
-        return ret;     
+                struct proc_dir_entry *next;
+                pde_get(de);
+                spin_unlock(&proc_subdir_lock);
+                if (!dir_emit(ctx, de->name, de->namelen,
+                            de->low_ino, de->mode >> 12)) {
+                        pde_put(de);
+                        return 0;
+                }
+                spin_lock(&proc_subdir_lock);
+                ctx->pos++;
+                next = de->next;
+                pde_put(de);
+                de = next;
+        } while (de);
+        spin_unlock(&proc_subdir_lock);
+        return 0;
 }
-int proc_readdir(struct file *filp, void *dirent, filldir_t filldir)
+int proc_readdir(struct file *file, struct dir_context *ctx)
 {
-        struct inode *inode = file_inode(filp);
+        struct inode *inode = file_inode(file);
-        return proc_readdir_de(PDE(inode), filp, dirent, filldir);
+        return proc_readdir_de(PDE(inode), file, ctx);
 }
 /*
@@ -313,7 +289,7 @@ int proc_readdir(struct file *filp, void *dirent, filldir_t filldir)
 static const struct file_operations proc_dir_operations = {
        .llseek                 = generic_file_llseek,
        .read                   = generic_read_dir,
-        .readdir                = proc_readdir,
+        .iterate                = proc_readdir,
 };
 /*
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index d600fb098b6a..4eae2e149f31 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -165,14 +165,14 @@ extern int proc_setattr(struct dentry *, struct iattr *);
 extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *);
 extern int pid_revalidate(struct dentry *, unsigned int);
 extern int pid_delete_dentry(const struct dentry *);
-extern int proc_pid_readdir(struct file *, void *, filldir_t);
+extern int proc_pid_readdir(struct file *, struct dir_context *);
 extern struct dentry *proc_pid_lookup(struct inode *, struct dentry *, unsigned int);
 extern loff_t mem_lseek(struct file *, loff_t, int);
 /* Lookups */
 typedef struct dentry *instantiate_t(struct inode *, struct dentry *,
                                     struct task_struct *, const void *);
-extern int proc_fill_cache(struct file *, void *, filldir_t, const char *, int,
+extern bool proc_fill_cache(struct file *, struct dir_context *, const char *, int,
                           instantiate_t, struct task_struct *, const void *);
 /*
@@ -183,8 +183,8 @@ extern spinlock_t proc_subdir_lock;
 extern struct dentry *proc_lookup(struct inode *, struct dentry *, unsigned int);
 extern struct dentry *proc_lookup_de(struct proc_dir_entry *, struct inode *,
                                     struct dentry *);
-extern int proc_readdir(struct file *, void *, filldir_t);
+extern int proc_readdir(struct file *, struct dir_context *);
-extern int proc_readdir_de(struct proc_dir_entry *, struct file *, void *, filldir_t);
+extern int proc_readdir_de(struct proc_dir_entry *, struct file *, struct dir_context *);
 static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde)
 {
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 54bdc6701e9f..f6abbbbfad8a 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -213,74 +213,36 @@ out:
        return error;
 }
-static int proc_ns_fill_cache(struct file *filp, void *dirent,
+static int proc_ns_dir_readdir(struct file *file, struct dir_context *ctx)
-        filldir_t filldir, struct task_struct *task,
-        const struct proc_ns_operations *ops)
 {
-        return proc_fill_cache(filp, dirent, filldir,
+        struct task_struct *task = get_proc_task(file_inode(file));
-                                ops->name, strlen(ops->name),
-                                proc_ns_instantiate, task, ops);
-}
-static int proc_ns_dir_readdir(struct file *filp, void *dirent,
-                                filldir_t filldir)
-{
-        int i;
-        struct dentry *dentry = filp->f_path.dentry;
-        struct inode *inode = dentry->d_inode;
-        struct task_struct *task = get_proc_task(inode);
        const struct proc_ns_operations **entry, **last;
-        ino_t ino;
-        int ret;
-        ret = -ENOENT;
        if (!task)
-                goto out_no_task;
+                return -ENOENT;
-        ret = 0;
+        if (!dir_emit_dots(file, ctx))
-        i = filp->f_pos;
+                goto out;
-        switch (i) {
+        if (ctx->pos >= 2 + ARRAY_SIZE(ns_entries))
-        case 0:
+                goto out;
-                ino = inode->i_ino;
+        entry = ns_entries + (ctx->pos - 2);
-                if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
+        last = &ns_entries[ARRAY_SIZE(ns_entries) - 1];
-                        goto out;
+        while (entry <= last) {
-                i++;
+                const struct proc_ns_operations *ops = *entry;
-                filp->f_pos++;
+                if (!proc_fill_cache(file, ctx, ops->name, strlen(ops->name),
-                /* fall through */
+                                     proc_ns_instantiate, task, ops))
-        case 1:
+                        break;
-                ino = parent_ino(dentry);
+                ctx->pos++;
-                if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0)
+                entry++;
-                        goto out;
-                i++;
-                filp->f_pos++;
-                /* fall through */
-        default:
-                i -= 2;
-                if (i >= ARRAY_SIZE(ns_entries)) {
-                        ret = 1;
-                        goto out;
-                }
-                entry = ns_entries + i;
-                last = &ns_entries[ARRAY_SIZE(ns_entries) - 1];
-                while (entry <= last) {
-                        if (proc_ns_fill_cache(filp, dirent, filldir,
-                                                task, *entry) < 0)
-                                goto out;
-                        filp->f_pos++;
-                        entry++;
-                }
        }
-        ret = 1;
 out:
        put_task_struct(task);
-out_no_task:
+        return 0;
-        return ret;
 }
 const struct file_operations proc_ns_dir_operations = {
        .read           = generic_read_dir,
-        .readdir        = proc_ns_dir_readdir,
+        .iterate        = proc_ns_dir_readdir,
 };
 static struct dentry *proc_ns_dir_lookup(struct inode *dir,
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 986e83220d56..4677bb7dc7c2 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -160,16 +160,15 @@ const struct inode_operations proc_net_inode_operations = {
        .getattr        = proc_tgid_net_getattr,
 };
-static int proc_tgid_net_readdir(struct file *filp, void *dirent,
+static int proc_tgid_net_readdir(struct file *file, struct dir_context *ctx)
-                filldir_t filldir)
 {
        int ret;
        struct net *net;
        ret = -EINVAL;
-        net = get_proc_task_net(file_inode(filp));
+        net = get_proc_task_net(file_inode(file));
        if (net != NULL) {
-                ret = proc_readdir_de(net->proc_net, filp, dirent, filldir);
+                ret = proc_readdir_de(net->proc_net, file, ctx);
                put_net(net);
        }
        return ret;
@@ -178,7 +177,7 @@ static int proc_tgid_net_readdir(struct file *filp, void *dirent,
 const struct file_operations proc_net_operations = {
        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
-        .readdir        = proc_tgid_net_readdir,
+        .iterate        = proc_tgid_net_readdir,
 };
 static __net_init int proc_net_ns_init(struct net *net)
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index ac05f33a0dde..f3a570e7c257 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -573,12 +573,12 @@ out:
        return ret;
 }
-static int proc_sys_fill_cache(struct file *filp, void *dirent,
+static bool proc_sys_fill_cache(struct file *file,
-                                filldir_t filldir,
+                                struct dir_context *ctx,
                                struct ctl_table_header *head,
                                struct ctl_table *table)
 {
-        struct dentry *child, *dir = filp->f_path.dentry;
+        struct dentry *child, *dir = file->f_path.dentry;
        struct inode *inode;
        struct qstr qname;
        ino_t ino = 0;
@@ -595,38 +595,38 @@ static int proc_sys_fill_cache(struct file *filp, void *dirent,
                        inode = proc_sys_make_inode(dir->d_sb, head, table);
                        if (!inode) {
                                dput(child);
-                                return -ENOMEM;
+                                return false;
                        } else {
                                d_set_d_op(child, &proc_sys_dentry_operations);
                                d_add(child, inode);
                        }
                } else {
-                        return -ENOMEM;
+                        return false;
                }
        }
        inode = child->d_inode;
        ino  = inode->i_ino;
        type = inode->i_mode >> 12;
        dput(child);
-        return !!filldir(dirent, qname.name, qname.len, filp->f_pos, ino, type);
+        return dir_emit(ctx, qname.name, qname.len, ino, type);
 }
-static int proc_sys_link_fill_cache(struct file *filp, void *dirent,
+static bool proc_sys_link_fill_cache(struct file *file,
-                                    filldir_t filldir,
+                                    struct dir_context *ctx,
                                    struct ctl_table_header *head,
                                    struct ctl_table *table)
 {
-        int err, ret = 0;
+        bool ret = true;
        head = sysctl_head_grab(head);
        if (S_ISLNK(table->mode)) {
                /* It is not an error if we can not follow the link ignore it */
-                err = sysctl_follow_link(&head, &table, current->nsproxy);
+                int err = sysctl_follow_link(&head, &table, current->nsproxy);
                if (err)
                        goto out;
        }
-        ret = proc_sys_fill_cache(filp, dirent, filldir, head, table);
+        ret = proc_sys_fill_cache(file, ctx, head, table);
 out:
        sysctl_head_finish(head);
        return ret;
@@ -634,67 +634,50 @@ out:
 static int scan(struct ctl_table_header *head, ctl_table *table,
                unsigned long *pos, struct file *file,
-                void *dirent, filldir_t filldir)
+                struct dir_context *ctx)
 {
-        int res;
+        bool res;
-        if ((*pos)++ < file->f_pos)
+        if ((*pos)++ < ctx->pos)
-                return 0;
+                return true;
        if (unlikely(S_ISLNK(table->mode)))
-                res = proc_sys_link_fill_cache(file, dirent, filldir, head, table);
+                res = proc_sys_link_fill_cache(file, ctx, head, table);
        else
-                res = proc_sys_fill_cache(file, dirent, filldir, head, table);
+                res = proc_sys_fill_cache(file, ctx, head, table);
-        if (res == 0)
+        if (res)
-                file->f_pos = *pos;
+                ctx->pos = *pos;
        return res;
 }
-static int proc_sys_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int proc_sys_readdir(struct file *file, struct dir_context *ctx)
 {
-        struct dentry *dentry = filp->f_path.dentry;
+        struct ctl_table_header *head = grab_header(file_inode(file));
-        struct inode *inode = dentry->d_inode;
-        struct ctl_table_header *head = grab_header(inode);
        struct ctl_table_header *h = NULL;
        struct ctl_table *entry;
        struct ctl_dir *ctl_dir;
        unsigned long pos;
-        int ret = -EINVAL;
        if (IS_ERR(head))
                return PTR_ERR(head);
        ctl_dir = container_of(head, struct ctl_dir, header);
-        ret = 0;
+        if (!dir_emit_dots(file, ctx))
-        /* Avoid a switch here: arm builds fail with missing __cmpdi2 */
+                return 0;
-        if (filp->f_pos == 0) {
-                if (filldir(dirent, ".", 1, filp->f_pos,
-                                inode->i_ino, DT_DIR) < 0)
-                        goto out;
-                filp->f_pos++;
-        }
-        if (filp->f_pos == 1) {
-                if (filldir(dirent, "..", 2, filp->f_pos,
-                                parent_ino(dentry), DT_DIR) < 0)
-                        goto out;
-                filp->f_pos++;
-        }
        pos = 2;
        for (first_entry(ctl_dir, &h, &entry); h; next_entry(&h, &entry)) {
-                ret = scan(h, entry, &pos, filp, dirent, filldir);
+                if (!scan(h, entry, &pos, file, ctx)) {
-                if (ret) {
                        sysctl_head_finish(h);
                        break;
                }
        }
-        ret = 1;
-out:
        sysctl_head_finish(head);
-        return ret;
+        return 0;
 }
 static int proc_sys_permission(struct inode *inode, int mask)
@@ -769,7 +752,7 @@ static const struct file_operations proc_sys_file_operations = {
 static const struct file_operations proc_sys_dir_file_operations = {
        .read           = generic_read_dir,
-        .readdir        = proc_sys_readdir,
+        .iterate        = proc_sys_readdir,
        .llseek         = generic_file_llseek,
 };
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 41a6ea93f486..229e366598da 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -202,21 +202,14 @@ static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentr
        return proc_pid_lookup(dir, dentry, flags);
 }
-static int proc_root_readdir(struct file * filp,
+static int proc_root_readdir(struct file *file, struct dir_context *ctx)
-        void * dirent, filldir_t filldir)
 {
-        unsigned int nr = filp->f_pos;
+        if (ctx->pos < FIRST_PROCESS_ENTRY) {
-        int ret;
+                proc_readdir(file, ctx);
+                ctx->pos = FIRST_PROCESS_ENTRY;
-        if (nr < FIRST_PROCESS_ENTRY) {
-                int error = proc_readdir(filp, dirent, filldir);
-                if (error <= 0)
-                        return error;
-                filp->f_pos = FIRST_PROCESS_ENTRY;
        }
-        ret = proc_pid_readdir(filp, dirent, filldir);
+        return proc_pid_readdir(file, ctx);
-        return ret;
 }
 /*
@@ -226,7 +219,7 @@ static int proc_root_readdir(struct file * filp,
 */
 static const struct file_operations proc_root_operations = {
        .read            = generic_read_dir,
-        .readdir         = proc_root_readdir,
+        .iterate         = proc_root_readdir,
        .llseek         = default_llseek,
 };
diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c
index 28ce014b3cef..b218f965817b 100644
--- a/fs/qnx4/dir.c
+++ b/fs/qnx4/dir.c
@@ -14,9 +14,9 @@
 #include <linux/buffer_head.h>
 #include "qnx4.h"
-static int qnx4_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int qnx4_readdir(struct file *file, struct dir_context *ctx)
 {
-        struct inode *inode = file_inode(filp);
+        struct inode *inode = file_inode(file);
        unsigned int offset;
        struct buffer_head *bh;
        struct qnx4_inode_entry *de;
@@ -26,48 +26,44 @@ static int qnx4_readdir(struct file *filp, void *dirent, filldir_t filldir)
        int size;
        QNX4DEBUG((KERN_INFO "qnx4_readdir:i_size = %ld\n", (long) inode->i_size));
-        QNX4DEBUG((KERN_INFO "filp->f_pos         = %ld\n", (long) filp->f_pos));
+        QNX4DEBUG((KERN_INFO "pos                 = %ld\n", (long) ctx->pos));
-        while (filp->f_pos < inode->i_size) {
+        while (ctx->pos < inode->i_size) {
-                blknum = qnx4_block_map( inode, filp->f_pos >> QNX4_BLOCK_SIZE_BITS );
+                blknum = qnx4_block_map(inode, ctx->pos >> QNX4_BLOCK_SIZE_BITS);
                bh = sb_bread(inode->i_sb, blknum);
-                if(bh==NULL) {
+                if (bh == NULL) {
                        printk(KERN_ERR "qnx4_readdir: bread failed (%ld)\n", blknum);
-                        break;
+                        return 0;
                }
-                ix = (int)(filp->f_pos >> QNX4_DIR_ENTRY_SIZE_BITS) % QNX4_INODES_PER_BLOCK;
+                ix = (ctx->pos >> QNX4_DIR_ENTRY_SIZE_BITS) % QNX4_INODES_PER_BLOCK;
-                while (ix < QNX4_INODES_PER_BLOCK) {
+                for (; ix < QNX4_INODES_PER_BLOCK; ix++, ctx->pos += QNX4_DIR_ENTRY_SIZE) {
                        offset = ix * QNX4_DIR_ENTRY_SIZE;
                        de = (struct qnx4_inode_entry *) (bh->b_data + offset);
-                        size = strlen(de->di_fname);
+                        if (!de->di_fname[0])
-                        if (size) {
+                                continue;
-                                if ( !( de->di_status & QNX4_FILE_LINK ) && size > QNX4_SHORT_NAME_MAX )
+                        if (!(de->di_status & (QNX4_FILE_USED|QNX4_FILE_LINK)))
-                                        size = QNX4_SHORT_NAME_MAX;
+                                continue;
-                                else if ( size > QNX4_NAME_MAX )
+                        if (!(de->di_status & QNX4_FILE_LINK))
-                                        size = QNX4_NAME_MAX;
+                                size = QNX4_SHORT_NAME_MAX;
+                        else
-                                if ( ( de->di_status & (QNX4_FILE_USED|QNX4_FILE_LINK) ) != 0 ) {
+                                size = QNX4_NAME_MAX;
-                                        QNX4DEBUG((KERN_INFO "qnx4_readdir:%.*s\n", size, de->di_fname));
+                        size = strnlen(de->di_fname, size);
-                                        if ( ( de->di_status & QNX4_FILE_LINK ) == 0 )
+                        QNX4DEBUG((KERN_INFO "qnx4_readdir:%.*s\n", size, de->di_fname));
-                                                ino = blknum * QNX4_INODES_PER_BLOCK + ix - 1;
+                        if (!(de->di_status & QNX4_FILE_LINK))
-                                        else {
+                                ino = blknum * QNX4_INODES_PER_BLOCK + ix - 1;
-                                                le  = (struct qnx4_link_info*)de;
+                        else {
-                                                ino = ( le32_to_cpu(le->dl_inode_blk) - 1 ) *
+                                le  = (struct qnx4_link_info*)de;
-                                                        QNX4_INODES_PER_BLOCK +
+                                ino = ( le32_to_cpu(le->dl_inode_blk) - 1 ) *
-                                                        le->dl_inode_ndx;
+                                        QNX4_INODES_PER_BLOCK +
-                                        }
+                                        le->dl_inode_ndx;
-                                        if (filldir(dirent, de->di_fname, size, filp->f_pos, ino, DT_UNKNOWN) < 0) {
+                        }
-                                                brelse(bh);
+                        if (!dir_emit(ctx, de->di_fname, size, ino, DT_UNKNOWN)) {
-                                                goto out;
+                                brelse(bh);
-                                        }
+                                return 0;
-                                }
                        }
-                        ix++;
-                        filp->f_pos += QNX4_DIR_ENTRY_SIZE;
                }
                brelse(bh);
        }
-out:
        return 0;
 }
@@ -75,7 +71,7 @@ const struct file_operations qnx4_dir_operations =
 {
        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
-        .readdir        = qnx4_readdir,
+        .iterate        = qnx4_readdir,
        .fsync          = generic_file_fsync,
 };
diff --git a/fs/qnx6/dir.c b/fs/qnx6/dir.c
index afa6be6fc397..15b7d92ed60d 100644
--- a/fs/qnx6/dir.c
+++ b/fs/qnx6/dir.c
@@ -65,8 +65,8 @@ static struct qnx6_long_filename *qnx6_longname(struct super_block *sb,
 static int qnx6_dir_longfilename(struct inode *inode,
                        struct qnx6_long_dir_entry *de,
-                        void *dirent, loff_t pos,
+                        struct dir_context *ctx,
-                        unsigned de_inode, filldir_t filldir)
+                        unsigned de_inode)
 {
        struct qnx6_long_filename *lf;
        struct super_block *s = inode->i_sb;
@@ -104,8 +104,7 @@ static int qnx6_dir_longfilename(struct inode *inode,
        QNX6DEBUG((KERN_INFO "qnx6_readdir:%.*s inode:%u\n",
                                        lf_size, lf->lf_fname, de_inode));
-        if (filldir(dirent, lf->lf_fname, lf_size, pos, de_inode,
+        if (!dir_emit(ctx, lf->lf_fname, lf_size, de_inode, DT_UNKNOWN)) {
-                        DT_UNKNOWN) < 0) {
                qnx6_put_page(page);
                return 0;
        }
@@ -115,18 +114,19 @@ static int qnx6_dir_longfilename(struct inode *inode,
        return 1;
 }
-static int qnx6_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int qnx6_readdir(struct file *file, struct dir_context *ctx)
 {
-        struct inode *inode = file_inode(filp);
+        struct inode *inode = file_inode(file);
        struct super_block *s = inode->i_sb;
        struct qnx6_sb_info *sbi = QNX6_SB(s);
-        loff_t pos = filp->f_pos & ~(QNX6_DIR_ENTRY_SIZE - 1);
+        loff_t pos = ctx->pos & ~(QNX6_DIR_ENTRY_SIZE - 1);
        unsigned long npages = dir_pages(inode);
        unsigned long n = pos >> PAGE_CACHE_SHIFT;
        unsigned start = (pos & ~PAGE_CACHE_MASK) / QNX6_DIR_ENTRY_SIZE;
        bool done = false;
-        if (filp->f_pos >= inode->i_size)
+        ctx->pos = pos;
+        if (ctx->pos >= inode->i_size)
                return 0;
        for ( ; !done && n < npages; n++, start = 0) {
@@ -137,11 +137,11 @@ static int qnx6_readdir(struct file *filp, void *dirent, filldir_t filldir)
                if (IS_ERR(page)) {
                        printk(KERN_ERR "qnx6_readdir: read failed\n");
-                        filp->f_pos = (n + 1) << PAGE_CACHE_SHIFT;
+                        ctx->pos = (n + 1) << PAGE_CACHE_SHIFT;
                        return PTR_ERR(page);
                }
                de = ((struct qnx6_dir_entry *)page_address(page)) + start;
-                for (; i < limit; i++, de++, pos += QNX6_DIR_ENTRY_SIZE) {
+                for (; i < limit; i++, de++, ctx->pos += QNX6_DIR_ENTRY_SIZE) {
                        int size = de->de_size;
                        u32 no_inode = fs32_to_cpu(sbi, de->de_inode);
@@ -154,8 +154,7 @@ static int qnx6_readdir(struct file *filp, void *dirent, filldir_t filldir)
                                   structure / block */
                                if (!qnx6_dir_longfilename(inode,
                                        (struct qnx6_long_dir_entry *)de,
-                                        dirent, pos, no_inode,
+                                        ctx, no_inode)) {
-                                        filldir)) {
                                        done = true;
                                        break;
                                }
@@ -163,9 +162,8 @@ static int qnx6_readdir(struct file *filp, void *dirent, filldir_t filldir)
                                QNX6DEBUG((KERN_INFO "qnx6_readdir:%.*s"
                                   " inode:%u\n", size, de->de_fname,
                                                        no_inode));
-                                if (filldir(dirent, de->de_fname, size,
+                                if (!dir_emit(ctx, de->de_fname, size,
-                                      pos, no_inode, DT_UNKNOWN)
+                                      no_inode, DT_UNKNOWN)) {
-                                        < 0) {
                                        done = true;
                                        break;
                                }
@@ -173,7 +171,6 @@ static int qnx6_readdir(struct file *filp, void *dirent, filldir_t filldir)
                }
                qnx6_put_page(page);
        }
-        filp->f_pos = pos;
        return 0;
 }
@@ -282,7 +279,7 @@ found:
 const struct file_operations qnx6_dir_operations = {
        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
-        .readdir        = qnx6_readdir,
+        .iterate        = qnx6_readdir,
        .fsync          = generic_file_fsync,
 };
diff --git a/fs/read_write.c b/fs/read_write.c
index 03430008704e..2cefa417be34 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1064,6 +1064,7 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
        struct fd in, out;
        struct inode *in_inode, *out_inode;
        loff_t pos;
+        loff_t out_pos;
        ssize_t retval;
        int fl;
@@ -1077,12 +1078,14 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
        if (!(in.file->f_mode & FMODE_READ))
                goto fput_in;
        retval = -ESPIPE;
-        if (!ppos)
+        if (!ppos) {
-                ppos = &in.file->f_pos;
+                pos = in.file->f_pos;
-        else
+        } else {
+                pos = *ppos;
                if (!(in.file->f_mode & FMODE_PREAD))
                        goto fput_in;
-        retval = rw_verify_area(READ, in.file, ppos, count);
+        }
+        retval = rw_verify_area(READ, in.file, &pos, count);
        if (retval < 0)
                goto fput_in;
        count = retval;
@@ -1099,7 +1102,8 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
        retval = -EINVAL;
        in_inode = file_inode(in.file);
        out_inode = file_inode(out.file);
-        retval = rw_verify_area(WRITE, out.file, &out.file->f_pos, count);
+        out_pos = out.file->f_pos;
+        retval = rw_verify_area(WRITE, out.file, &out_pos, count);
        if (retval < 0)
                goto fput_out;
        count = retval;
@@ -1107,7 +1111,6 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
        if (!max)
                max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
-        pos = *ppos;
        if (unlikely(pos + count > max)) {
                retval = -EOVERFLOW;
                if (pos >= max)
@@ -1126,18 +1129,23 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
        if (in.file->f_flags & O_NONBLOCK)
                fl = SPLICE_F_NONBLOCK;
 #endif
-        retval = do_splice_direct(in.file, ppos, out.file, count, fl);
+        retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl);
        if (retval > 0) {
                add_rchar(current, retval);
                add_wchar(current, retval);
                fsnotify_access(in.file);
                fsnotify_modify(out.file);
+                out.file->f_pos = out_pos;
+                if (ppos)
+                        *ppos = pos;
+                else
+                        in.file->f_pos = pos;
        }
        inc_syscr(current);
        inc_syscw(current);
-        if (*ppos > max)
+        if (pos > max)
                retval = -EOVERFLOW;
 fput_out:
diff --git a/fs/readdir.c b/fs/readdir.c
index fee38e04fae4..93d71e574310 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -20,11 +20,11 @@
 #include <asm/uaccess.h>
-int vfs_readdir(struct file *file, filldir_t filler, void *buf)
+int iterate_dir(struct file *file, struct dir_context *ctx)
 {
        struct inode *inode = file_inode(file);
        int res = -ENOTDIR;
-        if (!file->f_op || !file->f_op->readdir)
+        if (!file->f_op || !file->f_op->iterate)
                goto out;
        res = security_file_permission(file, MAY_READ);
@@ -37,15 +37,16 @@ int vfs_readdir(struct file *file, filldir_t filler, void *buf)
        res = -ENOENT;
        if (!IS_DEADDIR(inode)) {
-                res = file->f_op->readdir(file, buf, filler);
+                ctx->pos = file->f_pos;
+                res = file->f_op->iterate(file, ctx);
+                file->f_pos = ctx->pos;
                file_accessed(file);
        }
        mutex_unlock(&inode->i_mutex);
 out:
        return res;
 }
+EXPORT_SYMBOL(iterate_dir);
-EXPORT_SYMBOL(vfs_readdir);
 /*
 * Traditional linux readdir() handling..
@@ -66,6 +67,7 @@ struct old_linux_dirent {
 };
 struct readdir_callback {
+        struct dir_context ctx;
        struct old_linux_dirent __user * dirent;
        int result;
 };
@@ -73,7 +75,7 @@ struct readdir_callback {
 static int fillonedir(void * __buf, const char * name, int namlen, loff_t offset,
                      u64 ino, unsigned int d_type)
 {
-        struct readdir_callback * buf = (struct readdir_callback *) __buf;
+        struct readdir_callback *buf = (struct readdir_callback *) __buf;
        struct old_linux_dirent __user * dirent;
        unsigned long d_ino;
@@ -107,15 +109,15 @@ SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
 {
        int error;
        struct fd f = fdget(fd);
-        struct readdir_callback buf;
+        struct readdir_callback buf = {
+                .ctx.actor = fillonedir,
+                .dirent = dirent
+        };
        if (!f.file)
                return -EBADF;
-        buf.result = 0;
+        error = iterate_dir(f.file, &buf.ctx);
-        buf.dirent = dirent;
-        error = vfs_readdir(f.file, fillonedir, &buf);
        if (buf.result)
                error = buf.result;
@@ -137,6 +139,7 @@ struct linux_dirent {
 };
 struct getdents_callback {
+        struct dir_context ctx;
        struct linux_dirent __user * current_dir;
        struct linux_dirent __user * previous;
        int count;
@@ -191,7 +194,11 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd,
 {
        struct fd f;
        struct linux_dirent __user * lastdirent;
-        struct getdents_callback buf;
+        struct getdents_callback buf = {
+                .ctx.actor = filldir,
+                .count = count,
+                .current_dir = dirent
+        };
        int error;
        if (!access_ok(VERIFY_WRITE, dirent, count))
@@ -201,17 +208,12 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd,
        if (!f.file)
                return -EBADF;
-        buf.current_dir = dirent;
+        error = iterate_dir(f.file, &buf.ctx);
-        buf.previous = NULL;
-        buf.count = count;
-        buf.error = 0;
-        error = vfs_readdir(f.file, filldir, &buf);
        if (error >= 0)
                error = buf.error;
        lastdirent = buf.previous;
        if (lastdirent) {
-                if (put_user(f.file->f_pos, &lastdirent->d_off))
+                if (put_user(buf.ctx.pos, &lastdirent->d_off))
                        error = -EFAULT;
                else
                        error = count - buf.count;
@@ -221,6 +223,7 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd,
 }
 struct getdents_callback64 {
+        struct dir_context ctx;
        struct linux_dirent64 __user * current_dir;
        struct linux_dirent64 __user * previous;
        int count;
@@ -271,7 +274,11 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd,
 {
        struct fd f;
        struct linux_dirent64 __user * lastdirent;
-        struct getdents_callback64 buf;
+        struct getdents_callback64 buf = {
+                .ctx.actor = filldir64,
+                .count = count,
+                .current_dir = dirent
+        };
        int error;
        if (!access_ok(VERIFY_WRITE, dirent, count))
@@ -281,17 +288,12 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd,
        if (!f.file)
                return -EBADF;
-        buf.current_dir = dirent;
+        error = iterate_dir(f.file, &buf.ctx);
-        buf.previous = NULL;
-        buf.count = count;
-        buf.error = 0;
-        error = vfs_readdir(f.file, filldir64, &buf);
        if (error >= 0)
                error = buf.error;
        lastdirent = buf.previous;
        if (lastdirent) {
-                typeof(lastdirent->d_off) d_off = f.file->f_pos;
+                typeof(lastdirent->d_off) d_off = buf.ctx.pos;
                if (__put_user(d_off, &lastdirent->d_off))
                        error = -EFAULT;
                else
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index 6c2d136561cb..03e4ca5624d6 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -13,14 +13,14 @@
 extern const struct reiserfs_key MIN_KEY;
-static int reiserfs_readdir(struct file *, void *, filldir_t);
+static int reiserfs_readdir(struct file *, struct dir_context *);
 static int reiserfs_dir_fsync(struct file *filp, loff_t start, loff_t end,
                              int datasync);
 const struct file_operations reiserfs_dir_operations = {
        .llseek = generic_file_llseek,
        .read = generic_read_dir,
-        .readdir = reiserfs_readdir,
+        .iterate = reiserfs_readdir,
        .fsync = reiserfs_dir_fsync,
        .unlocked_ioctl = reiserfs_ioctl,
 #ifdef CONFIG_COMPAT
@@ -50,18 +50,15 @@ static int reiserfs_dir_fsync(struct file *filp, loff_t start, loff_t end,
 #define store_ih(where,what) copy_item_head (where, what)
-static inline bool is_privroot_deh(struct dentry *dir,
+static inline bool is_privroot_deh(struct inode *dir, struct reiserfs_de_head *deh)
-                                   struct reiserfs_de_head *deh)
 {
-        struct dentry *privroot = REISERFS_SB(dir->d_sb)->priv_root;
+        struct dentry *privroot = REISERFS_SB(dir->i_sb)->priv_root;
-        return (dir == dir->d_parent && privroot->d_inode &&
+        return (privroot->d_inode &&
                deh->deh_objectid == INODE_PKEY(privroot->d_inode)->k_objectid);
 }
-int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
+int reiserfs_readdir_inode(struct inode *inode, struct dir_context *ctx)
-                           filldir_t filldir, loff_t *pos)
 {
-        struct inode *inode = dentry->d_inode;
        struct cpu_key pos_key; /* key of current position in the directory (key of directory entry) */
        INITIALIZE_PATH(path_to_entry);
        struct buffer_head *bh;
@@ -81,7 +78,7 @@ int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
        /* form key for search the next directory entry using f_pos field of
           file structure */
-        make_cpu_key(&pos_key, inode, *pos ?: DOT_OFFSET, TYPE_DIRENTRY, 3);
+        make_cpu_key(&pos_key, inode, ctx->pos ?: DOT_OFFSET, TYPE_DIRENTRY, 3);
        next_pos = cpu_key_k_offset(&pos_key);
        path_to_entry.reada = PATH_READA;
@@ -126,7 +123,6 @@ int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
                             entry_num++, deh++) {
                                int d_reclen;
                                char *d_name;
-                                off_t d_off;
                                ino_t d_ino;
                                if (!de_visible(deh))
@@ -155,11 +151,10 @@ int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
                                }
                                /* Ignore the .reiserfs_priv entry */
-                                if (is_privroot_deh(dentry, deh))
+                                if (is_privroot_deh(inode, deh))
                                        continue;
-                                d_off = deh_offset(deh);
+                                ctx->pos = deh_offset(deh);
-                                *pos = d_off;
                                d_ino = deh_objectid(deh);
                                if (d_reclen <= 32) {
                                        local_buf = small_buf;
@@ -187,9 +182,9 @@ int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
                                 * the write lock here for other waiters
                                 */
                                reiserfs_write_unlock(inode->i_sb);
-                                if (filldir
+                                if (!dir_emit
-                                    (dirent, local_buf, d_reclen, d_off, d_ino,
+                                    (ctx, local_buf, d_reclen, d_ino,
-                                     DT_UNKNOWN) < 0) {
+                                     DT_UNKNOWN)) {
                                        reiserfs_write_lock(inode->i_sb);
                                        if (local_buf != small_buf) {
                                                kfree(local_buf);
@@ -237,7 +232,7 @@ int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
        }                       /* while */
 end:
-        *pos = next_pos;
+        ctx->pos = next_pos;
        pathrelse(&path_to_entry);
        reiserfs_check_path(&path_to_entry);
 out:
@@ -245,10 +240,9 @@ out:
        return ret;
 }
-static int reiserfs_readdir(struct file *file, void *dirent, filldir_t filldir)
+static int reiserfs_readdir(struct file *file, struct dir_context *ctx)
 {
-        struct dentry *dentry = file->f_path.dentry;
+        return reiserfs_readdir_inode(file_inode(file), ctx);
-        return reiserfs_readdir_dentry(dentry, dirent, filldir, &file->f_pos);
 }
 /* compose directory item containing "." and ".." entries (entries are
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index f844533792ee..0048cc16a6a8 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -2975,16 +2975,19 @@ static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh)
 }
 /* clm -- taken from fs/buffer.c:block_invalidate_page */
-static void reiserfs_invalidatepage(struct page *page, unsigned long offset)
+static void reiserfs_invalidatepage(struct page *page, unsigned int offset,
+                                    unsigned int length)
 {
        struct buffer_head *head, *bh, *next;
        struct inode *inode = page->mapping->host;
        unsigned int curr_off = 0;
+        unsigned int stop = offset + length;
+        int partial_page = (offset || length < PAGE_CACHE_SIZE);
        int ret = 1;
        BUG_ON(!PageLocked(page));
-        if (offset == 0)
+        if (!partial_page)
                ClearPageChecked(page);
        if (!page_has_buffers(page))
@@ -2996,6 +2999,9 @@ static void reiserfs_invalidatepage(struct page *page, unsigned long offset)
                unsigned int next_off = curr_off + bh->b_size;
                next = bh->b_this_page;
+                if (next_off > stop)
+                        goto out;
                /*
                 * is this block fully invalidated?
                 */
@@ -3014,7 +3020,7 @@ static void reiserfs_invalidatepage(struct page *page, unsigned long offset)
         * The get_block cached value has been unconditionally invalidated,
         * so real IO is not possible anymore.
         */
-        if (!offset && ret) {
+        if (!partial_page && ret) {
                ret = try_to_release_page(page, 0);
                /* maybe should BUG_ON(!ret); - neilb */
        }
diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h
index 157e474ab303..3df5ce6c724d 100644
--- a/fs/reiserfs/reiserfs.h
+++ b/fs/reiserfs/reiserfs.h
@@ -2709,7 +2709,7 @@ extern const struct inode_operations reiserfs_dir_inode_operations;
 extern const struct inode_operations reiserfs_symlink_inode_operations;
 extern const struct inode_operations reiserfs_special_inode_operations;
 extern const struct file_operations reiserfs_dir_operations;
-int reiserfs_readdir_dentry(struct dentry *, void *, filldir_t, loff_t *);
+int reiserfs_readdir_inode(struct inode *, struct dir_context *);
 /* tail_conversion.c */
 int direct2indirect(struct reiserfs_transaction_handle *, struct inode *,
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 821bcf70e467..c69cdd749f09 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -171,6 +171,7 @@ static struct dentry *open_xa_dir(const struct inode *inode, int flags)
 * modifying extended attributes. This includes operations such as permissions
 * or ownership changes, object deletions, etc. */
 struct reiserfs_dentry_buf {
+        struct dir_context ctx;
        struct dentry *xadir;
        int count;
        struct dentry *dentries[8];
@@ -223,9 +224,8 @@ static int reiserfs_for_each_xattr(struct inode *inode,
 {
        struct dentry *dir;
        int i, err = 0;
-        loff_t pos = 0;
        struct reiserfs_dentry_buf buf = {
-                .count = 0,
+                .ctx.actor = fill_with_dentries,
        };
        /* Skip out, an xattr has no xattrs associated with it */
@@ -249,29 +249,27 @@ static int reiserfs_for_each_xattr(struct inode *inode,
        reiserfs_write_lock(inode->i_sb);
        buf.xadir = dir;
-        err = reiserfs_readdir_dentry(dir, &buf, fill_with_dentries, &pos);
+        while (1) {
-        while ((err == 0 || err == -ENOSPC) && buf.count) {
+                err = reiserfs_readdir_inode(dir->d_inode, &buf.ctx);
-                err = 0;
+                if (err)
+                        break;
-                for (i = 0; i < buf.count && buf.dentries[i]; i++) {
+                if (!buf.count)
-                        int lerr = 0;
+                        break;
+                for (i = 0; !err && i < buf.count && buf.dentries[i]; i++) {
                        struct dentry *dentry = buf.dentries[i];
-                        if (err == 0 && !S_ISDIR(dentry->d_inode->i_mode))
+                        if (!S_ISDIR(dentry->d_inode->i_mode))
-                                lerr = action(dentry, data);
+                                err = action(dentry, data);
                        dput(dentry);
                        buf.dentries[i] = NULL;
-                        err = lerr ?: err;
                }
+                if (err)
+                        break;
                buf.count = 0;
-                if (!err)
-                        err = reiserfs_readdir_dentry(dir, &buf,
-                                                      fill_with_dentries, &pos);
        }
        mutex_unlock(&dir->d_inode->i_mutex);
-        /* Clean up after a failed readdir */
        cleanup_dentry_buf(&buf);
        if (!err) {
@@ -800,6 +798,7 @@ int reiserfs_removexattr(struct dentry *dentry, const char *name)
 }
 struct listxattr_buf {
+        struct dir_context ctx;
        size_t size;
        size_t pos;
        char *buf;
@@ -845,8 +844,8 @@ ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size)
 {
        struct dentry *dir;
        int err = 0;
-        loff_t pos = 0;
        struct listxattr_buf buf = {
+                .ctx.actor = listxattr_filler,
                .dentry = dentry,
                .buf = buffer,
                .size = buffer ? size : 0,
@@ -868,7 +867,7 @@ ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size)
        }
        mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_XATTR);
-        err = reiserfs_readdir_dentry(dir, &buf, listxattr_filler, &pos);
+        err = reiserfs_readdir_inode(dir->d_inode, &buf.ctx);
        mutex_unlock(&dir->d_inode->i_mutex);
        if (!err)
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 15cbc41ee365..ff1d3d42e72a 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -145,19 +145,18 @@ static const struct address_space_operations romfs_aops = {
 /*
 * read the entries from a directory
 */
-static int romfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int romfs_readdir(struct file *file, struct dir_context *ctx)
 {
-        struct inode *i = file_inode(filp);
+        struct inode *i = file_inode(file);
        struct romfs_inode ri;
        unsigned long offset, maxoff;
        int j, ino, nextfh;
-        int stored = 0;
        char fsname[ROMFS_MAXFN];       /* XXX dynamic? */
        int ret;
        maxoff = romfs_maxsize(i->i_sb);
-        offset = filp->f_pos;
+        offset = ctx->pos;
        if (!offset) {
                offset = i->i_ino & ROMFH_MASK;
                ret = romfs_dev_read(i->i_sb, offset, &ri, ROMFH_SIZE);
@@ -170,10 +169,10 @@ static int romfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
        for (;;) {
                if (!offset || offset >= maxoff) {
                        offset = maxoff;
-                        filp->f_pos = offset;
+                        ctx->pos = offset;
                        goto out;
                }
-                filp->f_pos = offset;
+                ctx->pos = offset;
                /* Fetch inode info */
                ret = romfs_dev_read(i->i_sb, offset, &ri, ROMFH_SIZE);
@@ -194,16 +193,14 @@ static int romfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                nextfh = be32_to_cpu(ri.next);
                if ((nextfh & ROMFH_TYPE) == ROMFH_HRD)
                        ino = be32_to_cpu(ri.spec);
-                if (filldir(dirent, fsname, j, offset, ino,
+                if (!dir_emit(ctx, fsname, j, ino,
-                            romfs_dtype_table[nextfh & ROMFH_TYPE]) < 0)
+                            romfs_dtype_table[nextfh & ROMFH_TYPE]))
                        goto out;
-                stored++;
                offset = nextfh & ROMFH_MASK;
        }
 out:
-        return stored;
+        return 0;
 }
 /*
@@ -281,7 +278,7 @@ error:
 static const struct file_operations romfs_dir_operations = {
        .read           = generic_read_dir,
-        .readdir        = romfs_readdir,
+        .iterate        = romfs_readdir,
        .llseek         = default_llseek,
 };
diff --git a/fs/splice.c b/fs/splice.c
index e6b25598c8c4..d37431dd60a1 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1274,7 +1274,7 @@ static int direct_splice_actor(struct pipe_inode_info *pipe,
 {
        struct file *file = sd->u.file;
-        return do_splice_from(pipe, file, &file->f_pos, sd->total_len,
+        return do_splice_from(pipe, file, sd->opos, sd->total_len,
                              sd->flags);
 }
@@ -1283,6 +1283,7 @@ static int direct_splice_actor(struct pipe_inode_info *pipe,
 * @in:         file to splice from
 * @ppos:       input file offset
 * @out:        file to splice to
+ * @opos:       output file offset
 * @len:        number of bytes to splice
 * @flags:      splice modifier flags
 *
@@ -1294,7 +1295,7 @@ static int direct_splice_actor(struct pipe_inode_info *pipe,
 *
 */
 long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
-                      size_t len, unsigned int flags)
+                      loff_t *opos, size_t len, unsigned int flags)
 {
        struct splice_desc sd = {
                .len            = len,
@@ -1302,6 +1303,7 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
                .flags          = flags,
                .pos            = *ppos,
                .u.file         = out,
+                .opos           = opos,
        };
        long ret;
@@ -1325,7 +1327,7 @@ static long do_splice(struct file *in, loff_t __user *off_in,
 {
        struct pipe_inode_info *ipipe;
        struct pipe_inode_info *opipe;
-        loff_t offset, *off;
+        loff_t offset;
        long ret;
        ipipe = get_pipe_info(in);
@@ -1356,13 +1358,15 @@ static long do_splice(struct file *in, loff_t __user *off_in,
                                return -EINVAL;
                        if (copy_from_user(&offset, off_out, sizeof(loff_t)))
                                return -EFAULT;
-                        off = &offset;
+                } else {
-                } else
+                        offset = out->f_pos;
-                        off = &out->f_pos;
+                }
-                ret = do_splice_from(ipipe, out, off, len, flags);
+                ret = do_splice_from(ipipe, out, &offset, len, flags);
-                if (off_out && copy_to_user(off_out, off, sizeof(loff_t)))
+                if (!off_out)
+                        out->f_pos = offset;
+                else if (copy_to_user(off_out, &offset, sizeof(loff_t)))
                        ret = -EFAULT;
                return ret;
@@ -1376,13 +1380,15 @@ static long do_splice(struct file *in, loff_t __user *off_in,
                                return -EINVAL;
                        if (copy_from_user(&offset, off_in, sizeof(loff_t)))
                                return -EFAULT;
-                        off = &offset;
+                } else {
-                } else
+                        offset = in->f_pos;
-                        off = &in->f_pos;
+                }
-                ret = do_splice_to(in, off, opipe, len, flags);
+                ret = do_splice_to(in, &offset, opipe, len, flags);
-                if (off_in && copy_to_user(off_in, off, sizeof(loff_t)))
+                if (!off_in)
+                        in->f_pos = offset;
+                else if (copy_to_user(off_in, &offset, sizeof(loff_t)))
                        ret = -EFAULT;
                return ret;
diff --git a/fs/squashfs/dir.c b/fs/squashfs/dir.c
index 57dc70ebbb19..f7f527bf8c10 100644
--- a/fs/squashfs/dir.c
+++ b/fs/squashfs/dir.c
@@ -100,7 +100,7 @@ static int get_dir_index_using_offset(struct super_block *sb,
 }
-static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir)
+static int squashfs_readdir(struct file *file, struct dir_context *ctx)
 {
        struct inode *inode = file_inode(file);
        struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
@@ -127,11 +127,11 @@ static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir)
         * It also means that the external f_pos is offset by 3 from the
         * on-disk directory f_pos.
         */
-        while (file->f_pos < 3) {
+        while (ctx->pos < 3) {
                char *name;
                int i_ino;
-                if (file->f_pos == 0) {
+                if (ctx->pos == 0) {
                        name = ".";
                        size = 1;
                        i_ino = inode->i_ino;
@@ -141,24 +141,18 @@ static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir)
                        i_ino = squashfs_i(inode)->parent;
                }
-                TRACE("Calling filldir(%p, %s, %d, %lld, %d, %d)\n",
+                if (!dir_emit(ctx, name, size, i_ino,
-                                dirent, name, size, file->f_pos, i_ino,
+                                squashfs_filetype_table[1]))
-                                squashfs_filetype_table[1]);
-                if (filldir(dirent, name, size, file->f_pos, i_ino,
-                                squashfs_filetype_table[1]) < 0) {
-                                TRACE("Filldir returned less than 0\n");
                        goto finish;
-                }
-                file->f_pos += size;
+                ctx->pos += size;
        }
        length = get_dir_index_using_offset(inode->i_sb, &block, &offset,
                                squashfs_i(inode)->dir_idx_start,
                                squashfs_i(inode)->dir_idx_offset,
                                squashfs_i(inode)->dir_idx_cnt,
-                                file->f_pos);
+                                ctx->pos);
        while (length < i_size_read(inode)) {
                /*
@@ -198,7 +192,7 @@ static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir)
                        length += sizeof(*dire) + size;
-                        if (file->f_pos >= length)
+                        if (ctx->pos >= length)
                                continue;
                        dire->name[size] = '\0';
@@ -206,22 +200,12 @@ static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir)
                                ((short) le16_to_cpu(dire->inode_number));
                        type = le16_to_cpu(dire->type);
-                        TRACE("Calling filldir(%p, %s, %d, %lld, %x:%x, %d, %d)"
+                        if (!dir_emit(ctx, dire->name, size,
-                                        "\n", dirent, dire->name, size,
-                                        file->f_pos,
-                                        le32_to_cpu(dirh.start_block),
-                                        le16_to_cpu(dire->offset),
-                                        inode_number,
-                                        squashfs_filetype_table[type]);
-                        if (filldir(dirent, dire->name, size, file->f_pos,
                                        inode_number,
-                                        squashfs_filetype_table[type]) < 0) {
+                                        squashfs_filetype_table[type]))
-                                TRACE("Filldir returned less than 0\n");
                                goto finish;
-                        }
-                        file->f_pos = length;
+                        ctx->pos = length;
                }
        }
@@ -238,6 +222,6 @@ failed_read:
 const struct file_operations squashfs_dir_ops = {
        .read = generic_read_dir,
-        .readdir = squashfs_readdir,
+        .iterate = squashfs_readdir,
        .llseek = default_llseek,
 };
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index e8e0e71b29d5..e068e744dbdd 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -74,7 +74,7 @@ static int sysfs_sd_compare(const struct sysfs_dirent *left,
 }
 /**
- *      sysfs_link_subling - link sysfs_dirent into sibling rbtree
+ *      sysfs_link_sibling - link sysfs_dirent into sibling rbtree
 *      @sd: sysfs_dirent of interest
 *
 *      Link @sd into its sibling rbtree which starts from
@@ -998,68 +998,38 @@ static struct sysfs_dirent *sysfs_dir_next_pos(const void *ns,
        return pos;
 }
-static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
+static int sysfs_readdir(struct file *file, struct dir_context *ctx)
 {
-        struct dentry *dentry = filp->f_path.dentry;
+        struct dentry *dentry = file->f_path.dentry;
        struct sysfs_dirent * parent_sd = dentry->d_fsdata;
-        struct sysfs_dirent *pos = filp->private_data;
+        struct sysfs_dirent *pos = file->private_data;
        enum kobj_ns_type type;
        const void *ns;
-        ino_t ino;
-        loff_t off;
        type = sysfs_ns_type(parent_sd);
        ns = sysfs_info(dentry->d_sb)->ns[type];
-        if (filp->f_pos == 0) {
+        if (!dir_emit_dots(file, ctx))
-                ino = parent_sd->s_ino;
+                return 0;
-                if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) == 0)
-                        filp->f_pos++;
-                else
-                        return 0;
-        }
-        if (filp->f_pos == 1) {
-                if (parent_sd->s_parent)
-                        ino = parent_sd->s_parent->s_ino;
-                else
-                        ino = parent_sd->s_ino;
-                if (filldir(dirent, "..", 2, filp->f_pos, ino, DT_DIR) == 0)
-                        filp->f_pos++;
-                else
-                        return 0;
-        }
        mutex_lock(&sysfs_mutex);
-        off = filp->f_pos;
+        for (pos = sysfs_dir_pos(ns, parent_sd, ctx->pos, pos);
-        for (pos = sysfs_dir_pos(ns, parent_sd, filp->f_pos, pos);
             pos;
-             pos = sysfs_dir_next_pos(ns, parent_sd, filp->f_pos, pos)) {
+             pos = sysfs_dir_next_pos(ns, parent_sd, ctx->pos, pos)) {
-                const char * name;
+                const char *name = pos->s_name;
-                unsigned int type;
+                unsigned int type = dt_type(pos);
-                int len, ret;
+                int len = strlen(name);
+                ino_t ino = pos->s_ino;
-                name = pos->s_name;
+                ctx->pos = pos->s_hash;
-                len = strlen(name);
+                file->private_data = sysfs_get(pos);
-                ino = pos->s_ino;
-                type = dt_type(pos);
-                off = filp->f_pos = pos->s_hash;
-                filp->private_data = sysfs_get(pos);
                mutex_unlock(&sysfs_mutex);
-                ret = filldir(dirent, name, len, off, ino, type);
+                if (!dir_emit(ctx, name, len, ino, type))
+                        return 0;
                mutex_lock(&sysfs_mutex);
-                if (ret < 0)
-                        break;
        }
        mutex_unlock(&sysfs_mutex);
+        file->private_data = NULL;
-        /* don't reference last entry if its refcount is dropped */
+        ctx->pos = INT_MAX;
-        if (!pos) {
-                filp->private_data = NULL;
-                /* EOF and not changed as 0 or 1 in read/write path */
-                if (off == filp->f_pos && off > 1)
-                        filp->f_pos = INT_MAX;
-        }
        return 0;
 }
@@ -1077,7 +1047,7 @@ static loff_t sysfs_dir_llseek(struct file *file, loff_t offset, int whence)
 const struct file_operations sysfs_dir_operations = {
        .read           = generic_read_dir,
-        .readdir        = sysfs_readdir,
+        .iterate        = sysfs_readdir,
        .release        = sysfs_dir_release,
        .llseek         = sysfs_dir_llseek,
 };
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 602f56db0442..d2bb7ed8fa74 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -449,10 +449,12 @@ void sysfs_notify_dirent(struct sysfs_dirent *sd)
        spin_lock_irqsave(&sysfs_open_dirent_lock, flags);
-        od = sd->s_attr.open;
+        if (!WARN_ON(sysfs_type(sd) != SYSFS_KOBJ_ATTR)) {
-        if (od) {
+                od = sd->s_attr.open;
-                atomic_inc(&od->event);
+                if (od) {
-                wake_up_interruptible(&od->poll);
+                        atomic_inc(&od->event);
+                        wake_up_interruptible(&od->poll);
+                }
        }
        spin_unlock_irqrestore(&sysfs_open_dirent_lock, flags);
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 0ce3ccf7f401..3e2837a633ed 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -24,8 +24,6 @@
 #include <linux/security.h>
 #include "sysfs.h"
-extern struct super_block * sysfs_sb;
 static const struct address_space_operations sysfs_aops = {
        .readpage       = simple_readpage,
        .write_begin    = simple_write_begin,
diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
index 3799e8dac3eb..d42291d08215 100644
--- a/fs/sysv/dir.c
+++ b/fs/sysv/dir.c
@@ -18,12 +18,12 @@
 #include <linux/swap.h>
 #include "sysv.h"
-static int sysv_readdir(struct file *, void *, filldir_t);
+static int sysv_readdir(struct file *, struct dir_context *);
 const struct file_operations sysv_dir_operations = {
        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
-        .readdir        = sysv_readdir,
+        .iterate        = sysv_readdir,
        .fsync          = generic_file_fsync,
 };
@@ -65,18 +65,21 @@ static struct page * dir_get_page(struct inode *dir, unsigned long n)
        return page;
 }
-static int sysv_readdir(struct file * filp, void * dirent, filldir_t filldir)
+static int sysv_readdir(struct file *file, struct dir_context *ctx)
 {
-        unsigned long pos = filp->f_pos;
+        unsigned long pos = ctx->pos;
-        struct inode *inode = file_inode(filp);
+        struct inode *inode = file_inode(file);
        struct super_block *sb = inode->i_sb;
-        unsigned offset = pos & ~PAGE_CACHE_MASK;
-        unsigned long n = pos >> PAGE_CACHE_SHIFT;
        unsigned long npages = dir_pages(inode);
+        unsigned offset;
+        unsigned long n;
-        pos = (pos + SYSV_DIRSIZE-1) & ~(SYSV_DIRSIZE-1);
+        ctx->pos = pos = (pos + SYSV_DIRSIZE-1) & ~(SYSV_DIRSIZE-1);
        if (pos >= inode->i_size)
-                goto done;
+                return 0;
+        offset = pos & ~PAGE_CACHE_MASK;
+        n = pos >> PAGE_CACHE_SHIFT;
        for ( ; n < npages; n++, offset = 0) {
                char *kaddr, *limit;
@@ -88,29 +91,21 @@ static int sysv_readdir(struct file * filp, void * dirent, filldir_t filldir)
                kaddr = (char *)page_address(page);
                de = (struct sysv_dir_entry *)(kaddr+offset);
                limit = kaddr + PAGE_CACHE_SIZE - SYSV_DIRSIZE;
-                for ( ;(char*)de <= limit; de++) {
+                for ( ;(char*)de <= limit; de++, ctx->pos += sizeof(*de)) {
                        char *name = de->name;
-                        int over;
                        if (!de->inode)
                                continue;
-                        offset = (char *)de - kaddr;
+                        if (!dir_emit(ctx, name, strnlen(name,SYSV_NAMELEN),
-                        over = filldir(dirent, name, strnlen(name,SYSV_NAMELEN),
-                                        ((loff_t)n<<PAGE_CACHE_SHIFT) | offset,
                                        fs16_to_cpu(SYSV_SB(sb), de->inode),
-                                        DT_UNKNOWN);
+                                        DT_UNKNOWN)) {
-                        if (over) {
                                dir_put_page(page);
-                                goto done;
+                                return 0;
                        }
                }
                dir_put_page(page);
        }
-done:
-        filp->f_pos = ((loff_t)n << PAGE_CACHE_SHIFT) | offset;
        return 0;
 }
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index de08c92f2e23..6b4947f75af7 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -346,38 +346,46 @@ static unsigned int vfs_dent_type(uint8_t type)
 * This means that UBIFS cannot support NFS which requires full
 * 'seekdir()'/'telldir()' support.
 */
-static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir)
+static int ubifs_readdir(struct file *file, struct dir_context *ctx)
 {
-        int err, over = 0;
+        int err;
        struct qstr nm;
        union ubifs_key key;
        struct ubifs_dent_node *dent;
        struct inode *dir = file_inode(file);
        struct ubifs_info *c = dir->i_sb->s_fs_info;
-        dbg_gen("dir ino %lu, f_pos %#llx", dir->i_ino, file->f_pos);
+        dbg_gen("dir ino %lu, f_pos %#llx", dir->i_ino, ctx->pos);
-        if (file->f_pos > UBIFS_S_KEY_HASH_MASK || file->f_pos == 2)
+        if (ctx->pos > UBIFS_S_KEY_HASH_MASK || ctx->pos == 2)
                /*
                 * The directory was seek'ed to a senseless position or there
                 * are no more entries.
                 */
                return 0;
-        /* File positions 0 and 1 correspond to "." and ".." */
+        if (file->f_version == 0) {
-        if (file->f_pos == 0) {
+                /*
-                ubifs_assert(!file->private_data);
+                 * The file was seek'ed, which means that @file->private_data
-                over = filldir(dirent, ".", 1, 0, dir->i_ino, DT_DIR);
+                 * is now invalid. This may also be just the first
-                if (over)
+                 * 'ubifs_readdir()' invocation, in which case
-                        return 0;
+                 * @file->private_data is NULL, and the below code is
-                file->f_pos = 1;
+                 * basically a no-op.
+                 */
+                kfree(file->private_data);
+                file->private_data = NULL;
        }
-        if (file->f_pos == 1) {
+        /*
+         * 'generic_file_llseek()' unconditionally sets @file->f_version to
+         * zero, and we use this for detecting whether the file was seek'ed.
+         */
+        file->f_version = 1;
+        /* File positions 0 and 1 correspond to "." and ".." */
+        if (ctx->pos < 2) {
                ubifs_assert(!file->private_data);
-                over = filldir(dirent, "..", 2, 1,
+                if (!dir_emit_dots(file, ctx))
-                               parent_ino(file->f_path.dentry), DT_DIR);
-                if (over)
                        return 0;
                /* Find the first entry in TNC and save it */
@@ -389,7 +397,7 @@ static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir)
                        goto out;
                }
-                file->f_pos = key_hash_flash(c, &dent->key);
+                ctx->pos = key_hash_flash(c, &dent->key);
                file->private_data = dent;
        }
@@ -397,17 +405,16 @@ static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir)
        if (!dent) {
                /*
                 * The directory was seek'ed to and is now readdir'ed.
-                 * Find the entry corresponding to @file->f_pos or the
+                 * Find the entry corresponding to @ctx->pos or the closest one.
-                 * closest one.
                 */
-                dent_key_init_hash(c, &key, dir->i_ino, file->f_pos);
+                dent_key_init_hash(c, &key, dir->i_ino, ctx->pos);
                nm.name = NULL;
                dent = ubifs_tnc_next_ent(c, &key, &nm);
                if (IS_ERR(dent)) {
                        err = PTR_ERR(dent);
                        goto out;
                }
-                file->f_pos = key_hash_flash(c, &dent->key);
+                ctx->pos = key_hash_flash(c, &dent->key);
                file->private_data = dent;
        }
@@ -419,10 +426,9 @@ static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir)
                             ubifs_inode(dir)->creat_sqnum);
                nm.len = le16_to_cpu(dent->nlen);
-                over = filldir(dirent, dent->name, nm.len, file->f_pos,
+                if (!dir_emit(ctx, dent->name, nm.len,
                               le64_to_cpu(dent->inum),
-                               vfs_dent_type(dent->type));
+                               vfs_dent_type(dent->type)))
-                if (over)
                        return 0;
                /* Switch to the next entry */
@@ -435,7 +441,7 @@ static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir)
                }
                kfree(file->private_data);
-                file->f_pos = key_hash_flash(c, &dent->key);
+                ctx->pos = key_hash_flash(c, &dent->key);
                file->private_data = dent;
                cond_resched();
        }
@@ -448,18 +454,11 @@ out:
        kfree(file->private_data);
        file->private_data = NULL;
-        file->f_pos = 2;
+        /* 2 is a special value indicating that there are no more direntries */
+        ctx->pos = 2;
        return 0;
 }
-/* If a directory is seeked, we have to free saved readdir() state */
-static loff_t ubifs_dir_llseek(struct file *file, loff_t offset, int whence)
-{
-        kfree(file->private_data);
-        file->private_data = NULL;
-        return generic_file_llseek(file, offset, whence);
-}
 /* Free saved readdir() state when the directory is closed */
 static int ubifs_dir_release(struct inode *dir, struct file *file)
 {
@@ -1177,10 +1176,10 @@ const struct inode_operations ubifs_dir_inode_operations = {
 };
 const struct file_operations ubifs_dir_operations = {
-        .llseek         = ubifs_dir_llseek,
+        .llseek         = generic_file_llseek,
        .release        = ubifs_dir_release,
        .read           = generic_read_dir,
-        .readdir        = ubifs_readdir,
+        .iterate        = ubifs_readdir,
        .fsync          = ubifs_fsync,
        .unlocked_ioctl = ubifs_ioctl,
 #ifdef CONFIG_COMPAT
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 14374530784c..123c79b7261e 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1277,13 +1277,14 @@ int ubifs_setattr(struct dentry *dentry, struct iattr *attr)
        return err;
 }
-static void ubifs_invalidatepage(struct page *page, unsigned long offset)
+static void ubifs_invalidatepage(struct page *page, unsigned int offset,
+                                 unsigned int length)
 {
        struct inode *inode = page->mapping->host;
        struct ubifs_info *c = inode->i_sb->s_fs_info;
        ubifs_assert(PagePrivate(page));
-        if (offset)
+        if (offset || length < PAGE_CACHE_SIZE)
                /* Partial page remains dirty */
                return;
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index b3e93f5e17c3..a012c51caffd 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -35,14 +35,16 @@
 #include "udf_i.h"
 #include "udf_sb.h"
-static int do_udf_readdir(struct inode *dir, struct file *filp,
-                          filldir_t filldir, void *dirent)
+static int udf_readdir(struct file *file, struct dir_context *ctx)
 {
+        struct inode *dir = file_inode(file);
+        struct udf_inode_info *iinfo = UDF_I(dir);
        struct udf_fileident_bh fibh = { .sbh = NULL, .ebh = NULL};
        struct fileIdentDesc *fi = NULL;
        struct fileIdentDesc cfi;
        int block, iblock;
-        loff_t nf_pos = (filp->f_pos - 1) << 2;
+        loff_t nf_pos;
        int flen;
        unsigned char *fname = NULL;
        unsigned char *nameptr;
@@ -54,10 +56,14 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
        uint32_t elen;
        sector_t offset;
        int i, num, ret = 0;
-        unsigned int dt_type;
        struct extent_position epos = { NULL, 0, {0, 0} };
-        struct udf_inode_info *iinfo;
+        if (ctx->pos == 0) {
+                if (!dir_emit_dot(file, ctx))
+                        return 0;
+                ctx->pos = 1;
+        }
+        nf_pos = (ctx->pos - 1) << 2;
        if (nf_pos >= size)
                goto out;
@@ -71,7 +77,6 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
                nf_pos = udf_ext0_offset(dir);
        fibh.soffset = fibh.eoffset = nf_pos & (dir->i_sb->s_blocksize - 1);
-        iinfo = UDF_I(dir);
        if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
                if (inode_bmap(dir, nf_pos >> dir->i_sb->s_blocksize_bits,
                    &epos, &eloc, &elen, &offset)
@@ -116,7 +121,9 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
        }
        while (nf_pos < size) {
-                filp->f_pos = (nf_pos >> 2) + 1;
+                struct kernel_lb_addr tloc;
+                ctx->pos = (nf_pos >> 2) + 1;
                fi = udf_fileident_read(dir, &nf_pos, &fibh, &cfi, &epos, &eloc,
                                        &elen, &offset);
@@ -155,24 +162,22 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
                }
                if (cfi.fileCharacteristics & FID_FILE_CHAR_PARENT) {
-                        iblock = parent_ino(filp->f_path.dentry);
+                        if (!dir_emit_dotdot(file, ctx))
-                        flen = 2;
+                                goto out;
-                        memcpy(fname, "..", flen);
+                        continue;
-                        dt_type = DT_DIR;
-                } else {
-                        struct kernel_lb_addr tloc = lelb_to_cpu(cfi.icb.extLocation);
-                        iblock = udf_get_lb_pblock(dir->i_sb, &tloc, 0);
-                        flen = udf_get_filename(dir->i_sb, nameptr, fname, lfi);
-                        dt_type = DT_UNKNOWN;
                }
-                if (flen && filldir(dirent, fname, flen, filp->f_pos,
+                flen = udf_get_filename(dir->i_sb, nameptr, fname, lfi);
-                                    iblock, dt_type) < 0)
+                if (!flen)
+                        continue;
+                tloc = lelb_to_cpu(cfi.icb.extLocation);
+                iblock = udf_get_lb_pblock(dir->i_sb, &tloc, 0);
+                if (!dir_emit(ctx, fname, flen, iblock, DT_UNKNOWN))
                        goto out;
        } /* end while */
-        filp->f_pos = (nf_pos >> 2) + 1;
+        ctx->pos = (nf_pos >> 2) + 1;
 out:
        if (fibh.sbh != fibh.ebh)
@@ -184,27 +189,11 @@ out:
        return ret;
 }
-static int udf_readdir(struct file *filp, void *dirent, filldir_t filldir)
-{
-        struct inode *dir = file_inode(filp);
-        int result;
-        if (filp->f_pos == 0) {
-                if (filldir(dirent, ".", 1, filp->f_pos, dir->i_ino, DT_DIR) < 0) {
-                        return 0;
-                }
-                filp->f_pos++;
-        }
-        result = do_udf_readdir(dir, filp, filldir, dirent);
-        return result;
-}
 /* readdir and lookup functions */
 const struct file_operations udf_dir_operations = {
        .llseek                 = generic_file_llseek,
        .read                   = generic_read_dir,
-        .readdir                = udf_readdir,
+        .iterate                = udf_readdir,
        .unlocked_ioctl         = udf_ioctl,
        .fsync                  = generic_file_fsync,
 };
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index 3a75ca09c506..0ecc2cebed8f 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -430,16 +430,16 @@ ufs_validate_entry(struct super_block *sb, char *base,
 * This is blatantly stolen from ext2fs
 */
 static int
-ufs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+ufs_readdir(struct file *file, struct dir_context *ctx)
 {
-        loff_t pos = filp->f_pos;
+        loff_t pos = ctx->pos;
-        struct inode *inode = file_inode(filp);
+        struct inode *inode = file_inode(file);
        struct super_block *sb = inode->i_sb;
        unsigned int offset = pos & ~PAGE_CACHE_MASK;
        unsigned long n = pos >> PAGE_CACHE_SHIFT;
        unsigned long npages = ufs_dir_pages(inode);
        unsigned chunk_mask = ~(UFS_SB(sb)->s_uspi->s_dirblksize - 1);
-        int need_revalidate = filp->f_version != inode->i_version;
+        int need_revalidate = file->f_version != inode->i_version;
        unsigned flags = UFS_SB(sb)->s_flags;
        UFSD("BEGIN\n");
@@ -457,16 +457,16 @@ ufs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                        ufs_error(sb, __func__,
                                  "bad page in #%lu",
                                  inode->i_ino);
-                        filp->f_pos += PAGE_CACHE_SIZE - offset;
+                        ctx->pos += PAGE_CACHE_SIZE - offset;
                        return -EIO;
                }
                kaddr = page_address(page);
                if (unlikely(need_revalidate)) {
                        if (offset) {
                                offset = ufs_validate_entry(sb, kaddr, offset, chunk_mask);
-                                filp->f_pos = (n<<PAGE_CACHE_SHIFT) + offset;
+                                ctx->pos = (n<<PAGE_CACHE_SHIFT) + offset;
                        }
-                        filp->f_version = inode->i_version;
+                        file->f_version = inode->i_version;
                        need_revalidate = 0;
                }
                de = (struct ufs_dir_entry *)(kaddr+offset);
@@ -479,11 +479,8 @@ ufs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                                return -EIO;
                        }
                        if (de->d_ino) {
-                                int over;
                                unsigned char d_type = DT_UNKNOWN;
-                                offset = (char *)de - kaddr;
                                UFSD("filldir(%s,%u)\n", de->d_name,
                                      fs32_to_cpu(sb, de->d_ino));
                                UFSD("namlen %u\n", ufs_get_de_namlen(sb, de));
@@ -491,16 +488,15 @@ ufs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                                if ((flags & UFS_DE_MASK) == UFS_DE_44BSD)
                                        d_type = de->d_u.d_44.d_type;
-                                over = filldir(dirent, de->d_name,
+                                if (!dir_emit(ctx, de->d_name,
                                               ufs_get_de_namlen(sb, de),
-                                                (n<<PAGE_CACHE_SHIFT) | offset,
+                                               fs32_to_cpu(sb, de->d_ino),
-                                               fs32_to_cpu(sb, de->d_ino), d_type);
+                                               d_type)) {
-                                if (over) {
                                        ufs_put_page(page);
                                        return 0;
                                }
                        }
-                        filp->f_pos += fs16_to_cpu(sb, de->d_reclen);
+                        ctx->pos += fs16_to_cpu(sb, de->d_reclen);
                }
                ufs_put_page(page);
        }
@@ -660,7 +656,7 @@ not_empty:
 const struct file_operations ufs_dir_operations = {
        .read           = generic_read_dir,
-        .readdir        = ufs_readdir,
+        .iterate        = ufs_readdir,
        .fsync          = generic_file_fsync,
        .llseek         = generic_file_llseek,
 };
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 41a695048be7..596ec71da00e 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -843,10 +843,12 @@ xfs_cluster_write(
 STATIC void
 xfs_vm_invalidatepage(
        struct page             *page,
-        unsigned long           offset)
+        unsigned int            offset,
+        unsigned int            length)
 {
-        trace_xfs_invalidatepage(page->mapping->host, page, offset);
+        trace_xfs_invalidatepage(page->mapping->host, page, offset,
-        block_invalidatepage(page, offset);
+                                 length);
+        block_invalidatepage(page, offset, length);
 }
 /*
@@ -910,7 +912,7 @@ next_buffer:
        xfs_iunlock(ip, XFS_ILOCK_EXCL);
 out_invalidate:
-        xfs_vm_invalidatepage(page, 0);
+        xfs_vm_invalidatepage(page, 0, PAGE_CACHE_SIZE);
        return;
 }
@@ -940,7 +942,7 @@ xfs_vm_writepage(
        int                     count = 0;
        int                     nonblocking = 0;
-        trace_xfs_writepage(inode, page, 0);
+        trace_xfs_writepage(inode, page, 0, 0);
        ASSERT(page_has_buffers(page));
@@ -1171,7 +1173,7 @@ xfs_vm_releasepage(
 {
        int                     delalloc, unwritten;
-        trace_xfs_releasepage(page->mapping->host, page, 0);
+        trace_xfs_releasepage(page->mapping->host, page, 0, 0);
        xfs_count_page_state(page, &delalloc, &unwritten);
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index b26a50f9921d..8f023dee404d 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -368,10 +368,8 @@ xfs_dir_removename(
 int
 xfs_readdir(
        xfs_inode_t     *dp,
-        void            *dirent,
+        struct dir_context *ctx,
-        size_t          bufsize,
+        size_t          bufsize)
-        xfs_off_t       *offset,
-        filldir_t       filldir)
 {
        int             rval;           /* return value */
        int             v;              /* type-checking value */
@@ -385,14 +383,13 @@ xfs_readdir(
        XFS_STATS_INC(xs_dir_getdents);
        if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
-                rval = xfs_dir2_sf_getdents(dp, dirent, offset, filldir);
+                rval = xfs_dir2_sf_getdents(dp, ctx);
        else if ((rval = xfs_dir2_isblock(NULL, dp, &v)))
                ;
        else if (v)
-                rval = xfs_dir2_block_getdents(dp, dirent, offset, filldir);
+                rval = xfs_dir2_block_getdents(dp, ctx);
        else
-                rval = xfs_dir2_leaf_getdents(dp, dirent, bufsize, offset,
+                rval = xfs_dir2_leaf_getdents(dp, ctx, bufsize);
-                                              filldir);
        return rval;
 }
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index e59f5fc816fe..09aea0247d96 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -569,9 +569,7 @@ xfs_dir2_block_addname(
 int                                             /* error */
 xfs_dir2_block_getdents(
        xfs_inode_t             *dp,            /* incore inode */
-        void                    *dirent,
+        struct dir_context      *ctx)
-        xfs_off_t               *offset,
-        filldir_t               filldir)
 {
        xfs_dir2_data_hdr_t     *hdr;           /* block header */
        struct xfs_buf          *bp;            /* buffer for block */
@@ -589,7 +587,7 @@ xfs_dir2_block_getdents(
        /*
         * If the block number in the offset is out of range, we're done.
         */
-        if (xfs_dir2_dataptr_to_db(mp, *offset) > mp->m_dirdatablk)
+        if (xfs_dir2_dataptr_to_db(mp, ctx->pos) > mp->m_dirdatablk)
                return 0;
        error = xfs_dir3_block_read(NULL, dp, &bp);
@@ -600,7 +598,7 @@ xfs_dir2_block_getdents(
         * Extract the byte offset we start at from the seek pointer.
         * We'll skip entries before this.
         */
-        wantoff = xfs_dir2_dataptr_to_off(mp, *offset);
+        wantoff = xfs_dir2_dataptr_to_off(mp, ctx->pos);
        hdr = bp->b_addr;
        xfs_dir3_data_check(dp, bp);
        /*
@@ -639,13 +637,12 @@ xfs_dir2_block_getdents(
                cook = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
                                            (char *)dep - (char *)hdr);
+                ctx->pos = cook & 0x7fffffff;
                /*
                 * If it didn't fit, set the final offset to here & return.
                 */
-                if (filldir(dirent, (char *)dep->name, dep->namelen,
+                if (!dir_emit(ctx, (char *)dep->name, dep->namelen,
-                            cook & 0x7fffffff, be64_to_cpu(dep->inumber),
+                            be64_to_cpu(dep->inumber), DT_UNKNOWN)) {
-                            DT_UNKNOWN)) {
-                        *offset = cook & 0x7fffffff;
                        xfs_trans_brelse(NULL, bp);
                        return 0;
                }
@@ -655,7 +652,7 @@ xfs_dir2_block_getdents(
         * Reached the end of the block.
         * Set the offset to a non-existent block 1 and return.
         */
-        *offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) &
+        ctx->pos = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) &
                        0x7fffffff;
        xfs_trans_brelse(NULL, bp);
        return 0;
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index da71a1819d78..e0cc1243a8aa 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -1300,10 +1300,8 @@ out:
 int                                             /* error */
 xfs_dir2_leaf_getdents(
        xfs_inode_t             *dp,            /* incore directory inode */
-        void                    *dirent,
+        struct dir_context      *ctx,
-        size_t                  bufsize,
+        size_t                  bufsize)
-        xfs_off_t               *offset,
-        filldir_t               filldir)
 {
        struct xfs_buf          *bp = NULL;     /* data block buffer */
        xfs_dir2_data_hdr_t     *hdr;           /* data block header */
@@ -1322,7 +1320,7 @@ xfs_dir2_leaf_getdents(
         * If the offset is at or past the largest allowed value,
         * give up right away.
         */
-        if (*offset >= XFS_DIR2_MAX_DATAPTR)
+        if (ctx->pos >= XFS_DIR2_MAX_DATAPTR)
                return 0;
        mp = dp->i_mount;
@@ -1343,7 +1341,7 @@ xfs_dir2_leaf_getdents(
         * Inside the loop we keep the main offset value as a byte offset
         * in the directory file.
         */
-        curoff = xfs_dir2_dataptr_to_byte(mp, *offset);
+        curoff = xfs_dir2_dataptr_to_byte(mp, ctx->pos);
        /*
         * Force this conversion through db so we truncate the offset
@@ -1444,8 +1442,8 @@ xfs_dir2_leaf_getdents(
                dep = (xfs_dir2_data_entry_t *)ptr;
                length = xfs_dir2_data_entsize(dep->namelen);
-                if (filldir(dirent, (char *)dep->name, dep->namelen,
+                ctx->pos = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff;
-                            xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff,
+                if (!dir_emit(ctx, (char *)dep->name, dep->namelen,
                            be64_to_cpu(dep->inumber), DT_UNKNOWN))
                        break;
@@ -1462,9 +1460,9 @@ xfs_dir2_leaf_getdents(
         * All done.  Set output offset value to current offset.
         */
        if (curoff > xfs_dir2_dataptr_to_byte(mp, XFS_DIR2_MAX_DATAPTR))
-                *offset = XFS_DIR2_MAX_DATAPTR & 0x7fffffff;
+                ctx->pos = XFS_DIR2_MAX_DATAPTR & 0x7fffffff;
        else
-                *offset = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff;
+                ctx->pos = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff;
        kmem_free(map_info);
        if (bp)
                xfs_trans_brelse(NULL, bp);
diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h
index 7cf573c88aad..0511cda4a712 100644
--- a/fs/xfs/xfs_dir2_priv.h
+++ b/fs/xfs/xfs_dir2_priv.h
@@ -33,8 +33,8 @@ extern int xfs_dir_cilookup_result(struct xfs_da_args *args,
 extern const struct xfs_buf_ops xfs_dir3_block_buf_ops;
 extern int xfs_dir2_block_addname(struct xfs_da_args *args);
-extern int xfs_dir2_block_getdents(struct xfs_inode *dp, void *dirent,
+extern int xfs_dir2_block_getdents(struct xfs_inode *dp,
-                xfs_off_t *offset, filldir_t filldir);
+                struct dir_context *ctx);
 extern int xfs_dir2_block_lookup(struct xfs_da_args *args);
 extern int xfs_dir2_block_removename(struct xfs_da_args *args);
 extern int xfs_dir2_block_replace(struct xfs_da_args *args);
@@ -91,8 +91,8 @@ extern void xfs_dir3_leaf_compact(struct xfs_da_args *args,
 extern void xfs_dir3_leaf_compact_x1(struct xfs_dir3_icleaf_hdr *leafhdr,
                struct xfs_dir2_leaf_entry *ents, int *indexp,
                int *lowstalep, int *highstalep, int *lowlogp, int *highlogp);
-extern int xfs_dir2_leaf_getdents(struct xfs_inode *dp, void *dirent,
+extern int xfs_dir2_leaf_getdents(struct xfs_inode *dp, struct dir_context *ctx,
-                size_t bufsize, xfs_off_t *offset, filldir_t filldir);
+                size_t bufsize);
 extern int xfs_dir3_leaf_get_buf(struct xfs_da_args *args, xfs_dir2_db_t bno,
                struct xfs_buf **bpp, __uint16_t magic);
 extern void xfs_dir3_leaf_log_ents(struct xfs_trans *tp, struct xfs_buf *bp,
@@ -153,8 +153,7 @@ extern int xfs_dir2_block_to_sf(struct xfs_da_args *args, struct xfs_buf *bp,
                int size, xfs_dir2_sf_hdr_t *sfhp);
 extern int xfs_dir2_sf_addname(struct xfs_da_args *args);
 extern int xfs_dir2_sf_create(struct xfs_da_args *args, xfs_ino_t pino);
-extern int xfs_dir2_sf_getdents(struct xfs_inode *dp, void *dirent,
+extern int xfs_dir2_sf_getdents(struct xfs_inode *dp, struct dir_context *ctx);
-                xfs_off_t *offset, filldir_t filldir);
 extern int xfs_dir2_sf_lookup(struct xfs_da_args *args);
 extern int xfs_dir2_sf_removename(struct xfs_da_args *args);
 extern int xfs_dir2_sf_replace(struct xfs_da_args *args);
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
index 6157424dbf8f..97676a347da1 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -768,9 +768,7 @@ xfs_dir2_sf_create(
 int                                             /* error */
 xfs_dir2_sf_getdents(
        xfs_inode_t             *dp,            /* incore directory inode */
-        void                    *dirent,
+        struct dir_context      *ctx)
-        xfs_off_t               *offset,
-        filldir_t               filldir)
 {
        int                     i;              /* shortform entry number */
        xfs_mount_t             *mp;            /* filesystem mount point */
@@ -802,7 +800,7 @@ xfs_dir2_sf_getdents(
        /*
         * If the block number in the offset is out of range, we're done.
         */
-        if (xfs_dir2_dataptr_to_db(mp, *offset) > mp->m_dirdatablk)
+        if (xfs_dir2_dataptr_to_db(mp, ctx->pos) > mp->m_dirdatablk)
                return 0;
        /*
@@ -819,22 +817,20 @@ xfs_dir2_sf_getdents(
        /*
         * Put . entry unless we're starting past it.
         */
-        if (*offset <= dot_offset) {
+        if (ctx->pos <= dot_offset) {
-                if (filldir(dirent, ".", 1, dot_offset & 0x7fffffff, dp->i_ino, DT_DIR)) {
+                ctx->pos = dot_offset & 0x7fffffff;
-                        *offset = dot_offset & 0x7fffffff;
+                if (!dir_emit(ctx, ".", 1, dp->i_ino, DT_DIR))
                        return 0;
-                }
        }
        /*
         * Put .. entry unless we're starting past it.
         */
-        if (*offset <= dotdot_offset) {
+        if (ctx->pos <= dotdot_offset) {
                ino = xfs_dir2_sf_get_parent_ino(sfp);
-                if (filldir(dirent, "..", 2, dotdot_offset & 0x7fffffff, ino, DT_DIR)) {
+                ctx->pos = dotdot_offset & 0x7fffffff;
-                        *offset = dotdot_offset & 0x7fffffff;
+                if (!dir_emit(ctx, "..", 2, ino, DT_DIR))
                        return 0;
-                }
        }
        /*
@@ -845,21 +841,20 @@ xfs_dir2_sf_getdents(
                off = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
                                xfs_dir2_sf_get_offset(sfep));
-                if (*offset > off) {
+                if (ctx->pos > off) {
                        sfep = xfs_dir2_sf_nextentry(sfp, sfep);
                        continue;
                }
                ino = xfs_dir2_sfe_get_ino(sfp, sfep);
-                if (filldir(dirent, (char *)sfep->name, sfep->namelen,
+                ctx->pos = off & 0x7fffffff;
-                            off & 0x7fffffff, ino, DT_UNKNOWN)) {
+                if (!dir_emit(ctx, (char *)sfep->name, sfep->namelen,
-                        *offset = off & 0x7fffffff;
+                            ino, DT_UNKNOWN))
                        return 0;
-                }
                sfep = xfs_dir2_sf_nextentry(sfp, sfep);
        }
-        *offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) &
+        ctx->pos = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) &
                        0x7fffffff;
        return 0;
 }
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index a5f2042aec8b..0ad2b95fca12 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -906,11 +906,10 @@ xfs_file_release(
 STATIC int
 xfs_file_readdir(
-        struct file     *filp,
+        struct file     *file,
-        void            *dirent,
+        struct dir_context *ctx)
-        filldir_t       filldir)
 {
-        struct inode    *inode = file_inode(filp);
+        struct inode    *inode = file_inode(file);
        xfs_inode_t     *ip = XFS_I(inode);
        int             error;
        size_t          bufsize;
@@ -929,8 +928,7 @@ xfs_file_readdir(
         */
        bufsize = (size_t)min_t(loff_t, 32768, ip->i_d.di_size);
-        error = xfs_readdir(ip, dirent, bufsize,
+        error = xfs_readdir(ip, ctx, bufsize);
-                                (xfs_off_t *)&filp->f_pos, filldir);
        if (error)
                return -error;
        return 0;
@@ -1432,7 +1430,7 @@ const struct file_operations xfs_file_operations = {
 const struct file_operations xfs_dir_file_operations = {
        .open           = xfs_dir_open,
        .read           = generic_read_dir,
-        .readdir        = xfs_file_readdir,
+        .iterate        = xfs_file_readdir,
        .llseek         = generic_file_llseek,
        .unlocked_ioctl = xfs_file_ioctl,
 #ifdef CONFIG_COMPAT
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index aa4db3307d36..a04701de6bbd 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -974,14 +974,16 @@ DEFINE_RW_EVENT(xfs_file_splice_read);
 DEFINE_RW_EVENT(xfs_file_splice_write);
 DECLARE_EVENT_CLASS(xfs_page_class,
-        TP_PROTO(struct inode *inode, struct page *page, unsigned long off),
+        TP_PROTO(struct inode *inode, struct page *page, unsigned long off,
-        TP_ARGS(inode, page, off),
+                 unsigned int len),
+        TP_ARGS(inode, page, off, len),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_ino_t, ino)
                __field(pgoff_t, pgoff)
                __field(loff_t, size)
                __field(unsigned long, offset)
+                __field(unsigned int, length)
                __field(int, delalloc)
                __field(int, unwritten)
        ),
@@ -995,24 +997,27 @@ DECLARE_EVENT_CLASS(xfs_page_class,
                __entry->pgoff = page_offset(page);
                __entry->size = i_size_read(inode);
                __entry->offset = off;
+                __entry->length = len;
                __entry->delalloc = delalloc;
                __entry->unwritten = unwritten;
        ),
        TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx "
-                  "delalloc %d unwritten %d",
+                  "length %x delalloc %d unwritten %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->pgoff,
                  __entry->size,
                  __entry->offset,
+                  __entry->length,
                  __entry->delalloc,
                  __entry->unwritten)
 )
 #define DEFINE_PAGE_EVENT(name)         \
 DEFINE_EVENT(xfs_page_class, name,      \
-        TP_PROTO(struct inode *inode, struct page *page, unsigned long off),    \
+        TP_PROTO(struct inode *inode, struct page *page, unsigned long off, \
-        TP_ARGS(inode, page, off))
+                 unsigned int len),     \
+        TP_ARGS(inode, page, off, len))
 DEFINE_PAGE_EVENT(xfs_writepage);
 DEFINE_PAGE_EVENT(xfs_releasepage);
 DEFINE_PAGE_EVENT(xfs_invalidatepage);
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 5163022d9808..38c67c34d73f 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -31,8 +31,7 @@ int xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
                struct xfs_inode *ip);
 int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
                struct xfs_name *target_name);
-int xfs_readdir(struct xfs_inode        *dp, void *dirent, size_t bufsize,
+int xfs_readdir(struct xfs_inode *dp, struct dir_context *ctx, size_t bufsize);
-                       xfs_off_t *offset, filldir_t filldir);
 int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,
                const char *target_path, umode_t mode, struct xfs_inode **ipp);
 int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state);