207 files changed, 3991 insertions, 2974 deletions
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index b5a1076aaa6c..879ed8851737 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -1138,7 +1138,7 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
        struct v9fs_session_info *v9ses = sb->s_fs_info;
        struct v9fs_inode *v9inode = V9FS_I(inode);
-        inode->i_nlink = 1;
+        set_nlink(inode, 1);
        inode->i_atime.tv_sec = stat->atime;
        inode->i_mtime.tv_sec = stat->mtime;
@@ -1164,7 +1164,7 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
                        /* HARDLINKCOUNT %u */
                        sscanf(ext, "%13s %u", tag_name, &i_nlink);
                        if (!strncmp(tag_name, "HARDLINKCOUNT", 13))
-                                inode->i_nlink = i_nlink;
+                                set_nlink(inode, i_nlink);
                }
        }
        mode = stat->mode & S_IALLUGO;
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index aded79fcd5cf..0b5745e21946 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -606,7 +606,7 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
                inode->i_ctime.tv_nsec = stat->st_ctime_nsec;
                inode->i_uid = stat->st_uid;
                inode->i_gid = stat->st_gid;
-                inode->i_nlink = stat->st_nlink;
+                set_nlink(inode, stat->st_nlink);
                mode = stat->st_mode & S_IALLUGO;
                mode |= inode->i_mode & ~S_IALLUGO;
@@ -632,7 +632,7 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
                if (stat->st_result_mask & P9_STATS_GID)
                        inode->i_gid = stat->st_gid;
                if (stat->st_result_mask & P9_STATS_NLINK)
-                        inode->i_nlink = stat->st_nlink;
+                        set_nlink(inode, stat->st_nlink);
                if (stat->st_result_mask & P9_STATS_MODE) {
                        inode->i_mode = stat->st_mode;
                        if ((S_ISBLK(inode->i_mode)) ||
diff --git a/fs/Kconfig b/fs/Kconfig
index 9fe0b349f4cd..5f4c45d4aa10 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -109,7 +109,7 @@ source "fs/proc/Kconfig"
 source "fs/sysfs/Kconfig"
 config TMPFS
-        bool "Virtual memory file system support (former shm fs)"
+        bool "Tmpfs virtual memory file system support (former shm fs)"
        depends on SHMEM
        help
          Tmpfs is a file system which keeps all files in virtual memory.
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index d5250c5aae21..1dab6a174d6a 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -247,7 +247,7 @@ adfs_iget(struct super_block *sb, struct object_info *obj)
        inode->i_gid     = ADFS_SB(sb)->s_gid;
        inode->i_ino     = obj->file_id;
        inode->i_size    = obj->size;
-        inode->i_nlink   = 2;
+        set_nlink(inode, 2);
        inode->i_blocks  = (inode->i_size + sb->s_blocksize - 1) >>
                            sb->s_blocksize_bits;
diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c
index 3a4557e8325c..de37ec842340 100644
--- a/fs/affs/amigaffs.c
+++ b/fs/affs/amigaffs.c
@@ -215,7 +215,7 @@ affs_remove_link(struct dentry *dentry)
                                break;
                        default:
                                if (!AFFS_TAIL(sb, bh)->link_chain)
-                                        inode->i_nlink = 1;
+                                        set_nlink(inode, 1);
                        }
                        affs_free_block(sb, link_ino);
                        goto done;
@@ -316,7 +316,7 @@ affs_remove_header(struct dentry *dentry)
        if (inode->i_nlink > 1)
                retval = affs_remove_link(dentry);
        else
-                inode->i_nlink = 0;
+                clear_nlink(inode);
        affs_unlock_link(inode);
        inode->i_ctime = CURRENT_TIME_SEC;
        mark_inode_dirty(inode);
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 5d828903ac69..88a4b0b50058 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -54,7 +54,7 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino)
        prot = be32_to_cpu(tail->protect);
        inode->i_size = 0;
-        inode->i_nlink = 1;
+        set_nlink(inode, 1);
        inode->i_mode = 0;
        AFFS_I(inode)->i_extcnt = 1;
        AFFS_I(inode)->i_ext_last = ~1;
@@ -137,7 +137,7 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino)
                                               sbi->s_hashsize + 1;
                }
                if (tail->link_chain)
-                        inode->i_nlink = 2;
+                        set_nlink(inode, 2);
                inode->i_mapping->a_ops = (sbi->s_flags & SF_OFS) ? &affs_aops_ofs : &affs_aops;
                inode->i_op = &affs_file_inode_operations;
                inode->i_fop = &affs_file_operations;
@@ -304,7 +304,7 @@ affs_new_inode(struct inode *dir)
        inode->i_uid     = current_fsuid();
        inode->i_gid     = current_fsgid();
        inode->i_ino     = block;
-        inode->i_nlink   = 1;
+        set_nlink(inode, 1);
        inode->i_mtime   = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
        atomic_set(&AFFS_I(inode)->i_opencnt, 0);
        AFFS_I(inode)->i_blkcnt = 0;
@@ -387,7 +387,7 @@ affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s3
                AFFS_TAIL(sb, inode_bh)->link_chain = cpu_to_be32(block);
                affs_adjust_checksum(inode_bh, block - be32_to_cpu(chain));
                mark_buffer_dirty_inode(inode_bh, inode);
-                inode->i_nlink = 2;
+                set_nlink(inode, 2);
                ihold(inode);
        }
        affs_fix_checksum(sb, bh);
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index e3e9efc1fdd8..780a11dc6318 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -277,7 +277,7 @@ affs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata
        inode->i_mapping->a_ops = (AFFS_SB(sb)->s_flags & SF_OFS) ? &affs_aops_ofs : &affs_aops;
        error = affs_add_entry(dir, inode, dentry, ST_FILE);
        if (error) {
-                inode->i_nlink = 0;
+                clear_nlink(inode);
                iput(inode);
                return error;
        }
@@ -305,7 +305,7 @@ affs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        error = affs_add_entry(dir, inode, dentry, ST_USERDIR);
        if (error) {
-                inode->i_nlink = 0;
+                clear_nlink(inode);
                mark_inode_dirty(inode);
                iput(inode);
                return error;
@@ -392,7 +392,7 @@ affs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
        return 0;
 err:
-        inode->i_nlink = 0;
+        clear_nlink(inode);
        mark_inode_dirty(inode);
        iput(inode);
        return error;
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index 346e3289abd7..2f213d109c21 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -90,7 +90,7 @@ static void xdr_decode_AFSFetchStatus(const __be32 **_bp,
                        vnode->vfs_inode.i_uid = status->owner;
                        vnode->vfs_inode.i_gid = status->group;
                        vnode->vfs_inode.i_generation = vnode->fid.unique;
-                        vnode->vfs_inode.i_nlink = status->nlink;
+                        set_nlink(&vnode->vfs_inode, status->nlink);
                        mode = vnode->vfs_inode.i_mode;
                        mode &= ~S_IALLUGO;
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 0fdab6e03d87..d890ae3b2ce6 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -67,7 +67,7 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key)
                fscache_attr_changed(vnode->cache);
 #endif
-        inode->i_nlink          = vnode->status.nlink;
+        set_nlink(inode, vnode->status.nlink);
        inode->i_uid            = vnode->status.owner;
        inode->i_gid            = 0;
        inode->i_size           = vnode->status.size;
@@ -174,7 +174,7 @@ struct inode *afs_iget_autocell(struct inode *dir, const char *dev_name,
        inode->i_size           = 0;
        inode->i_mode           = S_IFDIR | S_IRUGO | S_IXUGO;
        inode->i_op             = &afs_autocell_inode_operations;
-        inode->i_nlink          = 2;
+        set_nlink(inode, 2);
        inode->i_uid            = 0;
        inode->i_gid            = 0;
        inode->i_ctime.tv_sec   = get_seconds();
diff --git a/fs/aio.c b/fs/aio.c
index e29ec485af25..78c514cfd212 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -440,8 +440,6 @@ void exit_aio(struct mm_struct *mm)
 static struct kiocb *__aio_get_req(struct kioctx *ctx)
 {
        struct kiocb *req = NULL;
-        struct aio_ring *ring;
-        int okay = 0;
        req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL);
        if (unlikely(!req))
@@ -459,39 +457,114 @@ static struct kiocb *__aio_get_req(struct kioctx *ctx)
        INIT_LIST_HEAD(&req->ki_run_list);
        req->ki_eventfd = NULL;
-        /* Check if the completion queue has enough free space to
+        return req;
-         * accept an event from this io.
+}
-         */
+/*
+ * struct kiocb's are allocated in batches to reduce the number of
+ * times the ctx lock is acquired and released.
+ */
+#define KIOCB_BATCH_SIZE        32L
+struct kiocb_batch {
+        struct list_head head;
+        long count; /* number of requests left to allocate */
+};
+static void kiocb_batch_init(struct kiocb_batch *batch, long total)
+{
+        INIT_LIST_HEAD(&batch->head);
+        batch->count = total;
+}
+static void kiocb_batch_free(struct kiocb_batch *batch)
+{
+        struct kiocb *req, *n;
+        list_for_each_entry_safe(req, n, &batch->head, ki_batch) {
+                list_del(&req->ki_batch);
+                kmem_cache_free(kiocb_cachep, req);
+        }
+}
+/*
+ * Allocate a batch of kiocbs.  This avoids taking and dropping the
+ * context lock a lot during setup.
+ */
+static int kiocb_batch_refill(struct kioctx *ctx, struct kiocb_batch *batch)
+{
+        unsigned short allocated, to_alloc;
+        long avail;
+        bool called_fput = false;
+        struct kiocb *req, *n;
+        struct aio_ring *ring;
+        to_alloc = min(batch->count, KIOCB_BATCH_SIZE);
+        for (allocated = 0; allocated < to_alloc; allocated++) {
+                req = __aio_get_req(ctx);
+                if (!req)
+                        /* allocation failed, go with what we've got */
+                        break;
+                list_add(&req->ki_batch, &batch->head);
+        }
+        if (allocated == 0)
+                goto out;
+retry:
        spin_lock_irq(&ctx->ctx_lock);
-        ring = kmap_atomic(ctx->ring_info.ring_pages[0], KM_USER0);
+        ring = kmap_atomic(ctx->ring_info.ring_pages[0]);
-        if (ctx->reqs_active < aio_ring_avail(&ctx->ring_info, ring)) {
+        avail = aio_ring_avail(&ctx->ring_info, ring) - ctx->reqs_active;
+        BUG_ON(avail < 0);
+        if (avail == 0 && !called_fput) {
+                /*
+                 * Handle a potential starvation case.  It is possible that
+                 * we hold the last reference on a struct file, causing us
+                 * to delay the final fput to non-irq context.  In this case,
+                 * ctx->reqs_active is artificially high.  Calling the fput
+                 * routine here may free up a slot in the event completion
+                 * ring, allowing this allocation to succeed.
+                 */
+                kunmap_atomic(ring);
+                spin_unlock_irq(&ctx->ctx_lock);
+                aio_fput_routine(NULL);
+                called_fput = true;
+                goto retry;
+        }
+        if (avail < allocated) {
+                /* Trim back the number of requests. */
+                list_for_each_entry_safe(req, n, &batch->head, ki_batch) {
+                        list_del(&req->ki_batch);
+                        kmem_cache_free(kiocb_cachep, req);
+                        if (--allocated <= avail)
+                                break;
+                }
+        }
+        batch->count -= allocated;
+        list_for_each_entry(req, &batch->head, ki_batch) {
                list_add(&req->ki_list, &ctx->active_reqs);
                ctx->reqs_active++;
-                okay = 1;
        }
-        kunmap_atomic(ring, KM_USER0);
-        spin_unlock_irq(&ctx->ctx_lock);
-        if (!okay) {
+        kunmap_atomic(ring);
-                kmem_cache_free(kiocb_cachep, req);
+        spin_unlock_irq(&ctx->ctx_lock);
-                req = NULL;
-        }
-        return req;
+out:
+        return allocated;
 }
-static inline struct kiocb *aio_get_req(struct kioctx *ctx)
+static inline struct kiocb *aio_get_req(struct kioctx *ctx,
+                                        struct kiocb_batch *batch)
 {
        struct kiocb *req;
-        /* Handle a potential starvation case -- should be exceedingly rare as 
-         * requests will be stuck on fput_head only if the aio_fput_routine is 
+        if (list_empty(&batch->head))
-         * delayed and the requests were the last user of the struct file.
+                if (kiocb_batch_refill(ctx, batch) == 0)
-         */
+                        return NULL;
-        req = __aio_get_req(ctx);
+        req = list_first_entry(&batch->head, struct kiocb, ki_batch);
-        if (unlikely(NULL == req)) {
+        list_del(&req->ki_batch);
-                aio_fput_routine(NULL);
-                req = __aio_get_req(ctx);
-        }
        return req;
 }
@@ -1387,13 +1460,13 @@ static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb, bool compat)
                ret = compat_rw_copy_check_uvector(type,
                                (struct compat_iovec __user *)kiocb->ki_buf,
                                kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec,
-                                &kiocb->ki_iovec);
+                                &kiocb->ki_iovec, 1);
        else
 #endif
                ret = rw_copy_check_uvector(type,
                                (struct iovec __user *)kiocb->ki_buf,
                                kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec,
-                                &kiocb->ki_iovec);
+                                &kiocb->ki_iovec, 1);
        if (ret < 0)
                goto out;
@@ -1515,7 +1588,8 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat)
 }
 static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
-                         struct iocb *iocb, bool compat)
+                         struct iocb *iocb, struct kiocb_batch *batch,
+                         bool compat)
 {
        struct kiocb *req;
        struct file *file;
@@ -1541,7 +1615,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
        if (unlikely(!file))
                return -EBADF;
-        req = aio_get_req(ctx);         /* returns with 2 references to req */
+        req = aio_get_req(ctx, batch);  /* returns with 2 references to req */
        if (unlikely(!req)) {
                fput(file);
                return -EAGAIN;
@@ -1621,8 +1695,9 @@ long do_io_submit(aio_context_t ctx_id, long nr,
 {
        struct kioctx *ctx;
        long ret = 0;
-        int i;
+        int i = 0;
        struct blk_plug plug;
+        struct kiocb_batch batch;
        if (unlikely(nr < 0))
                return -EINVAL;
@@ -1639,6 +1714,8 @@ long do_io_submit(aio_context_t ctx_id, long nr,
                return -EINVAL;
        }
+        kiocb_batch_init(&batch, nr);
        blk_start_plug(&plug);
        /*
@@ -1659,12 +1736,13 @@ long do_io_submit(aio_context_t ctx_id, long nr,
                        break;
                }
-                ret = io_submit_one(ctx, user_iocb, &tmp, compat);
+                ret = io_submit_one(ctx, user_iocb, &tmp, &batch, compat);
                if (ret)
                        break;
        }
        blk_finish_plug(&plug);
+        kiocb_batch_free(&batch);
        put_ioctx(ctx);
        return i ? i : ret;
 }
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 180fa2425e49..8179f1ab8175 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -342,7 +342,7 @@ struct inode *autofs4_get_inode(struct super_block *sb, mode_t mode)
        inode->i_ino = get_next_ino();
        if (S_ISDIR(mode)) {
-                inode->i_nlink = 2;
+                set_nlink(inode, 2);
                inode->i_op = &autofs4_dir_inode_operations;
                inode->i_fop = &autofs4_dir_operations;
        } else if (S_ISLNK(mode)) {
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 720d885e8dca..8342ca67abcd 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -357,7 +357,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
        inode->i_gid = befs_sb->mount_opts.use_gid ?
            befs_sb->mount_opts.gid : (gid_t) fs32_to_cpu(sb, raw_inode->gid);
-        inode->i_nlink = 1;
+        set_nlink(inode, 1);
        /*
         * BEFS's time is 64 bits, but current VFS is 32 bits...
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index b14cebfd9047..9cc074019479 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -199,7 +199,7 @@ static int bfs_unlink(struct inode *dir, struct dentry *dentry)
                printf("unlinking non-existent file %s:%lu (nlink=%d)\n",
                                        inode->i_sb->s_id, inode->i_ino,
                                        inode->i_nlink);
-                inode->i_nlink = 1;
+                set_nlink(inode, 1);
        }
        de->ino = 0;
        mark_buffer_dirty_inode(bh, dir);
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index a8e37f81d097..697af5bf70b3 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -78,7 +78,7 @@ struct inode *bfs_iget(struct super_block *sb, unsigned long ino)
        BFS_I(inode)->i_dsk_ino = le16_to_cpu(di->i_ino);
        inode->i_uid =  le32_to_cpu(di->i_uid);
        inode->i_gid =  le32_to_cpu(di->i_gid);
-        inode->i_nlink =  le32_to_cpu(di->i_nlink);
+        set_nlink(inode, le32_to_cpu(di->i_nlink));
        inode->i_size = BFS_FILESIZE(di);
        inode->i_blocks = BFS_FILEBLOCKS(di);
        inode->i_atime.tv_sec =  le32_to_cpu(di->i_atime);
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index dd0fdfc56d38..21ac5ee4b43f 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -795,7 +795,16 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
                         * might try to exec.  This is because the brk will
                         * follow the loader, and is not movable.  */
 #if defined(CONFIG_X86) || defined(CONFIG_ARM)
-                        load_bias = 0;
+                        /* Memory randomization might have been switched off
+                         * in runtime via sysctl.
+                         * If that is the case, retain the original non-zero
+                         * load_bias value in order to establish proper
+                         * non-randomized mappings.
+                         */
+                        if (current->flags & PF_RANDOMIZE)
+                                load_bias = 0;
+                        else
+                                load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr);
 #else
                        load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr);
 #endif
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index ba1a1ae4a18a..1e9edbdeda7e 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -521,7 +521,7 @@ static void kill_node(Node *e)
        write_unlock(&entries_lock);
        if (dentry) {
-                dentry->d_inode->i_nlink--;
+                drop_nlink(dentry->d_inode);
                d_drop(dentry);
                dput(dentry);
                simple_release_fs(&bm_mnt, &entry_count);
diff --git a/fs/bio.c b/fs/bio.c
index 9bfade8a609b..41c93c722244 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -255,7 +255,6 @@ void bio_init(struct bio *bio)
 {
        memset(bio, 0, sizeof(*bio));
        bio->bi_flags = 1 << BIO_UPTODATE;
-        bio->bi_comp_cpu = -1;
        atomic_set(&bio->bi_cnt, 1);
 }
 EXPORT_SYMBOL(bio_init);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 95f786ec7f08..b07f1da1de4e 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -971,7 +971,7 @@ static void flush_disk(struct block_device *bdev, bool kill_dirty)
        if (!bdev->bd_disk)
                return;
-        if (disk_partitionable(bdev->bd_disk))
+        if (disk_part_scan_enabled(bdev->bd_disk))
                bdev->bd_invalidated = 1;
 }
@@ -1085,6 +1085,7 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
 static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 {
        struct gendisk *disk;
+        struct module *owner;
        int ret;
        int partno;
        int perm = 0;
@@ -1110,6 +1111,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
        disk = get_gendisk(bdev->bd_dev, &partno);
        if (!disk)
                goto out;
+        owner = disk->fops->owner;
        disk_block_events(disk);
        mutex_lock_nested(&bdev->bd_mutex, for_part);
@@ -1137,8 +1139,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
                                        bdev->bd_disk = NULL;
                                        mutex_unlock(&bdev->bd_mutex);
                                        disk_unblock_events(disk);
-                                        module_put(disk->fops->owner);
                                        put_disk(disk);
+                                        module_put(owner);
                                        goto restart;
                                }
                        }
@@ -1194,8 +1196,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
                                goto out_unlock_bdev;
                }
                /* only one opener holds refs to the module and disk */
-                module_put(disk->fops->owner);
                put_disk(disk);
+                module_put(owner);
        }
        bdev->bd_openers++;
        if (for_part)
@@ -1215,8 +1217,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 out_unlock_bdev:
        mutex_unlock(&bdev->bd_mutex);
        disk_unblock_events(disk);
-        module_put(disk->fops->owner);
        put_disk(disk);
+        module_put(owner);
 out:
        bdput(bdev);
@@ -1442,14 +1444,15 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
        if (!bdev->bd_openers) {
                struct module *owner = disk->fops->owner;
-                put_disk(disk);
-                module_put(owner);
                disk_put_part(bdev->bd_part);
                bdev->bd_part = NULL;
                bdev->bd_disk = NULL;
                if (bdev != bdev->bd_contains)
                        victim = bdev->bd_contains;
                bdev->bd_contains = NULL;
+                put_disk(disk);
+                module_put(owner);
        }
        mutex_unlock(&bdev->bd_mutex);
        bdput(bdev);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index b52c672f4c18..ae4d9cd10961 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1641,7 +1641,7 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
        inode->i_gid = btrfs_stack_inode_gid(inode_item);
        btrfs_i_size_write(inode, btrfs_stack_inode_size(inode_item));
        inode->i_mode = btrfs_stack_inode_mode(inode_item);
-        inode->i_nlink = btrfs_stack_inode_nlink(inode_item);
+        set_nlink(inode, btrfs_stack_inode_nlink(inode_item));
        inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item));
        BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item);
        BTRFS_I(inode)->sequence = btrfs_stack_inode_sequence(inode_item);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 07b3ac662e19..07ea91879a91 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1705,7 +1705,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        sb->s_bdi = &fs_info->bdi;
        fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
-        fs_info->btree_inode->i_nlink = 1;
+        set_nlink(fs_info->btree_inode, 1);
        /*
         * we set the i_size on the btree inode to the max possible int.
         * the real end of the address space is determined by all of
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f5be06a2462f..c9ee0e18bbdc 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3340,7 +3340,8 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
                smp_mb();
                nr_pages = min_t(unsigned long, nr_pages,
                       root->fs_info->delalloc_bytes >> PAGE_CACHE_SHIFT);
-                writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages);
+                writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages,
+                                                WB_REASON_FS_FREE_SPACE);
                spin_lock(&space_info->lock);
                if (reserved > space_info->bytes_reserved)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b2d004ad66a0..75686a61bd45 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2534,7 +2534,7 @@ static void btrfs_read_locked_inode(struct inode *inode)
        inode_item = btrfs_item_ptr(leaf, path->slots[0],
                                    struct btrfs_inode_item);
        inode->i_mode = btrfs_inode_mode(leaf, inode_item);
-        inode->i_nlink = btrfs_inode_nlink(leaf, inode_item);
+        set_nlink(inode, btrfs_inode_nlink(leaf, inode_item));
        inode->i_uid = btrfs_inode_uid(leaf, inode_item);
        inode->i_gid = btrfs_inode_gid(leaf, inode_item);
        btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
@@ -6728,7 +6728,7 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
        inode->i_op = &btrfs_dir_inode_operations;
        inode->i_fop = &btrfs_dir_file_operations;
-        inode->i_nlink = 1;
+        set_nlink(inode, 1);
        btrfs_i_size_write(inode, 0);
        err = btrfs_update_inode(trans, new_root, inode);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 786639fca067..0618aa39740b 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1030,7 +1030,7 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
        }
        btrfs_release_path(path);
        if (nlink != inode->i_nlink) {
-                inode->i_nlink = nlink;
+                set_nlink(inode, nlink);
                btrfs_update_inode(trans, root, inode);
        }
        BTRFS_I(inode)->index_cnt = (u64)-1;
diff --git a/fs/buffer.c b/fs/buffer.c
index 936d6035f6e2..19d8eb7fdc81 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -213,13 +213,16 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
         * elsewhere, don't buffer_error if we had some unmapped buffers
         */
        if (all_mapped) {
+                char b[BDEVNAME_SIZE];
                printk("__find_get_block_slow() failed. "
                        "block=%llu, b_blocknr=%llu\n",
                        (unsigned long long)block,
                        (unsigned long long)bh->b_blocknr);
                printk("b_state=0x%08lx, b_size=%zu\n",
                        bh->b_state, bh->b_size);
-                printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits);
+                printk("device %s blocksize: %d\n", bdevname(bdev, b),
+                        1 << bd_inode->i_blkbits);
        }
 out_unlock:
        spin_unlock(&bd_mapping->private_lock);
@@ -285,7 +288,7 @@ static void free_more_memory(void)
        struct zone *zone;
        int nid;
-        wakeup_flusher_threads(1024);
+        wakeup_flusher_threads(1024, WB_REASON_FREE_MORE_MEM);
        yield();
        for_each_online_node(nid) {
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index b8731bf3ef1f..0f327c6c9679 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -487,17 +487,15 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
                ci->i_rdcache_gen++;
        /*
-         * if we are newly issued FILE_SHARED, clear I_COMPLETE; we
+         * if we are newly issued FILE_SHARED, clear D_COMPLETE; we
         * don't know what happened to this directory while we didn't
         * have the cap.
         */
        if ((issued & CEPH_CAP_FILE_SHARED) &&
            (had & CEPH_CAP_FILE_SHARED) == 0) {
                ci->i_shared_gen++;
-                if (S_ISDIR(ci->vfs_inode.i_mode)) {
+                if (S_ISDIR(ci->vfs_inode.i_mode))
-                        dout(" marking %p NOT complete\n", &ci->vfs_inode);
+                        ceph_dir_clear_complete(&ci->vfs_inode);
-                        ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
-                }
        }
 }
@@ -2363,7 +2361,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
        }
        if ((issued & CEPH_CAP_LINK_EXCL) == 0)
-                inode->i_nlink = le32_to_cpu(grant->nlink);
+                set_nlink(inode, le32_to_cpu(grant->nlink));
        if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && grant->xattr_len) {
                int len = le32_to_cpu(grant->xattr_len);
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 382abc9a6a54..2abd0dfad7f8 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -108,7 +108,7 @@ static unsigned fpos_off(loff_t p)
 * falling back to a "normal" sync readdir if any dentries in the dir
 * are dropped.
 *
- * I_COMPLETE tells indicates we have all dentries in the dir.  It is
+ * D_COMPLETE tells indicates we have all dentries in the dir.  It is
 * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by
 * the MDS if/when the directory is modified).
 */
@@ -199,8 +199,8 @@ more:
        filp->f_pos++;
        /* make sure a dentry wasn't dropped while we didn't have parent lock */
-        if (!ceph_i_test(dir, CEPH_I_COMPLETE)) {
+        if (!ceph_dir_test_complete(dir)) {
-                dout(" lost I_COMPLETE on %p; falling back to mds\n", dir);
+                dout(" lost D_COMPLETE on %p; falling back to mds\n", dir);
                err = -EAGAIN;
                goto out;
        }
@@ -285,7 +285,7 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
        if ((filp->f_pos == 2 || fi->dentry) &&
            !ceph_test_mount_opt(fsc, NOASYNCREADDIR) &&
            ceph_snap(inode) != CEPH_SNAPDIR &&
-            (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
+            ceph_dir_test_complete(inode) &&
            __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
                spin_unlock(&inode->i_lock);
                err = __dcache_readdir(filp, dirent, filldir);
@@ -351,7 +351,7 @@ more:
                if (!req->r_did_prepopulate) {
                        dout("readdir !did_prepopulate");
-                        fi->dir_release_count--;    /* preclude I_COMPLETE */
+                        fi->dir_release_count--;    /* preclude D_COMPLETE */
                }
                /* note next offset and last dentry name */
@@ -430,8 +430,7 @@ more:
         */
        spin_lock(&inode->i_lock);
        if (ci->i_release_count == fi->dir_release_count) {
-                dout(" marking %p complete\n", inode);
+                ceph_dir_set_complete(inode);
-                /* ci->i_ceph_flags |= CEPH_I_COMPLETE; */
                ci->i_max_offset = filp->f_pos;
        }
        spin_unlock(&inode->i_lock);
@@ -614,7 +613,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
                            fsc->mount_options->snapdir_name,
                            dentry->d_name.len) &&
                    !is_root_ceph_dentry(dir, dentry) &&
-                    (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
+                    ceph_dir_test_complete(dir) &&
                    (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) {
                        spin_unlock(&dir->i_lock);
                        dout(" dir %p complete, -ENOENT\n", dir);
@@ -934,7 +933,7 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
                 */
                /* d_move screws up d_subdirs order */
-                ceph_i_clear(new_dir, CEPH_I_COMPLETE);
+                ceph_dir_clear_complete(new_dir);
                d_move(old_dentry, new_dentry);
@@ -1092,7 +1091,75 @@ static int ceph_snapdir_d_revalidate(struct dentry *dentry,
        return 1;
 }
+/*
+ * Set/clear/test dir complete flag on the dir's dentry.
+ */
+static struct dentry * __d_find_any_alias(struct inode *inode)
+{
+        struct dentry *alias;
+        if (list_empty(&inode->i_dentry))
+                return NULL;
+        alias = list_first_entry(&inode->i_dentry, struct dentry, d_alias);
+        return alias;
+}
+void ceph_dir_set_complete(struct inode *inode)
+{
+        struct dentry *dentry = __d_find_any_alias(inode);
+        
+        if (dentry && ceph_dentry(dentry)) {
+                dout(" marking %p (%p) complete\n", inode, dentry);
+                set_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags);
+        }
+}
+void ceph_dir_clear_complete(struct inode *inode)
+{
+        struct dentry *dentry = __d_find_any_alias(inode);
+        if (dentry && ceph_dentry(dentry)) {
+                dout(" marking %p (%p) NOT complete\n", inode, dentry);
+                clear_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags);
+        }
+}
+bool ceph_dir_test_complete(struct inode *inode)
+{
+        struct dentry *dentry = __d_find_any_alias(inode);
+        if (dentry && ceph_dentry(dentry))
+                return test_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags);
+        return false;
+}
+/*
+ * When the VFS prunes a dentry from the cache, we need to clear the
+ * complete flag on the parent directory.
+ *
+ * Called under dentry->d_lock.
+ */
+static void ceph_d_prune(struct dentry *dentry)
+{
+        struct ceph_dentry_info *di;
+        dout("d_release %p\n", dentry);
+        /* do we have a valid parent? */
+        if (!dentry->d_parent || IS_ROOT(dentry))
+                return;
+        /* if we are not hashed, we don't affect D_COMPLETE */
+        if (d_unhashed(dentry))
+                return;
+        /*
+         * we hold d_lock, so d_parent is stable, and d_fsdata is never
+         * cleared until d_release
+         */
+        di = ceph_dentry(dentry->d_parent);
+        clear_bit(CEPH_D_COMPLETE, &di->flags);
+}
 /*
 * read() on a dir.  This weird interface hack only works if mounted
@@ -1306,6 +1373,7 @@ const struct inode_operations ceph_dir_iops = {
 const struct dentry_operations ceph_dentry_ops = {
        .d_revalidate = ceph_d_revalidate,
        .d_release = ceph_d_release,
+        .d_prune = ceph_d_prune,
 };
 const struct dentry_operations ceph_snapdir_dentry_ops = {
@@ -1315,4 +1383,5 @@ const struct dentry_operations ceph_snapdir_dentry_ops = {
 const struct dentry_operations ceph_snap_dentry_ops = {
        .d_release = ceph_d_release,
+        .d_prune = ceph_d_prune,
 };
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 5dde7d51dc11..e392bfce84a3 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -618,7 +618,7 @@ static int fill_inode(struct inode *inode,
        }
        if ((issued & CEPH_CAP_LINK_EXCL) == 0)
-                inode->i_nlink = le32_to_cpu(info->nlink);
+                set_nlink(inode, le32_to_cpu(info->nlink));
        /* be careful with mtime, atime, size */
        ceph_decode_timespec(&atime, &info->atime);
@@ -771,9 +771,9 @@ no_change:
            ceph_snap(inode) == CEPH_NOSNAP &&
            (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) &&
            (issued & CEPH_CAP_FILE_EXCL) == 0 &&
-            (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) {
+            !ceph_dir_test_complete(inode)) {
                dout(" marking %p complete (empty)\n", inode);
-                /* ci->i_ceph_flags |= CEPH_I_COMPLETE; */
+                ceph_dir_set_complete(inode);
                ci->i_max_offset = 2;
        }
@@ -856,7 +856,7 @@ static void ceph_set_dentry_offset(struct dentry *dn)
        di = ceph_dentry(dn);
        spin_lock(&inode->i_lock);
-        if ((ceph_inode(inode)->i_ceph_flags & CEPH_I_COMPLETE) == 0) {
+        if (!ceph_dir_test_complete(inode)) {
                spin_unlock(&inode->i_lock);
                return;
        }
@@ -1056,7 +1056,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
                         * d_move() puts the renamed dentry at the end of
                         * d_subdirs.  We need to assign it an appropriate
                         * directory offset so we can behave when holding
-                         * I_COMPLETE.
+                         * D_COMPLETE.
                         */
                        ceph_set_dentry_offset(req->r_old_dentry);
                        dout("dn %p gets new offset %lld\n", req->r_old_dentry, 
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 1d72f15fe9f4..264ab701154f 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -619,7 +619,7 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
 *
 * Called under mdsc->mutex.
 */
-struct dentry *get_nonsnap_parent(struct dentry *dentry)
+static struct dentry *get_nonsnap_parent(struct dentry *dentry)
 {
        /*
         * we don't need to worry about protecting the d_parent access
@@ -2002,7 +2002,7 @@ out:
 }
 /*
- * Invalidate dir I_COMPLETE, dentry lease state on an aborted MDS
+ * Invalidate dir D_COMPLETE, dentry lease state on an aborted MDS
 * namespace request.
 */
 void ceph_invalidate_dir_request(struct ceph_mds_request *req)
@@ -2010,9 +2010,9 @@ void ceph_invalidate_dir_request(struct ceph_mds_request *req)
        struct inode *inode = req->r_locked_dir;
        struct ceph_inode_info *ci = ceph_inode(inode);
-        dout("invalidate_dir_request %p (I_COMPLETE, lease(s))\n", inode);
+        dout("invalidate_dir_request %p (D_COMPLETE, lease(s))\n", inode);
        spin_lock(&inode->i_lock);
-        ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
+        ceph_dir_clear_complete(inode);
        ci->i_release_count++;
        spin_unlock(&inode->i_lock);
@@ -3154,7 +3154,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
 /*
 * true if all sessions are closed, or we force unmount
 */
-bool done_closing_sessions(struct ceph_mds_client *mdsc)
+static bool done_closing_sessions(struct ceph_mds_client *mdsc)
 {
        int i, n = 0;
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 788f5ad8e66d..a90846fac759 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -426,7 +426,7 @@ static int extra_mon_dispatch(struct ceph_client *client, struct ceph_msg *msg)
 /*
 * create a new fs client
 */
-struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
+static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
                                        struct ceph_options *opt)
 {
        struct ceph_fs_client *fsc;
@@ -502,7 +502,7 @@ fail:
        return ERR_PTR(err);
 }
-void destroy_fs_client(struct ceph_fs_client *fsc)
+static void destroy_fs_client(struct ceph_fs_client *fsc)
 {
        dout("destroy_fs_client %p\n", fsc);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index b01442aaf278..01bf189e08a9 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -203,6 +203,7 @@ struct ceph_inode_xattr {
 * Ceph dentry state
 */
 struct ceph_dentry_info {
+        unsigned long flags;
        struct ceph_mds_session *lease_session;
        u32 lease_gen, lease_shared_gen;
        u32 lease_seq;
@@ -213,6 +214,18 @@ struct ceph_dentry_info {
        u64 offset;
 };
+/*
+ * dentry flags
+ *
+ * The locking for D_COMPLETE is a bit odd:
+ *  - we can clear it at almost any time (see ceph_d_prune)
+ *  - it is only meaningful if:
+ *    - we hold dir inode i_lock
+ *    - we hold dir FILE_SHARED caps
+ *    - the dentry D_COMPLETE is set
+ */
+#define CEPH_D_COMPLETE 1  /* if set, d_u.d_subdirs is complete directory */
 struct ceph_inode_xattrs_info {
        /*
         * (still encoded) xattr blob. we avoid the overhead of parsing
@@ -251,7 +264,7 @@ struct ceph_inode_info {
        struct timespec i_rctime;
        u64 i_rbytes, i_rfiles, i_rsubdirs;
        u64 i_files, i_subdirs;
-        u64 i_max_offset;  /* largest readdir offset, set with I_COMPLETE */
+        u64 i_max_offset;  /* largest readdir offset, set with D_COMPLETE */
        struct rb_root i_fragtree;
        struct mutex i_fragtree_mutex;
@@ -416,7 +429,6 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
 /*
 * Ceph inode.
 */
-#define CEPH_I_COMPLETE  1  /* we have complete directory cached */
 #define CEPH_I_NODELAY   4  /* do not delay cap release */
 #define CEPH_I_FLUSH     8  /* do not delay flush of dirty metadata */
 #define CEPH_I_NOFLUSH  16  /* do not flush dirty caps */
@@ -474,6 +486,13 @@ static inline loff_t ceph_make_fpos(unsigned frag, unsigned off)
 }
 /*
+ * set/clear directory D_COMPLETE flag
+ */
+void ceph_dir_set_complete(struct inode *inode);
+void ceph_dir_clear_complete(struct inode *inode);
+bool ceph_dir_test_complete(struct inode *inode);
+/*
 * caps helpers
 */
 static inline bool __ceph_is_any_real_caps(struct ceph_inode_info *ci)
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 2cfb695d1f89..5d9b9acc5fce 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -204,7 +204,7 @@ int cifs_verify_signature(struct kvec *iov, unsigned int nr_iov,
 }
 /* first calculate 24 bytes ntlm response and then 16 byte session key */
-int setup_ntlm_response(struct cifs_ses *ses)
+int setup_ntlm_response(struct cifs_ses *ses, const struct nls_table *nls_cp)
 {
        int rc = 0;
        unsigned int temp_len = CIFS_SESS_KEY_SIZE + CIFS_AUTH_RESP_SIZE;
@@ -221,14 +221,14 @@ int setup_ntlm_response(struct cifs_ses *ses)
        ses->auth_key.len = temp_len;
        rc = SMBNTencrypt(ses->password, ses->server->cryptkey,
-                        ses->auth_key.response + CIFS_SESS_KEY_SIZE);
+                        ses->auth_key.response + CIFS_SESS_KEY_SIZE, nls_cp);
        if (rc) {
                cFYI(1, "%s Can't generate NTLM response, error: %d",
                        __func__, rc);
                return rc;
        }
-        rc = E_md4hash(ses->password, temp_key);
+        rc = E_md4hash(ses->password, temp_key, nls_cp);
        if (rc) {
                cFYI(1, "%s Can't generate NT hash, error: %d", __func__, rc);
                return rc;
@@ -404,7 +404,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
        }
        /* calculate md4 hash of password */
-        E_md4hash(ses->password, nt_hash);
+        E_md4hash(ses->password, nt_hash, nls_cp);
        rc = crypto_shash_setkey(ses->server->secmech.hmacmd5, nt_hash,
                                CIFS_NTHASH_SIZE);
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index d9dbaf869cd1..30ff56005d8f 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -125,5 +125,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 extern const struct export_operations cifs_export_ops;
 #endif /* CONFIG_CIFS_NFSD_EXPORT */
-#define CIFS_VERSION   "1.75"
+#define CIFS_VERSION   "1.76"
 #endif                          /* _CIFSFS_H */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index ef4f631e4c01..6f4e243e0f62 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -395,8 +395,9 @@ extern int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *,
 extern int cifs_verify_signature(struct kvec *iov, unsigned int nr_iov,
                                 struct TCP_Server_Info *server,
                                __u32 expected_sequence_number);
-extern int SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *);
+extern int SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *,
-extern int setup_ntlm_response(struct cifs_ses *);
+                        const struct nls_table *);
+extern int setup_ntlm_response(struct cifs_ses *, const struct nls_table *);
 extern int setup_ntlmv2_rsp(struct cifs_ses *, const struct nls_table *);
 extern int cifs_crypto_shash_allocate(struct TCP_Server_Info *);
 extern void cifs_crypto_shash_release(struct TCP_Server_Info *);
@@ -448,7 +449,8 @@ extern int CIFSCheckMFSymlink(struct cifs_fattr *fattr,
                const unsigned char *path,
                struct cifs_sb_info *cifs_sb, int xid);
 extern int mdfour(unsigned char *, unsigned char *, int);
-extern int E_md4hash(const unsigned char *passwd, unsigned char *p16);
+extern int E_md4hash(const unsigned char *passwd, unsigned char *p16,
+                        const struct nls_table *codepage);
 extern int SMBencrypt(unsigned char *passwd, const unsigned char *c8,
                        unsigned char *p24);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 7ef4e2846658..d6a972df0338 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -3453,7 +3453,7 @@ CIFSTCon(unsigned int xid, struct cifs_ses *ses,
                else
 #endif /* CIFS_WEAK_PW_HASH */
                rc = SMBNTencrypt(tcon->password, ses->server->cryptkey,
-                                        bcc_ptr);
+                                        bcc_ptr, nls_codepage);
                bcc_ptr += CIFS_AUTH_RESP_SIZE;
                if (ses->capabilities & CAP_UNICODE) {
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index ea096ce5d4f7..c1f063cd1b0c 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -778,7 +778,6 @@ try_again:
                else {
                        mutex_lock(&cinode->lock_mutex);
                        list_del_init(&lock->blist);
-                        mutex_unlock(&cinode->lock_mutex);
                }
        }
@@ -794,6 +793,9 @@ cifs_posix_lock_test(struct file *file, struct file_lock *flock)
        struct cifsInodeInfo *cinode = CIFS_I(file->f_path.dentry->d_inode);
        unsigned char saved_type = flock->fl_type;
+        if ((flock->fl_flags & FL_POSIX) == 0)
+                return 1;
        mutex_lock(&cinode->lock_mutex);
        posix_test_lock(file, flock);
@@ -810,12 +812,15 @@ static int
 cifs_posix_lock_set(struct file *file, struct file_lock *flock)
 {
        struct cifsInodeInfo *cinode = CIFS_I(file->f_path.dentry->d_inode);
-        int rc;
+        int rc = 1;
+        if ((flock->fl_flags & FL_POSIX) == 0)
+                return rc;
        mutex_lock(&cinode->lock_mutex);
        if (!cinode->can_cache_brlcks) {
                mutex_unlock(&cinode->lock_mutex);
-                return 1;
+                return rc;
        }
        rc = posix_lock_file_wait(file, flock);
        mutex_unlock(&cinode->lock_mutex);
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 2c50bd2f65d1..e851d5b8931e 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -132,7 +132,7 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
        inode->i_mtime = fattr->cf_mtime;
        inode->i_ctime = fattr->cf_ctime;
        inode->i_rdev = fattr->cf_rdev;
-        inode->i_nlink = fattr->cf_nlink;
+        set_nlink(inode, fattr->cf_nlink);
        inode->i_uid = fattr->cf_uid;
        inode->i_gid = fattr->cf_gid;
@@ -905,7 +905,7 @@ struct inode *cifs_root_iget(struct super_block *sb)
        if (rc && tcon->ipc) {
                cFYI(1, "ipc connection - fake read inode");
                inode->i_mode |= S_IFDIR;
-                inode->i_nlink = 2;
+                set_nlink(inode, 2);
                inode->i_op = &cifs_ipc_inode_ops;
                inode->i_fop = &simple_dir_operations;
                inode->i_uid = cifs_sb->mnt_uid;
@@ -1367,7 +1367,7 @@ mkdir_get_info:
                 /* setting nlink not necessary except in cases where we
                  * failed to get it from the server or was set bogus */
                if ((direntry->d_inode) && (direntry->d_inode->i_nlink < 2))
-                                direntry->d_inode->i_nlink = 2;
+                        set_nlink(direntry->d_inode, 2);
                mode &= ~current_umask();
                /* must turn on setgid bit if parent dir has it */
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 8693b5d0e180..6b0e06434391 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -433,7 +433,7 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode,
        if (old_file->d_inode) {
                cifsInode = CIFS_I(old_file->d_inode);
                if (rc == 0) {
-                        old_file->d_inode->i_nlink++;
+                        inc_nlink(old_file->d_inode);
 /* BB should we make this contingent on superblock flag NOATIME? */
 /*                      old_file->d_inode->i_ctime = CURRENT_TIME;*/
                        /* parent dir timestamps will update from srv
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index c7d80e24f24e..4ec3ee9d72cc 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -683,7 +683,7 @@ ssetup_ntlmssp_authenticate:
                        cpu_to_le16(CIFS_AUTH_RESP_SIZE);
                /* calculate ntlm response and session key */
-                rc = setup_ntlm_response(ses);
+                rc = setup_ntlm_response(ses, nls_cp);
                if (rc) {
                        cERROR(1, "Error %d during NTLM authentication", rc);
                        goto ssetup_exit;
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index ac1221d969d6..7cacba12b8f1 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -199,75 +199,36 @@ SMBencrypt(unsigned char *passwd, const unsigned char *c8, unsigned char *p24)
        return rc;
 }
-/* Routines for Windows NT MD4 Hash functions. */
-static int
-_my_wcslen(__u16 *str)
-{
-        int len = 0;
-        while (*str++ != 0)
-                len++;
-        return len;
-}
-/*
- * Convert a string into an NT UNICODE string.
- * Note that regardless of processor type
- * this must be in intel (little-endian)
- * format.
- */
-static int
-_my_mbstowcs(__u16 *dst, const unsigned char *src, int len)
-{       /* BB not a very good conversion routine - change/fix */
-        int i;
-        __u16 val;
-        for (i = 0; i < len; i++) {
-                val = *src;
-                SSVAL(dst, 0, val);
-                dst++;
-                src++;
-                if (val == 0)
-                        break;
-        }
-        return i;
-}
 /*
 * Creates the MD4 Hash of the users password in NT UNICODE.
 */
 int
-E_md4hash(const unsigned char *passwd, unsigned char *p16)
+E_md4hash(const unsigned char *passwd, unsigned char *p16,
+        const struct nls_table *codepage)
 {
        int rc;
        int len;
        __u16 wpwd[129];
        /* Password cannot be longer than 128 characters */
-        if (passwd) {
+        if (passwd) /* Password must be converted to NT unicode */
-                len = strlen((char *) passwd);
+                len = cifs_strtoUCS(wpwd, passwd, 128, codepage);
-                if (len > 128)
+        else {
-                        len = 128;
-                /* Password must be converted to NT unicode */
-                _my_mbstowcs(wpwd, passwd, len);
-        } else
                len = 0;
+                *wpwd = 0; /* Ensure string is null terminated */
+        }
-        wpwd[len] = 0;  /* Ensure string is null terminated */
+        rc = mdfour(p16, (unsigned char *) wpwd, len * sizeof(__u16));
-        /* Calculate length in bytes */
+        memset(wpwd, 0, 129 * sizeof(__u16));
-        len = _my_wcslen(wpwd) * sizeof(__u16);
-        rc = mdfour(p16, (unsigned char *) wpwd, len);
-        memset(wpwd, 0, 129 * 2);
        return rc;
 }
 /* Does the NT MD4 hash then des encryption. */
 int
-SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24)
+SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24,
+                const struct nls_table *codepage)
 {
        int rc;
        unsigned char p16[16], p21[21];
@@ -275,7 +236,7 @@ SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24)
        memset(p16, '\0', 16);
        memset(p21, '\0', 21);
-        rc = E_md4hash(passwd, p16);
+        rc = E_md4hash(passwd, p16, codepage);
        if (rc) {
                cFYI(1, "%s Can't generate NT hash, error: %d", __func__, rc);
                return rc;
diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c
index 2bdbcc11b373..854ace712685 100644
--- a/fs/coda/coda_linux.c
+++ b/fs/coda/coda_linux.c
@@ -104,7 +104,7 @@ void coda_vattr_to_iattr(struct inode *inode, struct coda_vattr *attr)
        if (attr->va_gid != -1)
                inode->i_gid = (gid_t) attr->va_gid;
        if (attr->va_nlink != -1)
-                inode->i_nlink = attr->va_nlink;
+                set_nlink(inode, attr->va_nlink);
        if (attr->va_size != -1)
                inode->i_size = attr->va_size;
        if (attr->va_size != -1)
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 0239433f50cb..28e7e135cfab 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -340,7 +340,7 @@ static int coda_rmdir(struct inode *dir, struct dentry *de)
        if (!error) {
                /* VFS may delete the child */
                if (de->d_inode)
-                    de->d_inode->i_nlink = 0;
+                        clear_nlink(de->d_inode);
                /* fix the link count of the parent */
                coda_dir_drop_nlink(dir);
diff --git a/fs/compat.c b/fs/compat.c
index 302e761bd0aa..c98787536bb8 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -546,7 +546,7 @@ out:
 ssize_t compat_rw_copy_check_uvector(int type,
                const struct compat_iovec __user *uvector, unsigned long nr_segs,
                unsigned long fast_segs, struct iovec *fast_pointer,
-                struct iovec **ret_pointer)
+                struct iovec **ret_pointer, int check_access)
 {
        compat_ssize_t tot_len;
        struct iovec *iov = *ret_pointer = fast_pointer;
@@ -593,7 +593,8 @@ ssize_t compat_rw_copy_check_uvector(int type,
                }
                if (len < 0)    /* size_t not fitting in compat_ssize_t .. */
                        goto out;
-                if (!access_ok(vrfy_dir(type), compat_ptr(buf), len)) {
+                if (check_access &&
+                    !access_ok(vrfy_dir(type), compat_ptr(buf), len)) {
                        ret = -EFAULT;
                        goto out;
                }
@@ -1107,7 +1108,7 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
                goto out;
        tot_len = compat_rw_copy_check_uvector(type, uvector, nr_segs,
-                                               UIO_FASTIOV, iovstack, &iov);
+                                               UIO_FASTIOV, iovstack, &iov, 1);
        if (tot_len == 0) {
                ret = 0;
                goto out;
diff --git a/fs/dcache.c b/fs/dcache.c
index a88948b8bd17..274f13e2f094 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -225,7 +225,7 @@ static void dentry_unlink_inode(struct dentry * dentry)
 }
 /*
- * dentry_lru_(add|del|move_tail) must be called with d_lock held.
+ * dentry_lru_(add|del|prune|move_tail) must be called with d_lock held.
 */
 static void dentry_lru_add(struct dentry *dentry)
 {
@@ -245,6 +245,9 @@ static void __dentry_lru_del(struct dentry *dentry)
        dentry_stat.nr_unused--;
 }
+/*
+ * Remove a dentry with references from the LRU.
+ */
 static void dentry_lru_del(struct dentry *dentry)
 {
        if (!list_empty(&dentry->d_lru)) {
@@ -254,6 +257,23 @@ static void dentry_lru_del(struct dentry *dentry)
        }
 }
+/*
+ * Remove a dentry that is unreferenced and about to be pruned
+ * (unhashed and destroyed) from the LRU, and inform the file system.
+ * This wrapper should be called _prior_ to unhashing a victim dentry.
+ */
+static void dentry_lru_prune(struct dentry *dentry)
+{
+        if (!list_empty(&dentry->d_lru)) {
+                if (dentry->d_flags & DCACHE_OP_PRUNE)
+                        dentry->d_op->d_prune(dentry);
+                spin_lock(&dcache_lru_lock);
+                __dentry_lru_del(dentry);
+                spin_unlock(&dcache_lru_lock);
+        }
+}
 static void dentry_lru_move_tail(struct dentry *dentry)
 {
        spin_lock(&dcache_lru_lock);
@@ -403,8 +423,12 @@ relock:
        if (ref)
                dentry->d_count--;
-        /* if dentry was on the d_lru list delete it from there */
+        /*
-        dentry_lru_del(dentry);
+         * if dentry was on the d_lru list delete it from there.
+         * inform the fs via d_prune that this dentry is about to be
+         * unhashed and destroyed.
+         */
+        dentry_lru_prune(dentry);
        /* if it was on the hash then remove it */
        __d_drop(dentry);
        return d_kill(dentry, parent);
@@ -854,8 +878,12 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
                do {
                        struct inode *inode;
-                        /* detach from the system */
+                        /*
-                        dentry_lru_del(dentry);
+                         * remove the dentry from the lru, and inform
+                         * the fs that this dentry is about to be
+                         * unhashed and destroyed.
+                         */
+                        dentry_lru_prune(dentry);
                        __d_shrink(dentry);
                        if (dentry->d_count != 0) {
@@ -1283,6 +1311,8 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
                dentry->d_flags |= DCACHE_OP_REVALIDATE;
        if (op->d_delete)
                dentry->d_flags |= DCACHE_OP_DELETE;
+        if (op->d_prune)
+                dentry->d_flags |= DCACHE_OP_PRUNE;
 }
 EXPORT_SYMBOL(d_set_d_op);
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 2f27e578d466..d5d5297efe97 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -307,7 +307,7 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
        inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR;
        inode->i_op = &simple_dir_inode_operations;
        inode->i_fop = &simple_dir_operations;
-        inode->i_nlink = 2;
+        set_nlink(inode, 2);
        s->s_root = d_alloc_root(inode);
        if (s->s_root)
@@ -549,7 +549,7 @@ void devpts_pty_kill(struct tty_struct *tty)
        dentry = d_find_alias(inode);
-        inode->i_nlink--;
+        drop_nlink(inode);
        d_delete(dentry);
        dput(dentry);   /* d_alloc_name() in devpts_pty_new() */
        dput(dentry);           /* d_find_alias above */
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index b36c5572b3f3..54481a3b2c79 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -514,7 +514,7 @@ ecryptfs_set_dentry_lower_mnt(struct dentry *dentry, struct vfsmount *lower_mnt)
 #define ecryptfs_printk(type, fmt, arg...) \
        __ecryptfs_printk(type "%s: " fmt, __func__, ## arg);
-__attribute__ ((format(printf, 1, 2)))
+__printf(1, 2)
 void __ecryptfs_printk(const char *fmt, ...);
 extern const struct file_operations ecryptfs_main_fops;
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 11f8582d7218..a36d327f1521 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -474,8 +474,8 @@ static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir,
                goto out_lock;
        fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode);
        fsstack_copy_inode_size(dir, lower_dir_dentry->d_inode);
-        old_dentry->d_inode->i_nlink =
+        set_nlink(old_dentry->d_inode,
-                ecryptfs_inode_to_lower(old_dentry->d_inode)->i_nlink;
+                  ecryptfs_inode_to_lower(old_dentry->d_inode)->i_nlink);
        i_size_write(new_dentry->d_inode, file_size_save);
 out_lock:
        unlock_dir(lower_dir_dentry);
@@ -499,8 +499,8 @@ static int ecryptfs_unlink(struct inode *dir, struct dentry *dentry)
                goto out_unlock;
        }
        fsstack_copy_attr_times(dir, lower_dir_inode);
-        dentry->d_inode->i_nlink =
+        set_nlink(dentry->d_inode,
-                ecryptfs_inode_to_lower(dentry->d_inode)->i_nlink;
+                  ecryptfs_inode_to_lower(dentry->d_inode)->i_nlink);
        dentry->d_inode->i_ctime = dir->i_ctime;
        d_drop(dentry);
 out_unlock:
@@ -565,7 +565,7 @@ static int ecryptfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
                goto out;
        fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode);
        fsstack_copy_inode_size(dir, lower_dir_dentry->d_inode);
-        dir->i_nlink = lower_dir_dentry->d_inode->i_nlink;
+        set_nlink(dir, lower_dir_dentry->d_inode->i_nlink);
 out:
        unlock_dir(lower_dir_dentry);
        if (!dentry->d_inode)
@@ -588,7 +588,7 @@ static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry)
        if (!rc && dentry->d_inode)
                clear_nlink(dentry->d_inode);
        fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode);
-        dir->i_nlink = lower_dir_dentry->d_inode->i_nlink;
+        set_nlink(dir, lower_dir_dentry->d_inode->i_nlink);
        unlock_dir(lower_dir_dentry);
        if (!rc)
                d_drop(dentry);
diff --git a/fs/efs/inode.c b/fs/efs/inode.c
index 9c13412e6c99..bc84f365d75c 100644
--- a/fs/efs/inode.c
+++ b/fs/efs/inode.c
@@ -96,7 +96,7 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino)
        efs_inode = (struct efs_dinode *) (bh->b_data + offset);
    
        inode->i_mode  = be16_to_cpu(efs_inode->di_mode);
-        inode->i_nlink = be16_to_cpu(efs_inode->di_nlink);
+        set_nlink(inode, be16_to_cpu(efs_inode->di_nlink));
        inode->i_uid   = (uid_t)be16_to_cpu(efs_inode->di_uid);
        inode->i_gid   = (gid_t)be16_to_cpu(efs_inode->di_gid);
        inode->i_size  = be32_to_cpu(efs_inode->di_size);
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 9026fc91fe3b..828e750af23a 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -70,6 +70,15 @@
 * simultaneous inserts (A into B and B into A) from racing and
 * constructing a cycle without either insert observing that it is
 * going to.
+ * It is necessary to acquire multiple "ep->mtx"es at once in the
+ * case when one epoll fd is added to another. In this case, we
+ * always acquire the locks in the order of nesting (i.e. after
+ * epoll_ctl(e1, EPOLL_CTL_ADD, e2), e1->mtx will always be acquired
+ * before e2->mtx). Since we disallow cycles of epoll file
+ * descriptors, this ensures that the mutexes are well-ordered. In
+ * order to communicate this nesting to lockdep, when walking a tree
+ * of epoll file descriptors, we use the current recursion depth as
+ * the lockdep subkey.
 * It is possible to drop the "ep->mtx" and to use the global
 * mutex "epmutex" (together with "ep->lock") to have it working,
 * but having "ep->mtx" will make the interface more scalable.
@@ -464,13 +473,15 @@ static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
 * @ep: Pointer to the epoll private data structure.
 * @sproc: Pointer to the scan callback.
 * @priv: Private opaque data passed to the @sproc callback.
+ * @depth: The current depth of recursive f_op->poll calls.
 *
 * Returns: The same integer error code returned by the @sproc callback.
 */
 static int ep_scan_ready_list(struct eventpoll *ep,
                              int (*sproc)(struct eventpoll *,
                                           struct list_head *, void *),
-                              void *priv)
+                              void *priv,
+                              int depth)
 {
        int error, pwake = 0;
        unsigned long flags;
@@ -481,7 +492,7 @@ static int ep_scan_ready_list(struct eventpoll *ep,
         * We need to lock this because we could be hit by
         * eventpoll_release_file() and epoll_ctl().
         */
-        mutex_lock(&ep->mtx);
+        mutex_lock_nested(&ep->mtx, depth);
        /*
         * Steal the ready list, and re-init the original one to the
@@ -670,7 +681,7 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
 static int ep_poll_readyevents_proc(void *priv, void *cookie, int call_nests)
 {
-        return ep_scan_ready_list(priv, ep_read_events_proc, NULL);
+        return ep_scan_ready_list(priv, ep_read_events_proc, NULL, call_nests + 1);
 }
 static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
@@ -737,7 +748,7 @@ void eventpoll_release_file(struct file *file)
                ep = epi->ep;
                list_del_init(&epi->fllink);
-                mutex_lock(&ep->mtx);
+                mutex_lock_nested(&ep->mtx, 0);
                ep_remove(ep, epi);
                mutex_unlock(&ep->mtx);
        }
@@ -1134,7 +1145,7 @@ static int ep_send_events(struct eventpoll *ep,
        esed.maxevents = maxevents;
        esed.events = events;
-        return ep_scan_ready_list(ep, ep_send_events_proc, &esed);
+        return ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0);
 }
 static inline struct timespec ep_set_mstimeout(long ms)
@@ -1267,7 +1278,7 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
        struct rb_node *rbp;
        struct epitem *epi;
-        mutex_lock(&ep->mtx);
+        mutex_lock_nested(&ep->mtx, call_nests + 1);
        for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
                epi = rb_entry(rbp, struct epitem, rbn);
                if (unlikely(is_file_epoll(epi->ffd.file))) {
@@ -1409,7 +1420,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
        }
-        mutex_lock(&ep->mtx);
+        mutex_lock_nested(&ep->mtx, 0);
        /*
         * Try to lookup the file inside our RB tree, Since we grabbed "mtx"
diff --git a/fs/exec.c b/fs/exec.c
index 25dcbe5fc356..36254645b7cc 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -841,10 +841,6 @@ static int exec_mmap(struct mm_struct *mm)
        tsk->mm = mm;
        tsk->active_mm = mm;
        activate_mm(active_mm, mm);
-        if (old_mm && tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
-                atomic_dec(&old_mm->oom_disable_count);
-                atomic_inc(&tsk->mm->oom_disable_count);
-        }
        task_unlock(tsk);
        arch_pick_mmap_layout(mm);
        if (old_mm) {
diff --git a/fs/exofs/Kconfig b/fs/exofs/Kconfig
index fa9a286c8771..da42f32c49be 100644
--- a/fs/exofs/Kconfig
+++ b/fs/exofs/Kconfig
@@ -5,7 +5,7 @@
 # selected by any of the users.
 config ORE
        tristate
-        depends on EXOFS_FS
+        depends on EXOFS_FS || PNFS_OBJLAYOUT
        select ASYNC_XOR
        default SCSI_OSD_ULD
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 3e5f3a6be90a..f6dbf7768ce6 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -1165,7 +1165,7 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
        inode->i_mode = le16_to_cpu(fcb.i_mode);
        inode->i_uid = le32_to_cpu(fcb.i_uid);
        inode->i_gid = le32_to_cpu(fcb.i_gid);
-        inode->i_nlink = le16_to_cpu(fcb.i_links_count);
+        set_nlink(inode, le16_to_cpu(fcb.i_links_count));
        inode->i_ctime.tv_sec = (signed)le32_to_cpu(fcb.i_ctime);
        inode->i_atime.tv_sec = (signed)le32_to_cpu(fcb.i_atime);
        inode->i_mtime.tv_sec = (signed)le32_to_cpu(fcb.i_mtime);
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index 8f44cef1b3ef..a8cbe1bc6ad4 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -421,7 +421,7 @@ static inline int rsv_is_empty(struct ext2_reserve_window *rsv)
 void ext2_init_block_alloc_info(struct inode *inode)
 {
        struct ext2_inode_info *ei = EXT2_I(inode);
-        struct ext2_block_alloc_info *block_i = ei->i_block_alloc_info;
+        struct ext2_block_alloc_info *block_i;
        struct super_block *sb = inode->i_sb;
        block_i = kmalloc(sizeof(*block_i), GFP_NOFS);
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index af9fc89b1b2d..9a4e5e206d08 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -135,10 +135,10 @@ extern long ext2_compat_ioctl(struct file *, unsigned int, unsigned long);
 struct dentry *ext2_get_parent(struct dentry *child);
 /* super.c */
-extern void ext2_error (struct super_block *, const char *, const char *, ...)
+extern __printf(3, 4)
-        __attribute__ ((format (printf, 3, 4)));
+void ext2_error(struct super_block *, const char *, const char *, ...);
-extern void ext2_msg(struct super_block *, const char *, const char *, ...)
+extern __printf(3, 4)
-        __attribute__ ((format (printf, 3, 4)));
+void ext2_msg(struct super_block *, const char *, const char *, ...);
 extern void ext2_update_dynamic_rev (struct super_block *sb);
 extern void ext2_write_super (struct super_block *);
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index ee9ed31948e1..c4e81dfb74ba 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -601,7 +601,7 @@ fail_free_drop:
 fail_drop:
        dquot_drop(inode);
        inode->i_flags |= S_NOQUOTA;
-        inode->i_nlink = 0;
+        clear_nlink(inode);
        unlock_new_inode(inode);
        iput(inode);
        return ERR_PTR(err);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index a8a58f63f07c..91a6945af6d8 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -1321,7 +1321,7 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
                inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
                inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
        }
-        inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
+        set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
        inode->i_size = le32_to_cpu(raw_inode->i_size);
        inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime);
        inode->i_ctime.tv_sec = (signed)le32_to_cpu(raw_inode->i_ctime);
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 1dd62ed35b85..bd8ac164a3bf 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -327,10 +327,10 @@ static struct inode *ext2_nfs_get_inode(struct super_block *sb,
        if (ino > le32_to_cpu(EXT2_SB(sb)->s_es->s_inodes_count))
                return ERR_PTR(-ESTALE);
-        /* iget isn't really right if the inode is currently unallocated!!
+        /*
-         * ext2_read_inode currently does appropriate checks, but
+         * ext2_iget isn't quite right if the inode is currently unallocated!
-         * it might be "neater" to call ext2_get_inode first and check
+         * However ext2_iget currently does appropriate checks to handle stale
-         * if the inode is valid.....
+         * inodes so everything is OK.
         */
        inode = ext2_iget(sb, ino);
        if (IS_ERR(inode))
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 6386d76f44a7..a2038928f9a3 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -427,7 +427,7 @@ static inline int rsv_is_empty(struct ext3_reserve_window *rsv)
 void ext3_init_block_alloc_info(struct inode *inode)
 {
        struct ext3_inode_info *ei = EXT3_I(inode);
-        struct ext3_block_alloc_info *block_i = ei->i_block_alloc_info;
+        struct ext3_block_alloc_info *block_i;
        struct super_block *sb = inode->i_sb;
        block_i = kmalloc(sizeof(*block_i), GFP_NOFS);
@@ -1440,14 +1440,14 @@ out:
 *
 * Check if filesystem has at least 1 free block available for allocation.
 */
-static int ext3_has_free_blocks(struct ext3_sb_info *sbi)
+static int ext3_has_free_blocks(struct ext3_sb_info *sbi, int use_reservation)
 {
        ext3_fsblk_t free_blocks, root_blocks;
        free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
        root_blocks = le32_to_cpu(sbi->s_es->s_r_blocks_count);
        if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) &&
-                sbi->s_resuid != current_fsuid() &&
+                !use_reservation && sbi->s_resuid != current_fsuid() &&
                (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) {
                return 0;
        }
@@ -1468,7 +1468,7 @@ static int ext3_has_free_blocks(struct ext3_sb_info *sbi)
 */
 int ext3_should_retry_alloc(struct super_block *sb, int *retries)
 {
-        if (!ext3_has_free_blocks(EXT3_SB(sb)) || (*retries)++ > 3)
+        if (!ext3_has_free_blocks(EXT3_SB(sb), 0) || (*retries)++ > 3)
                return 0;
        jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
@@ -1546,7 +1546,7 @@ ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode,
        if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0))
                my_rsv = &block_i->rsv_window_node;
-        if (!ext3_has_free_blocks(sbi)) {
+        if (!ext3_has_free_blocks(sbi, IS_NOQUOTA(inode))) {
                *errp = -ENOSPC;
                goto out;
        }
@@ -1924,9 +1924,10 @@ unsigned long ext3_bg_num_gdb(struct super_block *sb, int group)
 * reaches any used block. Then issue a TRIM command on this extent and free
 * the extent in the block bitmap. This is done until whole group is scanned.
 */
-ext3_grpblk_t ext3_trim_all_free(struct super_block *sb, unsigned int group,
+static ext3_grpblk_t ext3_trim_all_free(struct super_block *sb,
-                                ext3_grpblk_t start, ext3_grpblk_t max,
+                                        unsigned int group,
-                                ext3_grpblk_t minblocks)
+                                        ext3_grpblk_t start, ext3_grpblk_t max,
+                                        ext3_grpblk_t minblocks)
 {
        handle_t *handle;
        ext3_grpblk_t next, free_blocks, bit, freed, count = 0;
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
index d494c554c6e6..1860ed356323 100644
--- a/fs/ext3/fsync.c
+++ b/fs/ext3/fsync.c
@@ -61,13 +61,6 @@ int ext3_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
        if (ret)
                goto out;
-        /*
-         * Taking the mutex here just to keep consistent with how fsync was
-         * called previously, however it looks like we don't need to take
-         * i_mutex at all.
-         */
-        mutex_lock(&inode->i_mutex);
        J_ASSERT(ext3_journal_current_handle() == NULL);
        /*
@@ -85,7 +78,6 @@ int ext3_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
         *  safe in-journal, which is all fsync() needs to ensure.
         */
        if (ext3_should_journal_data(inode)) {
-                mutex_unlock(&inode->i_mutex);
                ret = ext3_force_commit(inode->i_sb);
                goto out;
        }
@@ -108,8 +100,6 @@ int ext3_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
         */
        if (needs_barrier)
                blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
-        mutex_unlock(&inode->i_mutex);
 out:
        trace_ext3_sync_file_exit(inode, ret);
        return ret;
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index bf09cbf938cc..5c866e06e7ab 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -178,42 +178,6 @@ error_return:
 }
 /*
- * There are two policies for allocating an inode.  If the new inode is
- * a directory, then a forward search is made for a block group with both
- * free space and a low directory-to-inode ratio; if that fails, then of
- * the groups with above-average free space, that group with the fewest
- * directories already is chosen.
- *
- * For other inodes, search forward from the parent directory\'s block
- * group to find a free inode.
- */
-static int find_group_dir(struct super_block *sb, struct inode *parent)
-{
-        int ngroups = EXT3_SB(sb)->s_groups_count;
-        unsigned int freei, avefreei;
-        struct ext3_group_desc *desc, *best_desc = NULL;
-        int group, best_group = -1;
-        freei = percpu_counter_read_positive(&EXT3_SB(sb)->s_freeinodes_counter);
-        avefreei = freei / ngroups;
-        for (group = 0; group < ngroups; group++) {
-                desc = ext3_get_group_desc (sb, group, NULL);
-                if (!desc || !desc->bg_free_inodes_count)
-                        continue;
-                if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei)
-                        continue;
-                if (!best_desc ||
-                    (le16_to_cpu(desc->bg_free_blocks_count) >
-                     le16_to_cpu(best_desc->bg_free_blocks_count))) {
-                        best_group = group;
-                        best_desc = desc;
-                }
-        }
-        return best_group;
-}
-/*
 * Orlov's allocator for directories.
 *
 * We always try to spread first-level directories.
@@ -436,12 +400,9 @@ struct inode *ext3_new_inode(handle_t *handle, struct inode * dir,
        sbi = EXT3_SB(sb);
        es = sbi->s_es;
-        if (S_ISDIR(mode)) {
+        if (S_ISDIR(mode))
-                if (test_opt (sb, OLDALLOC))
+                group = find_group_orlov(sb, dir);
-                        group = find_group_dir(sb, dir);
+        else
-                else
-                        group = find_group_orlov(sb, dir);
-        } else
                group = find_group_other(sb, dir);
        err = -ENOSPC;
@@ -621,7 +582,7 @@ fail_free_drop:
 fail_drop:
        dquot_drop(inode);
        inode->i_flags |= S_NOQUOTA;
-        inode->i_nlink = 0;
+        clear_nlink(inode);
        unlock_new_inode(inode);
        iput(inode);
        brelse(bitmap_bh);
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 12661e1deedd..85fe655fe3e0 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -2899,7 +2899,7 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
                inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
                inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
        }
-        inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
+        set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
        inode->i_size = le32_to_cpu(raw_inode->i_size);
        inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime);
        inode->i_ctime.tv_sec = (signed)le32_to_cpu(raw_inode->i_ctime);
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index c7f43944f160..ba1b54e23cae 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -150,30 +150,6 @@ setversion_out:
                mnt_drop_write(filp->f_path.mnt);
                return err;
        }
-#ifdef CONFIG_JBD_DEBUG
-        case EXT3_IOC_WAIT_FOR_READONLY:
-                /*
-                 * This is racy - by the time we're woken up and running,
-                 * the superblock could be released.  And the module could
-                 * have been unloaded.  So sue me.
-                 *
-                 * Returns 1 if it slept, else zero.
-                 */
-                {
-                        struct super_block *sb = inode->i_sb;
-                        DECLARE_WAITQUEUE(wait, current);
-                        int ret = 0;
-                        set_current_state(TASK_INTERRUPTIBLE);
-                        add_wait_queue(&EXT3_SB(sb)->ro_wait_queue, &wait);
-                        if (timer_pending(&EXT3_SB(sb)->turn_ro_timer)) {
-                                schedule();
-                                ret = 1;
-                        }
-                        remove_wait_queue(&EXT3_SB(sb)->ro_wait_queue, &wait);
-                        return ret;
-                }
-#endif
        case EXT3_IOC_GETRSVSZ:
                if (test_opt(inode->i_sb, RESERVATION)
                        && S_ISREG(inode->i_mode)
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 0629e09f6511..642dc6d66dfd 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1821,7 +1821,7 @@ retry:
        de->name_len = 2;
        strcpy (de->name, "..");
        ext3_set_de_type(dir->i_sb, de, S_IFDIR);
-        inode->i_nlink = 2;
+        set_nlink(inode, 2);
        BUFFER_TRACE(dir_block, "call ext3_journal_dirty_metadata");
        err = ext3_journal_dirty_metadata(handle, dir_block);
        if (err)
@@ -1833,7 +1833,7 @@ retry:
        if (err) {
 out_clear_inode:
-                inode->i_nlink = 0;
+                clear_nlink(inode);
                unlock_new_inode(inode);
                ext3_mark_inode_dirty(handle, inode);
                iput (inode);
@@ -2170,7 +2170,7 @@ static int ext3_unlink(struct inode * dir, struct dentry *dentry)
                ext3_warning (inode->i_sb, "ext3_unlink",
                              "Deleting nonexistent file (%lu), %d",
                              inode->i_ino, inode->i_nlink);
-                inode->i_nlink = 1;
+                set_nlink(inode, 1);
        }
        retval = ext3_delete_entry(handle, dir, de, bh);
        if (retval)
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 7beb69ae0015..922d289aeeb3 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -652,8 +652,6 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs)
                seq_puts(seq, ",nouid32");
        if (test_opt(sb, DEBUG))
                seq_puts(seq, ",debug");
-        if (test_opt(sb, OLDALLOC))
-                seq_puts(seq, ",oldalloc");
 #ifdef CONFIG_EXT3_FS_XATTR
        if (test_opt(sb, XATTR_USER))
                seq_puts(seq, ",user_xattr");
@@ -1049,10 +1047,12 @@ static int parse_options (char *options, struct super_block *sb,
                        set_opt (sbi->s_mount_opt, DEBUG);
                        break;
                case Opt_oldalloc:
-                        set_opt (sbi->s_mount_opt, OLDALLOC);
+                        ext3_msg(sb, KERN_WARNING,
+                                "Ignoring deprecated oldalloc option");
                        break;
                case Opt_orlov:
-                        clear_opt (sbi->s_mount_opt, OLDALLOC);
+                        ext3_msg(sb, KERN_WARNING,
+                                "Ignoring deprecated orlov option");
                        break;
 #ifdef CONFIG_EXT3_FS_XATTR
                case Opt_user_xattr:
@@ -2669,13 +2669,13 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
                        /*
                         * If we have an unprocessed orphan list hanging
                         * around from a previously readonly bdev mount,
-                         * require a full umount/remount for now.
+                         * require a full umount & mount for now.
                         */
                        if (es->s_last_orphan) {
                                ext3_msg(sb, KERN_WARNING, "warning: couldn't "
                                       "remount RDWR because of unprocessed "
                                       "orphan inode list.  Please "
-                                       "umount/remount instead.");
+                                       "umount & mount instead.");
                                err = -EINVAL;
                                goto restore_opts;
                        }
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index f8224adf496e..f6dba4505f1c 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -28,7 +28,8 @@
 */
 /*
- * Calculate the block group number and offset, given a block number
+ * Calculate the block group number and offset into the block/cluster
+ * allocation bitmap, given a block number
 */
 void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
                ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp)
@@ -37,7 +38,8 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
        ext4_grpblk_t offset;
        blocknr = blocknr - le32_to_cpu(es->s_first_data_block);
-        offset = do_div(blocknr, EXT4_BLOCKS_PER_GROUP(sb));
+        offset = do_div(blocknr, EXT4_BLOCKS_PER_GROUP(sb)) >>
+                EXT4_SB(sb)->s_cluster_bits;
        if (offsetp)
                *offsetp = offset;
        if (blockgrpp)
@@ -55,130 +57,169 @@ static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block,
        return 0;
 }
-static int ext4_group_used_meta_blocks(struct super_block *sb,
+/* Return the number of clusters used for file system metadata; this
-                                       ext4_group_t block_group,
+ * represents the overhead needed by the file system.
-                                       struct ext4_group_desc *gdp)
+ */
+unsigned ext4_num_overhead_clusters(struct super_block *sb,
+                                    ext4_group_t block_group,
+                                    struct ext4_group_desc *gdp)
 {
-        ext4_fsblk_t tmp;
+        unsigned num_clusters;
+        int block_cluster = -1, inode_cluster = -1, itbl_cluster = -1, i, c;
+        ext4_fsblk_t start = ext4_group_first_block_no(sb, block_group);
+        ext4_fsblk_t itbl_blk;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
-        /* block bitmap, inode bitmap, and inode table blocks */
-        int used_blocks = sbi->s_itb_per_group + 2;
-        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
+        /* This is the number of clusters used by the superblock,
-                if (!ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp),
+         * block group descriptors, and reserved block group
-                                        block_group))
+         * descriptor blocks */
-                        used_blocks--;
+        num_clusters = ext4_num_base_meta_clusters(sb, block_group);
-                if (!ext4_block_in_group(sb, ext4_inode_bitmap(sb, gdp),
+        /*
-                                        block_group))
+         * For the allocation bitmaps and inode table, we first need
-                        used_blocks--;
+         * to check to see if the block is in the block group.  If it
+         * is, then check to see if the cluster is already accounted
-                tmp = ext4_inode_table(sb, gdp);
+         * for in the clusters used for the base metadata cluster, or
-                for (; tmp < ext4_inode_table(sb, gdp) +
+         * if we can increment the base metadata cluster to include
-                                sbi->s_itb_per_group; tmp++) {
+         * that block.  Otherwise, we will have to track the cluster
-                        if (!ext4_block_in_group(sb, tmp, block_group))
+         * used for the allocation bitmap or inode table explicitly.
-                                used_blocks -= 1;
+         * Normally all of these blocks are contiguous, so the special
+         * case handling shouldn't be necessary except for *very*
+         * unusual file system layouts.
+         */
+        if (ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp), block_group)) {
+                block_cluster = EXT4_B2C(sbi, (start -
+                                               ext4_block_bitmap(sb, gdp)));
+                if (block_cluster < num_clusters)
+                        block_cluster = -1;
+                else if (block_cluster == num_clusters) {
+                        num_clusters++;
+                        block_cluster = -1;
                }
        }
-        return used_blocks;
-}
-/* Initializes an uninitialized block bitmap if given, and returns the
+        if (ext4_block_in_group(sb, ext4_inode_bitmap(sb, gdp), block_group)) {
- * number of blocks free in the group. */
+                inode_cluster = EXT4_B2C(sbi,
-unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
+                                         start - ext4_inode_bitmap(sb, gdp));
-                 ext4_group_t block_group, struct ext4_group_desc *gdp)
+                if (inode_cluster < num_clusters)
-{
+                        inode_cluster = -1;
-        int bit, bit_max;
+                else if (inode_cluster == num_clusters) {
-        ext4_group_t ngroups = ext4_get_groups_count(sb);
+                        num_clusters++;
-        unsigned free_blocks, group_blocks;
+                        inode_cluster = -1;
-        struct ext4_sb_info *sbi = EXT4_SB(sb);
-        if (bh) {
-                J_ASSERT_BH(bh, buffer_locked(bh));
-                /* If checksum is bad mark all blocks used to prevent allocation
-                 * essentially implementing a per-group read-only flag. */
-                if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
-                        ext4_error(sb, "Checksum bad for group %u",
-                                        block_group);
-                        ext4_free_blks_set(sb, gdp, 0);
-                        ext4_free_inodes_set(sb, gdp, 0);
-                        ext4_itable_unused_set(sb, gdp, 0);
-                        memset(bh->b_data, 0xff, sb->s_blocksize);
-                        return 0;
                }
-                memset(bh->b_data, 0, sb->s_blocksize);
        }
-        /* Check for superblock and gdt backups in this group */
+        itbl_blk = ext4_inode_table(sb, gdp);
-        bit_max = ext4_bg_has_super(sb, block_group);
+        for (i = 0; i < sbi->s_itb_per_group; i++) {
+                if (ext4_block_in_group(sb, itbl_blk + i, block_group)) {
-        if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) ||
+                        c = EXT4_B2C(sbi, start - itbl_blk + i);
-            block_group < le32_to_cpu(sbi->s_es->s_first_meta_bg) *
+                        if ((c < num_clusters) || (c == inode_cluster) ||
-                          sbi->s_desc_per_block) {
+                            (c == block_cluster) || (c == itbl_cluster))
-                if (bit_max) {
+                                continue;
-                        bit_max += ext4_bg_num_gdb(sb, block_group);
+                        if (c == num_clusters) {
-                        bit_max +=
+                                num_clusters++;
-                                le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks);
+                                continue;
+                        }
+                        num_clusters++;
+                        itbl_cluster = c;
                }
-        } else { /* For META_BG_BLOCK_GROUPS */
-                bit_max += ext4_bg_num_gdb(sb, block_group);
        }
-        if (block_group == ngroups - 1) {
+        if (block_cluster != -1)
+                num_clusters++;
+        if (inode_cluster != -1)
+                num_clusters++;
+        return num_clusters;
+}
+static unsigned int num_clusters_in_group(struct super_block *sb,
+                                          ext4_group_t block_group)
+{
+        unsigned int blocks;
+        if (block_group == ext4_get_groups_count(sb) - 1) {
                /*
-                 * Even though mke2fs always initialize first and last group
+                 * Even though mke2fs always initializes the first and
-                 * if some other tool enabled the EXT4_BG_BLOCK_UNINIT we need
+                 * last group, just in case some other tool was used,
-                 * to make sure we calculate the right free blocks
+                 * we need to make sure we calculate the right free
+                 * blocks.
                 */
-                group_blocks = ext4_blocks_count(sbi->s_es) -
+                blocks = ext4_blocks_count(EXT4_SB(sb)->s_es) -
-                        ext4_group_first_block_no(sb, ngroups - 1);
+                        ext4_group_first_block_no(sb, block_group);
-        } else {
+        } else
-                group_blocks = EXT4_BLOCKS_PER_GROUP(sb);
+                blocks = EXT4_BLOCKS_PER_GROUP(sb);
-        }
+        return EXT4_NUM_B2C(EXT4_SB(sb), blocks);
+}
-        free_blocks = group_blocks - bit_max;
+/* Initializes an uninitialized block bitmap */
+void ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
+                            ext4_group_t block_group,
+                            struct ext4_group_desc *gdp)
+{
+        unsigned int bit, bit_max;
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        ext4_fsblk_t start, tmp;
+        int flex_bg = 0;
+        J_ASSERT_BH(bh, buffer_locked(bh));
+        /* If checksum is bad mark all blocks used to prevent allocation
+         * essentially implementing a per-group read-only flag. */
+        if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
+                ext4_error(sb, "Checksum bad for group %u", block_group);
+                ext4_free_group_clusters_set(sb, gdp, 0);
+                ext4_free_inodes_set(sb, gdp, 0);
+                ext4_itable_unused_set(sb, gdp, 0);
+                memset(bh->b_data, 0xff, sb->s_blocksize);
+                return;
+        }
+        memset(bh->b_data, 0, sb->s_blocksize);
-        if (bh) {
+        bit_max = ext4_num_base_meta_clusters(sb, block_group);
-                ext4_fsblk_t start, tmp;
+        for (bit = 0; bit < bit_max; bit++)
-                int flex_bg = 0;
+                ext4_set_bit(bit, bh->b_data);
-                for (bit = 0; bit < bit_max; bit++)
+        start = ext4_group_first_block_no(sb, block_group);
-                        ext4_set_bit(bit, bh->b_data);
-                start = ext4_group_first_block_no(sb, block_group);
+        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
+                flex_bg = 1;
-                if (EXT4_HAS_INCOMPAT_FEATURE(sb,
+        /* Set bits for block and inode bitmaps, and inode table */
-                                              EXT4_FEATURE_INCOMPAT_FLEX_BG))
+        tmp = ext4_block_bitmap(sb, gdp);
-                        flex_bg = 1;
+        if (!flex_bg || ext4_block_in_group(sb, tmp, block_group))
+                ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data);
-                /* Set bits for block and inode bitmaps, and inode table */
+        tmp = ext4_inode_bitmap(sb, gdp);
-                tmp = ext4_block_bitmap(sb, gdp);
+        if (!flex_bg || ext4_block_in_group(sb, tmp, block_group))
-                if (!flex_bg || ext4_block_in_group(sb, tmp, block_group))
+                ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data);
-                        ext4_set_bit(tmp - start, bh->b_data);
-                tmp = ext4_inode_bitmap(sb, gdp);
+        tmp = ext4_inode_table(sb, gdp);
+        for (; tmp < ext4_inode_table(sb, gdp) +
+                     sbi->s_itb_per_group; tmp++) {
                if (!flex_bg || ext4_block_in_group(sb, tmp, block_group))
-                        ext4_set_bit(tmp - start, bh->b_data);
+                        ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data);
-                tmp = ext4_inode_table(sb, gdp);
-                for (; tmp < ext4_inode_table(sb, gdp) +
-                                sbi->s_itb_per_group; tmp++) {
-                        if (!flex_bg ||
-                                ext4_block_in_group(sb, tmp, block_group))
-                                ext4_set_bit(tmp - start, bh->b_data);
-                }
-                /*
-                 * Also if the number of blocks within the group is
-                 * less than the blocksize * 8 ( which is the size
-                 * of bitmap ), set rest of the block bitmap to 1
-                 */
-                ext4_mark_bitmap_end(group_blocks, sb->s_blocksize * 8,
-                                     bh->b_data);
        }
-        return free_blocks - ext4_group_used_meta_blocks(sb, block_group, gdp);
+        /*
+         * Also if the number of blocks within the group is less than
+         * the blocksize * 8 ( which is the size of bitmap ), set rest
+         * of the block bitmap to 1
+         */
+        ext4_mark_bitmap_end(num_clusters_in_group(sb, block_group),
+                             sb->s_blocksize * 8, bh->b_data);
 }
+/* Return the number of free blocks in a block group.  It is used when
+ * the block bitmap is uninitialized, so we can't just count the bits
+ * in the bitmap. */
+unsigned ext4_free_clusters_after_init(struct super_block *sb,
+                                       ext4_group_t block_group,
+                                       struct ext4_group_desc *gdp)
+{
+        return num_clusters_in_group(sb, block_group) - 
+                ext4_num_overhead_clusters(sb, block_group, gdp);
+}
 /*
 * The free blocks are managed by bitmaps.  A file system contains several
@@ -362,53 +403,54 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
 }
 /**
- * ext4_has_free_blocks()
+ * ext4_has_free_clusters()
 * @sbi:        in-core super block structure.
- * @nblocks:    number of needed blocks
+ * @nclusters:  number of needed blocks
+ * @flags:      flags from ext4_mb_new_blocks()
 *
- * Check if filesystem has nblocks free & available for allocation.
+ * Check if filesystem has nclusters free & available for allocation.
 * On success return 1, return 0 on failure.
 */
-static int ext4_has_free_blocks(struct ext4_sb_info *sbi,
+static int ext4_has_free_clusters(struct ext4_sb_info *sbi,
-                                s64 nblocks, unsigned int flags)
+                                  s64 nclusters, unsigned int flags)
 {
-        s64 free_blocks, dirty_blocks, root_blocks;
+        s64 free_clusters, dirty_clusters, root_clusters;
-        struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
+        struct percpu_counter *fcc = &sbi->s_freeclusters_counter;
-        struct percpu_counter *dbc = &sbi->s_dirtyblocks_counter;
+        struct percpu_counter *dcc = &sbi->s_dirtyclusters_counter;
-        free_blocks  = percpu_counter_read_positive(fbc);
+        free_clusters  = percpu_counter_read_positive(fcc);
-        dirty_blocks = percpu_counter_read_positive(dbc);
+        dirty_clusters = percpu_counter_read_positive(dcc);
-        root_blocks = ext4_r_blocks_count(sbi->s_es);
+        root_clusters = EXT4_B2C(sbi, ext4_r_blocks_count(sbi->s_es));
-        if (free_blocks - (nblocks + root_blocks + dirty_blocks) <
+        if (free_clusters - (nclusters + root_clusters + dirty_clusters) <
-                                                EXT4_FREEBLOCKS_WATERMARK) {
+                                        EXT4_FREECLUSTERS_WATERMARK) {
-                free_blocks  = percpu_counter_sum_positive(fbc);
+                free_clusters  = EXT4_C2B(sbi, percpu_counter_sum_positive(fcc));
-                dirty_blocks = percpu_counter_sum_positive(dbc);
+                dirty_clusters = percpu_counter_sum_positive(dcc);
        }
-        /* Check whether we have space after
+        /* Check whether we have space after accounting for current
-         * accounting for current dirty blocks & root reserved blocks.
+         * dirty clusters & root reserved clusters.
         */
-        if (free_blocks >= ((root_blocks + nblocks) + dirty_blocks))
+        if (free_clusters >= ((root_clusters + nclusters) + dirty_clusters))
                return 1;
-        /* Hm, nope.  Are (enough) root reserved blocks available? */
+        /* Hm, nope.  Are (enough) root reserved clusters available? */
        if (sbi->s_resuid == current_fsuid() ||
            ((sbi->s_resgid != 0) && in_group_p(sbi->s_resgid)) ||
            capable(CAP_SYS_RESOURCE) ||
                (flags & EXT4_MB_USE_ROOT_BLOCKS)) {
-                if (free_blocks >= (nblocks + dirty_blocks))
+                if (free_clusters >= (nclusters + dirty_clusters))
                        return 1;
        }
        return 0;
 }
-int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
+int ext4_claim_free_clusters(struct ext4_sb_info *sbi,
-                           s64 nblocks, unsigned int flags)
+                             s64 nclusters, unsigned int flags)
 {
-        if (ext4_has_free_blocks(sbi, nblocks, flags)) {
+        if (ext4_has_free_clusters(sbi, nclusters, flags)) {
-                percpu_counter_add(&sbi->s_dirtyblocks_counter, nblocks);
+                percpu_counter_add(&sbi->s_dirtyclusters_counter, nclusters);
                return 0;
        } else
                return -ENOSPC;
@@ -428,7 +470,7 @@ int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
 */
 int ext4_should_retry_alloc(struct super_block *sb, int *retries)
 {
-        if (!ext4_has_free_blocks(EXT4_SB(sb), 1, 0) ||
+        if (!ext4_has_free_clusters(EXT4_SB(sb), 1, 0) ||
            (*retries)++ > 3 ||
            !EXT4_SB(sb)->s_journal)
                return 0;
@@ -444,7 +486,7 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
 * @handle:             handle to this transaction
 * @inode:              file inode
 * @goal:               given target block(filesystem wide)
- * @count:              pointer to total number of blocks needed
+ * @count:              pointer to total number of clusters needed
 * @errp:               error code
 *
 * Return 1st allocated block number on success, *count stores total account
@@ -476,18 +518,19 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
                spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
                EXT4_I(inode)->i_allocated_meta_blocks += ar.len;
                spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
-                dquot_alloc_block_nofail(inode, ar.len);
+                dquot_alloc_block_nofail(inode,
+                                EXT4_C2B(EXT4_SB(inode->i_sb), ar.len));
        }
        return ret;
 }
 /**
- * ext4_count_free_blocks() -- count filesystem free blocks
+ * ext4_count_free_clusters() -- count filesystem free clusters
 * @sb:         superblock
 *
- * Adds up the number of free blocks from each block group.
+ * Adds up the number of free clusters from each block group.
 */
-ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
+ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb)
 {
        ext4_fsblk_t desc_count;
        struct ext4_group_desc *gdp;
@@ -508,7 +551,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
                gdp = ext4_get_group_desc(sb, i, NULL);
                if (!gdp)
                        continue;
-                desc_count += ext4_free_blks_count(sb, gdp);
+                desc_count += ext4_free_group_clusters(sb, gdp);
                brelse(bitmap_bh);
                bitmap_bh = ext4_read_block_bitmap(sb, i);
                if (bitmap_bh == NULL)
@@ -516,12 +559,13 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
                x = ext4_count_free(bitmap_bh, sb->s_blocksize);
                printk(KERN_DEBUG "group %u: stored = %d, counted = %u\n",
-                        i, ext4_free_blks_count(sb, gdp), x);
+                        i, ext4_free_group_clusters(sb, gdp), x);
                bitmap_count += x;
        }
        brelse(bitmap_bh);
-        printk(KERN_DEBUG "ext4_count_free_blocks: stored = %llu"
+        printk(KERN_DEBUG "ext4_count_free_clusters: stored = %llu"
-                ", computed = %llu, %llu\n", ext4_free_blocks_count(es),
+               ", computed = %llu, %llu\n",
+               EXT4_B2C(sbi, ext4_free_blocks_count(es)),
               desc_count, bitmap_count);
        return bitmap_count;
 #else
@@ -530,7 +574,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
                gdp = ext4_get_group_desc(sb, i, NULL);
                if (!gdp)
                        continue;
-                desc_count += ext4_free_blks_count(sb, gdp);
+                desc_count += ext4_free_group_clusters(sb, gdp);
        }
        return desc_count;
@@ -620,6 +664,31 @@ unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group)
 }
+/*
+ * This function returns the number of file system metadata clusters at
+ * the beginning of a block group, including the reserved gdt blocks.
+ */
+unsigned ext4_num_base_meta_clusters(struct super_block *sb,
+                                     ext4_group_t block_group)
+{
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        unsigned num;
+        /* Check for superblock and gdt backups in this group */
+        num = ext4_bg_has_super(sb, block_group);
+        if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) ||
+            block_group < le32_to_cpu(sbi->s_es->s_first_meta_bg) *
+                          sbi->s_desc_per_block) {
+                if (num) {
+                        num += ext4_bg_num_gdb(sb, block_group);
+                        num += le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks);
+                }
+        } else { /* For META_BG_BLOCK_GROUPS */
+                num += ext4_bg_num_gdb(sb, block_group);
+        }
+        return EXT4_NUM_B2C(sbi, num);
+}
 /**
 *      ext4_inode_to_goal_block - return a hint for block allocation
 *      @inode: inode for block allocation
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index b7d7bd0f066e..5b0e26a1272d 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -144,9 +144,17 @@ struct ext4_allocation_request {
 #define EXT4_MAP_UNWRITTEN      (1 << BH_Unwritten)
 #define EXT4_MAP_BOUNDARY       (1 << BH_Boundary)
 #define EXT4_MAP_UNINIT         (1 << BH_Uninit)
+/* Sometimes (in the bigalloc case, from ext4_da_get_block_prep) the caller of
+ * ext4_map_blocks wants to know whether or not the underlying cluster has
+ * already been accounted for. EXT4_MAP_FROM_CLUSTER conveys to the caller that
+ * the requested mapping was from previously mapped (or delayed allocated)
+ * cluster. We use BH_AllocFromCluster only for this flag. BH_AllocFromCluster
+ * should never appear on buffer_head's state flags.
+ */
+#define EXT4_MAP_FROM_CLUSTER   (1 << BH_AllocFromCluster)
 #define EXT4_MAP_FLAGS          (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\
                                 EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\
-                                 EXT4_MAP_UNINIT)
+                                 EXT4_MAP_UNINIT | EXT4_MAP_FROM_CLUSTER)
 struct ext4_map_blocks {
        ext4_fsblk_t m_pblk;
@@ -239,8 +247,11 @@ struct ext4_io_submit {
 # define EXT4_BLOCK_SIZE(s)             (EXT4_MIN_BLOCK_SIZE << (s)->s_log_block_size)
 #endif
 #define EXT4_ADDR_PER_BLOCK(s)          (EXT4_BLOCK_SIZE(s) / sizeof(__u32))
+#define EXT4_CLUSTER_SIZE(s)            (EXT4_BLOCK_SIZE(s) << \
+                                         EXT4_SB(s)->s_cluster_bits)
 #ifdef __KERNEL__
 # define EXT4_BLOCK_SIZE_BITS(s)        ((s)->s_blocksize_bits)
+# define EXT4_CLUSTER_BITS(s)           (EXT4_SB(s)->s_cluster_bits)
 #else
 # define EXT4_BLOCK_SIZE_BITS(s)        ((s)->s_log_block_size + 10)
 #endif
@@ -258,6 +269,14 @@ struct ext4_io_submit {
 #endif
 #define EXT4_BLOCK_ALIGN(size, blkbits)         ALIGN((size), (1 << (blkbits)))
+/* Translate a block number to a cluster number */
+#define EXT4_B2C(sbi, blk)      ((blk) >> (sbi)->s_cluster_bits)
+/* Translate a cluster number to a block number */
+#define EXT4_C2B(sbi, cluster)  ((cluster) << (sbi)->s_cluster_bits)
+/* Translate # of blks to # of clusters */
+#define EXT4_NUM_B2C(sbi, blks) (((blks) + (sbi)->s_cluster_ratio - 1) >> \
+                                 (sbi)->s_cluster_bits)
 /*
 * Structure of a blocks group descriptor
 */
@@ -289,7 +308,7 @@ struct ext4_group_desc
 struct flex_groups {
        atomic_t free_inodes;
-        atomic_t free_blocks;
+        atomic_t free_clusters;
        atomic_t used_dirs;
 };
@@ -306,6 +325,7 @@ struct flex_groups {
 #define EXT4_DESC_SIZE(s)               (EXT4_SB(s)->s_desc_size)
 #ifdef __KERNEL__
 # define EXT4_BLOCKS_PER_GROUP(s)       (EXT4_SB(s)->s_blocks_per_group)
+# define EXT4_CLUSTERS_PER_GROUP(s)     (EXT4_SB(s)->s_clusters_per_group)
 # define EXT4_DESC_PER_BLOCK(s)         (EXT4_SB(s)->s_desc_per_block)
 # define EXT4_INODES_PER_GROUP(s)       (EXT4_SB(s)->s_inodes_per_group)
 # define EXT4_DESC_PER_BLOCK_BITS(s)    (EXT4_SB(s)->s_desc_per_block_bits)
@@ -358,8 +378,7 @@ struct flex_groups {
 /* Flags that should be inherited by new inodes from their parent. */
 #define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
-                           EXT4_SYNC_FL | EXT4_IMMUTABLE_FL | EXT4_APPEND_FL |\
+                           EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\
-                           EXT4_NODUMP_FL | EXT4_NOATIME_FL |\
                           EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\
                           EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL)
@@ -520,6 +539,8 @@ struct ext4_new_group_data {
 #define EXT4_GET_BLOCKS_PUNCH_OUT_EXT           0x0020
        /* Don't normalize allocation size (used for fallocate) */
 #define EXT4_GET_BLOCKS_NO_NORMALIZE            0x0040
+        /* Request will not result in inode size update (user for fallocate) */
+#define EXT4_GET_BLOCKS_KEEP_SIZE               0x0080
 /*
 * Flags used by ext4_free_blocks
@@ -528,6 +549,13 @@ struct ext4_new_group_data {
 #define EXT4_FREE_BLOCKS_FORGET         0x0002
 #define EXT4_FREE_BLOCKS_VALIDATED      0x0004
 #define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008
+#define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER   0x0010
+#define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER    0x0020
+/*
+ * Flags used by ext4_discard_partial_page_buffers
+ */
+#define EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED   0x0001
 /*
 * ioctl commands
@@ -538,9 +566,6 @@ struct ext4_new_group_data {
 #define EXT4_IOC_SETVERSION             _IOW('f', 4, long)
 #define EXT4_IOC_GETVERSION_OLD         FS_IOC_GETVERSION
 #define EXT4_IOC_SETVERSION_OLD         FS_IOC_SETVERSION
-#ifdef CONFIG_JBD2_DEBUG
-#define EXT4_IOC_WAIT_FOR_READONLY      _IOR('f', 99, long)
-#endif
 #define EXT4_IOC_GETRSVSZ               _IOR('f', 5, long)
 #define EXT4_IOC_SETRSVSZ               _IOW('f', 6, long)
 #define EXT4_IOC_GROUP_EXTEND           _IOW('f', 7, unsigned long)
@@ -563,9 +588,6 @@ struct ext4_new_group_data {
 #define EXT4_IOC32_SETRSVSZ             _IOW('f', 6, int)
 #define EXT4_IOC32_GROUP_EXTEND         _IOW('f', 7, unsigned int)
 #define EXT4_IOC32_GROUP_ADD            _IOW('f', 8, struct compat_ext4_new_group_input)
-#ifdef CONFIG_JBD2_DEBUG
-#define EXT4_IOC32_WAIT_FOR_READONLY    _IOR('f', 99, int)
-#endif
 #define EXT4_IOC32_GETVERSION_OLD       FS_IOC32_GETVERSION
 #define EXT4_IOC32_SETVERSION_OLD       FS_IOC32_SETVERSION
 #endif
@@ -837,6 +859,7 @@ struct ext4_inode_info {
        ext4_group_t    i_last_alloc_group;
        /* allocation reservation info for delalloc */
+        /* In case of bigalloc, these refer to clusters rather than blocks */
        unsigned int i_reserved_data_blocks;
        unsigned int i_reserved_meta_blocks;
        unsigned int i_allocated_meta_blocks;
@@ -886,7 +909,6 @@ struct ext4_inode_info {
 /*
 * Mount flags
 */
-#define EXT4_MOUNT_OLDALLOC             0x00002  /* Don't use the new Orlov allocator */
 #define EXT4_MOUNT_GRPID                0x00004 /* Create files with directory's group */
 #define EXT4_MOUNT_DEBUG                0x00008 /* Some debugging messages */
 #define EXT4_MOUNT_ERRORS_CONT          0x00010 /* Continue on errors */
@@ -918,6 +940,9 @@ struct ext4_inode_info {
 #define EXT4_MOUNT_DISCARD              0x40000000 /* Issue DISCARD requests */
 #define EXT4_MOUNT_INIT_INODE_TABLE     0x80000000 /* Initialize uninitialized itables */
+#define EXT4_MOUNT2_EXPLICIT_DELALLOC   0x00000001 /* User explicitly
+                                                      specified delalloc */
 #define clear_opt(sb, opt)              EXT4_SB(sb)->s_mount_opt &= \
                                                ~EXT4_MOUNT_##opt
 #define set_opt(sb, opt)                EXT4_SB(sb)->s_mount_opt |= \
@@ -968,9 +993,9 @@ struct ext4_super_block {
 /*10*/  __le32  s_free_inodes_count;    /* Free inodes count */
        __le32  s_first_data_block;     /* First Data Block */
        __le32  s_log_block_size;       /* Block size */
-        __le32  s_obso_log_frag_size;   /* Obsoleted fragment size */
+        __le32  s_log_cluster_size;     /* Allocation cluster size */
 /*20*/  __le32  s_blocks_per_group;     /* # Blocks per group */
-        __le32  s_obso_frags_per_group; /* Obsoleted fragments per group */
+        __le32  s_clusters_per_group;   /* # Clusters per group */
        __le32  s_inodes_per_group;     /* # Inodes per group */
        __le32  s_mtime;                /* Mount time */
 /*30*/  __le32  s_wtime;                /* Write time */
@@ -1066,7 +1091,10 @@ struct ext4_super_block {
        __u8    s_last_error_func[32];  /* function where the error happened */
 #define EXT4_S_ERR_END offsetof(struct ext4_super_block, s_mount_opts)
        __u8    s_mount_opts[64];
-        __le32  s_reserved[112];        /* Padding to the end of the block */
+        __le32  s_usr_quota_inum;       /* inode for tracking user quota */
+        __le32  s_grp_quota_inum;       /* inode for tracking group quota */
+        __le32  s_overhead_clusters;    /* overhead blocks/clusters in fs */
+        __le32  s_reserved[109];        /* Padding to the end of the block */
 };
 #define EXT4_S_ERR_LEN (EXT4_S_ERR_END - EXT4_S_ERR_START)
@@ -1086,6 +1114,7 @@ struct ext4_sb_info {
        unsigned long s_desc_size;      /* Size of a group descriptor in bytes */
        unsigned long s_inodes_per_block;/* Number of inodes per block */
        unsigned long s_blocks_per_group;/* Number of blocks in a group */
+        unsigned long s_clusters_per_group; /* Number of clusters in a group */
        unsigned long s_inodes_per_group;/* Number of inodes in a group */
        unsigned long s_itb_per_group;  /* Number of inode table blocks per group */
        unsigned long s_gdb_count;      /* Number of group descriptor blocks */
@@ -1094,6 +1123,8 @@ struct ext4_sb_info {
        ext4_group_t s_blockfile_groups;/* Groups acceptable for non-extent files */
        unsigned long s_overhead_last;  /* Last calculated overhead */
        unsigned long s_blocks_last;    /* Last seen block count */
+        unsigned int s_cluster_ratio;   /* Number of blocks per cluster */
+        unsigned int s_cluster_bits;    /* log2 of s_cluster_ratio */
        loff_t s_bitmap_maxbytes;       /* max bytes for bitmap files */
        struct buffer_head * s_sbh;     /* Buffer containing the super block */
        struct ext4_super_block *s_es;  /* Pointer to the super block in the buffer */
@@ -1117,10 +1148,10 @@ struct ext4_sb_info {
        u32 s_hash_seed[4];
        int s_def_hash_version;
        int s_hash_unsigned;    /* 3 if hash should be signed, 0 if not */
-        struct percpu_counter s_freeblocks_counter;
+        struct percpu_counter s_freeclusters_counter;
        struct percpu_counter s_freeinodes_counter;
        struct percpu_counter s_dirs_counter;
-        struct percpu_counter s_dirtyblocks_counter;
+        struct percpu_counter s_dirtyclusters_counter;
        struct blockgroup_lock *s_blockgroup_lock;
        struct proc_dir_entry *s_proc;
        struct kobject s_kobj;
@@ -1136,10 +1167,6 @@ struct ext4_sb_info {
        u32 s_max_batch_time;
        u32 s_min_batch_time;
        struct block_device *journal_bdev;
-#ifdef CONFIG_JBD2_DEBUG
-        struct timer_list turn_ro_timer;        /* For turning read-only (crash simulation) */
-        wait_queue_head_t ro_wait_queue;        /* For people waiting for the fs to go read-only */
-#endif
 #ifdef CONFIG_QUOTA
        char *s_qf_names[MAXQUOTAS];            /* Names of quota files with journalled quota */
        int s_jquota_fmt;                       /* Format of quota to use */
@@ -1248,6 +1275,15 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
                 ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count));
 }
+static inline void ext4_set_io_unwritten_flag(struct inode *inode,
+                                              struct ext4_io_end *io_end)
+{
+        if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
+                io_end->flag |= EXT4_IO_END_UNWRITTEN;
+                atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
+        }
+}
 /*
 * Inode dynamic state flags
 */
@@ -1360,6 +1396,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
 #define EXT4_FEATURE_RO_COMPAT_DIR_NLINK        0x0020
 #define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE      0x0040
 #define EXT4_FEATURE_RO_COMPAT_QUOTA            0x0100
+#define EXT4_FEATURE_RO_COMPAT_BIGALLOC         0x0200
 #define EXT4_FEATURE_INCOMPAT_COMPRESSION       0x0001
 #define EXT4_FEATURE_INCOMPAT_FILETYPE          0x0002
@@ -1402,7 +1439,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
                                         EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \
                                         EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \
                                         EXT4_FEATURE_RO_COMPAT_BTREE_DIR |\
-                                         EXT4_FEATURE_RO_COMPAT_HUGE_FILE)
+                                         EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\
+                                         EXT4_FEATURE_RO_COMPAT_BIGALLOC)
 /*
 * Default values for user and/or group using reserved blocks
@@ -1735,9 +1773,9 @@ extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
                                         unsigned int flags,
                                         unsigned long *count,
                                         int *errp);
-extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
+extern int ext4_claim_free_clusters(struct ext4_sb_info *sbi,
-                                  s64 nblocks, unsigned int flags);
+                                    s64 nclusters, unsigned int flags);
-extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *);
+extern ext4_fsblk_t ext4_count_free_clusters(struct super_block *);
 extern void ext4_check_blocks_bitmap(struct super_block *);
 extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
                                                    ext4_group_t block_group,
@@ -1745,12 +1783,18 @@ extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
 extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
 struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
                                      ext4_group_t block_group);
-extern unsigned ext4_init_block_bitmap(struct super_block *sb,
+extern void ext4_init_block_bitmap(struct super_block *sb,
-                                       struct buffer_head *bh,
+                                   struct buffer_head *bh,
-                                       ext4_group_t group,
+                                   ext4_group_t group,
-                                       struct ext4_group_desc *desc);
+                                   struct ext4_group_desc *desc);
-#define ext4_free_blocks_after_init(sb, group, desc)                    \
+extern unsigned ext4_free_clusters_after_init(struct super_block *sb,
-                ext4_init_block_bitmap(sb, NULL, group, desc)
+                                              ext4_group_t block_group,
+                                              struct ext4_group_desc *gdp);
+extern unsigned ext4_num_base_meta_clusters(struct super_block *sb,
+                                            ext4_group_t block_group);
+extern unsigned ext4_num_overhead_clusters(struct super_block *sb,
+                                           ext4_group_t block_group,
+                                           struct ext4_group_desc *gdp);
 ext4_fsblk_t ext4_inode_to_goal_block(struct inode *);
 /* dir.c */
@@ -1776,7 +1820,8 @@ extern int ext4fs_dirhash(const char *name, int len, struct
 /* ialloc.c */
 extern struct inode *ext4_new_inode(handle_t *, struct inode *, int,
-                                    const struct qstr *qstr, __u32 goal);
+                                    const struct qstr *qstr, __u32 goal,
+                                    uid_t *owner);
 extern void ext4_free_inode(handle_t *, struct inode *);
 extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
 extern unsigned long ext4_count_free_inodes(struct super_block *);
@@ -1839,6 +1884,12 @@ extern int ext4_block_truncate_page(handle_t *handle,
                struct address_space *mapping, loff_t from);
 extern int ext4_block_zero_page_range(handle_t *handle,
                struct address_space *mapping, loff_t from, loff_t length);
+extern int ext4_discard_partial_page_buffers(handle_t *handle,
+                struct address_space *mapping, loff_t from,
+                loff_t length, int flags);
+extern int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
+                struct inode *inode, struct page *page, loff_t from,
+                loff_t length, int flags);
 extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 extern qsize_t *ext4_get_reserved_space(struct inode *inode);
 extern void ext4_da_update_reserve_space(struct inode *inode,
@@ -1878,40 +1929,40 @@ extern int ext4_group_extend(struct super_block *sb,
 extern void *ext4_kvmalloc(size_t size, gfp_t flags);
 extern void *ext4_kvzalloc(size_t size, gfp_t flags);
 extern void ext4_kvfree(void *ptr);
-extern void __ext4_error(struct super_block *, const char *, unsigned int,
+extern __printf(4, 5)
-                         const char *, ...)
+void __ext4_error(struct super_block *, const char *, unsigned int,
-        __attribute__ ((format (printf, 4, 5)));
+                  const char *, ...);
 #define ext4_error(sb, message...)      __ext4_error(sb, __func__,      \
                                                     __LINE__, ## message)
-extern void ext4_error_inode(struct inode *, const char *, unsigned int,
+extern __printf(5, 6)
-                             ext4_fsblk_t, const char *, ...)
+void ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t,
-        __attribute__ ((format (printf, 5, 6)));
+                      const char *, ...);
-extern void ext4_error_file(struct file *, const char *, unsigned int,
+extern __printf(5, 6)
-                            ext4_fsblk_t, const char *, ...)
+void ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t,
-        __attribute__ ((format (printf, 5, 6)));
+                     const char *, ...);
 extern void __ext4_std_error(struct super_block *, const char *,
                             unsigned int, int);
-extern void __ext4_abort(struct super_block *, const char *, unsigned int,
+extern __printf(4, 5)
-                       const char *, ...)
+void __ext4_abort(struct super_block *, const char *, unsigned int,
-        __attribute__ ((format (printf, 4, 5)));
+                  const char *, ...);
 #define ext4_abort(sb, message...)      __ext4_abort(sb, __func__, \
                                                       __LINE__, ## message)
-extern void __ext4_warning(struct super_block *, const char *, unsigned int,
+extern __printf(4, 5)
-                          const char *, ...)
+void __ext4_warning(struct super_block *, const char *, unsigned int,
-        __attribute__ ((format (printf, 4, 5)));
+                    const char *, ...);
 #define ext4_warning(sb, message...)    __ext4_warning(sb, __func__, \
                                                       __LINE__, ## message)
-extern void ext4_msg(struct super_block *, const char *, const char *, ...)
+extern __printf(3, 4)
-        __attribute__ ((format (printf, 3, 4)));
+void ext4_msg(struct super_block *, const char *, const char *, ...);
 extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp,
                           const char *, unsigned int, const char *);
 #define dump_mmp_msg(sb, mmp, msg)      __dump_mmp_msg(sb, mmp, __func__, \
                                                       __LINE__, msg)
-extern void __ext4_grp_locked_error(const char *, unsigned int, \
+extern __printf(7, 8)
-                                    struct super_block *, ext4_group_t, \
+void __ext4_grp_locked_error(const char *, unsigned int,
-                                    unsigned long, ext4_fsblk_t, \
+                             struct super_block *, ext4_group_t,
-                                    const char *, ...)
+                             unsigned long, ext4_fsblk_t,
-        __attribute__ ((format (printf, 7, 8)));
+                             const char *, ...);
 #define ext4_grp_locked_error(sb, grp, message...) \
        __ext4_grp_locked_error(__func__, __LINE__, (sb), (grp), ## message)
 extern void ext4_update_dynamic_rev(struct super_block *sb);
@@ -1927,8 +1978,8 @@ extern ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
                                      struct ext4_group_desc *bg);
 extern ext4_fsblk_t ext4_inode_table(struct super_block *sb,
                                     struct ext4_group_desc *bg);
-extern __u32 ext4_free_blks_count(struct super_block *sb,
+extern __u32 ext4_free_group_clusters(struct super_block *sb,
-                                struct ext4_group_desc *bg);
+                                      struct ext4_group_desc *bg);
 extern __u32 ext4_free_inodes_count(struct super_block *sb,
                                 struct ext4_group_desc *bg);
 extern __u32 ext4_used_dirs_count(struct super_block *sb,
@@ -1941,8 +1992,9 @@ extern void ext4_inode_bitmap_set(struct super_block *sb,
                                  struct ext4_group_desc *bg, ext4_fsblk_t blk);
 extern void ext4_inode_table_set(struct super_block *sb,
                                 struct ext4_group_desc *bg, ext4_fsblk_t blk);
-extern void ext4_free_blks_set(struct super_block *sb,
+extern void ext4_free_group_clusters_set(struct super_block *sb,
-                               struct ext4_group_desc *bg, __u32 count);
+                                         struct ext4_group_desc *bg,
+                                         __u32 count);
 extern void ext4_free_inodes_set(struct super_block *sb,
                                struct ext4_group_desc *bg, __u32 count);
 extern void ext4_used_dirs_set(struct super_block *sb,
@@ -2051,13 +2103,13 @@ do {								\
 } while (0)
 #ifdef CONFIG_SMP
-/* Each CPU can accumulate percpu_counter_batch blocks in their local
+/* Each CPU can accumulate percpu_counter_batch clusters in their local
- * counters. So we need to make sure we have free blocks more
+ * counters. So we need to make sure we have free clusters more
 * than percpu_counter_batch  * nr_cpu_ids. Also add a window of 4 times.
 */
-#define EXT4_FREEBLOCKS_WATERMARK (4 * (percpu_counter_batch * nr_cpu_ids))
+#define EXT4_FREECLUSTERS_WATERMARK (4 * (percpu_counter_batch * nr_cpu_ids))
 #else
-#define EXT4_FREEBLOCKS_WATERMARK 0
+#define EXT4_FREECLUSTERS_WATERMARK 0
 #endif
 static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
@@ -2243,10 +2295,19 @@ extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
 enum ext4_state_bits {
        BH_Uninit       /* blocks are allocated but uninitialized on disk */
          = BH_JBDPrivateStart,
+        BH_AllocFromCluster,    /* allocated blocks were part of already
+                                 * allocated cluster. Note that this flag will
+                                 * never, ever appear in a buffer_head's state
+                                 * flag. See EXT4_MAP_FROM_CLUSTER to see where
+                                 * this is used. */
+        BH_Da_Mapped,   /* Delayed allocated block that now has a mapping. This
+                         * flag is set when ext4_map_blocks is called on a
+                         * delayed allocated block to get its real mapping. */
 };
 BUFFER_FNS(Uninit, uninit)
 TAS_BUFFER_FNS(Uninit, uninit)
+BUFFER_FNS(Da_Mapped, da_mapped)
 /*
 * Add new method to test wether block and inode bitmaps are properly
@@ -2282,4 +2343,6 @@ extern void ext4_resize_end(struct super_block *sb);
 #endif  /* __KERNEL__ */
+#include "ext4_extents.h"
 #endif  /* _EXT4_H */
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 095c36f3b612..a52db3a69a30 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -290,5 +290,7 @@ extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
                                                        struct ext4_ext_path *);
 extern void ext4_ext_drop_refs(struct ext4_ext_path *);
 extern int ext4_ext_check_inode(struct inode *inode);
+extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk,
+                                      int search_hint_reverse);
 #endif /* _EXT4_EXTENTS */
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index f5240aa15601..aca179017582 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -109,9 +109,11 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
        if (ext4_handle_valid(handle)) {
                err = jbd2_journal_dirty_metadata(handle, bh);
-                if (err)
+                if (err) {
-                        ext4_journal_abort_handle(where, line, __func__,
+                        /* Errors can only happen if there is a bug */
-                                                  bh, handle, err);
+                        handle->h_err = err;
+                        __ext4_journal_stop(where, line, handle);
+                }
        } else {
                if (inode)
                        mark_buffer_dirty_inode(bh, inode);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 57cf568a98ab..61fa9e1614af 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -42,7 +42,6 @@
 #include <asm/uaccess.h>
 #include <linux/fiemap.h>
 #include "ext4_jbd2.h"
-#include "ext4_extents.h"
 #include <trace/events/ext4.h>
@@ -96,13 +95,17 @@ static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
 *  - ENOMEM
 *  - EIO
 */
-static int ext4_ext_dirty(handle_t *handle, struct inode *inode,
+#define ext4_ext_dirty(handle, inode, path) \
-                                struct ext4_ext_path *path)
+                __ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path))
+static int __ext4_ext_dirty(const char *where, unsigned int line,
+                            handle_t *handle, struct inode *inode,
+                            struct ext4_ext_path *path)
 {
        int err;
        if (path->p_bh) {
                /* path points to block */
-                err = ext4_handle_dirty_metadata(handle, inode, path->p_bh);
+                err = __ext4_handle_dirty_metadata(where, line, handle,
+                                                   inode, path->p_bh);
        } else {
                /* path points to leaf/index in inode body */
                err = ext4_mark_inode_dirty(handle, inode);
@@ -114,11 +117,9 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
                              struct ext4_ext_path *path,
                              ext4_lblk_t block)
 {
-        int depth;
        if (path) {
+                int depth = path->p_depth;
                struct ext4_extent *ex;
-                depth = path->p_depth;
                /*
                 * Try to predict block placement assuming that we are
@@ -180,12 +181,10 @@ static inline int ext4_ext_space_block(struct inode *inode, int check)
        size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
                        / sizeof(struct ext4_extent);
-        if (!check) {
 #ifdef AGGRESSIVE_TEST
-                if (size > 6)
+        if (!check && size > 6)
-                        size = 6;
+                size = 6;
 #endif
-        }
        return size;
 }
@@ -195,12 +194,10 @@ static inline int ext4_ext_space_block_idx(struct inode *inode, int check)
        size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
                        / sizeof(struct ext4_extent_idx);
-        if (!check) {
 #ifdef AGGRESSIVE_TEST
-                if (size > 5)
+        if (!check && size > 5)
-                        size = 5;
+                size = 5;
 #endif
-        }
        return size;
 }
@@ -211,12 +208,10 @@ static inline int ext4_ext_space_root(struct inode *inode, int check)
        size = sizeof(EXT4_I(inode)->i_data);
        size -= sizeof(struct ext4_extent_header);
        size /= sizeof(struct ext4_extent);
-        if (!check) {
 #ifdef AGGRESSIVE_TEST
-                if (size > 3)
+        if (!check && size > 3)
-                        size = 3;
+                size = 3;
 #endif
-        }
        return size;
 }
@@ -227,12 +222,10 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
        size = sizeof(EXT4_I(inode)->i_data);
        size -= sizeof(struct ext4_extent_header);
        size /= sizeof(struct ext4_extent_idx);
-        if (!check) {
 #ifdef AGGRESSIVE_TEST
-                if (size > 4)
+        if (!check && size > 4)
-                        size = 4;
+                size = 4;
 #endif
-        }
        return size;
 }
@@ -244,7 +237,7 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
 int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
 {
        struct ext4_inode_info *ei = EXT4_I(inode);
-        int idxs, num = 0;
+        int idxs;
        idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
                / sizeof(struct ext4_extent_idx));
@@ -259,6 +252,8 @@ int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
         */
        if (ei->i_da_metadata_calc_len &&
            ei->i_da_metadata_calc_last_lblock+1 == lblock) {
+                int num = 0;
                if ((ei->i_da_metadata_calc_len % idxs) == 0)
                        num++;
                if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0)
@@ -321,8 +316,6 @@ static int ext4_valid_extent_entries(struct inode *inode,
                                struct ext4_extent_header *eh,
                                int depth)
 {
-        struct ext4_extent *ext;
-        struct ext4_extent_idx *ext_idx;
        unsigned short entries;
        if (eh->eh_entries == 0)
                return 1;
@@ -331,7 +324,7 @@ static int ext4_valid_extent_entries(struct inode *inode,
        if (depth == 0) {
                /* leaf entries */
-                ext = EXT_FIRST_EXTENT(eh);
+                struct ext4_extent *ext = EXT_FIRST_EXTENT(eh);
                while (entries) {
                        if (!ext4_valid_extent(inode, ext))
                                return 0;
@@ -339,7 +332,7 @@ static int ext4_valid_extent_entries(struct inode *inode,
                        entries--;
                }
        } else {
-                ext_idx = EXT_FIRST_INDEX(eh);
+                struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh);
                while (entries) {
                        if (!ext4_valid_extent_idx(inode, ext_idx))
                                return 0;
@@ -751,31 +744,30 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
                return -EIO;
        }
-        len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx;
        if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
                /* insert after */
-                if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) {
+                ext_debug("insert new index %d after: %llu\n", logical, ptr);
-                        len = (len - 1) * sizeof(struct ext4_extent_idx);
-                        len = len < 0 ? 0 : len;
-                        ext_debug("insert new index %d after: %llu. "
-                                        "move %d from 0x%p to 0x%p\n",
-                                        logical, ptr, len,
-                                        (curp->p_idx + 1), (curp->p_idx + 2));
-                        memmove(curp->p_idx + 2, curp->p_idx + 1, len);
-                }
                ix = curp->p_idx + 1;
        } else {
                /* insert before */
-                len = len * sizeof(struct ext4_extent_idx);
+                ext_debug("insert new index %d before: %llu\n", logical, ptr);
-                len = len < 0 ? 0 : len;
-                ext_debug("insert new index %d before: %llu. "
-                                "move %d from 0x%p to 0x%p\n",
-                                logical, ptr, len,
-                                curp->p_idx, (curp->p_idx + 1));
-                memmove(curp->p_idx + 1, curp->p_idx, len);
                ix = curp->p_idx;
        }
+        len = EXT_LAST_INDEX(curp->p_hdr) - ix + 1;
+        BUG_ON(len < 0);
+        if (len > 0) {
+                ext_debug("insert new index %d: "
+                                "move %d indices from 0x%p to 0x%p\n",
+                                logical, len, ix, ix + 1);
+                memmove(ix + 1, ix, len * sizeof(struct ext4_extent_idx));
+        }
+        if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) {
+                EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!");
+                return -EIO;
+        }
        ix->ei_block = cpu_to_le32(logical);
        ext4_idx_store_pblock(ix, ptr);
        le16_add_cpu(&curp->p_hdr->eh_entries, 1);
@@ -1042,16 +1034,14 @@ cleanup:
 */
 static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
                                 unsigned int flags,
-                                 struct ext4_ext_path *path,
                                 struct ext4_extent *newext)
 {
-        struct ext4_ext_path *curp = path;
        struct ext4_extent_header *neh;
        struct buffer_head *bh;
        ext4_fsblk_t newblock;
        int err = 0;
-        newblock = ext4_ext_new_meta_block(handle, inode, path,
+        newblock = ext4_ext_new_meta_block(handle, inode, NULL,
                newext, &err, flags);
        if (newblock == 0)
                return err;
@@ -1071,7 +1061,8 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
        }
        /* move top-level index/leaf into new block */
-        memmove(bh->b_data, curp->p_hdr, sizeof(EXT4_I(inode)->i_data));
+        memmove(bh->b_data, EXT4_I(inode)->i_data,
+                sizeof(EXT4_I(inode)->i_data));
        /* set size of new block */
        neh = ext_block_hdr(bh);
@@ -1089,32 +1080,23 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
        if (err)
                goto out;
-        /* create index in new top-level index: num,max,pointer */
+        /* Update top-level index: num,max,pointer */
-        err = ext4_ext_get_access(handle, inode, curp);
-        if (err)
-                goto out;
-        curp->p_hdr->eh_magic = EXT4_EXT_MAGIC;
-        curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0));
-        curp->p_hdr->eh_entries = cpu_to_le16(1);
-        curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr);
-        if (path[0].p_hdr->eh_depth)
-                curp->p_idx->ei_block =
-                        EXT_FIRST_INDEX(path[0].p_hdr)->ei_block;
-        else
-                curp->p_idx->ei_block =
-                        EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block;
-        ext4_idx_store_pblock(curp->p_idx, newblock);
        neh = ext_inode_hdr(inode);
+        neh->eh_entries = cpu_to_le16(1);
+        ext4_idx_store_pblock(EXT_FIRST_INDEX(neh), newblock);
+        if (neh->eh_depth == 0) {
+                /* Root extent block becomes index block */
+                neh->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0));
+                EXT_FIRST_INDEX(neh)->ei_block =
+                        EXT_FIRST_EXTENT(neh)->ee_block;
+        }
        ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n",
                  le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
                  le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
                  ext4_idx_pblock(EXT_FIRST_INDEX(neh)));
-        neh->eh_depth = cpu_to_le16(path->p_depth + 1);
+        neh->eh_depth = cpu_to_le16(neh->eh_depth + 1);
-        err = ext4_ext_dirty(handle, inode, curp);
+        ext4_mark_inode_dirty(handle, inode);
 out:
        brelse(bh);
@@ -1162,8 +1144,7 @@ repeat:
                        err = PTR_ERR(path);
        } else {
                /* tree is full, time to grow in depth */
-                err = ext4_ext_grow_indepth(handle, inode, flags,
+                err = ext4_ext_grow_indepth(handle, inode, flags, newext);
-                                            path, newext);
                if (err)
                        goto out;
@@ -1235,9 +1216,9 @@ static int ext4_ext_search_left(struct inode *inode,
                        if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
                                EXT4_ERROR_INODE(inode,
                                  "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!",
-                                  ix != NULL ? ix->ei_block : 0,
+                                  ix != NULL ? le32_to_cpu(ix->ei_block) : 0,
                                  EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ?
-                                    EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block : 0,
+                le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block) : 0,
                                  depth);
                                return -EIO;
                        }
@@ -1260,13 +1241,14 @@ static int ext4_ext_search_left(struct inode *inode,
 /*
 * search the closest allocated block to the right for *logical
 * and returns it at @logical + it's physical address at @phys
- * if *logical is the smallest allocated block, the function
+ * if *logical is the largest allocated block, the function
 * returns 0 at @phys
 * return value contains 0 (success) or error code
 */
 static int ext4_ext_search_right(struct inode *inode,
                                 struct ext4_ext_path *path,
-                                 ext4_lblk_t *logical, ext4_fsblk_t *phys)
+                                 ext4_lblk_t *logical, ext4_fsblk_t *phys,
+                                 struct ext4_extent **ret_ex)
 {
        struct buffer_head *bh = NULL;
        struct ext4_extent_header *eh;
@@ -1308,9 +1290,7 @@ static int ext4_ext_search_right(struct inode *inode,
                                return -EIO;
                        }
                }
-                *logical = le32_to_cpu(ex->ee_block);
+                goto found_extent;
-                *phys = ext4_ext_pblock(ex);
-                return 0;
        }
        if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
@@ -1323,9 +1303,7 @@ static int ext4_ext_search_right(struct inode *inode,
        if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {
                /* next allocated block in this leaf */
                ex++;
-                *logical = le32_to_cpu(ex->ee_block);
+                goto found_extent;
-                *phys = ext4_ext_pblock(ex);
-                return 0;
        }
        /* go up and search for index to the right */
@@ -1368,9 +1346,12 @@ got_index:
                return -EIO;
        }
        ex = EXT_FIRST_EXTENT(eh);
+found_extent:
        *logical = le32_to_cpu(ex->ee_block);
        *phys = ext4_ext_pblock(ex);
-        put_bh(bh);
+        *ret_ex = ex;
+        if (bh)
+                put_bh(bh);
        return 0;
 }
@@ -1395,7 +1376,8 @@ ext4_ext_next_allocated_block(struct ext4_ext_path *path)
        while (depth >= 0) {
                if (depth == path->p_depth) {
                        /* leaf */
-                        if (path[depth].p_ext !=
+                        if (path[depth].p_ext &&
+                                path[depth].p_ext !=
                                        EXT_LAST_EXTENT(path[depth].p_hdr))
                          return le32_to_cpu(path[depth].p_ext[1].ee_block);
                } else {
@@ -1623,7 +1605,8 @@ static int ext4_ext_try_to_merge(struct inode *inode,
 * such that there will be no overlap, and then returns 1.
 * If there is no overlap found, it returns 0.
 */
-static unsigned int ext4_ext_check_overlap(struct inode *inode,
+static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi,
+                                           struct inode *inode,
                                           struct ext4_extent *newext,
                                           struct ext4_ext_path *path)
 {
@@ -1637,6 +1620,7 @@ static unsigned int ext4_ext_check_overlap(struct inode *inode,
        if (!path[depth].p_ext)
                goto out;
        b2 = le32_to_cpu(path[depth].p_ext->ee_block);
+        b2 &= ~(sbi->s_cluster_ratio - 1);
        /*
         * get the next allocated block if the extent in the path
@@ -1646,6 +1630,7 @@ static unsigned int ext4_ext_check_overlap(struct inode *inode,
                b2 = ext4_ext_next_allocated_block(path);
                if (b2 == EXT_MAX_BLOCKS)
                        goto out;
+                b2 &= ~(sbi->s_cluster_ratio - 1);
        }
        /* check for wrap through zero on extent logical start block*/
@@ -1697,7 +1682,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
        /* try to insert block into found extent and return */
        if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)
                && ext4_can_extents_be_merged(inode, ex, newext)) {
-                ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
+                ext_debug("append [%d]%d block to %u:[%d]%d (from %llu)\n",
                          ext4_ext_is_uninitialized(newext),
                          ext4_ext_get_actual_len(newext),
                          le32_to_cpu(ex->ee_block),
@@ -1735,7 +1720,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
        if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block))
                next = ext4_ext_next_leaf_block(path);
        if (next != EXT_MAX_BLOCKS) {
-                ext_debug("next leaf block - %d\n", next);
+                ext_debug("next leaf block - %u\n", next);
                BUG_ON(npath != NULL);
                npath = ext4_ext_find_extent(inode, next, NULL);
                if (IS_ERR(npath))
@@ -1773,46 +1758,51 @@ has_space:
        if (!nearex) {
                /* there is no extent in this leaf, create first one */
-                ext_debug("first extent in the leaf: %d:%llu:[%d]%d\n",
+                ext_debug("first extent in the leaf: %u:%llu:[%d]%d\n",
                                le32_to_cpu(newext->ee_block),
                                ext4_ext_pblock(newext),
                                ext4_ext_is_uninitialized(newext),
                                ext4_ext_get_actual_len(newext));
-                path[depth].p_ext = EXT_FIRST_EXTENT(eh);
+                nearex = EXT_FIRST_EXTENT(eh);
-        } else if (le32_to_cpu(newext->ee_block)
+        } else {
+                if (le32_to_cpu(newext->ee_block)
                           > le32_to_cpu(nearex->ee_block)) {
-/*              BUG_ON(newext->ee_block == nearex->ee_block); */
+                        /* Insert after */
-                if (nearex != EXT_LAST_EXTENT(eh)) {
+                        ext_debug("insert %u:%llu:[%d]%d before: "
-                        len = EXT_MAX_EXTENT(eh) - nearex;
+                                        "nearest %p\n",
-                        len = (len - 1) * sizeof(struct ext4_extent);
-                        len = len < 0 ? 0 : len;
-                        ext_debug("insert %d:%llu:[%d]%d after: nearest 0x%p, "
-                                        "move %d from 0x%p to 0x%p\n",
                                        le32_to_cpu(newext->ee_block),
                                        ext4_ext_pblock(newext),
                                        ext4_ext_is_uninitialized(newext),
                                        ext4_ext_get_actual_len(newext),
-                                        nearex, len, nearex + 1, nearex + 2);
+                                        nearex);
-                        memmove(nearex + 2, nearex + 1, len);
+                        nearex++;
+                } else {
+                        /* Insert before */
+                        BUG_ON(newext->ee_block == nearex->ee_block);
+                        ext_debug("insert %u:%llu:[%d]%d after: "
+                                        "nearest %p\n",
+                                        le32_to_cpu(newext->ee_block),
+                                        ext4_ext_pblock(newext),
+                                        ext4_ext_is_uninitialized(newext),
+                                        ext4_ext_get_actual_len(newext),
+                                        nearex);
+                }
+                len = EXT_LAST_EXTENT(eh) - nearex + 1;
+                if (len > 0) {
+                        ext_debug("insert %u:%llu:[%d]%d: "
+                                        "move %d extents from 0x%p to 0x%p\n",
+                                        le32_to_cpu(newext->ee_block),
+                                        ext4_ext_pblock(newext),
+                                        ext4_ext_is_uninitialized(newext),
+                                        ext4_ext_get_actual_len(newext),
+                                        len, nearex, nearex + 1);
+                        memmove(nearex + 1, nearex,
+                                len * sizeof(struct ext4_extent));
                }
-                path[depth].p_ext = nearex + 1;
-        } else {
-                BUG_ON(newext->ee_block == nearex->ee_block);
-                len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext4_extent);
-                len = len < 0 ? 0 : len;
-                ext_debug("insert %d:%llu:[%d]%d before: nearest 0x%p, "
-                                "move %d from 0x%p to 0x%p\n",
-                                le32_to_cpu(newext->ee_block),
-                                ext4_ext_pblock(newext),
-                                ext4_ext_is_uninitialized(newext),
-                                ext4_ext_get_actual_len(newext),
-                                nearex, len, nearex, nearex + 1);
-                memmove(nearex + 1, nearex, len);
-                path[depth].p_ext = nearex;
        }
        le16_add_cpu(&eh->eh_entries, 1);
-        nearex = path[depth].p_ext;
+        path[depth].p_ext = nearex;
        nearex->ee_block = newext->ee_block;
        ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext));
        nearex->ee_len = newext->ee_len;
@@ -1962,6 +1952,7 @@ ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block,
        struct ext4_ext_cache *cex;
        BUG_ON(len == 0);
        spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+        trace_ext4_ext_put_in_cache(inode, block, len, start);
        cex = &EXT4_I(inode)->i_cached_extent;
        cex->ec_block = block;
        cex->ec_len = len;
@@ -2063,6 +2054,7 @@ errout:
                sbi->extent_cache_misses++;
        else
                sbi->extent_cache_hits++;
+        trace_ext4_ext_in_cache(inode, block, ret);
        spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
        return ret;
 }
@@ -2130,6 +2122,8 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
        if (err)
                return err;
        ext_debug("index is empty, remove it, free block %llu\n", leaf);
+        trace_ext4_ext_rm_idx(inode, leaf);
        ext4_free_blocks(handle, inode, NULL, leaf, 1,
                         EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
        return err;
@@ -2158,7 +2152,7 @@ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
                         *  need to account for leaf block credit
                         *
                         *  bitmaps and block group descriptor blocks
-                         *  and other metadat blocks still need to be
+                         *  and other metadata blocks still need to be
                         *  accounted.
                         */
                        /* 1 bitmap, 1 block group descriptor */
@@ -2195,14 +2189,40 @@ int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
 }
 static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
-                                struct ext4_extent *ex,
+                              struct ext4_extent *ex,
-                                ext4_lblk_t from, ext4_lblk_t to)
+                              ext4_fsblk_t *partial_cluster,
+                              ext4_lblk_t from, ext4_lblk_t to)
 {
+        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        unsigned short ee_len =  ext4_ext_get_actual_len(ex);
+        ext4_fsblk_t pblk;
        int flags = EXT4_FREE_BLOCKS_FORGET;
        if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
                flags |= EXT4_FREE_BLOCKS_METADATA;
+        /*
+         * For bigalloc file systems, we never free a partial cluster
+         * at the beginning of the extent.  Instead, we make a note
+         * that we tried freeing the cluster, and check to see if we
+         * need to free it on a subsequent call to ext4_remove_blocks,
+         * or at the end of the ext4_truncate() operation.
+         */
+        flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER;
+        trace_ext4_remove_blocks(inode, ex, from, to, *partial_cluster);
+        /*
+         * If we have a partial cluster, and it's different from the
+         * cluster of the last block, we need to explicitly free the
+         * partial cluster here.
+         */
+        pblk = ext4_ext_pblock(ex) + ee_len - 1;
+        if (*partial_cluster && (EXT4_B2C(sbi, pblk) != *partial_cluster)) {
+                ext4_free_blocks(handle, inode, NULL,
+                                 EXT4_C2B(sbi, *partial_cluster),
+                                 sbi->s_cluster_ratio, flags);
+                *partial_cluster = 0;
+        }
 #ifdef EXTENTS_STATS
        {
                struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -2222,12 +2242,24 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
            && to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
                /* tail removal */
                ext4_lblk_t num;
-                ext4_fsblk_t start;
                num = le32_to_cpu(ex->ee_block) + ee_len - from;
-                start = ext4_ext_pblock(ex) + ee_len - num;
+                pblk = ext4_ext_pblock(ex) + ee_len - num;
-                ext_debug("free last %u blocks starting %llu\n", num, start);
+                ext_debug("free last %u blocks starting %llu\n", num, pblk);
-                ext4_free_blocks(handle, inode, NULL, start, num, flags);
+                ext4_free_blocks(handle, inode, NULL, pblk, num, flags);
+                /*
+                 * If the block range to be freed didn't start at the
+                 * beginning of a cluster, and we removed the entire
+                 * extent, save the partial cluster here, since we
+                 * might need to delete if we determine that the
+                 * truncate operation has removed all of the blocks in
+                 * the cluster.
+                 */
+                if (pblk & (sbi->s_cluster_ratio - 1) &&
+                    (ee_len == num))
+                        *partial_cluster = EXT4_B2C(sbi, pblk);
+                else
+                        *partial_cluster = 0;
        } else if (from == le32_to_cpu(ex->ee_block)
                   && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
                /* head removal */
@@ -2238,7 +2270,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
                start = ext4_ext_pblock(ex);
                ext_debug("free first %u blocks starting %llu\n", num, start);
-                ext4_free_blocks(handle, inode, 0, start, num, flags);
+                ext4_free_blocks(handle, inode, NULL, start, num, flags);
        } else {
                printk(KERN_INFO "strange request: removal(2) "
@@ -2262,19 +2294,19 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
 */
 static int
 ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
-                struct ext4_ext_path *path, ext4_lblk_t start,
+                 struct ext4_ext_path *path, ext4_fsblk_t *partial_cluster,
-                ext4_lblk_t end)
+                 ext4_lblk_t start, ext4_lblk_t end)
 {
+        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        int err = 0, correct_index = 0;
        int depth = ext_depth(inode), credits;
        struct ext4_extent_header *eh;
-        ext4_lblk_t a, b, block;
+        ext4_lblk_t a, b;
        unsigned num;
        ext4_lblk_t ex_ee_block;
        unsigned short ex_ee_len;
        unsigned uninitialized = 0;
        struct ext4_extent *ex;
-        struct ext4_map_blocks map;
        /* the header must be checked already in ext4_ext_remove_space() */
        ext_debug("truncate since %u in leaf\n", start);
@@ -2291,6 +2323,8 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
        ex_ee_block = le32_to_cpu(ex->ee_block);
        ex_ee_len = ext4_ext_get_actual_len(ex);
+        trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster);
        while (ex >= EXT_FIRST_EXTENT(eh) &&
                        ex_ee_block + ex_ee_len > start) {
@@ -2315,86 +2349,18 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
                        ex_ee_block = le32_to_cpu(ex->ee_block);
                        ex_ee_len = ext4_ext_get_actual_len(ex);
                        continue;
-                } else if (a != ex_ee_block &&
+                } else if (b != ex_ee_block + ex_ee_len - 1) {
-                        b != ex_ee_block + ex_ee_len - 1) {
+                        EXT4_ERROR_INODE(inode,"  bad truncate %u:%u\n",
-                        /*
+                                         start, end);
-                         * If this is a truncate, then this condition should
+                        err = -EIO;
-                         * never happen because at least one of the end points
+                        goto out;
-                         * needs to be on the edge of the extent.
-                         */
-                        if (end == EXT_MAX_BLOCKS - 1) {
-                                ext_debug("  bad truncate %u:%u\n",
-                                                start, end);
-                                block = 0;
-                                num = 0;
-                                err = -EIO;
-                                goto out;
-                        }
-                        /*
-                         * else this is a hole punch, so the extent needs to
-                         * be split since neither edge of the hole is on the
-                         * extent edge
-                         */
-                        else{
-                                map.m_pblk = ext4_ext_pblock(ex);
-                                map.m_lblk = ex_ee_block;
-                                map.m_len = b - ex_ee_block;
-                                err = ext4_split_extent(handle,
-                                        inode, path, &map, 0,
-                                        EXT4_GET_BLOCKS_PUNCH_OUT_EXT |
-                                        EXT4_GET_BLOCKS_PRE_IO);
-                                if (err < 0)
-                                        goto out;
-                                ex_ee_len = ext4_ext_get_actual_len(ex);
-                                b = ex_ee_block+ex_ee_len - 1 < end ?
-                                        ex_ee_block+ex_ee_len - 1 : end;
-                                /* Then remove tail of this extent */
-                                block = ex_ee_block;
-                                num = a - block;
-                        }
                } else if (a != ex_ee_block) {
                        /* remove tail of the extent */
-                        block = ex_ee_block;
+                        num = a - ex_ee_block;
-                        num = a - block;
-                } else if (b != ex_ee_block + ex_ee_len - 1) {
-                        /* remove head of the extent */
-                        block = b;
-                        num =  ex_ee_block + ex_ee_len - b;
-                        /*
-                         * If this is a truncate, this condition
-                         * should never happen
-                         */
-                        if (end == EXT_MAX_BLOCKS - 1) {
-                                ext_debug("  bad truncate %u:%u\n",
-                                        start, end);
-                                err = -EIO;
-                                goto out;
-                        }
                } else {
                        /* remove whole extent: excellent! */
-                        block = ex_ee_block;
                        num = 0;
-                        if (a != ex_ee_block) {
-                                ext_debug("  bad truncate %u:%u\n",
-                                        start, end);
-                                err = -EIO;
-                                goto out;
-                        }
-                        if (b != ex_ee_block + ex_ee_len - 1) {
-                                ext_debug("  bad truncate %u:%u\n",
-                                        start, end);
-                                err = -EIO;
-                                goto out;
-                        }
                }
                /*
                 * 3 for leaf, sb, and inode plus 2 (bmap and group
                 * descriptor) for each block group; assume two block
@@ -2416,23 +2382,15 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
                if (err)
                        goto out;
-                err = ext4_remove_blocks(handle, inode, ex, a, b);
+                err = ext4_remove_blocks(handle, inode, ex, partial_cluster,
+                                         a, b);
                if (err)
                        goto out;
-                if (num == 0) {
+                if (num == 0)
                        /* this extent is removed; mark slot entirely unused */
                        ext4_ext_store_pblock(ex, 0);
-                } else if (block != ex_ee_block) {
-                        /*
-                         * If this was a head removal, then we need to update
-                         * the physical block since it is now at a different
-                         * location
-                         */
-                        ext4_ext_store_pblock(ex, ext4_ext_pblock(ex) + (b-a));
-                }
-                ex->ee_block = cpu_to_le32(block);
                ex->ee_len = cpu_to_le16(num);
                /*
                 * Do not mark uninitialized if all the blocks in the
@@ -2440,11 +2398,6 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
                 */
                if (uninitialized && num)
                        ext4_ext_mark_uninitialized(ex);
-                err = ext4_ext_dirty(handle, inode, path + depth);
-                if (err)
-                        goto out;
                /*
                 * If the extent was completely released,
                 * we need to remove it from the leaf
@@ -2464,9 +2417,14 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
                                        sizeof(struct ext4_extent));
                        }
                        le16_add_cpu(&eh->eh_entries, -1);
-                }
+                } else
+                        *partial_cluster = 0;
-                ext_debug("new extent: %u:%u:%llu\n", block, num,
+                err = ext4_ext_dirty(handle, inode, path + depth);
+                if (err)
+                        goto out;
+                ext_debug("new extent: %u:%u:%llu\n", ex_ee_block, num,
                                ext4_ext_pblock(ex));
                ex--;
                ex_ee_block = le32_to_cpu(ex->ee_block);
@@ -2476,6 +2434,25 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
        if (correct_index && eh->eh_entries)
                err = ext4_ext_correct_indexes(handle, inode, path);
+        /*
+         * If there is still a entry in the leaf node, check to see if
+         * it references the partial cluster.  This is the only place
+         * where it could; if it doesn't, we can free the cluster.
+         */
+        if (*partial_cluster && ex >= EXT_FIRST_EXTENT(eh) &&
+            (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) !=
+             *partial_cluster)) {
+                int flags = EXT4_FREE_BLOCKS_FORGET;
+                if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+                        flags |= EXT4_FREE_BLOCKS_METADATA;
+                ext4_free_blocks(handle, inode, NULL,
+                                 EXT4_C2B(sbi, *partial_cluster),
+                                 sbi->s_cluster_ratio, flags);
+                *partial_cluster = 0;
+        }
        /* if this leaf is free, then we should
         * remove it from index block above */
        if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL)
@@ -2511,6 +2488,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
        struct super_block *sb = inode->i_sb;
        int depth = ext_depth(inode);
        struct ext4_ext_path *path;
+        ext4_fsblk_t partial_cluster = 0;
        handle_t *handle;
        int i, err;
@@ -2524,6 +2502,8 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
 again:
        ext4_ext_invalidate_cache(inode);
+        trace_ext4_ext_remove_space(inode, start, depth);
        /*
         * We start scanning from right side, freeing all the blocks
         * after i_size and walking into the tree depth-wise.
@@ -2546,7 +2526,8 @@ again:
                if (i == depth) {
                        /* this is leaf block */
                        err = ext4_ext_rm_leaf(handle, inode, path,
-                                        start, EXT_MAX_BLOCKS - 1);
+                                               &partial_cluster, start,
+                                               EXT_MAX_BLOCKS - 1);
                        /* root level has p_bh == NULL, brelse() eats this */
                        brelse(path[i].p_bh);
                        path[i].p_bh = NULL;
@@ -2618,6 +2599,24 @@ again:
                }
        }
+        trace_ext4_ext_remove_space_done(inode, start, depth, partial_cluster,
+                        path->p_hdr->eh_entries);
+        /* If we still have something in the partial cluster and we have removed
+         * even the first extent, then we should free the blocks in the partial
+         * cluster as well. */
+        if (partial_cluster && path->p_hdr->eh_entries == 0) {
+                int flags = EXT4_FREE_BLOCKS_FORGET;
+                if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+                        flags |= EXT4_FREE_BLOCKS_METADATA;
+                ext4_free_blocks(handle, inode, NULL,
+                                 EXT4_C2B(EXT4_SB(sb), partial_cluster),
+                                 EXT4_SB(sb)->s_cluster_ratio, flags);
+                partial_cluster = 0;
+        }
        /* TODO: flexible tree reduction should be here */
        if (path->p_hdr->eh_entries == 0) {
                /*
@@ -2909,17 +2908,29 @@ out:
 *   a> There is no split required: Entire extent should be initialized
 *   b> Splits in two extents: Write is happening at either end of the extent
 *   c> Splits in three extents: Somone is writing in middle of the extent
+ *
+ * Pre-conditions:
+ *  - The extent pointed to by 'path' is uninitialized.
+ *  - The extent pointed to by 'path' contains a superset
+ *    of the logical span [map->m_lblk, map->m_lblk + map->m_len).
+ *
+ * Post-conditions on success:
+ *  - the returned value is the number of blocks beyond map->l_lblk
+ *    that are allocated and initialized.
+ *    It is guaranteed to be >= map->m_len.
 */
 static int ext4_ext_convert_to_initialized(handle_t *handle,
                                           struct inode *inode,
                                           struct ext4_map_blocks *map,
                                           struct ext4_ext_path *path)
 {
+        struct ext4_extent_header *eh;
        struct ext4_map_blocks split_map;
        struct ext4_extent zero_ex;
        struct ext4_extent *ex;
        ext4_lblk_t ee_block, eof_block;
-        unsigned int allocated, ee_len, depth;
+        unsigned int ee_len, depth;
+        int allocated;
        int err = 0;
        int split_flag = 0;
@@ -2933,11 +2944,93 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                eof_block = map->m_lblk + map->m_len;
        depth = ext_depth(inode);
+        eh = path[depth].p_hdr;
        ex = path[depth].p_ext;
        ee_block = le32_to_cpu(ex->ee_block);
        ee_len = ext4_ext_get_actual_len(ex);
        allocated = ee_len - (map->m_lblk - ee_block);
+        trace_ext4_ext_convert_to_initialized_enter(inode, map, ex);
+        /* Pre-conditions */
+        BUG_ON(!ext4_ext_is_uninitialized(ex));
+        BUG_ON(!in_range(map->m_lblk, ee_block, ee_len));
+        BUG_ON(map->m_lblk + map->m_len > ee_block + ee_len);
+        /*
+         * Attempt to transfer newly initialized blocks from the currently
+         * uninitialized extent to its left neighbor. This is much cheaper
+         * than an insertion followed by a merge as those involve costly
+         * memmove() calls. This is the common case in steady state for
+         * workloads doing fallocate(FALLOC_FL_KEEP_SIZE) followed by append
+         * writes.
+         *
+         * Limitations of the current logic:
+         *  - L1: we only deal with writes at the start of the extent.
+         *    The approach could be extended to writes at the end
+         *    of the extent but this scenario was deemed less common.
+         *  - L2: we do not deal with writes covering the whole extent.
+         *    This would require removing the extent if the transfer
+         *    is possible.
+         *  - L3: we only attempt to merge with an extent stored in the
+         *    same extent tree node.
+         */
+        if ((map->m_lblk == ee_block) &&        /*L1*/
+                (map->m_len < ee_len) &&        /*L2*/
+                (ex > EXT_FIRST_EXTENT(eh))) {  /*L3*/
+                struct ext4_extent *prev_ex;
+                ext4_lblk_t prev_lblk;
+                ext4_fsblk_t prev_pblk, ee_pblk;
+                unsigned int prev_len, write_len;
+                prev_ex = ex - 1;
+                prev_lblk = le32_to_cpu(prev_ex->ee_block);
+                prev_len = ext4_ext_get_actual_len(prev_ex);
+                prev_pblk = ext4_ext_pblock(prev_ex);
+                ee_pblk = ext4_ext_pblock(ex);
+                write_len = map->m_len;
+                /*
+                 * A transfer of blocks from 'ex' to 'prev_ex' is allowed
+                 * upon those conditions:
+                 * - C1: prev_ex is initialized,
+                 * - C2: prev_ex is logically abutting ex,
+                 * - C3: prev_ex is physically abutting ex,
+                 * - C4: prev_ex can receive the additional blocks without
+                 *   overflowing the (initialized) length limit.
+                 */
+                if ((!ext4_ext_is_uninitialized(prev_ex)) &&            /*C1*/
+                        ((prev_lblk + prev_len) == ee_block) &&         /*C2*/
+                        ((prev_pblk + prev_len) == ee_pblk) &&          /*C3*/
+                        (prev_len < (EXT_INIT_MAX_LEN - write_len))) {  /*C4*/
+                        err = ext4_ext_get_access(handle, inode, path + depth);
+                        if (err)
+                                goto out;
+                        trace_ext4_ext_convert_to_initialized_fastpath(inode,
+                                map, ex, prev_ex);
+                        /* Shift the start of ex by 'write_len' blocks */
+                        ex->ee_block = cpu_to_le32(ee_block + write_len);
+                        ext4_ext_store_pblock(ex, ee_pblk + write_len);
+                        ex->ee_len = cpu_to_le16(ee_len - write_len);
+                        ext4_ext_mark_uninitialized(ex); /* Restore the flag */
+                        /* Extend prev_ex by 'write_len' blocks */
+                        prev_ex->ee_len = cpu_to_le16(prev_len + write_len);
+                        /* Mark the block containing both extents as dirty */
+                        ext4_ext_dirty(handle, inode, path + depth);
+                        /* Update path to point to the right extent */
+                        path[depth].p_ext = prev_ex;
+                        /* Result: number of initialized blocks past m_lblk */
+                        allocated = write_len;
+                        goto out;
+                }
+        }
        WARN_ON(map->m_lblk < ee_block);
        /*
         * It is safe to convert extent to initialized via explicit
@@ -3165,6 +3258,192 @@ static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
        return ext4_mark_inode_dirty(handle, inode);
 }
+/**
+ * ext4_find_delalloc_range: find delayed allocated block in the given range.
+ *
+ * Goes through the buffer heads in the range [lblk_start, lblk_end] and returns
+ * whether there are any buffers marked for delayed allocation. It returns '1'
+ * on the first delalloc'ed buffer head found. If no buffer head in the given
+ * range is marked for delalloc, it returns 0.
+ * lblk_start should always be <= lblk_end.
+ * search_hint_reverse is to indicate that searching in reverse from lblk_end to
+ * lblk_start might be more efficient (i.e., we will likely hit the delalloc'ed
+ * block sooner). This is useful when blocks are truncated sequentially from
+ * lblk_start towards lblk_end.
+ */
+static int ext4_find_delalloc_range(struct inode *inode,
+                                    ext4_lblk_t lblk_start,
+                                    ext4_lblk_t lblk_end,
+                                    int search_hint_reverse)
+{
+        struct address_space *mapping = inode->i_mapping;
+        struct buffer_head *head, *bh = NULL;
+        struct page *page;
+        ext4_lblk_t i, pg_lblk;
+        pgoff_t index;
+        /* reverse search wont work if fs block size is less than page size */
+        if (inode->i_blkbits < PAGE_CACHE_SHIFT)
+                search_hint_reverse = 0;
+        if (search_hint_reverse)
+                i = lblk_end;
+        else
+                i = lblk_start;
+        index = i >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+        while ((i >= lblk_start) && (i <= lblk_end)) {
+                page = find_get_page(mapping, index);
+                if (!page)
+                        goto nextpage;
+                if (!page_has_buffers(page))
+                        goto nextpage;
+                head = page_buffers(page);
+                if (!head)
+                        goto nextpage;
+                bh = head;
+                pg_lblk = index << (PAGE_CACHE_SHIFT -
+                                                inode->i_blkbits);
+                do {
+                        if (unlikely(pg_lblk < lblk_start)) {
+                                /*
+                                 * This is possible when fs block size is less
+                                 * than page size and our cluster starts/ends in
+                                 * middle of the page. So we need to skip the
+                                 * initial few blocks till we reach the 'lblk'
+                                 */
+                                pg_lblk++;
+                                continue;
+                        }
+                        /* Check if the buffer is delayed allocated and that it
+                         * is not yet mapped. (when da-buffers are mapped during
+                         * their writeout, their da_mapped bit is set.)
+                         */
+                        if (buffer_delay(bh) && !buffer_da_mapped(bh)) {
+                                page_cache_release(page);
+                                trace_ext4_find_delalloc_range(inode,
+                                                lblk_start, lblk_end,
+                                                search_hint_reverse,
+                                                1, i);
+                                return 1;
+                        }
+                        if (search_hint_reverse)
+                                i--;
+                        else
+                                i++;
+                } while ((i >= lblk_start) && (i <= lblk_end) &&
+                                ((bh = bh->b_this_page) != head));
+nextpage:
+                if (page)
+                        page_cache_release(page);
+                /*
+                 * Move to next page. 'i' will be the first lblk in the next
+                 * page.
+                 */
+                if (search_hint_reverse)
+                        index--;
+                else
+                        index++;
+                i = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+        }
+        trace_ext4_find_delalloc_range(inode, lblk_start, lblk_end,
+                                        search_hint_reverse, 0, 0);
+        return 0;
+}
+int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk,
+                               int search_hint_reverse)
+{
+        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+        ext4_lblk_t lblk_start, lblk_end;
+        lblk_start = lblk & (~(sbi->s_cluster_ratio - 1));
+        lblk_end = lblk_start + sbi->s_cluster_ratio - 1;
+        return ext4_find_delalloc_range(inode, lblk_start, lblk_end,
+                                        search_hint_reverse);
+}
+/**
+ * Determines how many complete clusters (out of those specified by the 'map')
+ * are under delalloc and were reserved quota for.
+ * This function is called when we are writing out the blocks that were
+ * originally written with their allocation delayed, but then the space was
+ * allocated using fallocate() before the delayed allocation could be resolved.
+ * The cases to look for are:
+ * ('=' indicated delayed allocated blocks
+ *  '-' indicates non-delayed allocated blocks)
+ * (a) partial clusters towards beginning and/or end outside of allocated range
+ *     are not delalloc'ed.
+ *      Ex:
+ *      |----c---=|====c====|====c====|===-c----|
+ *               |++++++ allocated ++++++|
+ *      ==> 4 complete clusters in above example
+ *
+ * (b) partial cluster (outside of allocated range) towards either end is
+ *     marked for delayed allocation. In this case, we will exclude that
+ *     cluster.
+ *      Ex:
+ *      |----====c========|========c========|
+ *           |++++++ allocated ++++++|
+ *      ==> 1 complete clusters in above example
+ *
+ *      Ex:
+ *      |================c================|
+ *            |++++++ allocated ++++++|
+ *      ==> 0 complete clusters in above example
+ *
+ * The ext4_da_update_reserve_space will be called only if we
+ * determine here that there were some "entire" clusters that span
+ * this 'allocated' range.
+ * In the non-bigalloc case, this function will just end up returning num_blks
+ * without ever calling ext4_find_delalloc_range.
+ */
+static unsigned int
+get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
+                           unsigned int num_blks)
+{
+        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+        ext4_lblk_t alloc_cluster_start, alloc_cluster_end;
+        ext4_lblk_t lblk_from, lblk_to, c_offset;
+        unsigned int allocated_clusters = 0;
+        alloc_cluster_start = EXT4_B2C(sbi, lblk_start);
+        alloc_cluster_end = EXT4_B2C(sbi, lblk_start + num_blks - 1);
+        /* max possible clusters for this allocation */
+        allocated_clusters = alloc_cluster_end - alloc_cluster_start + 1;
+        trace_ext4_get_reserved_cluster_alloc(inode, lblk_start, num_blks);
+        /* Check towards left side */
+        c_offset = lblk_start & (sbi->s_cluster_ratio - 1);
+        if (c_offset) {
+                lblk_from = lblk_start & (~(sbi->s_cluster_ratio - 1));
+                lblk_to = lblk_from + c_offset - 1;
+                if (ext4_find_delalloc_range(inode, lblk_from, lblk_to, 0))
+                        allocated_clusters--;
+        }
+        /* Now check towards right. */
+        c_offset = (lblk_start + num_blks) & (sbi->s_cluster_ratio - 1);
+        if (allocated_clusters && c_offset) {
+                lblk_from = lblk_start + num_blks;
+                lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1;
+                if (ext4_find_delalloc_range(inode, lblk_from, lblk_to, 0))
+                        allocated_clusters--;
+        }
+        return allocated_clusters;
+}
 static int
 ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
                        struct ext4_map_blocks *map,
@@ -3181,6 +3460,9 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
                  flags, allocated);
        ext4_ext_show_leaf(inode, path);
+        trace_ext4_ext_handle_uninitialized_extents(inode, map, allocated,
+                                                    newblock);
        /* get_block() before submit the IO, split the extent */
        if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
                ret = ext4_split_unwritten_extents(handle, inode, map,
@@ -3190,10 +3472,9 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
                 * that this IO needs to conversion to written when IO is
                 * completed
                 */
-                if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) {
+                if (io)
-                        io->flag = EXT4_IO_END_UNWRITTEN;
+                        ext4_set_io_unwritten_flag(inode, io);
-                        atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
+                else
-                } else
                        ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
                if (ext4_should_dioread_nolock(inode))
                        map->m_flags |= EXT4_MAP_UNINIT;
@@ -3234,14 +3515,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
        /* buffered write, writepage time, convert*/
        ret = ext4_ext_convert_to_initialized(handle, inode, map, path);
-        if (ret >= 0) {
+        if (ret >= 0)
                ext4_update_inode_fsync_trans(handle, inode, 1);
-                err = check_eofblocks_fl(handle, inode, map->m_lblk, path,
-                                         map->m_len);
-                if (err < 0)
-                        goto out2;
-        }
 out:
        if (ret <= 0) {
                err = ret;
@@ -3270,11 +3545,24 @@ out:
         * But fallocate would have already updated quota and block
         * count for this offset. So cancel these reservation
         */
-        if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
+        if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
-                ext4_da_update_reserve_space(inode, allocated, 0);
+                unsigned int reserved_clusters;
+                reserved_clusters = get_reserved_cluster_alloc(inode,
+                                map->m_lblk, map->m_len);
+                if (reserved_clusters)
+                        ext4_da_update_reserve_space(inode,
+                                                     reserved_clusters,
+                                                     0);
+        }
 map_out:
        map->m_flags |= EXT4_MAP_MAPPED;
+        if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) {
+                err = check_eofblocks_fl(handle, inode, map->m_lblk, path,
+                                         map->m_len);
+                if (err < 0)
+                        goto out2;
+        }
 out1:
        if (allocated > map->m_len)
                allocated = map->m_len;
@@ -3290,6 +3578,111 @@ out2:
 }
 /*
+ * get_implied_cluster_alloc - check to see if the requested
+ * allocation (in the map structure) overlaps with a cluster already
+ * allocated in an extent.
+ *      @sb     The filesystem superblock structure
+ *      @map    The requested lblk->pblk mapping
+ *      @ex     The extent structure which might contain an implied
+ *                      cluster allocation
+ *
+ * This function is called by ext4_ext_map_blocks() after we failed to
+ * find blocks that were already in the inode's extent tree.  Hence,
+ * we know that the beginning of the requested region cannot overlap
+ * the extent from the inode's extent tree.  There are three cases we
+ * want to catch.  The first is this case:
+ *
+ *               |--- cluster # N--|
+ *    |--- extent ---|  |---- requested region ---|
+ *                      |==========|
+ *
+ * The second case that we need to test for is this one:
+ *
+ *   |--------- cluster # N ----------------|
+ *         |--- requested region --|   |------- extent ----|
+ *         |=======================|
+ *
+ * The third case is when the requested region lies between two extents
+ * within the same cluster:
+ *          |------------- cluster # N-------------|
+ * |----- ex -----|                  |---- ex_right ----|
+ *                  |------ requested region ------|
+ *                  |================|
+ *
+ * In each of the above cases, we need to set the map->m_pblk and
+ * map->m_len so it corresponds to the return the extent labelled as
+ * "|====|" from cluster #N, since it is already in use for data in
+ * cluster EXT4_B2C(sbi, map->m_lblk).  We will then return 1 to
+ * signal to ext4_ext_map_blocks() that map->m_pblk should be treated
+ * as a new "allocated" block region.  Otherwise, we will return 0 and
+ * ext4_ext_map_blocks() will then allocate one or more new clusters
+ * by calling ext4_mb_new_blocks().
+ */
+static int get_implied_cluster_alloc(struct super_block *sb,
+                                     struct ext4_map_blocks *map,
+                                     struct ext4_extent *ex,
+                                     struct ext4_ext_path *path)
+{
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        ext4_lblk_t c_offset = map->m_lblk & (sbi->s_cluster_ratio-1);
+        ext4_lblk_t ex_cluster_start, ex_cluster_end;
+        ext4_lblk_t rr_cluster_start, rr_cluster_end;
+        ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
+        ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
+        unsigned short ee_len = ext4_ext_get_actual_len(ex);
+        /* The extent passed in that we are trying to match */
+        ex_cluster_start = EXT4_B2C(sbi, ee_block);
+        ex_cluster_end = EXT4_B2C(sbi, ee_block + ee_len - 1);
+        /* The requested region passed into ext4_map_blocks() */
+        rr_cluster_start = EXT4_B2C(sbi, map->m_lblk);
+        rr_cluster_end = EXT4_B2C(sbi, map->m_lblk + map->m_len - 1);
+        if ((rr_cluster_start == ex_cluster_end) ||
+            (rr_cluster_start == ex_cluster_start)) {
+                if (rr_cluster_start == ex_cluster_end)
+                        ee_start += ee_len - 1;
+                map->m_pblk = (ee_start & ~(sbi->s_cluster_ratio - 1)) +
+                        c_offset;
+                map->m_len = min(map->m_len,
+                                 (unsigned) sbi->s_cluster_ratio - c_offset);
+                /*
+                 * Check for and handle this case:
+                 *
+                 *   |--------- cluster # N-------------|
+                 *                     |------- extent ----|
+                 *         |--- requested region ---|
+                 *         |===========|
+                 */
+                if (map->m_lblk < ee_block)
+                        map->m_len = min(map->m_len, ee_block - map->m_lblk);
+                /*
+                 * Check for the case where there is already another allocated
+                 * block to the right of 'ex' but before the end of the cluster.
+                 *
+                 *          |------------- cluster # N-------------|
+                 * |----- ex -----|                  |---- ex_right ----|
+                 *                  |------ requested region ------|
+                 *                  |================|
+                 */
+                if (map->m_lblk > ee_block) {
+                        ext4_lblk_t next = ext4_ext_next_allocated_block(path);
+                        map->m_len = min(map->m_len, next - map->m_lblk);
+                }
+                trace_ext4_get_implied_cluster_alloc_exit(sb, map, 1);
+                return 1;
+        }
+        trace_ext4_get_implied_cluster_alloc_exit(sb, map, 0);
+        return 0;
+}
+/*
 * Block allocation/map/preallocation routine for extents based files
 *
 *
@@ -3311,15 +3704,17 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                        struct ext4_map_blocks *map, int flags)
 {
        struct ext4_ext_path *path = NULL;
-        struct ext4_extent newex, *ex;
+        struct ext4_extent newex, *ex, *ex2;
+        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        ext4_fsblk_t newblock = 0;
-        int err = 0, depth, ret;
+        int free_on_err = 0, err = 0, depth, ret;
-        unsigned int allocated = 0;
+        unsigned int allocated = 0, offset = 0;
+        unsigned int allocated_clusters = 0;
        unsigned int punched_out = 0;
        unsigned int result = 0;
        struct ext4_allocation_request ar;
        ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
-        struct ext4_map_blocks punch_map;
+        ext4_lblk_t cluster_offset;
        ext_debug("blocks %u/%u requested for inode %lu\n",
                  map->m_lblk, map->m_len, inode->i_ino);
@@ -3329,6 +3724,10 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
        if (!(flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) &&
                ext4_ext_in_cache(inode, map->m_lblk, &newex)) {
                if (!newex.ee_start_lo && !newex.ee_start_hi) {
+                        if ((sbi->s_cluster_ratio > 1) &&
+                            ext4_find_delalloc_cluster(inode, map->m_lblk, 0))
+                                map->m_flags |= EXT4_MAP_FROM_CLUSTER;
                        if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
                                /*
                                 * block isn't allocated yet and
@@ -3339,6 +3738,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                        /* we should allocate requested block */
                } else {
                        /* block is already allocated */
+                        if (sbi->s_cluster_ratio > 1)
+                                map->m_flags |= EXT4_MAP_FROM_CLUSTER;
                        newblock = map->m_lblk
                                   - le32_to_cpu(newex.ee_block)
                                   + ext4_ext_pblock(&newex);
@@ -3384,8 +3785,14 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                 * we split out initialized portions during a write.
                 */
                ee_len = ext4_ext_get_actual_len(ex);
+                trace_ext4_ext_show_extent(inode, ee_block, ee_start, ee_len);
                /* if found extent covers block, simply return it */
                if (in_range(map->m_lblk, ee_block, ee_len)) {
+                        struct ext4_map_blocks punch_map;
+                        ext4_fsblk_t partial_cluster = 0;
                        newblock = map->m_lblk - ee_block + ee_start;
                        /* number of remaining blocks in the extent */
                        allocated = ee_len - (map->m_lblk - ee_block);
@@ -3469,7 +3876,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                        ext4_ext_invalidate_cache(inode);
                        err = ext4_ext_rm_leaf(handle, inode, path,
-                                map->m_lblk, map->m_lblk + punched_out);
+                                               &partial_cluster, map->m_lblk,
+                                               map->m_lblk + punched_out);
                        if (!err && path->p_hdr->eh_entries == 0) {
                                /*
@@ -3492,6 +3900,10 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                }
        }
+        if ((sbi->s_cluster_ratio > 1) &&
+            ext4_find_delalloc_cluster(inode, map->m_lblk, 0))
+                map->m_flags |= EXT4_MAP_FROM_CLUSTER;
        /*
         * requested block isn't allocated yet;
         * we couldn't try to create block if create flag is zero
@@ -3504,9 +3916,25 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
                goto out2;
        }
        /*
         * Okay, we need to do block allocation.
         */
+        map->m_flags &= ~EXT4_MAP_FROM_CLUSTER;
+        newex.ee_block = cpu_to_le32(map->m_lblk);
+        cluster_offset = map->m_lblk & (sbi->s_cluster_ratio-1);
+        /*
+         * If we are doing bigalloc, check to see if the extent returned
+         * by ext4_ext_find_extent() implies a cluster we can use.
+         */
+        if (cluster_offset && ex &&
+            get_implied_cluster_alloc(inode->i_sb, map, ex, path)) {
+                ar.len = allocated = map->m_len;
+                newblock = map->m_pblk;
+                map->m_flags |= EXT4_MAP_FROM_CLUSTER;
+                goto got_allocated_blocks;
+        }
        /* find neighbour allocated blocks */
        ar.lleft = map->m_lblk;
@@ -3514,10 +3942,21 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
        if (err)
                goto out2;
        ar.lright = map->m_lblk;
-        err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright);
+        ex2 = NULL;
+        err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2);
        if (err)
                goto out2;
+        /* Check if the extent after searching to the right implies a
+         * cluster we can use. */
+        if ((sbi->s_cluster_ratio > 1) && ex2 &&
+            get_implied_cluster_alloc(inode->i_sb, map, ex2, path)) {
+                ar.len = allocated = map->m_len;
+                newblock = map->m_pblk;
+                map->m_flags |= EXT4_MAP_FROM_CLUSTER;
+                goto got_allocated_blocks;
+        }
        /*
         * See if request is beyond maximum number of blocks we can have in
         * a single extent. For an initialized extent this limit is
@@ -3532,9 +3971,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                map->m_len = EXT_UNINIT_MAX_LEN;
        /* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */
-        newex.ee_block = cpu_to_le32(map->m_lblk);
        newex.ee_len = cpu_to_le16(map->m_len);
-        err = ext4_ext_check_overlap(inode, &newex, path);
+        err = ext4_ext_check_overlap(sbi, inode, &newex, path);
        if (err)
                allocated = ext4_ext_get_actual_len(&newex);
        else
@@ -3544,7 +3982,18 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
        ar.inode = inode;
        ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk);
        ar.logical = map->m_lblk;
-        ar.len = allocated;
+        /*
+         * We calculate the offset from the beginning of the cluster
+         * for the logical block number, since when we allocate a
+         * physical cluster, the physical block should start at the
+         * same offset from the beginning of the cluster.  This is
+         * needed so that future calls to get_implied_cluster_alloc()
+         * work correctly.
+         */
+        offset = map->m_lblk & (sbi->s_cluster_ratio - 1);
+        ar.len = EXT4_NUM_B2C(sbi, offset+allocated);
+        ar.goal -= offset;
+        ar.logical -= offset;
        if (S_ISREG(inode->i_mode))
                ar.flags = EXT4_MB_HINT_DATA;
        else
@@ -3557,9 +4006,15 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                goto out2;
        ext_debug("allocate new block: goal %llu, found %llu/%u\n",
                  ar.goal, newblock, allocated);
+        free_on_err = 1;
+        allocated_clusters = ar.len;
+        ar.len = EXT4_C2B(sbi, ar.len) - offset;
+        if (ar.len > allocated)
+                ar.len = allocated;
+got_allocated_blocks:
        /* try to insert new extent into found leaf and return */
-        ext4_ext_store_pblock(&newex, newblock);
+        ext4_ext_store_pblock(&newex, newblock + offset);
        newex.ee_len = cpu_to_le16(ar.len);
        /* Mark uninitialized */
        if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){
@@ -3572,10 +4027,9 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                 * that we need to perform conversion when IO is done.
                 */
                if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
-                        if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) {
+                        if (io)
-                                io->flag = EXT4_IO_END_UNWRITTEN;
+                                ext4_set_io_unwritten_flag(inode, io);
-                                atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
+                        else
-                        } else
                                ext4_set_inode_state(inode,
                                                     EXT4_STATE_DIO_UNWRITTEN);
                }
@@ -3583,11 +4037,14 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                        map->m_flags |= EXT4_MAP_UNINIT;
        }
-        err = check_eofblocks_fl(handle, inode, map->m_lblk, path, ar.len);
+        err = 0;
+        if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0)
+                err = check_eofblocks_fl(handle, inode, map->m_lblk,
+                                         path, ar.len);
        if (!err)
                err = ext4_ext_insert_extent(handle, inode, path,
                                             &newex, flags);
-        if (err) {
+        if (err && free_on_err) {
                int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ?
                        EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0;
                /* free data blocks we just allocated */
@@ -3610,8 +4067,82 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
         * Update reserved blocks/metadata blocks after successful
         * block allocation which had been deferred till now.
         */
-        if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
+        if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
-                ext4_da_update_reserve_space(inode, allocated, 1);
+                unsigned int reserved_clusters;
+                /*
+                 * Check how many clusters we had reserved this allocated range
+                 */
+                reserved_clusters = get_reserved_cluster_alloc(inode,
+                                                map->m_lblk, allocated);
+                if (map->m_flags & EXT4_MAP_FROM_CLUSTER) {
+                        if (reserved_clusters) {
+                                /*
+                                 * We have clusters reserved for this range.
+                                 * But since we are not doing actual allocation
+                                 * and are simply using blocks from previously
+                                 * allocated cluster, we should release the
+                                 * reservation and not claim quota.
+                                 */
+                                ext4_da_update_reserve_space(inode,
+                                                reserved_clusters, 0);
+                        }
+                } else {
+                        BUG_ON(allocated_clusters < reserved_clusters);
+                        /* We will claim quota for all newly allocated blocks.*/
+                        ext4_da_update_reserve_space(inode, allocated_clusters,
+                                                        1);
+                        if (reserved_clusters < allocated_clusters) {
+                                struct ext4_inode_info *ei = EXT4_I(inode);
+                                int reservation = allocated_clusters -
+                                                  reserved_clusters;
+                                /*
+                                 * It seems we claimed few clusters outside of
+                                 * the range of this allocation. We should give
+                                 * it back to the reservation pool. This can
+                                 * happen in the following case:
+                                 *
+                                 * * Suppose s_cluster_ratio is 4 (i.e., each
+                                 *   cluster has 4 blocks. Thus, the clusters
+                                 *   are [0-3],[4-7],[8-11]...
+                                 * * First comes delayed allocation write for
+                                 *   logical blocks 10 & 11. Since there were no
+                                 *   previous delayed allocated blocks in the
+                                 *   range [8-11], we would reserve 1 cluster
+                                 *   for this write.
+                                 * * Next comes write for logical blocks 3 to 8.
+                                 *   In this case, we will reserve 2 clusters
+                                 *   (for [0-3] and [4-7]; and not for [8-11] as
+                                 *   that range has a delayed allocated blocks.
+                                 *   Thus total reserved clusters now becomes 3.
+                                 * * Now, during the delayed allocation writeout
+                                 *   time, we will first write blocks [3-8] and
+                                 *   allocate 3 clusters for writing these
+                                 *   blocks. Also, we would claim all these
+                                 *   three clusters above.
+                                 * * Now when we come here to writeout the
+                                 *   blocks [10-11], we would expect to claim
+                                 *   the reservation of 1 cluster we had made
+                                 *   (and we would claim it since there are no
+                                 *   more delayed allocated blocks in the range
+                                 *   [8-11]. But our reserved cluster count had
+                                 *   already gone to 0.
+                                 *
+                                 *   Thus, at the step 4 above when we determine
+                                 *   that there are still some unwritten delayed
+                                 *   allocated blocks outside of our current
+                                 *   block range, we should increment the
+                                 *   reserved clusters count so that when the
+                                 *   remaining blocks finally gets written, we
+                                 *   could claim them.
+                                 */
+                                dquot_reserve_block(inode,
+                                                EXT4_C2B(sbi, reservation));
+                                spin_lock(&ei->i_block_reservation_lock);
+                                ei->i_reserved_data_blocks += reservation;
+                                spin_unlock(&ei->i_block_reservation_lock);
+                        }
+                }
+        }
        /*
         * Cache the extent and update transaction to commit on fdatasync only
@@ -3634,12 +4165,12 @@ out2:
                ext4_ext_drop_refs(path);
                kfree(path);
        }
-        trace_ext4_ext_map_blocks_exit(inode, map->m_lblk,
-                newblock, map->m_len, err ? err : allocated);
        result = (flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) ?
                        punched_out : allocated;
+        trace_ext4_ext_map_blocks_exit(inode, map->m_lblk,
+                newblock, map->m_len, err ? err : result);
        return err ? err : result;
 }
@@ -3649,6 +4180,7 @@ void ext4_ext_truncate(struct inode *inode)
        struct super_block *sb = inode->i_sb;
        ext4_lblk_t last_block;
        handle_t *handle;
+        loff_t page_len;
        int err = 0;
        /*
@@ -3665,8 +4197,16 @@ void ext4_ext_truncate(struct inode *inode)
        if (IS_ERR(handle))
                return;
-        if (inode->i_size & (sb->s_blocksize - 1))
+        if (inode->i_size % PAGE_CACHE_SIZE != 0) {
-                ext4_block_truncate_page(handle, mapping, inode->i_size);
+                page_len = PAGE_CACHE_SIZE -
+                        (inode->i_size & (PAGE_CACHE_SIZE - 1));
+                err = ext4_discard_partial_page_buffers(handle,
+                        mapping, inode->i_size, page_len, 0);
+                if (err)
+                        goto out_stop;
+        }
        if (ext4_orphan_add(handle, inode))
                goto out_stop;
@@ -3760,6 +4300,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
        int ret = 0;
        int ret2 = 0;
        int retries = 0;
+        int flags;
        struct ext4_map_blocks map;
        unsigned int credits, blkbits = inode->i_blkbits;
@@ -3796,6 +4337,16 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
                trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
                return ret;
        }
+        flags = EXT4_GET_BLOCKS_CREATE_UNINIT_EXT;
+        if (mode & FALLOC_FL_KEEP_SIZE)
+                flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
+        /*
+         * Don't normalize the request if it can fit in one extent so
+         * that it doesn't get unnecessarily split into multiple
+         * extents.
+         */
+        if (len <= EXT_UNINIT_MAX_LEN << blkbits)
+                flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
 retry:
        while (ret >= 0 && ret < max_blocks) {
                map.m_lblk = map.m_lblk + ret;
@@ -3805,9 +4356,7 @@ retry:
                        ret = PTR_ERR(handle);
                        break;
                }
-                ret = ext4_map_blocks(handle, inode, &map,
+                ret = ext4_map_blocks(handle, inode, &map, flags);
-                                      EXT4_GET_BLOCKS_CREATE_UNINIT_EXT |
-                                      EXT4_GET_BLOCKS_NO_NORMALIZE);
                if (ret <= 0) {
 #ifdef EXT4FS_DEBUG
                        WARN_ON(ret <= 0);
@@ -4102,7 +4651,6 @@ found_delayed_extent:
                return EXT_BREAK;
        return EXT_CONTINUE;
 }
 /* fiemap flags we can handle specified here */
 #define EXT4_FIEMAP_FLAGS       (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
@@ -4162,17 +4710,28 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
        struct address_space *mapping = inode->i_mapping;
        struct ext4_map_blocks map;
        handle_t *handle;
-        loff_t first_block_offset, last_block_offset, block_len;
+        loff_t first_page, last_page, page_len;
-        loff_t first_page, last_page, first_page_offset, last_page_offset;
+        loff_t first_page_offset, last_page_offset;
        int ret, credits, blocks_released, err = 0;
+        /* No need to punch hole beyond i_size */
+        if (offset >= inode->i_size)
+                return 0;
+        /*
+         * If the hole extends beyond i_size, set the hole
+         * to end after the page that contains i_size
+         */
+        if (offset + length > inode->i_size) {
+                length = inode->i_size +
+                   PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) -
+                   offset;
+        }
        first_block = (offset + sb->s_blocksize - 1) >>
                EXT4_BLOCK_SIZE_BITS(sb);
        last_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
-        first_block_offset = first_block << EXT4_BLOCK_SIZE_BITS(sb);
-        last_block_offset = last_block << EXT4_BLOCK_SIZE_BITS(sb);
        first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
        last_page = (offset + length) >> PAGE_CACHE_SHIFT;
@@ -4185,11 +4744,10 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
         */
        if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
                err = filemap_write_and_wait_range(mapping,
-                        first_page_offset == 0 ? 0 : first_page_offset-1,
+                        offset, offset + length - 1);
-                        last_page_offset);
-                        if (err)
+                if (err)
-                                return err;
+                        return err;
        }
        /* Now release the pages */
@@ -4211,24 +4769,64 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
                goto out;
        /*
-         * Now we need to zero out the un block aligned data.
+         * Now we need to zero out the non-page-aligned data in the
-         * If the file is smaller than a block, just
+         * pages at the start and tail of the hole, and unmap the buffer
-         * zero out the middle
+         * heads for the block aligned regions of the page that were
+         * completely zeroed.
         */
-        if (first_block > last_block)
+        if (first_page > last_page) {
-                ext4_block_zero_page_range(handle, mapping, offset, length);
+                /*
-        else {
+                 * If the file space being truncated is contained within a page
-                /* zero out the head of the hole before the first block */
+                 * just zero out and unmap the middle of that page
-                block_len  = first_block_offset - offset;
+                 */
-                if (block_len > 0)
+                err = ext4_discard_partial_page_buffers(handle,
-                        ext4_block_zero_page_range(handle, mapping,
+                        mapping, offset, length, 0);
-                                                   offset, block_len);
+                if (err)
-                /* zero out the tail of the hole after the last block */
+                        goto out;
-                block_len = offset + length - last_block_offset;
+        } else {
-                if (block_len > 0) {
+                /*
-                        ext4_block_zero_page_range(handle, mapping,
+                 * zero out and unmap the partial page that contains
-                                        last_block_offset, block_len);
+                 * the start of the hole
+                 */
+                page_len  = first_page_offset - offset;
+                if (page_len > 0) {
+                        err = ext4_discard_partial_page_buffers(handle, mapping,
+                                                   offset, page_len, 0);
+                        if (err)
+                                goto out;
+                }
+                /*
+                 * zero out and unmap the partial page that contains
+                 * the end of the hole
+                 */
+                page_len = offset + length - last_page_offset;
+                if (page_len > 0) {
+                        err = ext4_discard_partial_page_buffers(handle, mapping,
+                                        last_page_offset, page_len, 0);
+                        if (err)
+                                goto out;
+                }
+        }
+        /*
+         * If i_size is contained in the last page, we need to
+         * unmap and zero the partial page after i_size
+         */
+        if (inode->i_size >> PAGE_CACHE_SHIFT == last_page &&
+           inode->i_size % PAGE_CACHE_SIZE != 0) {
+                page_len = PAGE_CACHE_SIZE -
+                        (inode->i_size & (PAGE_CACHE_SIZE - 1));
+                if (page_len > 0) {
+                        err = ext4_discard_partial_page_buffers(handle,
+                          mapping, inode->i_size, page_len, 0);
+                        if (err)
+                                goto out;
                }
        }
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index b9548f477bb8..cb70f1812a70 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -181,8 +181,8 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
                path.dentry = mnt->mnt_root;
                cp = d_path(&path, buf, sizeof(buf));
                if (!IS_ERR(cp)) {
-                        memcpy(sbi->s_es->s_last_mounted, cp,
+                        strlcpy(sbi->s_es->s_last_mounted, cp,
-                               sizeof(sbi->s_es->s_last_mounted));
+                                sizeof(sbi->s_es->s_last_mounted));
                        ext4_mark_super_dirty(sb);
                }
        }
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 036f78f7a1ef..00a2cb753efd 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -75,7 +75,7 @@ static void dump_completed_IO(struct inode * inode)
 * to written.
 * The function return the number of pending IOs on success.
 */
-extern int ext4_flush_completed_IO(struct inode *inode)
+int ext4_flush_completed_IO(struct inode *inode)
 {
        ext4_io_end_t *io;
        struct ext4_inode_info *ei = EXT4_I(inode);
@@ -83,14 +83,12 @@ extern int ext4_flush_completed_IO(struct inode *inode)
        int ret = 0;
        int ret2 = 0;
-        if (list_empty(&ei->i_completed_io_list))
-                return ret;
        dump_completed_IO(inode);
        spin_lock_irqsave(&ei->i_completed_io_lock, flags);
        while (!list_empty(&ei->i_completed_io_list)){
                io = list_entry(ei->i_completed_io_list.next,
                                ext4_io_end_t, list);
+                list_del_init(&io->list);
                /*
                 * Calling ext4_end_io_nolock() to convert completed
                 * IO to written.
@@ -107,11 +105,9 @@ extern int ext4_flush_completed_IO(struct inode *inode)
                 */
                spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
                ret = ext4_end_io_nolock(io);
-                spin_lock_irqsave(&ei->i_completed_io_lock, flags);
                if (ret < 0)
                        ret2 = ret;
-                else
+                spin_lock_irqsave(&ei->i_completed_io_lock, flags);
-                        list_del_init(&io->list);
        }
        spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
        return (ret2 < 0) ? ret2 : 0;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 9c63f273b550..00beb4f9cc4f 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -78,7 +78,7 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb,
         * allocation, essentially implementing a per-group read-only flag. */
        if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
                ext4_error(sb, "Checksum bad for group %u", block_group);
-                ext4_free_blks_set(sb, gdp, 0);
+                ext4_free_group_clusters_set(sb, gdp, 0);
                ext4_free_inodes_set(sb, gdp, 0);
                ext4_itable_unused_set(sb, gdp, 0);
                memset(bh->b_data, 0xff, sb->s_blocksize);
@@ -293,121 +293,9 @@ error_return:
        ext4_std_error(sb, fatal);
 }
-/*
- * There are two policies for allocating an inode.  If the new inode is
- * a directory, then a forward search is made for a block group with both
- * free space and a low directory-to-inode ratio; if that fails, then of
- * the groups with above-average free space, that group with the fewest
- * directories already is chosen.
- *
- * For other inodes, search forward from the parent directory\'s block
- * group to find a free inode.
- */
-static int find_group_dir(struct super_block *sb, struct inode *parent,
-                                ext4_group_t *best_group)
-{
-        ext4_group_t ngroups = ext4_get_groups_count(sb);
-        unsigned int freei, avefreei;
-        struct ext4_group_desc *desc, *best_desc = NULL;
-        ext4_group_t group;
-        int ret = -1;
-        freei = percpu_counter_read_positive(&EXT4_SB(sb)->s_freeinodes_counter);
-        avefreei = freei / ngroups;
-        for (group = 0; group < ngroups; group++) {
-                desc = ext4_get_group_desc(sb, group, NULL);
-                if (!desc || !ext4_free_inodes_count(sb, desc))
-                        continue;
-                if (ext4_free_inodes_count(sb, desc) < avefreei)
-                        continue;
-                if (!best_desc ||
-                    (ext4_free_blks_count(sb, desc) >
-                     ext4_free_blks_count(sb, best_desc))) {
-                        *best_group = group;
-                        best_desc = desc;
-                        ret = 0;
-                }
-        }
-        return ret;
-}
-#define free_block_ratio 10
-static int find_group_flex(struct super_block *sb, struct inode *parent,
-                           ext4_group_t *best_group)
-{
-        struct ext4_sb_info *sbi = EXT4_SB(sb);
-        struct ext4_group_desc *desc;
-        struct flex_groups *flex_group = sbi->s_flex_groups;
-        ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
-        ext4_group_t parent_fbg_group = ext4_flex_group(sbi, parent_group);
-        ext4_group_t ngroups = ext4_get_groups_count(sb);
-        int flex_size = ext4_flex_bg_size(sbi);
-        ext4_group_t best_flex = parent_fbg_group;
-        int blocks_per_flex = sbi->s_blocks_per_group * flex_size;
-        int flexbg_free_blocks;
-        int flex_freeb_ratio;
-        ext4_group_t n_fbg_groups;
-        ext4_group_t i;
-        n_fbg_groups = (ngroups + flex_size - 1) >>
-                sbi->s_log_groups_per_flex;
-find_close_to_parent:
-        flexbg_free_blocks = atomic_read(&flex_group[best_flex].free_blocks);
-        flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
-        if (atomic_read(&flex_group[best_flex].free_inodes) &&
-            flex_freeb_ratio > free_block_ratio)
-                goto found_flexbg;
-        if (best_flex && best_flex == parent_fbg_group) {
-                best_flex--;
-                goto find_close_to_parent;
-        }
-        for (i = 0; i < n_fbg_groups; i++) {
-                if (i == parent_fbg_group || i == parent_fbg_group - 1)
-                        continue;
-                flexbg_free_blocks = atomic_read(&flex_group[i].free_blocks);
-                flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
-                if (flex_freeb_ratio > free_block_ratio &&
-                    (atomic_read(&flex_group[i].free_inodes))) {
-                        best_flex = i;
-                        goto found_flexbg;
-                }
-                if ((atomic_read(&flex_group[best_flex].free_inodes) == 0) ||
-                    ((atomic_read(&flex_group[i].free_blocks) >
-                      atomic_read(&flex_group[best_flex].free_blocks)) &&
-                     atomic_read(&flex_group[i].free_inodes)))
-                        best_flex = i;
-        }
-        if (!atomic_read(&flex_group[best_flex].free_inodes) ||
-            !atomic_read(&flex_group[best_flex].free_blocks))
-                return -1;
-found_flexbg:
-        for (i = best_flex * flex_size; i < ngroups &&
-                     i < (best_flex + 1) * flex_size; i++) {
-                desc = ext4_get_group_desc(sb, i, NULL);
-                if (ext4_free_inodes_count(sb, desc)) {
-                        *best_group = i;
-                        goto out;
-                }
-        }
-        return -1;
-out:
-        return 0;
-}
 struct orlov_stats {
        __u32 free_inodes;
-        __u32 free_blocks;
+        __u32 free_clusters;
        __u32 used_dirs;
 };
@@ -424,7 +312,7 @@ static void get_orlov_stats(struct super_block *sb, ext4_group_t g,
        if (flex_size > 1) {
                stats->free_inodes = atomic_read(&flex_group[g].free_inodes);
-                stats->free_blocks = atomic_read(&flex_group[g].free_blocks);
+                stats->free_clusters = atomic_read(&flex_group[g].free_clusters);
                stats->used_dirs = atomic_read(&flex_group[g].used_dirs);
                return;
        }
@@ -432,11 +320,11 @@ static void get_orlov_stats(struct super_block *sb, ext4_group_t g,
        desc = ext4_get_group_desc(sb, g, NULL);
        if (desc) {
                stats->free_inodes = ext4_free_inodes_count(sb, desc);
-                stats->free_blocks = ext4_free_blks_count(sb, desc);
+                stats->free_clusters = ext4_free_group_clusters(sb, desc);
                stats->used_dirs = ext4_used_dirs_count(sb, desc);
        } else {
                stats->free_inodes = 0;
-                stats->free_blocks = 0;
+                stats->free_clusters = 0;
                stats->used_dirs = 0;
        }
 }
@@ -471,10 +359,10 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
        ext4_group_t real_ngroups = ext4_get_groups_count(sb);
        int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
        unsigned int freei, avefreei;
-        ext4_fsblk_t freeb, avefreeb;
+        ext4_fsblk_t freeb, avefreec;
        unsigned int ndirs;
        int max_dirs, min_inodes;
-        ext4_grpblk_t min_blocks;
+        ext4_grpblk_t min_clusters;
        ext4_group_t i, grp, g, ngroups;
        struct ext4_group_desc *desc;
        struct orlov_stats stats;
@@ -490,9 +378,10 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
        freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
        avefreei = freei / ngroups;
-        freeb = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
+        freeb = EXT4_C2B(sbi,
-        avefreeb = freeb;
+                percpu_counter_read_positive(&sbi->s_freeclusters_counter));
-        do_div(avefreeb, ngroups);
+        avefreec = freeb;
+        do_div(avefreec, ngroups);
        ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);
        if (S_ISDIR(mode) &&
@@ -518,7 +407,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
                                continue;
                        if (stats.free_inodes < avefreei)
                                continue;
-                        if (stats.free_blocks < avefreeb)
+                        if (stats.free_clusters < avefreec)
                                continue;
                        grp = g;
                        ret = 0;
@@ -556,7 +445,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
        min_inodes = avefreei - inodes_per_group*flex_size / 4;
        if (min_inodes < 1)
                min_inodes = 1;
-        min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb)*flex_size / 4;
+        min_clusters = avefreec - EXT4_CLUSTERS_PER_GROUP(sb)*flex_size / 4;
        /*
         * Start looking in the flex group where we last allocated an
@@ -575,7 +464,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
                        continue;
                if (stats.free_inodes < min_inodes)
                        continue;
-                if (stats.free_blocks < min_blocks)
+                if (stats.free_clusters < min_clusters)
                        continue;
                goto found_flex_bg;
        }
@@ -659,7 +548,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
        *group = parent_group;
        desc = ext4_get_group_desc(sb, *group, NULL);
        if (desc && ext4_free_inodes_count(sb, desc) &&
-                        ext4_free_blks_count(sb, desc))
+            ext4_free_group_clusters(sb, desc))
                return 0;
        /*
@@ -683,7 +572,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
                        *group -= ngroups;
                desc = ext4_get_group_desc(sb, *group, NULL);
                if (desc && ext4_free_inodes_count(sb, desc) &&
-                                ext4_free_blks_count(sb, desc))
+                    ext4_free_group_clusters(sb, desc))
                        return 0;
        }
@@ -802,7 +691,7 @@ err_ret:
 * group to find a free inode.
 */
 struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode,
-                             const struct qstr *qstr, __u32 goal)
+                             const struct qstr *qstr, __u32 goal, uid_t *owner)
 {
        struct super_block *sb;
        struct buffer_head *inode_bitmap_bh = NULL;
@@ -816,8 +705,6 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode,
        int ret2, err = 0;
        struct inode *ret;
        ext4_group_t i;
-        int free = 0;
-        static int once = 1;
        ext4_group_t flex_group;
        /* Cannot create files in a deleted directory */
@@ -843,26 +730,9 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode,
                goto got_group;
        }
-        if (sbi->s_log_groups_per_flex && test_opt(sb, OLDALLOC)) {
+        if (S_ISDIR(mode))
-                ret2 = find_group_flex(sb, dir, &group);
+                ret2 = find_group_orlov(sb, dir, &group, mode, qstr);
-                if (ret2 == -1) {
+        else
-                        ret2 = find_group_other(sb, dir, &group, mode);
-                        if (ret2 == 0 && once) {
-                                once = 0;
-                                printk(KERN_NOTICE "ext4: find_group_flex "
-                                       "failed, fallback succeeded dir %lu\n",
-                                       dir->i_ino);
-                        }
-                }
-                goto got_group;
-        }
-        if (S_ISDIR(mode)) {
-                if (test_opt(sb, OLDALLOC))
-                        ret2 = find_group_dir(sb, dir, &group);
-                else
-                        ret2 = find_group_orlov(sb, dir, &group, mode, qstr);
-        } else
                ret2 = find_group_other(sb, dir, &group, mode);
 got_group:
@@ -950,26 +820,21 @@ got:
                        goto fail;
                }
-                free = 0;
+                BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
-                ext4_lock_group(sb, group);
+                err = ext4_handle_dirty_metadata(handle, NULL, block_bitmap_bh);
+                brelse(block_bitmap_bh);
                /* recheck and clear flag under lock if we still need to */
+                ext4_lock_group(sb, group);
                if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
-                        free = ext4_free_blocks_after_init(sb, group, gdp);
                        gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
-                        ext4_free_blks_set(sb, gdp, free);
+                        ext4_free_group_clusters_set(sb, gdp,
+                                ext4_free_clusters_after_init(sb, group, gdp));
                        gdp->bg_checksum = ext4_group_desc_csum(sbi, group,
                                                                gdp);
                }
                ext4_unlock_group(sb, group);
-                /* Don't need to dirty bitmap block if we didn't change it */
-                if (free) {
-                        BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
-                        err = ext4_handle_dirty_metadata(handle,
-                                                        NULL, block_bitmap_bh);
-                }
-                brelse(block_bitmap_bh);
                if (err)
                        goto fail;
        }
@@ -987,8 +852,11 @@ got:
                flex_group = ext4_flex_group(sbi, group);
                atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes);
        }
+        if (owner) {
-        if (test_opt(sb, GRPID)) {
+                inode->i_mode = mode;
+                inode->i_uid = owner[0];
+                inode->i_gid = owner[1];
+        } else if (test_opt(sb, GRPID)) {
                inode->i_mode = mode;
                inode->i_uid = current_fsuid();
                inode->i_gid = dir->i_gid;
@@ -1005,11 +873,7 @@ got:
        ei->i_dir_start_lookup = 0;
        ei->i_disksize = 0;
-        /*
+        /* Don't inherit extent flag from directory, amongst others. */
-         * Don't inherit extent flag from directory, amongst others. We set
-         * extent flag on newly created directory and file only if -o extent
-         * mount option is specified
-         */
        ei->i_flags =
                ext4_mask_flags(mode, EXT4_I(dir)->i_flags & EXT4_FL_INHERITED);
        ei->i_file_acl = 0;
@@ -1084,7 +948,7 @@ fail_free_drop:
 fail_drop:
        dquot_drop(inode);
        inode->i_flags |= S_NOQUOTA;
-        inode->i_nlink = 0;
+        clear_nlink(inode);
        unlock_new_inode(inode);
        iput(inode);
        brelse(inode_bitmap_bh);
@@ -1235,7 +1099,7 @@ unsigned long ext4_count_dirs(struct super_block * sb)
 * inode allocation from the current group, so we take alloc_sem lock, to
 * block ext4_claim_inode until we are finished.
 */
-extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
+int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
                                 int barrier)
 {
        struct ext4_group_info *grp = ext4_get_group_info(sb, group);
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 0962642119c0..3cfc73fbca8e 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -699,6 +699,13 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
        /*
         * Okay, we need to do block allocation.
        */
+        if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+                                       EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
+                EXT4_ERROR_INODE(inode, "Can't allocate blocks for "
+                                 "non-extent mapped inodes with bigalloc");
+                return -ENOSPC;
+        }
        goal = ext4_find_goal(inode, map->m_lblk, partial);
        /* the number of blocks need to allocate for [d,t]indirect blocks */
@@ -1343,7 +1350,9 @@ void ext4_ind_truncate(struct inode *inode)
        __le32 nr = 0;
        int n = 0;
        ext4_lblk_t last_block, max_block;
+        loff_t page_len;
        unsigned blocksize = inode->i_sb->s_blocksize;
+        int err;
        handle = start_transaction(inode);
        if (IS_ERR(handle))
@@ -1354,9 +1363,16 @@ void ext4_ind_truncate(struct inode *inode)
        max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1)
                                        >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
-        if (inode->i_size & (blocksize - 1))
+        if (inode->i_size % PAGE_CACHE_SIZE != 0) {
-                if (ext4_block_truncate_page(handle, mapping, inode->i_size))
+                page_len = PAGE_CACHE_SIZE -
+                        (inode->i_size & (PAGE_CACHE_SIZE - 1));
+                err = ext4_discard_partial_page_buffers(handle,
+                        mapping, inode->i_size, page_len, 0);
+                if (err)
                        goto out_stop;
+        }
        if (last_block != max_block) {
                n = ext4_block_to_path(inode, last_block, offsets, NULL);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 986e2388f031..240f6e2dc7ee 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -42,7 +42,6 @@
 #include "ext4_jbd2.h"
 #include "xattr.h"
 #include "acl.h"
-#include "ext4_extents.h"
 #include "truncate.h"
 #include <trace/events/ext4.h>
@@ -268,7 +267,7 @@ void ext4_da_update_reserve_space(struct inode *inode,
        struct ext4_inode_info *ei = EXT4_I(inode);
        spin_lock(&ei->i_block_reservation_lock);
-        trace_ext4_da_update_reserve_space(inode, used);
+        trace_ext4_da_update_reserve_space(inode, used, quota_claim);
        if (unlikely(used > ei->i_reserved_data_blocks)) {
                ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d "
                         "with only %d reserved data blocks\n",
@@ -281,7 +280,7 @@ void ext4_da_update_reserve_space(struct inode *inode,
        /* Update per-inode reservations */
        ei->i_reserved_data_blocks -= used;
        ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks;
-        percpu_counter_sub(&sbi->s_dirtyblocks_counter,
+        percpu_counter_sub(&sbi->s_dirtyclusters_counter,
                           used + ei->i_allocated_meta_blocks);
        ei->i_allocated_meta_blocks = 0;
@@ -291,7 +290,7 @@ void ext4_da_update_reserve_space(struct inode *inode,
                 * only when we have written all of the delayed
                 * allocation blocks.
                 */
-                percpu_counter_sub(&sbi->s_dirtyblocks_counter,
+                percpu_counter_sub(&sbi->s_dirtyclusters_counter,
                                   ei->i_reserved_meta_blocks);
                ei->i_reserved_meta_blocks = 0;
                ei->i_da_metadata_calc_len = 0;
@@ -300,14 +299,14 @@ void ext4_da_update_reserve_space(struct inode *inode,
        /* Update quota subsystem for data blocks */
        if (quota_claim)
-                dquot_claim_block(inode, used);
+                dquot_claim_block(inode, EXT4_C2B(sbi, used));
        else {
                /*
                 * We did fallocate with an offset that is already delayed
                 * allocated. So on delayed allocated writeback we should
                 * not re-claim the quota for fallocated blocks.
                 */
-                dquot_release_reservation_block(inode, used);
+                dquot_release_reservation_block(inode, EXT4_C2B(sbi, used));
        }
        /*
@@ -399,6 +398,49 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
 }
 /*
+ * Sets the BH_Da_Mapped bit on the buffer heads corresponding to the given map.
+ */
+static void set_buffers_da_mapped(struct inode *inode,
+                                   struct ext4_map_blocks *map)
+{
+        struct address_space *mapping = inode->i_mapping;
+        struct pagevec pvec;
+        int i, nr_pages;
+        pgoff_t index, end;
+        index = map->m_lblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+        end = (map->m_lblk + map->m_len - 1) >>
+                (PAGE_CACHE_SHIFT - inode->i_blkbits);
+        pagevec_init(&pvec, 0);
+        while (index <= end) {
+                nr_pages = pagevec_lookup(&pvec, mapping, index,
+                                          min(end - index + 1,
+                                              (pgoff_t)PAGEVEC_SIZE));
+                if (nr_pages == 0)
+                        break;
+                for (i = 0; i < nr_pages; i++) {
+                        struct page *page = pvec.pages[i];
+                        struct buffer_head *bh, *head;
+                        if (unlikely(page->mapping != mapping) ||
+                            !PageDirty(page))
+                                break;
+                        if (page_has_buffers(page)) {
+                                bh = head = page_buffers(page);
+                                do {
+                                        set_buffer_da_mapped(bh);
+                                        bh = bh->b_this_page;
+                                } while (bh != head);
+                        }
+                        index++;
+                }
+                pagevec_release(&pvec);
+        }
+}
+/*
 * The ext4_map_blocks() function tries to look up the requested blocks,
 * and returns if the blocks are already mapped.
 *
@@ -416,7 +458,7 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
 * the buffer head is mapped.
 *
 * It returns 0 if plain look up failed (blocks have not been allocated), in
- * that casem, buffer head is unmapped
+ * that case, buffer head is unmapped
 *
 * It returns the error in case of allocation failure.
 */
@@ -435,9 +477,11 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
         */
        down_read((&EXT4_I(inode)->i_data_sem));
        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
-                retval = ext4_ext_map_blocks(handle, inode, map, 0);
+                retval = ext4_ext_map_blocks(handle, inode, map, flags &
+                                             EXT4_GET_BLOCKS_KEEP_SIZE);
        } else {
-                retval = ext4_ind_map_blocks(handle, inode, map, 0);
+                retval = ext4_ind_map_blocks(handle, inode, map, flags &
+                                             EXT4_GET_BLOCKS_KEEP_SIZE);
        }
        up_read((&EXT4_I(inode)->i_data_sem));
@@ -455,7 +499,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
         * Returns if the blocks have already allocated
         *
         * Note that if blocks have been preallocated
-         * ext4_ext_get_block() returns th create = 0
+         * ext4_ext_get_block() returns the create = 0
         * with buffer head unmapped.
         */
        if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
@@ -517,9 +561,17 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
                        (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
                        ext4_da_update_reserve_space(inode, retval, 1);
        }
-        if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
+        if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
                ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
+                /* If we have successfully mapped the delayed allocated blocks,
+                 * set the BH_Da_Mapped bit on them. Its important to do this
+                 * under the protection of i_data_sem.
+                 */
+                if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
+                        set_buffers_da_mapped(inode, map);
+        }
        up_write((&EXT4_I(inode)->i_data_sem));
        if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
                int ret = check_block_validity(inode, map);
@@ -909,7 +961,11 @@ static int ext4_ordered_write_end(struct file *file,
                        ext4_orphan_add(handle, inode);
                if (ret2 < 0)
                        ret = ret2;
+        } else {
+                unlock_page(page);
+                page_cache_release(page);
        }
        ret2 = ext4_journal_stop(handle);
        if (!ret)
                ret = ret2;
@@ -1037,14 +1093,14 @@ static int ext4_journalled_write_end(struct file *file,
 }
 /*
- * Reserve a single block located at lblock
+ * Reserve a single cluster located at lblock
 */
 static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
 {
        int retries = 0;
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct ext4_inode_info *ei = EXT4_I(inode);
-        unsigned long md_needed;
+        unsigned int md_needed;
        int ret;
        /*
@@ -1054,7 +1110,8 @@ static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
         */
 repeat:
        spin_lock(&ei->i_block_reservation_lock);
-        md_needed = ext4_calc_metadata_amount(inode, lblock);
+        md_needed = EXT4_NUM_B2C(sbi,
+                                 ext4_calc_metadata_amount(inode, lblock));
        trace_ext4_da_reserve_space(inode, md_needed);
        spin_unlock(&ei->i_block_reservation_lock);
@@ -1063,15 +1120,15 @@ repeat:
         * us from metadata over-estimation, though we may go over by
         * a small amount in the end.  Here we just reserve for data.
         */
-        ret = dquot_reserve_block(inode, 1);
+        ret = dquot_reserve_block(inode, EXT4_C2B(sbi, 1));
        if (ret)
                return ret;
        /*
         * We do still charge estimated metadata to the sb though;
         * we cannot afford to run out of free blocks.
         */
-        if (ext4_claim_free_blocks(sbi, md_needed + 1, 0)) {
+        if (ext4_claim_free_clusters(sbi, md_needed + 1, 0)) {
-                dquot_release_reservation_block(inode, 1);
+                dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1));
                if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
                        yield();
                        goto repeat;
@@ -1118,19 +1175,21 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
                 * We can release all of the reserved metadata blocks
                 * only when we have written all of the delayed
                 * allocation blocks.
+                 * Note that in case of bigalloc, i_reserved_meta_blocks,
+                 * i_reserved_data_blocks, etc. refer to number of clusters.
                 */
-                percpu_counter_sub(&sbi->s_dirtyblocks_counter,
+                percpu_counter_sub(&sbi->s_dirtyclusters_counter,
                                   ei->i_reserved_meta_blocks);
                ei->i_reserved_meta_blocks = 0;
                ei->i_da_metadata_calc_len = 0;
        }
        /* update fs dirty data blocks counter */
-        percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free);
+        percpu_counter_sub(&sbi->s_dirtyclusters_counter, to_free);
        spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
-        dquot_release_reservation_block(inode, to_free);
+        dquot_release_reservation_block(inode, EXT4_C2B(sbi, to_free));
 }
 static void ext4_da_page_release_reservation(struct page *page,
@@ -1139,6 +1198,9 @@ static void ext4_da_page_release_reservation(struct page *page,
        int to_release = 0;
        struct buffer_head *head, *bh;
        unsigned int curr_off = 0;
+        struct inode *inode = page->mapping->host;
+        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+        int num_clusters;
        head = page_buffers(page);
        bh = head;
@@ -1148,10 +1210,24 @@ static void ext4_da_page_release_reservation(struct page *page,
                if ((offset <= curr_off) && (buffer_delay(bh))) {
                        to_release++;
                        clear_buffer_delay(bh);
+                        clear_buffer_da_mapped(bh);
                }
                curr_off = next_off;
        } while ((bh = bh->b_this_page) != head);
-        ext4_da_release_space(page->mapping->host, to_release);
+        /* If we have released all the blocks belonging to a cluster, then we
+         * need to release the reserved space for that cluster. */
+        num_clusters = EXT4_NUM_B2C(sbi, to_release);
+        while (num_clusters > 0) {
+                ext4_fsblk_t lblk;
+                lblk = (page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits)) +
+                        ((num_clusters - 1) << sbi->s_cluster_bits);
+                if (sbi->s_cluster_ratio == 1 ||
+                    !ext4_find_delalloc_cluster(inode, lblk, 1))
+                        ext4_da_release_space(inode, 1);
+                num_clusters--;
+        }
 }
 /*
@@ -1253,6 +1329,8 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
                                                clear_buffer_delay(bh);
                                                bh->b_blocknr = pblock;
                                        }
+                                        if (buffer_da_mapped(bh))
+                                                clear_buffer_da_mapped(bh);
                                        if (buffer_unwritten(bh) ||
                                            buffer_mapped(bh))
                                                BUG_ON(bh->b_blocknr != pblock);
@@ -1346,12 +1424,15 @@ static void ext4_print_free_blocks(struct inode *inode)
 {
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        printk(KERN_CRIT "Total free blocks count %lld\n",
-               ext4_count_free_blocks(inode->i_sb));
+               EXT4_C2B(EXT4_SB(inode->i_sb),
+                        ext4_count_free_clusters(inode->i_sb)));
        printk(KERN_CRIT "Free/Dirty block details\n");
        printk(KERN_CRIT "free_blocks=%lld\n",
-               (long long) percpu_counter_sum(&sbi->s_freeblocks_counter));
+               (long long) EXT4_C2B(EXT4_SB(inode->i_sb),
+                percpu_counter_sum(&sbi->s_freeclusters_counter)));
        printk(KERN_CRIT "dirty_blocks=%lld\n",
-               (long long) percpu_counter_sum(&sbi->s_dirtyblocks_counter));
+               (long long) EXT4_C2B(EXT4_SB(inode->i_sb),
+                percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
        printk(KERN_CRIT "Block reservation details\n");
        printk(KERN_CRIT "i_reserved_data_blocks=%u\n",
               EXT4_I(inode)->i_reserved_data_blocks);
@@ -1430,8 +1511,7 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
                if (err == -EAGAIN)
                        goto submit_io;
-                if (err == -ENOSPC &&
+                if (err == -ENOSPC && ext4_count_free_clusters(sb)) {
-                    ext4_count_free_blocks(sb)) {
                        mpd->retval = err;
                        goto submit_io;
                }
@@ -1471,13 +1551,15 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
                for (i = 0; i < map.m_len; i++)
                        unmap_underlying_metadata(bdev, map.m_pblk + i);
-        }
-        if (ext4_should_order_data(mpd->inode)) {
+                if (ext4_should_order_data(mpd->inode)) {
-                err = ext4_jbd2_file_inode(handle, mpd->inode);
+                        err = ext4_jbd2_file_inode(handle, mpd->inode);
-                if (err)
+                        if (err) {
-                        /* This only happens if the journal is aborted */
+                                /* Only if the journal is aborted */
-                        return;
+                                mpd->retval = err;
+                                goto submit_io;
+                        }
+                }
        }
        /*
@@ -1584,6 +1666,66 @@ static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
 }
 /*
+ * This function is grabs code from the very beginning of
+ * ext4_map_blocks, but assumes that the caller is from delayed write
+ * time. This function looks up the requested blocks and sets the
+ * buffer delay bit under the protection of i_data_sem.
+ */
+static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
+                              struct ext4_map_blocks *map,
+                              struct buffer_head *bh)
+{
+        int retval;
+        sector_t invalid_block = ~((sector_t) 0xffff);
+        if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
+                invalid_block = ~0;
+        map->m_flags = 0;
+        ext_debug("ext4_da_map_blocks(): inode %lu, max_blocks %u,"
+                  "logical block %lu\n", inode->i_ino, map->m_len,
+                  (unsigned long) map->m_lblk);
+        /*
+         * Try to see if we can get the block without requesting a new
+         * file system block.
+         */
+        down_read((&EXT4_I(inode)->i_data_sem));
+        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+                retval = ext4_ext_map_blocks(NULL, inode, map, 0);
+        else
+                retval = ext4_ind_map_blocks(NULL, inode, map, 0);
+        if (retval == 0) {
+                /*
+                 * XXX: __block_prepare_write() unmaps passed block,
+                 * is it OK?
+                 */
+                /* If the block was allocated from previously allocated cluster,
+                 * then we dont need to reserve it again. */
+                if (!(map->m_flags & EXT4_MAP_FROM_CLUSTER)) {
+                        retval = ext4_da_reserve_space(inode, iblock);
+                        if (retval)
+                                /* not enough space to reserve */
+                                goto out_unlock;
+                }
+                /* Clear EXT4_MAP_FROM_CLUSTER flag since its purpose is served
+                 * and it should not appear on the bh->b_state.
+                 */
+                map->m_flags &= ~EXT4_MAP_FROM_CLUSTER;
+                map_bh(bh, inode->i_sb, invalid_block);
+                set_buffer_new(bh);
+                set_buffer_delay(bh);
+        }
+out_unlock:
+        up_read((&EXT4_I(inode)->i_data_sem));
+        return retval;
+}
+/*
 * This is a special get_blocks_t callback which is used by
 * ext4_da_write_begin().  It will either return mapped block or
 * reserve space for a single block.
@@ -1600,10 +1742,6 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
 {
        struct ext4_map_blocks map;
        int ret = 0;
-        sector_t invalid_block = ~((sector_t) 0xffff);
-        if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
-                invalid_block = ~0;
        BUG_ON(create == 0);
        BUG_ON(bh->b_size != inode->i_sb->s_blocksize);
@@ -1616,25 +1754,9 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
         * preallocated blocks are unmapped but should treated
         * the same as allocated blocks.
         */
-        ret = ext4_map_blocks(NULL, inode, &map, 0);
+        ret = ext4_da_map_blocks(inode, iblock, &map, bh);
-        if (ret < 0)
+        if (ret <= 0)
                return ret;
-        if (ret == 0) {
-                if (buffer_delay(bh))
-                        return 0; /* Not sure this could or should happen */
-                /*
-                 * XXX: __block_write_begin() unmaps passed block, is it OK?
-                 */
-                ret = ext4_da_reserve_space(inode, iblock);
-                if (ret)
-                        /* not enough space to reserve */
-                        return ret;
-                map_bh(bh, inode->i_sb, invalid_block);
-                set_buffer_new(bh);
-                set_buffer_delay(bh);
-                return 0;
-        }
        map_bh(bh, inode->i_sb, map.m_pblk);
        bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
@@ -1811,8 +1933,12 @@ static int ext4_writepage(struct page *page,
                 * We don't want to do block allocation, so redirty
                 * the page and return.  We may reach here when we do
                 * a journal commit via journal_submit_inode_data_buffers.
-                 * We can also reach here via shrink_page_list
+                 * We can also reach here via shrink_page_list but it
+                 * should never be for direct reclaim so warn if that
+                 * happens
                 */
+                WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) ==
+                                                                PF_MEMALLOC);
                goto redirty_page;
        }
        if (commit_write)
@@ -2046,6 +2172,7 @@ static int ext4_da_writepages(struct address_space *mapping,
        struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
        pgoff_t done_index = 0;
        pgoff_t end;
+        struct blk_plug plug;
        trace_ext4_da_writepages(inode, wbc);
@@ -2124,6 +2251,7 @@ retry:
        if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
                tag_pages_for_writeback(mapping, index, end);
+        blk_start_plug(&plug);
        while (!ret && wbc->nr_to_write > 0) {
                /*
@@ -2174,11 +2302,12 @@ retry:
                        ret = 0;
                } else if (ret == MPAGE_DA_EXTENT_TAIL) {
                        /*
-                         * got one extent now try with
+                         * Got one extent now try with rest of the pages.
-                         * rest of the pages
+                         * If mpd.retval is set -EIO, journal is aborted.
+                         * So we don't need to write any more.
                         */
                        pages_written += mpd.pages_written;
-                        ret = 0;
+                        ret = mpd.retval;
                        io_done = 1;
                } else if (wbc->nr_to_write)
                        /*
@@ -2188,6 +2317,7 @@ retry:
                         */
                        break;
        }
+        blk_finish_plug(&plug);
        if (!io_done && !cycled) {
                cycled = 1;
                index = 0;
@@ -2226,10 +2356,11 @@ static int ext4_nonda_switch(struct super_block *sb)
         * Delalloc need an accurate free block accounting. So switch
         * to non delalloc when we are near to error range.
         */
-        free_blocks  = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
+        free_blocks  = EXT4_C2B(sbi,
-        dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyblocks_counter);
+                percpu_counter_read_positive(&sbi->s_freeclusters_counter));
+        dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyclusters_counter);
        if (2 * free_blocks < 3 * dirty_blocks ||
-                free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) {
+                free_blocks < (dirty_blocks + EXT4_FREECLUSTERS_WATERMARK)) {
                /*
                 * free block count is less than 150% of dirty blocks
                 * or free blocks is less than watermark
@@ -2241,7 +2372,7 @@ static int ext4_nonda_switch(struct super_block *sb)
         * start pushing delalloc when 1/2 of free blocks are dirty.
         */
        if (free_blocks < 2 * dirty_blocks)
-                writeback_inodes_sb_if_idle(sb);
+                writeback_inodes_sb_if_idle(sb, WB_REASON_FS_FREE_SPACE);
        return 0;
 }
@@ -2255,6 +2386,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
        pgoff_t index;
        struct inode *inode = mapping->host;
        handle_t *handle;
+        loff_t page_len;
        index = pos >> PAGE_CACHE_SHIFT;
@@ -2301,6 +2433,13 @@ retry:
                 */
                if (pos + len > inode->i_size)
                        ext4_truncate_failed_write(inode);
+        } else {
+                page_len = pos & (PAGE_CACHE_SIZE - 1);
+                if (page_len > 0) {
+                        ret = ext4_discard_partial_page_buffers_no_lock(handle,
+                                inode, page, pos - page_len, page_len,
+                                EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED);
+                }
        }
        if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -2343,6 +2482,7 @@ static int ext4_da_write_end(struct file *file,
        loff_t new_i_size;
        unsigned long start, end;
        int write_mode = (int)(unsigned long)fsdata;
+        loff_t page_len;
        if (write_mode == FALL_BACK_TO_NONDELALLOC) {
                if (ext4_should_order_data(inode)) {
@@ -2391,6 +2531,16 @@ static int ext4_da_write_end(struct file *file,
        }
        ret2 = generic_write_end(file, mapping, pos, len, copied,
                                                        page, fsdata);
+        page_len = PAGE_CACHE_SIZE -
+                        ((pos + copied - 1) & (PAGE_CACHE_SIZE - 1));
+        if (page_len > 0) {
+                ret = ext4_discard_partial_page_buffers_no_lock(handle,
+                        inode, page, pos + copied - 1, page_len,
+                        EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED);
+        }
        copied = ret2;
        if (ret2 < 0)
                ret = ret2;
@@ -2685,10 +2835,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
         * but being more careful is always safe for the future change.
         */
        inode = io_end->inode;
-        if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
+        ext4_set_io_unwritten_flag(inode, io_end);
-                io_end->flag |= EXT4_IO_END_UNWRITTEN;
-                atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
-        }
        /* Add the io_end to per-inode completed io list*/
        spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
@@ -2854,6 +3001,12 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
        struct inode *inode = file->f_mapping->host;
        ssize_t ret;
+        /*
+         * If we are doing data journalling we don't support O_DIRECT
+         */
+        if (ext4_should_journal_data(inode))
+                return 0;
        trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw);
        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
@@ -2923,6 +3076,7 @@ static const struct address_space_operations ext4_journalled_aops = {
        .bmap                   = ext4_bmap,
        .invalidatepage         = ext4_invalidatepage,
        .releasepage            = ext4_releasepage,
+        .direct_IO              = ext4_direct_IO,
        .is_partially_uptodate  = block_is_partially_uptodate,
        .error_remove_page      = generic_error_remove_page,
 };
@@ -2959,6 +3113,227 @@ void ext4_set_aops(struct inode *inode)
                inode->i_mapping->a_ops = &ext4_journalled_aops;
 }
+/*
+ * ext4_discard_partial_page_buffers()
+ * Wrapper function for ext4_discard_partial_page_buffers_no_lock.
+ * This function finds and locks the page containing the offset
+ * "from" and passes it to ext4_discard_partial_page_buffers_no_lock.
+ * Calling functions that already have the page locked should call
+ * ext4_discard_partial_page_buffers_no_lock directly.
+ */
+int ext4_discard_partial_page_buffers(handle_t *handle,
+                struct address_space *mapping, loff_t from,
+                loff_t length, int flags)
+{
+        struct inode *inode = mapping->host;
+        struct page *page;
+        int err = 0;
+        page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
+                                   mapping_gfp_mask(mapping) & ~__GFP_FS);
+        if (!page)
+                return -ENOMEM;
+        err = ext4_discard_partial_page_buffers_no_lock(handle, inode, page,
+                from, length, flags);
+        unlock_page(page);
+        page_cache_release(page);
+        return err;
+}
+/*
+ * ext4_discard_partial_page_buffers_no_lock()
+ * Zeros a page range of length 'length' starting from offset 'from'.
+ * Buffer heads that correspond to the block aligned regions of the
+ * zeroed range will be unmapped.  Unblock aligned regions
+ * will have the corresponding buffer head mapped if needed so that
+ * that region of the page can be updated with the partial zero out.
+ *
+ * This function assumes that the page has already been  locked.  The
+ * The range to be discarded must be contained with in the given page.
+ * If the specified range exceeds the end of the page it will be shortened
+ * to the end of the page that corresponds to 'from'.  This function is
+ * appropriate for updating a page and it buffer heads to be unmapped and
+ * zeroed for blocks that have been either released, or are going to be
+ * released.
+ *
+ * handle: The journal handle
+ * inode:  The files inode
+ * page:   A locked page that contains the offset "from"
+ * from:   The starting byte offset (from the begining of the file)
+ *         to begin discarding
+ * len:    The length of bytes to discard
+ * flags:  Optional flags that may be used:
+ *
+ *         EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED
+ *         Only zero the regions of the page whose buffer heads
+ *         have already been unmapped.  This flag is appropriate
+ *         for updateing the contents of a page whose blocks may
+ *         have already been released, and we only want to zero
+ *         out the regions that correspond to those released blocks.
+ *
+ * Returns zero on sucess or negative on failure.
+ */
+int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
+                struct inode *inode, struct page *page, loff_t from,
+                loff_t length, int flags)
+{
+        ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
+        unsigned int offset = from & (PAGE_CACHE_SIZE-1);
+        unsigned int blocksize, max, pos;
+        ext4_lblk_t iblock;
+        struct buffer_head *bh;
+        int err = 0;
+        blocksize = inode->i_sb->s_blocksize;
+        max = PAGE_CACHE_SIZE - offset;
+        if (index != page->index)
+                return -EINVAL;
+        /*
+         * correct length if it does not fall between
+         * 'from' and the end of the page
+         */
+        if (length > max || length < 0)
+                length = max;
+        iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
+        if (!page_has_buffers(page)) {
+                /*
+                 * If the range to be discarded covers a partial block
+                 * we need to get the page buffers.  This is because
+                 * partial blocks cannot be released and the page needs
+                 * to be updated with the contents of the block before
+                 * we write the zeros on top of it.
+                 */
+                if ((from & (blocksize - 1)) ||
+                    ((from + length) & (blocksize - 1))) {
+                        create_empty_buffers(page, blocksize, 0);
+                } else {
+                        /*
+                         * If there are no partial blocks,
+                         * there is nothing to update,
+                         * so we can return now
+                         */
+                        return 0;
+                }
+        }
+        /* Find the buffer that contains "offset" */
+        bh = page_buffers(page);
+        pos = blocksize;
+        while (offset >= pos) {
+                bh = bh->b_this_page;
+                iblock++;
+                pos += blocksize;
+        }
+        pos = offset;
+        while (pos < offset + length) {
+                unsigned int end_of_block, range_to_discard;
+                err = 0;
+                /* The length of space left to zero and unmap */
+                range_to_discard = offset + length - pos;
+                /* The length of space until the end of the block */
+                end_of_block = blocksize - (pos & (blocksize-1));
+                /*
+                 * Do not unmap or zero past end of block
+                 * for this buffer head
+                 */
+                if (range_to_discard > end_of_block)
+                        range_to_discard = end_of_block;
+                /*
+                 * Skip this buffer head if we are only zeroing unampped
+                 * regions of the page
+                 */
+                if (flags & EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED &&
+                        buffer_mapped(bh))
+                                goto next;
+                /* If the range is block aligned, unmap */
+                if (range_to_discard == blocksize) {
+                        clear_buffer_dirty(bh);
+                        bh->b_bdev = NULL;
+                        clear_buffer_mapped(bh);
+                        clear_buffer_req(bh);
+                        clear_buffer_new(bh);
+                        clear_buffer_delay(bh);
+                        clear_buffer_unwritten(bh);
+                        clear_buffer_uptodate(bh);
+                        zero_user(page, pos, range_to_discard);
+                        BUFFER_TRACE(bh, "Buffer discarded");
+                        goto next;
+                }
+                /*
+                 * If this block is not completely contained in the range
+                 * to be discarded, then it is not going to be released. Because
+                 * we need to keep this block, we need to make sure this part
+                 * of the page is uptodate before we modify it by writeing
+                 * partial zeros on it.
+                 */
+                if (!buffer_mapped(bh)) {
+                        /*
+                         * Buffer head must be mapped before we can read
+                         * from the block
+                         */
+                        BUFFER_TRACE(bh, "unmapped");
+                        ext4_get_block(inode, iblock, bh, 0);
+                        /* unmapped? It's a hole - nothing to do */
+                        if (!buffer_mapped(bh)) {
+                                BUFFER_TRACE(bh, "still unmapped");
+                                goto next;
+                        }
+                }
+                /* Ok, it's mapped. Make sure it's up-to-date */
+                if (PageUptodate(page))
+                        set_buffer_uptodate(bh);
+                if (!buffer_uptodate(bh)) {
+                        err = -EIO;
+                        ll_rw_block(READ, 1, &bh);
+                        wait_on_buffer(bh);
+                        /* Uhhuh. Read error. Complain and punt.*/
+                        if (!buffer_uptodate(bh))
+                                goto next;
+                }
+                if (ext4_should_journal_data(inode)) {
+                        BUFFER_TRACE(bh, "get write access");
+                        err = ext4_journal_get_write_access(handle, bh);
+                        if (err)
+                                goto next;
+                }
+                zero_user(page, pos, range_to_discard);
+                err = 0;
+                if (ext4_should_journal_data(inode)) {
+                        err = ext4_handle_dirty_metadata(handle, inode, bh);
+                } else
+                        mark_buffer_dirty(bh);
+                BUFFER_TRACE(bh, "Partial buffer zeroed");
+next:
+                bh = bh->b_this_page;
+                iblock++;
+                pos += range_to_discard;
+        }
+        return err;
+}
 /*
 * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
 * up to the end of the block which corresponds to `from'.
@@ -3001,7 +3376,7 @@ int ext4_block_zero_page_range(handle_t *handle,
        page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
                                   mapping_gfp_mask(mapping) & ~__GFP_FS);
        if (!page)
-                return -EINVAL;
+                return -ENOMEM;
        blocksize = inode->i_sb->s_blocksize;
        max = blocksize - (offset & (blocksize - 1));
@@ -3070,11 +3445,8 @@ int ext4_block_zero_page_range(handle_t *handle,
        err = 0;
        if (ext4_should_journal_data(inode)) {
                err = ext4_handle_dirty_metadata(handle, inode, bh);
-        } else {
+        } else
-                if (ext4_should_order_data(inode) && EXT4_I(inode)->jinode)
-                        err = ext4_jbd2_file_inode(handle, inode);
                mark_buffer_dirty(bh);
-        }
 unlock:
        unlock_page(page);
@@ -3115,6 +3487,11 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
                return -ENOTSUPP;
        }
+        if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) {
+                /* TODO: Add support for bigalloc file systems */
+                return -ENOTSUPP;
+        }
        return ext4_ext_punch_hole(file, offset, length);
 }
@@ -3414,7 +3791,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
                inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
        }
-        inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
+        set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
        ext4_clear_state_flags(ei);     /* Only relevant on 32-bit archs */
        ei->i_dir_start_lookup = 0;
@@ -4416,6 +4793,7 @@ retry_alloc:
                          PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) {
                        unlock_page(page);
                        ret = VM_FAULT_SIGBUS;
+                        ext4_journal_stop(handle);
                        goto out;
                }
                ext4_set_inode_state(inode, EXT4_STATE_JDATA);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index f18bfe37aff8..a56796814d6a 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -21,6 +21,7 @@
 long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
        struct inode *inode = filp->f_dentry->d_inode;
+        struct super_block *sb = inode->i_sb;
        struct ext4_inode_info *ei = EXT4_I(inode);
        unsigned int flags;
@@ -173,33 +174,8 @@ setversion_out:
                mnt_drop_write(filp->f_path.mnt);
                return err;
        }
-#ifdef CONFIG_JBD2_DEBUG
-        case EXT4_IOC_WAIT_FOR_READONLY:
-                /*
-                 * This is racy - by the time we're woken up and running,
-                 * the superblock could be released.  And the module could
-                 * have been unloaded.  So sue me.
-                 *
-                 * Returns 1 if it slept, else zero.
-                 */
-                {
-                        struct super_block *sb = inode->i_sb;
-                        DECLARE_WAITQUEUE(wait, current);
-                        int ret = 0;
-                        set_current_state(TASK_INTERRUPTIBLE);
-                        add_wait_queue(&EXT4_SB(sb)->ro_wait_queue, &wait);
-                        if (timer_pending(&EXT4_SB(sb)->turn_ro_timer)) {
-                                schedule();
-                                ret = 1;
-                        }
-                        remove_wait_queue(&EXT4_SB(sb)->ro_wait_queue, &wait);
-                        return ret;
-                }
-#endif
        case EXT4_IOC_GROUP_EXTEND: {
                ext4_fsblk_t n_blocks_count;
-                struct super_block *sb = inode->i_sb;
                int err, err2=0;
                err = ext4_resize_begin(sb);
@@ -209,6 +185,13 @@ setversion_out:
                if (get_user(n_blocks_count, (__u32 __user *)arg))
                        return -EFAULT;
+                if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+                               EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
+                        ext4_msg(sb, KERN_ERR,
+                                 "Online resizing not supported with bigalloc");
+                        return -EOPNOTSUPP;
+                }
                err = mnt_want_write(filp->f_path.mnt);
                if (err)
                        return err;
@@ -250,6 +233,13 @@ setversion_out:
                        goto mext_out;
                }
+                if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+                               EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
+                        ext4_msg(sb, KERN_ERR,
+                                 "Online defrag not supported with bigalloc");
+                        return -EOPNOTSUPP;
+                }
                err = mnt_want_write(filp->f_path.mnt);
                if (err)
                        goto mext_out;
@@ -270,7 +260,6 @@ mext_out:
        case EXT4_IOC_GROUP_ADD: {
                struct ext4_new_group_data input;
-                struct super_block *sb = inode->i_sb;
                int err, err2=0;
                err = ext4_resize_begin(sb);
@@ -281,6 +270,13 @@ mext_out:
                                sizeof(input)))
                        return -EFAULT;
+                if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+                               EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
+                        ext4_msg(sb, KERN_ERR,
+                                 "Online resizing not supported with bigalloc");
+                        return -EOPNOTSUPP;
+                }
                err = mnt_want_write(filp->f_path.mnt);
                if (err)
                        return err;
@@ -337,7 +333,6 @@ mext_out:
        case FITRIM:
        {
-                struct super_block *sb = inode->i_sb;
                struct request_queue *q = bdev_get_queue(sb->s_bdev);
                struct fstrim_range range;
                int ret = 0;
@@ -348,7 +343,14 @@ mext_out:
                if (!blk_queue_discard(q))
                        return -EOPNOTSUPP;
-                if (copy_from_user(&range, (struct fstrim_range *)arg,
+                if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+                               EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
+                        ext4_msg(sb, KERN_ERR,
+                                 "FITRIM not supported with bigalloc");
+                        return -EOPNOTSUPP;
+                }
+                if (copy_from_user(&range, (struct fstrim_range __user *)arg,
                    sizeof(range)))
                        return -EFAULT;
@@ -358,7 +360,7 @@ mext_out:
                if (ret < 0)
                        return ret;
-                if (copy_to_user((struct fstrim_range *)arg, &range,
+                if (copy_to_user((struct fstrim_range __user *)arg, &range,
                    sizeof(range)))
                        return -EFAULT;
@@ -396,11 +398,6 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
        case EXT4_IOC32_SETVERSION_OLD:
                cmd = EXT4_IOC_SETVERSION_OLD;
                break;
-#ifdef CONFIG_JBD2_DEBUG
-        case EXT4_IOC32_WAIT_FOR_READONLY:
-                cmd = EXT4_IOC_WAIT_FOR_READONLY;
-                break;
-#endif
        case EXT4_IOC32_GETRSVSZ:
                cmd = EXT4_IOC_GETRSVSZ;
                break;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 17a5a57c415a..e2d8be8f28bf 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -70,8 +70,8 @@
 *
 * pa_lstart -> the logical start block for this prealloc space
 * pa_pstart -> the physical start block for this prealloc space
- * pa_len    -> length for this prealloc space
+ * pa_len    -> length for this prealloc space (in clusters)
- * pa_free   ->  free space available in this prealloc space
+ * pa_free   ->  free space available in this prealloc space (in clusters)
 *
 * The inode preallocation space is used looking at the _logical_ start
 * block. If only the logical file block falls within the range of prealloc
@@ -126,7 +126,8 @@
 * list. In case of inode preallocation we follow a list of heuristics
 * based on file size. This can be found in ext4_mb_normalize_request. If
 * we are doing a group prealloc we try to normalize the request to
- * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is
+ * sbi->s_mb_group_prealloc.  The default value of s_mb_group_prealloc is
+ * dependent on the cluster size; for non-bigalloc file systems, it is
 * 512 blocks. This can be tuned via
 * /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in
 * terms of number of blocks. If we have mounted the file system with -O
@@ -459,7 +460,7 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
                        ext4_fsblk_t blocknr;
                        blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
-                        blocknr += first + i;
+                        blocknr += EXT4_C2B(EXT4_SB(sb), first + i);
                        ext4_grp_locked_error(sb, e4b->bd_group,
                                              inode ? inode->i_ino : 0,
                                              blocknr,
@@ -580,7 +581,7 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
                                continue;
                        }
-                        /* both bits in buddy2 must be 0 */
+                        /* both bits in buddy2 must be 1 */
                        MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2));
                        MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2));
@@ -653,7 +654,7 @@ static void ext4_mb_mark_free_simple(struct super_block *sb,
        ext4_grpblk_t chunk;
        unsigned short border;
-        BUG_ON(len > EXT4_BLOCKS_PER_GROUP(sb));
+        BUG_ON(len > EXT4_CLUSTERS_PER_GROUP(sb));
        border = 2 << sb->s_blocksize_bits;
@@ -705,7 +706,7 @@ void ext4_mb_generate_buddy(struct super_block *sb,
                                void *buddy, void *bitmap, ext4_group_t group)
 {
        struct ext4_group_info *grp = ext4_get_group_info(sb, group);
-        ext4_grpblk_t max = EXT4_BLOCKS_PER_GROUP(sb);
+        ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
        ext4_grpblk_t i = 0;
        ext4_grpblk_t first;
        ext4_grpblk_t len;
@@ -734,7 +735,7 @@ void ext4_mb_generate_buddy(struct super_block *sb,
        if (free != grp->bb_free) {
                ext4_grp_locked_error(sb, group, 0, 0,
-                                      "%u blocks in bitmap, %u in gd",
+                                      "%u clusters in bitmap, %u in gd",
                                      free, grp->bb_free);
                /*
                 * If we intent to continue, we consider group descritor
@@ -1339,7 +1340,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
                        ext4_fsblk_t blocknr;
                        blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
-                        blocknr += block;
+                        blocknr += EXT4_C2B(EXT4_SB(sb), block);
                        ext4_grp_locked_error(sb, e4b->bd_group,
                                              inode ? inode->i_ino : 0,
                                              blocknr,
@@ -1390,7 +1391,6 @@ static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
 {
        int next = block;
        int max;
-        int ord;
        void *buddy;
        assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
@@ -1432,9 +1432,8 @@ static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
                if (mb_test_bit(next, EXT4_MB_BITMAP(e4b)))
                        break;
-                ord = mb_find_order_for_block(e4b, next);
+                order = mb_find_order_for_block(e4b, next);
-                order = ord;
                block = next >> order;
                ex->fe_len += 1 << order;
        }
@@ -1624,8 +1623,8 @@ static void ext4_mb_measure_extent(struct ext4_allocation_context *ac,
        struct ext4_free_extent *gex = &ac->ac_g_ex;
        BUG_ON(ex->fe_len <= 0);
-        BUG_ON(ex->fe_len > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
+        BUG_ON(ex->fe_len > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
-        BUG_ON(ex->fe_start >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
+        BUG_ON(ex->fe_start >= EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
        BUG_ON(ac->ac_status != AC_STATUS_CONTINUE);
        ac->ac_found++;
@@ -1823,15 +1822,15 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
        while (free && ac->ac_status == AC_STATUS_CONTINUE) {
                i = mb_find_next_zero_bit(bitmap,
-                                                EXT4_BLOCKS_PER_GROUP(sb), i);
+                                                EXT4_CLUSTERS_PER_GROUP(sb), i);
-                if (i >= EXT4_BLOCKS_PER_GROUP(sb)) {
+                if (i >= EXT4_CLUSTERS_PER_GROUP(sb)) {
                        /*
                         * IF we have corrupt bitmap, we won't find any
                         * free blocks even though group info says we
                         * we have free blocks
                         */
                        ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
-                                        "%d free blocks as per "
+                                        "%d free clusters as per "
                                        "group info. But bitmap says 0",
                                        free);
                        break;
@@ -1841,7 +1840,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
                BUG_ON(ex.fe_len <= 0);
                if (free < ex.fe_len) {
                        ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
-                                        "%d free blocks as per "
+                                        "%d free clusters as per "
                                        "group info. But got %d blocks",
                                        free, ex.fe_len);
                        /*
@@ -1887,7 +1886,7 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
        do_div(a, sbi->s_stripe);
        i = (a * sbi->s_stripe) - first_group_block;
-        while (i < EXT4_BLOCKS_PER_GROUP(sb)) {
+        while (i < EXT4_CLUSTERS_PER_GROUP(sb)) {
                if (!mb_test_bit(i, bitmap)) {
                        max = mb_find_extent(e4b, 0, i, sbi->s_stripe, &ex);
                        if (max >= sbi->s_stripe) {
@@ -2252,10 +2251,10 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
         */
        if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
                meta_group_info[i]->bb_free =
-                        ext4_free_blocks_after_init(sb, group, desc);
+                        ext4_free_clusters_after_init(sb, group, desc);
        } else {
                meta_group_info[i]->bb_free =
-                        ext4_free_blks_count(sb, desc);
+                        ext4_free_group_clusters(sb, desc);
        }
        INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
@@ -2473,7 +2472,20 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
        sbi->s_mb_stats = MB_DEFAULT_STATS;
        sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
        sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
-        sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC;
+        /*
+         * The default group preallocation is 512, which for 4k block
+         * sizes translates to 2 megabytes.  However for bigalloc file
+         * systems, this is probably too big (i.e, if the cluster size
+         * is 1 megabyte, then group preallocation size becomes half a
+         * gigabyte!).  As a default, we will keep a two megabyte
+         * group pralloc size for cluster sizes up to 64k, and after
+         * that, we will force a minimum group preallocation size of
+         * 32 clusters.  This translates to 8 megs when the cluster
+         * size is 256k, and 32 megs when the cluster size is 1 meg,
+         * which seems reasonable as a default.
+         */
+        sbi->s_mb_group_prealloc = max(MB_DEFAULT_GROUP_PREALLOC >>
+                                       sbi->s_cluster_bits, 32);
        /*
         * If there is a s_stripe > 1, then we set the s_mb_group_prealloc
         * to the lowest multiple of s_stripe which is bigger than
@@ -2490,7 +2502,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
        sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
        if (sbi->s_locality_groups == NULL) {
                ret = -ENOMEM;
-                goto out;
+                goto out_free_groupinfo_slab;
        }
        for_each_possible_cpu(i) {
                struct ext4_locality_group *lg;
@@ -2503,9 +2515,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
        /* init file for buddy data */
        ret = ext4_mb_init_backend(sb);
-        if (ret != 0) {
+        if (ret != 0)
-                goto out;
+                goto out_free_locality_groups;
-        }
        if (sbi->s_proc)
                proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
@@ -2513,11 +2524,19 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
        if (sbi->s_journal)
                sbi->s_journal->j_commit_callback = release_blocks_on_commit;
+        return 0;
+out_free_locality_groups:
+        free_percpu(sbi->s_locality_groups);
+        sbi->s_locality_groups = NULL;
+out_free_groupinfo_slab:
+        ext4_groupinfo_destroy_slabs();
 out:
-        if (ret) {
+        kfree(sbi->s_mb_offsets);
-                kfree(sbi->s_mb_offsets);
+        sbi->s_mb_offsets = NULL;
-                kfree(sbi->s_mb_maxs);
+        kfree(sbi->s_mb_maxs);
-        }
+        sbi->s_mb_maxs = NULL;
        return ret;
 }
@@ -2602,11 +2621,13 @@ int ext4_mb_release(struct super_block *sb)
 }
 static inline int ext4_issue_discard(struct super_block *sb,
-                ext4_group_t block_group, ext4_grpblk_t block, int count)
+                ext4_group_t block_group, ext4_grpblk_t cluster, int count)
 {
        ext4_fsblk_t discard_block;
-        discard_block = block + ext4_group_first_block_no(sb, block_group);
+        discard_block = (EXT4_C2B(EXT4_SB(sb), cluster) +
+                         ext4_group_first_block_no(sb, block_group));
+        count = EXT4_C2B(EXT4_SB(sb), count);
        trace_ext4_discard_blocks(sb,
                        (unsigned long long) discard_block, count);
        return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
@@ -2633,7 +2654,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
                if (test_opt(sb, DISCARD))
                        ext4_issue_discard(sb, entry->group,
-                                           entry->start_blk, entry->count);
+                                           entry->start_cluster, entry->count);
                err = ext4_mb_load_buddy(sb, entry->group, &e4b);
                /* we expect to find existing buddy because it's pinned */
@@ -2646,7 +2667,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
                ext4_lock_group(sb, entry->group);
                /* Take it out of per group rb tree */
                rb_erase(&entry->node, &(db->bb_free_root));
-                mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count);
+                mb_free_blocks(NULL, &e4b, entry->start_cluster, entry->count);
                /*
                 * Clear the trimmed flag for the group so that the next
@@ -2752,7 +2773,7 @@ void ext4_exit_mballoc(void)
 */
 static noinline_for_stack int
 ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
-                                handle_t *handle, unsigned int reserv_blks)
+                                handle_t *handle, unsigned int reserv_clstrs)
 {
        struct buffer_head *bitmap_bh = NULL;
        struct ext4_group_desc *gdp;
@@ -2783,7 +2804,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
                goto out_err;
        ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group,
-                        ext4_free_blks_count(sb, gdp));
+                        ext4_free_group_clusters(sb, gdp));
        err = ext4_journal_get_write_access(handle, gdp_bh);
        if (err)
@@ -2791,7 +2812,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
        block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
-        len = ac->ac_b_ex.fe_len;
+        len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
        if (!ext4_data_block_valid(sbi, block, len)) {
                ext4_error(sb, "Allocating blocks %llu-%llu which overlap "
                           "fs metadata\n", block, block+len);
@@ -2823,28 +2844,29 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
                      ac->ac_b_ex.fe_len);
        if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
                gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
-                ext4_free_blks_set(sb, gdp,
+                ext4_free_group_clusters_set(sb, gdp,
-                                        ext4_free_blocks_after_init(sb,
+                                             ext4_free_clusters_after_init(sb,
-                                        ac->ac_b_ex.fe_group, gdp));
+                                                ac->ac_b_ex.fe_group, gdp));
        }
-        len = ext4_free_blks_count(sb, gdp) - ac->ac_b_ex.fe_len;
+        len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len;
-        ext4_free_blks_set(sb, gdp, len);
+        ext4_free_group_clusters_set(sb, gdp, len);
        gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
        ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
-        percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
+        percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len);
        /*
         * Now reduce the dirty block count also. Should not go negative
         */
        if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
                /* release all the reserved blocks if non delalloc */
-                percpu_counter_sub(&sbi->s_dirtyblocks_counter, reserv_blks);
+                percpu_counter_sub(&sbi->s_dirtyclusters_counter,
+                                   reserv_clstrs);
        if (sbi->s_log_groups_per_flex) {
                ext4_group_t flex_group = ext4_flex_group(sbi,
                                                          ac->ac_b_ex.fe_group);
                atomic_sub(ac->ac_b_ex.fe_len,
-                           &sbi->s_flex_groups[flex_group].free_blocks);
+                           &sbi->s_flex_groups[flex_group].free_clusters);
        }
        err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
@@ -2886,6 +2908,7 @@ static noinline_for_stack void
 ext4_mb_normalize_request(struct ext4_allocation_context *ac,
                                struct ext4_allocation_request *ar)
 {
+        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
        int bsbits, max;
        ext4_lblk_t end;
        loff_t size, orig_size, start_off;
@@ -2916,7 +2939,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
        /* first, let's learn actual file size
         * given current request is allocated */
-        size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
+        size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
        size = size << bsbits;
        if (size < i_size_read(ac->ac_inode))
                size = i_size_read(ac->ac_inode);
@@ -2988,7 +3011,8 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
                        continue;
                }
-                pa_end = pa->pa_lstart + pa->pa_len;
+                pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb),
+                                                  pa->pa_len);
                /* PA must not overlap original request */
                BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end ||
@@ -3018,9 +3042,11 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
        rcu_read_lock();
        list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
                ext4_lblk_t pa_end;
                spin_lock(&pa->pa_lock);
                if (pa->pa_deleted == 0) {
-                        pa_end = pa->pa_lstart + pa->pa_len;
+                        pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb),
+                                                          pa->pa_len);
                        BUG_ON(!(start >= pa_end || end <= pa->pa_lstart));
                }
                spin_unlock(&pa->pa_lock);
@@ -3036,14 +3062,14 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
        }
        BUG_ON(start + size <= ac->ac_o_ex.fe_logical &&
                        start > ac->ac_o_ex.fe_logical);
-        BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
+        BUG_ON(size <= 0 || size > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
        /* now prepare goal request */
        /* XXX: is it better to align blocks WRT to logical
         * placement or satisfy big request as is */
        ac->ac_g_ex.fe_logical = start;
-        ac->ac_g_ex.fe_len = size;
+        ac->ac_g_ex.fe_len = EXT4_NUM_B2C(sbi, size);
        /* define goal start in order to merge */
        if (ar->pright && (ar->lright == (start + size))) {
@@ -3112,14 +3138,16 @@ static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
 static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
                                struct ext4_prealloc_space *pa)
 {
+        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
        ext4_fsblk_t start;
        ext4_fsblk_t end;
        int len;
        /* found preallocated blocks, use them */
        start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart);
-        end = min(pa->pa_pstart + pa->pa_len, start + ac->ac_o_ex.fe_len);
+        end = min(pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len),
-        len = end - start;
+                  start + EXT4_C2B(sbi, ac->ac_o_ex.fe_len));
+        len = EXT4_NUM_B2C(sbi, end - start);
        ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group,
                                        &ac->ac_b_ex.fe_start);
        ac->ac_b_ex.fe_len = len;
@@ -3127,7 +3155,7 @@ static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
        ac->ac_pa = pa;
        BUG_ON(start < pa->pa_pstart);
-        BUG_ON(start + len > pa->pa_pstart + pa->pa_len);
+        BUG_ON(end > pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len));
        BUG_ON(pa->pa_free < len);
        pa->pa_free -= len;
@@ -3193,6 +3221,7 @@ ext4_mb_check_group_pa(ext4_fsblk_t goal_block,
 static noinline_for_stack int
 ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
 {
+        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
        int order, i;
        struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
        struct ext4_locality_group *lg;
@@ -3210,12 +3239,14 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
                /* all fields in this condition don't change,
                 * so we can skip locking for them */
                if (ac->ac_o_ex.fe_logical < pa->pa_lstart ||
-                        ac->ac_o_ex.fe_logical >= pa->pa_lstart + pa->pa_len)
+                    ac->ac_o_ex.fe_logical >= (pa->pa_lstart +
+                                               EXT4_C2B(sbi, pa->pa_len)))
                        continue;
                /* non-extent files can't have physical blocks past 2^32 */
                if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) &&
-                        pa->pa_pstart + pa->pa_len > EXT4_MAX_BLOCK_FILE_PHYS)
+                    (pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len) >
+                     EXT4_MAX_BLOCK_FILE_PHYS))
                        continue;
                /* found preallocated blocks, use them */
@@ -3291,7 +3322,7 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
        while (n) {
                entry = rb_entry(n, struct ext4_free_data, node);
-                ext4_set_bits(bitmap, entry->start_blk, entry->count);
+                ext4_set_bits(bitmap, entry->start_cluster, entry->count);
                n = rb_next(n);
        }
        return;
@@ -3312,7 +3343,6 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
        ext4_group_t groupnr;
        ext4_grpblk_t start;
        int preallocated = 0;
-        int count = 0;
        int len;
        /* all form of preallocation discards first load group,
@@ -3335,7 +3365,6 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
                BUG_ON(groupnr != group);
                ext4_set_bits(bitmap, start, len);
                preallocated += len;
-                count++;
        }
        mb_debug(1, "prellocated %u for group %u\n", preallocated, group);
 }
@@ -3412,6 +3441,7 @@ static noinline_for_stack int
 ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
 {
        struct super_block *sb = ac->ac_sb;
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_prealloc_space *pa;
        struct ext4_group_info *grp;
        struct ext4_inode_info *ei;
@@ -3443,16 +3473,18 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
                winl = ac->ac_o_ex.fe_logical - ac->ac_g_ex.fe_logical;
                /* also, we should cover whole original request */
-                wins = ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len;
+                wins = EXT4_C2B(sbi, ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len);
                /* the smallest one defines real window */
                win = min(winl, wins);
-                offs = ac->ac_o_ex.fe_logical % ac->ac_b_ex.fe_len;
+                offs = ac->ac_o_ex.fe_logical %
+                        EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
                if (offs && offs < win)
                        win = offs;
-                ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical - win;
+                ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical -
+                        EXT4_B2C(sbi, win);
                BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical);
                BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len);
        }
@@ -3477,7 +3509,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
        trace_ext4_mb_new_inode_pa(ac, pa);
        ext4_mb_use_inode_pa(ac, pa);
-        atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
+        atomic_add(pa->pa_free, &sbi->s_mb_preallocated);
        ei = EXT4_I(ac->ac_inode);
        grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
@@ -3592,7 +3624,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
        BUG_ON(pa->pa_deleted == 0);
        ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
-        grp_blk_start = pa->pa_pstart - bit;
+        grp_blk_start = pa->pa_pstart - EXT4_C2B(sbi, bit);
        BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
        end = bit + pa->pa_len;
@@ -3607,7 +3639,8 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
                free += next - bit;
                trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit);
-                trace_ext4_mb_release_inode_pa(pa, grp_blk_start + bit,
+                trace_ext4_mb_release_inode_pa(pa, (grp_blk_start +
+                                                    EXT4_C2B(sbi, bit)),
                                               next - bit);
                mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
                bit = next + 1;
@@ -3690,7 +3723,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
        }
        if (needed == 0)
-                needed = EXT4_BLOCKS_PER_GROUP(sb) + 1;
+                needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1;
        INIT_LIST_HEAD(&list);
 repeat:
@@ -3958,7 +3991,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
        if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
                return;
-        size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
+        size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
        isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
                >> bsbits;
@@ -3969,6 +4002,11 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
                return;
        }
+        if (sbi->s_mb_group_prealloc <= 0) {
+                ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
+                return;
+        }
        /* don't use group allocation for large files */
        size = max(size, isize);
        if (size > sbi->s_mb_stream_request) {
@@ -4007,8 +4045,8 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
        len = ar->len;
        /* just a dirty hack to filter too big requests  */
-        if (len >= EXT4_BLOCKS_PER_GROUP(sb) - 10)
+        if (len >= EXT4_CLUSTERS_PER_GROUP(sb) - 10)
-                len = EXT4_BLOCKS_PER_GROUP(sb) - 10;
+                len = EXT4_CLUSTERS_PER_GROUP(sb) - 10;
        /* start searching from the goal */
        goal = ar->goal;
@@ -4019,18 +4057,15 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
        /* set up allocation goals */
        memset(ac, 0, sizeof(struct ext4_allocation_context));
-        ac->ac_b_ex.fe_logical = ar->logical;
+        ac->ac_b_ex.fe_logical = ar->logical & ~(sbi->s_cluster_ratio - 1);
        ac->ac_status = AC_STATUS_CONTINUE;
        ac->ac_sb = sb;
        ac->ac_inode = ar->inode;
-        ac->ac_o_ex.fe_logical = ar->logical;
+        ac->ac_o_ex.fe_logical = ac->ac_b_ex.fe_logical;
        ac->ac_o_ex.fe_group = group;
        ac->ac_o_ex.fe_start = block;
        ac->ac_o_ex.fe_len = len;
-        ac->ac_g_ex.fe_logical = ar->logical;
+        ac->ac_g_ex = ac->ac_o_ex;
-        ac->ac_g_ex.fe_group = group;
-        ac->ac_g_ex.fe_start = block;
-        ac->ac_g_ex.fe_len = len;
        ac->ac_flags = ar->flags;
        /* we have to define context: we'll we work with a file or
@@ -4182,13 +4217,14 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
 */
 static int ext4_mb_release_context(struct ext4_allocation_context *ac)
 {
+        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
        struct ext4_prealloc_space *pa = ac->ac_pa;
        if (pa) {
                if (pa->pa_type == MB_GROUP_PA) {
                        /* see comment in ext4_mb_use_group_pa() */
                        spin_lock(&pa->pa_lock);
-                        pa->pa_pstart += ac->ac_b_ex.fe_len;
+                        pa->pa_pstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
-                        pa->pa_lstart += ac->ac_b_ex.fe_len;
+                        pa->pa_lstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
                        pa->pa_free -= ac->ac_b_ex.fe_len;
                        pa->pa_len -= ac->ac_b_ex.fe_len;
                        spin_unlock(&pa->pa_lock);
@@ -4249,13 +4285,17 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
        struct super_block *sb;
        ext4_fsblk_t block = 0;
        unsigned int inquota = 0;
-        unsigned int reserv_blks = 0;
+        unsigned int reserv_clstrs = 0;
        sb = ar->inode->i_sb;
        sbi = EXT4_SB(sb);
        trace_ext4_request_blocks(ar);
+        /* Allow to use superuser reservation for quota file */
+        if (IS_NOQUOTA(ar->inode))
+                ar->flags |= EXT4_MB_USE_ROOT_BLOCKS;
        /*
         * For delayed allocation, we could skip the ENOSPC and
         * EDQUOT check, as blocks and quotas have been already
@@ -4269,7 +4309,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
                 * and verify allocation doesn't exceed the quota limits.
                 */
                while (ar->len &&
-                        ext4_claim_free_blocks(sbi, ar->len, ar->flags)) {
+                        ext4_claim_free_clusters(sbi, ar->len, ar->flags)) {
                        /* let others to free the space */
                        yield();
@@ -4279,12 +4319,14 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
                        *errp = -ENOSPC;
                        return 0;
                }
-                reserv_blks = ar->len;
+                reserv_clstrs = ar->len;
                if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) {
-                        dquot_alloc_block_nofail(ar->inode, ar->len);
+                        dquot_alloc_block_nofail(ar->inode,
+                                                 EXT4_C2B(sbi, ar->len));
                } else {
                        while (ar->len &&
-                                dquot_alloc_block(ar->inode, ar->len)) {
+                                dquot_alloc_block(ar->inode,
+                                                  EXT4_C2B(sbi, ar->len))) {
                                ar->flags |= EXT4_MB_HINT_NOPREALLOC;
                                ar->len--;
@@ -4328,7 +4370,7 @@ repeat:
                        ext4_mb_new_preallocation(ac);
        }
        if (likely(ac->ac_status == AC_STATUS_FOUND)) {
-                *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_blks);
+                *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs);
                if (*errp == -EAGAIN) {
                        /*
                         * drop the reference that we took
@@ -4364,13 +4406,13 @@ out:
        if (ac)
                kmem_cache_free(ext4_ac_cachep, ac);
        if (inquota && ar->len < inquota)
-                dquot_free_block(ar->inode, inquota - ar->len);
+                dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len));
        if (!ar->len) {
                if (!ext4_test_inode_state(ar->inode,
                                           EXT4_STATE_DELALLOC_RESERVED))
                        /* release all the reserved blocks if non delalloc */
-                        percpu_counter_sub(&sbi->s_dirtyblocks_counter,
+                        percpu_counter_sub(&sbi->s_dirtyclusters_counter,
-                                                reserv_blks);
+                                                reserv_clstrs);
        }
        trace_ext4_allocate_blocks(ar, (unsigned long long)block);
@@ -4388,7 +4430,7 @@ static int can_merge(struct ext4_free_data *entry1,
 {
        if ((entry1->t_tid == entry2->t_tid) &&
            (entry1->group == entry2->group) &&
-            ((entry1->start_blk + entry1->count) == entry2->start_blk))
+            ((entry1->start_cluster + entry1->count) == entry2->start_cluster))
                return 1;
        return 0;
 }
@@ -4398,7 +4440,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
                      struct ext4_free_data *new_entry)
 {
        ext4_group_t group = e4b->bd_group;
-        ext4_grpblk_t block;
+        ext4_grpblk_t cluster;
        struct ext4_free_data *entry;
        struct ext4_group_info *db = e4b->bd_info;
        struct super_block *sb = e4b->bd_sb;
@@ -4411,7 +4453,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
        BUG_ON(e4b->bd_buddy_page == NULL);
        new_node = &new_entry->node;
-        block = new_entry->start_blk;
+        cluster = new_entry->start_cluster;
        if (!*n) {
                /* first free block exent. We need to
@@ -4425,13 +4467,14 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
        while (*n) {
                parent = *n;
                entry = rb_entry(parent, struct ext4_free_data, node);
-                if (block < entry->start_blk)
+                if (cluster < entry->start_cluster)
                        n = &(*n)->rb_left;
-                else if (block >= (entry->start_blk + entry->count))
+                else if (cluster >= (entry->start_cluster + entry->count))
                        n = &(*n)->rb_right;
                else {
                        ext4_grp_locked_error(sb, group, 0,
-                                ext4_group_first_block_no(sb, group) + block,
+                                ext4_group_first_block_no(sb, group) +
+                                EXT4_C2B(sbi, cluster),
                                "Block already on to-be-freed list");
                        return 0;
                }
@@ -4445,7 +4488,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
        if (node) {
                entry = rb_entry(node, struct ext4_free_data, node);
                if (can_merge(entry, new_entry)) {
-                        new_entry->start_blk = entry->start_blk;
+                        new_entry->start_cluster = entry->start_cluster;
                        new_entry->count += entry->count;
                        rb_erase(node, &(db->bb_free_root));
                        spin_lock(&sbi->s_md_lock);
@@ -4496,6 +4539,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
        ext4_group_t block_group;
        struct ext4_sb_info *sbi;
        struct ext4_buddy e4b;
+        unsigned int count_clusters;
        int err = 0;
        int ret;
@@ -4544,6 +4588,38 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
        if (!ext4_should_writeback_data(inode))
                flags |= EXT4_FREE_BLOCKS_METADATA;
+        /*
+         * If the extent to be freed does not begin on a cluster
+         * boundary, we need to deal with partial clusters at the
+         * beginning and end of the extent.  Normally we will free
+         * blocks at the beginning or the end unless we are explicitly
+         * requested to avoid doing so.
+         */
+        overflow = block & (sbi->s_cluster_ratio - 1);
+        if (overflow) {
+                if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) {
+                        overflow = sbi->s_cluster_ratio - overflow;
+                        block += overflow;
+                        if (count > overflow)
+                                count -= overflow;
+                        else
+                                return;
+                } else {
+                        block -= overflow;
+                        count += overflow;
+                }
+        }
+        overflow = count & (sbi->s_cluster_ratio - 1);
+        if (overflow) {
+                if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) {
+                        if (count > overflow)
+                                count -= overflow;
+                        else
+                                return;
+                } else
+                        count += sbi->s_cluster_ratio - overflow;
+        }
 do_more:
        overflow = 0;
        ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
@@ -4552,10 +4628,12 @@ do_more:
         * Check to see if we are freeing blocks across a group
         * boundary.
         */
-        if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
+        if (EXT4_C2B(sbi, bit) + count > EXT4_BLOCKS_PER_GROUP(sb)) {
-                overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb);
+                overflow = EXT4_C2B(sbi, bit) + count -
+                        EXT4_BLOCKS_PER_GROUP(sb);
                count -= overflow;
        }
+        count_clusters = EXT4_B2C(sbi, count);
        bitmap_bh = ext4_read_block_bitmap(sb, block_group);
        if (!bitmap_bh) {
                err = -EIO;
@@ -4570,9 +4648,9 @@ do_more:
        if (in_range(ext4_block_bitmap(sb, gdp), block, count) ||
            in_range(ext4_inode_bitmap(sb, gdp), block, count) ||
            in_range(block, ext4_inode_table(sb, gdp),
-                      EXT4_SB(sb)->s_itb_per_group) ||
+                     EXT4_SB(sb)->s_itb_per_group) ||
            in_range(block + count - 1, ext4_inode_table(sb, gdp),
-                      EXT4_SB(sb)->s_itb_per_group)) {
+                     EXT4_SB(sb)->s_itb_per_group)) {
                ext4_error(sb, "Freeing blocks in system zone - "
                           "Block = %llu, count = %lu", block, count);
@@ -4597,11 +4675,11 @@ do_more:
 #ifdef AGGRESSIVE_CHECK
        {
                int i;
-                for (i = 0; i < count; i++)
+                for (i = 0; i < count_clusters; i++)
                        BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
        }
 #endif
-        trace_ext4_mballoc_free(sb, inode, block_group, bit, count);
+        trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters);
        err = ext4_mb_load_buddy(sb, block_group, &e4b);
        if (err)
@@ -4618,13 +4696,13 @@ do_more:
                        err = -ENOMEM;
                        goto error_return;
                }
-                new_entry->start_blk = bit;
+                new_entry->start_cluster = bit;
                new_entry->group  = block_group;
-                new_entry->count = count;
+                new_entry->count = count_clusters;
                new_entry->t_tid = handle->h_transaction->t_tid;
                ext4_lock_group(sb, block_group);
-                mb_clear_bits(bitmap_bh->b_data, bit, count);
+                mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
                ext4_mb_free_metadata(handle, &e4b, new_entry);
        } else {
                /* need to update group_info->bb_free and bitmap
@@ -4632,25 +4710,29 @@ do_more:
                 * them with group lock_held
                 */
                ext4_lock_group(sb, block_group);
-                mb_clear_bits(bitmap_bh->b_data, bit, count);
+                mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
-                mb_free_blocks(inode, &e4b, bit, count);
+                mb_free_blocks(inode, &e4b, bit, count_clusters);
        }
-        ret = ext4_free_blks_count(sb, gdp) + count;
+        ret = ext4_free_group_clusters(sb, gdp) + count_clusters;
-        ext4_free_blks_set(sb, gdp, ret);
+        ext4_free_group_clusters_set(sb, gdp, ret);
        gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
        ext4_unlock_group(sb, block_group);
-        percpu_counter_add(&sbi->s_freeblocks_counter, count);
+        percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters);
        if (sbi->s_log_groups_per_flex) {
                ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
-                atomic_add(count, &sbi->s_flex_groups[flex_group].free_blocks);
+                atomic_add(count_clusters,
+                           &sbi->s_flex_groups[flex_group].free_clusters);
        }
        ext4_mb_unload_buddy(&e4b);
        freed += count;
+        if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
+                dquot_free_block(inode, EXT4_C2B(sbi, count_clusters));
        /* We dirtied the bitmap block */
        BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
        err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
@@ -4669,8 +4751,6 @@ do_more:
        }
        ext4_mark_super_dirty(sb);
 error_return:
-        if (freed && !(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
-                dquot_free_block(inode, freed);
        brelse(bitmap_bh);
        ext4_std_error(sb, err);
        return;
@@ -4778,16 +4858,17 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
        ext4_lock_group(sb, block_group);
        mb_clear_bits(bitmap_bh->b_data, bit, count);
        mb_free_blocks(NULL, &e4b, bit, count);
-        blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc);
+        blk_free_count = blocks_freed + ext4_free_group_clusters(sb, desc);
-        ext4_free_blks_set(sb, desc, blk_free_count);
+        ext4_free_group_clusters_set(sb, desc, blk_free_count);
        desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
        ext4_unlock_group(sb, block_group);
-        percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
+        percpu_counter_add(&sbi->s_freeclusters_counter,
+                           EXT4_B2C(sbi, blocks_freed));
        if (sbi->s_log_groups_per_flex) {
                ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
-                atomic_add(blocks_freed,
+                atomic_add(EXT4_B2C(sbi, blocks_freed),
-                           &sbi->s_flex_groups[flex_group].free_blocks);
+                           &sbi->s_flex_groups[flex_group].free_clusters);
        }
        ext4_mb_unload_buddy(&e4b);
@@ -4948,7 +5029,7 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
        struct ext4_group_info *grp;
        ext4_group_t first_group, last_group;
        ext4_group_t group, ngroups = ext4_get_groups_count(sb);
-        ext4_grpblk_t cnt = 0, first_block, last_block;
+        ext4_grpblk_t cnt = 0, first_cluster, last_cluster;
        uint64_t start, len, minlen, trimmed = 0;
        ext4_fsblk_t first_data_blk =
                        le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
@@ -4958,7 +5039,7 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
        len = range->len >> sb->s_blocksize_bits;
        minlen = range->minlen >> sb->s_blocksize_bits;
-        if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb)))
+        if (unlikely(minlen > EXT4_CLUSTERS_PER_GROUP(sb)))
                return -EINVAL;
        if (start + len <= first_data_blk)
                goto out;
@@ -4969,11 +5050,11 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
        /* Determine first and last group to examine based on start and len */
        ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start,
-                                     &first_group, &first_block);
+                                     &first_group, &first_cluster);
        ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) (start + len),
-                                     &last_group, &last_block);
+                                     &last_group, &last_cluster);
        last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group;
-        last_block = EXT4_BLOCKS_PER_GROUP(sb);
+        last_cluster = EXT4_CLUSTERS_PER_GROUP(sb);
        if (first_group > last_group)
                return -EINVAL;
@@ -4993,20 +5074,20 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
                 * change it for the last group in which case start +
                 * len < EXT4_BLOCKS_PER_GROUP(sb).
                 */
-                if (first_block + len < EXT4_BLOCKS_PER_GROUP(sb))
+                if (first_cluster + len < EXT4_CLUSTERS_PER_GROUP(sb))
-                        last_block = first_block + len;
+                        last_cluster = first_cluster + len;
-                len -= last_block - first_block;
+                len -= last_cluster - first_cluster;
                if (grp->bb_free >= minlen) {
-                        cnt = ext4_trim_all_free(sb, group, first_block,
+                        cnt = ext4_trim_all_free(sb, group, first_cluster,
-                                                last_block, minlen);
+                                                last_cluster, minlen);
                        if (cnt < 0) {
                                ret = cnt;
                                break;
                        }
                }
                trimmed += cnt;
-                first_block = 0;
+                first_cluster = 0;
        }
        range->len = trimmed * sb->s_blocksize;
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 9d4a636b546c..47705f3285e3 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -106,7 +106,7 @@ struct ext4_free_data {
        ext4_group_t group;
        /* free block extent */
-        ext4_grpblk_t start_blk;
+        ext4_grpblk_t start_cluster;
        ext4_grpblk_t count;
        /* transaction which freed this extent */
@@ -139,9 +139,9 @@ enum {
 struct ext4_free_extent {
        ext4_lblk_t fe_logical;
-        ext4_grpblk_t fe_start;
+        ext4_grpblk_t fe_start; /* In cluster units */
        ext4_group_t fe_group;
-        ext4_grpblk_t fe_len;
+        ext4_grpblk_t fe_len;   /* In cluster units */
 };
 /*
@@ -175,7 +175,7 @@ struct ext4_allocation_context {
        /* the best found extent */
        struct ext4_free_extent ac_b_ex;
-        /* copy of the bext found extent taken before preallocation efforts */
+        /* copy of the best found extent taken before preallocation efforts */
        struct ext4_free_extent ac_f_ex;
        /* number of iterations done. we have to track to limit searching */
@@ -216,6 +216,7 @@ struct ext4_buddy {
 static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
                                        struct ext4_free_extent *fex)
 {
-        return ext4_group_first_block_no(sb, fex->fe_group) + fex->fe_start;
+        return ext4_group_first_block_no(sb, fex->fe_group) +
+                (fex->fe_start << EXT4_SB(sb)->s_cluster_bits);
 }
 #endif
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index b57b98fb44d1..16ac228dbec6 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -15,19 +15,18 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include "ext4_jbd2.h"
-#include "ext4_extents.h"
 /*
 * The contiguous blocks details which can be
 * represented by a single extent
 */
-struct list_blocks_struct {
+struct migrate_struct {
-        ext4_lblk_t first_block, last_block;
+        ext4_lblk_t first_block, last_block, curr_block;
        ext4_fsblk_t first_pblock, last_pblock;
 };
 static int finish_range(handle_t *handle, struct inode *inode,
-                                struct list_blocks_struct *lb)
+                                struct migrate_struct *lb)
 {
        int retval = 0, needed;
@@ -87,8 +86,7 @@ err_out:
 }
 static int update_extent_range(handle_t *handle, struct inode *inode,
-                                ext4_fsblk_t pblock, ext4_lblk_t blk_num,
+                               ext4_fsblk_t pblock, struct migrate_struct *lb)
-                                struct list_blocks_struct *lb)
 {
        int retval;
        /*
@@ -96,9 +94,10 @@ static int update_extent_range(handle_t *handle, struct inode *inode,
         */
        if (lb->first_pblock &&
                (lb->last_pblock+1 == pblock) &&
-                (lb->last_block+1 == blk_num)) {
+                (lb->last_block+1 == lb->curr_block)) {
                lb->last_pblock = pblock;
-                lb->last_block = blk_num;
+                lb->last_block = lb->curr_block;
+                lb->curr_block++;
                return 0;
        }
        /*
@@ -106,64 +105,49 @@ static int update_extent_range(handle_t *handle, struct inode *inode,
         */
        retval = finish_range(handle, inode, lb);
        lb->first_pblock = lb->last_pblock = pblock;
-        lb->first_block = lb->last_block = blk_num;
+        lb->first_block = lb->last_block = lb->curr_block;
+        lb->curr_block++;
        return retval;
 }
 static int update_ind_extent_range(handle_t *handle, struct inode *inode,
-                                   ext4_fsblk_t pblock, ext4_lblk_t *blk_nump,
+                                   ext4_fsblk_t pblock,
-                                   struct list_blocks_struct *lb)
+                                   struct migrate_struct *lb)
 {
        struct buffer_head *bh;
        __le32 *i_data;
        int i, retval = 0;
-        ext4_lblk_t blk_count = *blk_nump;
        unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
-        if (!pblock) {
-                /* Only update the file block number */
-                *blk_nump += max_entries;
-                return 0;
-        }
        bh = sb_bread(inode->i_sb, pblock);
        if (!bh)
                return -EIO;
        i_data = (__le32 *)bh->b_data;
-        for (i = 0; i < max_entries; i++, blk_count++) {
+        for (i = 0; i < max_entries; i++) {
                if (i_data[i]) {
                        retval = update_extent_range(handle, inode,
-                                                le32_to_cpu(i_data[i]),
+                                                le32_to_cpu(i_data[i]), lb);
-                                                blk_count, lb);
                        if (retval)
                                break;
+                } else {
+                        lb->curr_block++;
                }
        }
-        /* Update the file block number */
-        *blk_nump = blk_count;
        put_bh(bh);
        return retval;
 }
 static int update_dind_extent_range(handle_t *handle, struct inode *inode,
-                                    ext4_fsblk_t pblock, ext4_lblk_t *blk_nump,
+                                    ext4_fsblk_t pblock,
-                                    struct list_blocks_struct *lb)
+                                    struct migrate_struct *lb)
 {
        struct buffer_head *bh;
        __le32 *i_data;
        int i, retval = 0;
-        ext4_lblk_t blk_count = *blk_nump;
        unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
-        if (!pblock) {
-                /* Only update the file block number */
-                *blk_nump += max_entries * max_entries;
-                return 0;
-        }
        bh = sb_bread(inode->i_sb, pblock);
        if (!bh)
                return -EIO;
@@ -172,38 +156,28 @@ static int update_dind_extent_range(handle_t *handle, struct inode *inode,
        for (i = 0; i < max_entries; i++) {
                if (i_data[i]) {
                        retval = update_ind_extent_range(handle, inode,
-                                                le32_to_cpu(i_data[i]),
+                                                le32_to_cpu(i_data[i]), lb);
-                                                &blk_count, lb);
                        if (retval)
                                break;
                } else {
                        /* Only update the file block number */
-                        blk_count += max_entries;
+                        lb->curr_block += max_entries;
                }
        }
-        /* Update the file block number */
-        *blk_nump = blk_count;
        put_bh(bh);
        return retval;
 }
 static int update_tind_extent_range(handle_t *handle, struct inode *inode,
-                                     ext4_fsblk_t pblock, ext4_lblk_t *blk_nump,
+                                    ext4_fsblk_t pblock,
-                                     struct list_blocks_struct *lb)
+                                    struct migrate_struct *lb)
 {
        struct buffer_head *bh;
        __le32 *i_data;
        int i, retval = 0;
-        ext4_lblk_t blk_count = *blk_nump;
        unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
-        if (!pblock) {
-                /* Only update the file block number */
-                *blk_nump += max_entries * max_entries * max_entries;
-                return 0;
-        }
        bh = sb_bread(inode->i_sb, pblock);
        if (!bh)
                return -EIO;
@@ -212,16 +186,14 @@ static int update_tind_extent_range(handle_t *handle, struct inode *inode,
        for (i = 0; i < max_entries; i++) {
                if (i_data[i]) {
                        retval = update_dind_extent_range(handle, inode,
-                                                le32_to_cpu(i_data[i]),
+                                                le32_to_cpu(i_data[i]), lb);
-                                                &blk_count, lb);
                        if (retval)
                                break;
-                } else
+                } else {
                        /* Only update the file block number */
-                        blk_count += max_entries * max_entries;
+                        lb->curr_block += max_entries * max_entries;
+                }
        }
-        /* Update the file block number */
-        *blk_nump = blk_count;
        put_bh(bh);
        return retval;
@@ -462,12 +434,12 @@ int ext4_ext_migrate(struct inode *inode)
        handle_t *handle;
        int retval = 0, i;
        __le32 *i_data;
-        ext4_lblk_t blk_count = 0;
        struct ext4_inode_info *ei;
        struct inode *tmp_inode = NULL;
-        struct list_blocks_struct lb;
+        struct migrate_struct lb;
        unsigned long max_entries;
        __u32 goal;
+        uid_t owner[2];
        /*
         * If the filesystem does not support extents, or the inode
@@ -495,10 +467,12 @@ int ext4_ext_migrate(struct inode *inode)
        }
        goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) *
                EXT4_INODES_PER_GROUP(inode->i_sb)) + 1;
+        owner[0] = inode->i_uid;
+        owner[1] = inode->i_gid;
        tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode,
-                                   S_IFREG, NULL, goal);
+                                   S_IFREG, NULL, goal, owner);
        if (IS_ERR(tmp_inode)) {
-                retval = -ENOMEM;
+                retval = PTR_ERR(inode);
                ext4_journal_stop(handle);
                return retval;
        }
@@ -507,7 +481,7 @@ int ext4_ext_migrate(struct inode *inode)
         * Set the i_nlink to zero so it will be deleted later
         * when we drop inode reference.
         */
-        tmp_inode->i_nlink = 0;
+        clear_nlink(tmp_inode);
        ext4_ext_tree_init(handle, tmp_inode);
        ext4_orphan_add(handle, tmp_inode);
@@ -551,35 +525,32 @@ int ext4_ext_migrate(struct inode *inode)
        /* 32 bit block address 4 bytes */
        max_entries = inode->i_sb->s_blocksize >> 2;
-        for (i = 0; i < EXT4_NDIR_BLOCKS; i++, blk_count++) {
+        for (i = 0; i < EXT4_NDIR_BLOCKS; i++) {
                if (i_data[i]) {
                        retval = update_extent_range(handle, tmp_inode,
-                                                le32_to_cpu(i_data[i]),
+                                                le32_to_cpu(i_data[i]), &lb);
-                                                blk_count, &lb);
                        if (retval)
                                goto err_out;
-                }
+                } else
+                        lb.curr_block++;
        }
        if (i_data[EXT4_IND_BLOCK]) {
                retval = update_ind_extent_range(handle, tmp_inode,
-                                        le32_to_cpu(i_data[EXT4_IND_BLOCK]),
+                                le32_to_cpu(i_data[EXT4_IND_BLOCK]), &lb);
-                                        &blk_count, &lb);
                        if (retval)
                                goto err_out;
        } else
-                blk_count +=  max_entries;
+                lb.curr_block += max_entries;
        if (i_data[EXT4_DIND_BLOCK]) {
                retval = update_dind_extent_range(handle, tmp_inode,
-                                        le32_to_cpu(i_data[EXT4_DIND_BLOCK]),
+                                le32_to_cpu(i_data[EXT4_DIND_BLOCK]), &lb);
-                                        &blk_count, &lb);
                        if (retval)
                                goto err_out;
        } else
-                blk_count += max_entries * max_entries;
+                lb.curr_block += max_entries * max_entries;
        if (i_data[EXT4_TIND_BLOCK]) {
                retval = update_tind_extent_range(handle, tmp_inode,
-                                        le32_to_cpu(i_data[EXT4_TIND_BLOCK]),
+                                le32_to_cpu(i_data[EXT4_TIND_BLOCK]), &lb);
-                                        &blk_count, &lb);
                        if (retval)
                                goto err_out;
        }
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index 9bdef3f537c5..7ea4ba4eff2a 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -109,7 +109,7 @@ static int kmmpd(void *data)
        mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
        bdevname(bh->b_bdev, mmp->mmp_bdevname);
-        memcpy(mmp->mmp_nodename, init_utsname()->sysname,
+        memcpy(mmp->mmp_nodename, init_utsname()->nodename,
               sizeof(mmp->mmp_nodename));
        while (!kthread_should_stop()) {
@@ -125,8 +125,9 @@ static int kmmpd(void *data)
                 * Don't spew too many error messages. Print one every
                 * (s_mmp_update_interval * 60) seconds.
                 */
-                if (retval && (failed_writes % 60) == 0) {
+                if (retval) {
-                        ext4_error(sb, "Error writing to MMP block");
+                        if ((failed_writes % 60) == 0)
+                                ext4_error(sb, "Error writing to MMP block");
                        failed_writes++;
                }
@@ -295,7 +296,8 @@ skip:
        /*
         * write a new random sequence number.
         */
-        mmp->mmp_seq = seq = cpu_to_le32(mmp_new_seq());
+        seq = mmp_new_seq();
+        mmp->mmp_seq = cpu_to_le32(seq);
        retval = write_mmp_block(bh);
        if (retval)
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index f57455a1b1b2..c5826c623e7a 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -17,7 +17,6 @@
 #include <linux/quotaops.h>
 #include <linux/slab.h>
 #include "ext4_jbd2.h"
-#include "ext4_extents.h"
 #include "ext4.h"
 /**
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 1c924faeb6c8..aa4c782c9dd7 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1586,7 +1586,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
                        dxtrace(dx_show_index("node", frames[1].entries));
                        dxtrace(dx_show_index("node",
                               ((struct dx_node *) bh2->b_data)->entries));
-                        err = ext4_handle_dirty_metadata(handle, inode, bh2);
+                        err = ext4_handle_dirty_metadata(handle, dir, bh2);
                        if (err)
                                goto journal_error;
                        brelse (bh2);
@@ -1612,7 +1612,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
                        if (err)
                                goto journal_error;
                }
-                err = ext4_handle_dirty_metadata(handle, inode, frames[0].bh);
+                err = ext4_handle_dirty_metadata(handle, dir, frames[0].bh);
                if (err) {
                        ext4_std_error(inode->i_sb, err);
                        goto cleanup;
@@ -1694,7 +1694,7 @@ static void ext4_inc_count(handle_t *handle, struct inode *inode)
        if (is_dx(inode) && inode->i_nlink > 1) {
                /* limit is 16-bit i_links_count */
                if (inode->i_nlink >= EXT4_LINK_MAX || inode->i_nlink == 2) {
-                        inode->i_nlink = 1;
+                        set_nlink(inode, 1);
                        EXT4_SET_RO_COMPAT_FEATURE(inode->i_sb,
                                              EXT4_FEATURE_RO_COMPAT_DIR_NLINK);
                }
@@ -1707,9 +1707,8 @@ static void ext4_inc_count(handle_t *handle, struct inode *inode)
 */
 static void ext4_dec_count(handle_t *handle, struct inode *inode)
 {
-        drop_nlink(inode);
+        if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2)
-        if (S_ISDIR(inode->i_mode) && inode->i_nlink == 0)
+                drop_nlink(inode);
-                inc_nlink(inode);
 }
@@ -1756,7 +1755,7 @@ retry:
        if (IS_DIRSYNC(dir))
                ext4_handle_sync(handle);
-        inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0);
+        inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0, NULL);
        err = PTR_ERR(inode);
        if (!IS_ERR(inode)) {
                inode->i_op = &ext4_file_inode_operations;
@@ -1792,7 +1791,7 @@ retry:
        if (IS_DIRSYNC(dir))
                ext4_handle_sync(handle);
-        inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0);
+        inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0, NULL);
        err = PTR_ERR(inode);
        if (!IS_ERR(inode)) {
                init_special_inode(inode, inode->i_mode, rdev);
@@ -1832,7 +1831,7 @@ retry:
                ext4_handle_sync(handle);
        inode = ext4_new_inode(handle, dir, S_IFDIR | mode,
-                               &dentry->d_name, 0);
+                               &dentry->d_name, 0, NULL);
        err = PTR_ERR(inode);
        if (IS_ERR(inode))
                goto out_stop;
@@ -1861,9 +1860,9 @@ retry:
        de->name_len = 2;
        strcpy(de->name, "..");
        ext4_set_de_type(dir->i_sb, de, S_IFDIR);
-        inode->i_nlink = 2;
+        set_nlink(inode, 2);
        BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata");
-        err = ext4_handle_dirty_metadata(handle, dir, dir_block);
+        err = ext4_handle_dirty_metadata(handle, inode, dir_block);
        if (err)
                goto out_clear_inode;
        err = ext4_mark_inode_dirty(handle, inode);
@@ -2214,7 +2213,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
                ext4_warning(inode->i_sb,
                             "Deleting nonexistent file (%lu), %d",
                             inode->i_ino, inode->i_nlink);
-                inode->i_nlink = 1;
+                set_nlink(inode, 1);
        }
        retval = ext4_delete_entry(handle, dir, de, bh);
        if (retval)
@@ -2279,7 +2278,7 @@ retry:
                ext4_handle_sync(handle);
        inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO,
-                               &dentry->d_name, 0);
+                               &dentry->d_name, 0, NULL);
        err = PTR_ERR(inode);
        if (IS_ERR(inode))
                goto out_stop;
@@ -2530,7 +2529,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
                PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) =
                                                cpu_to_le32(new_dir->i_ino);
                BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata");
-                retval = ext4_handle_dirty_metadata(handle, old_dir, dir_bh);
+                retval = ext4_handle_dirty_metadata(handle, old_inode, dir_bh);
                if (retval) {
                        ext4_std_error(old_dir->i_sb, retval);
                        goto end_rename;
@@ -2539,7 +2538,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
                if (new_inode) {
                        /* checked empty_dir above, can't have another parent,
                         * ext4_dec_count() won't work for many-linked dirs */
-                        new_inode->i_nlink = 0;
+                        clear_nlink(new_inode);
                } else {
                        ext4_inc_count(handle, new_dir);
                        ext4_update_dx_flag(new_dir);
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 92f38ee13f8a..7ce1d0b19c94 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -70,7 +70,6 @@ static void put_io_page(struct ext4_io_page *io_page)
 void ext4_free_io_end(ext4_io_end_t *io)
 {
        int i;
-        wait_queue_head_t *wq;
        BUG_ON(!io);
        if (io->page)
@@ -78,56 +77,43 @@ void ext4_free_io_end(ext4_io_end_t *io)
        for (i = 0; i < io->num_io_pages; i++)
                put_io_page(io->pages[i]);
        io->num_io_pages = 0;
-        wq = ext4_ioend_wq(io->inode);
+        if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count))
-        if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count) &&
+                wake_up_all(ext4_ioend_wq(io->inode));
-            waitqueue_active(wq))
-                wake_up_all(wq);
        kmem_cache_free(io_end_cachep, io);
 }
 /*
 * check a range of space and convert unwritten extents to written.
+ *
+ * Called with inode->i_mutex; we depend on this when we manipulate
+ * io->flag, since we could otherwise race with ext4_flush_completed_IO()
 */
 int ext4_end_io_nolock(ext4_io_end_t *io)
 {
        struct inode *inode = io->inode;
        loff_t offset = io->offset;
        ssize_t size = io->size;
-        wait_queue_head_t *wq;
        int ret = 0;
        ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
                   "list->prev 0x%p\n",
                   io, inode->i_ino, io->list.next, io->list.prev);
-        if (list_empty(&io->list))
-                return ret;
-        if (!(io->flag & EXT4_IO_END_UNWRITTEN))
-                return ret;
        ret = ext4_convert_unwritten_extents(inode, offset, size);
        if (ret < 0) {
-                printk(KERN_EMERG "%s: failed to convert unwritten "
+                ext4_msg(inode->i_sb, KERN_EMERG,
-                        "extents to written extents, error is %d "
+                         "failed to convert unwritten extents to written "
-                        "io is still on inode %lu aio dio list\n",
+                         "extents -- potential data loss!  "
-                       __func__, ret, inode->i_ino);
+                         "(inode %lu, offset %llu, size %zd, error %d)",
-                return ret;
+                         inode->i_ino, offset, size, ret);
        }
        if (io->iocb)
                aio_complete(io->iocb, io->result, 0);
-        /* clear the DIO AIO unwritten flag */
-        if (io->flag & EXT4_IO_END_UNWRITTEN) {
-                io->flag &= ~EXT4_IO_END_UNWRITTEN;
-                /* Wake up anyone waiting on unwritten extent conversion */
-                wq = ext4_ioend_wq(io->inode);
-                if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten) &&
-                    waitqueue_active(wq)) {
-                        wake_up_all(wq);
-                }
-        }
+        /* Wake up anyone waiting on unwritten extent conversion */
+        if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten))
+                wake_up_all(ext4_ioend_wq(io->inode));
        return ret;
 }
@@ -140,9 +126,15 @@ static void ext4_end_io_work(struct work_struct *work)
        struct inode            *inode = io->inode;
        struct ext4_inode_info  *ei = EXT4_I(inode);
        unsigned long           flags;
-        int                     ret;
+        spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+        if (list_empty(&io->list)) {
+                spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+                goto free;
+        }
        if (!mutex_trylock(&inode->i_mutex)) {
+                spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
                /*
                 * Requeue the work instead of waiting so that the work
                 * items queued after this can be processed.
@@ -159,17 +151,11 @@ static void ext4_end_io_work(struct work_struct *work)
                io->flag |= EXT4_IO_END_QUEUED;
                return;
        }
-        ret = ext4_end_io_nolock(io);
+        list_del_init(&io->list);
-        if (ret < 0) {
-                mutex_unlock(&inode->i_mutex);
-                return;
-        }
-        spin_lock_irqsave(&ei->i_completed_io_lock, flags);
-        if (!list_empty(&io->list))
-                list_del_init(&io->list);
        spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+        (void) ext4_end_io_nolock(io);
        mutex_unlock(&inode->i_mutex);
+free:
        ext4_free_io_end(io);
 }
@@ -350,10 +336,8 @@ submit_and_retry:
        if ((io_end->num_io_pages >= MAX_IO_PAGES) &&
            (io_end->pages[io_end->num_io_pages-1] != io_page))
                goto submit_and_retry;
-        if (buffer_uninit(bh) && !(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
+        if (buffer_uninit(bh))
-                io_end->flag |= EXT4_IO_END_UNWRITTEN;
+                ext4_set_io_unwritten_flag(inode, io_end);
-                atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
-        }
        io->io_end->size += bh->b_size;
        io->io_next_block++;
        ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh));
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 707d3f16f7ce..996780ab4f4e 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -875,7 +875,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
        ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */
        ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */
        ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */
-        ext4_free_blks_set(sb, gdp, input->free_blocks_count);
+        ext4_free_group_clusters_set(sb, gdp, input->free_blocks_count);
        ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb));
        gdp->bg_flags = cpu_to_le16(EXT4_BG_INODE_ZEROED);
        gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp);
@@ -937,8 +937,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
                input->reserved_blocks);
        /* Update the free space counts */
-        percpu_counter_add(&sbi->s_freeblocks_counter,
+        percpu_counter_add(&sbi->s_freeclusters_counter,
-                           input->free_blocks_count);
+                           EXT4_B2C(sbi, input->free_blocks_count));
        percpu_counter_add(&sbi->s_freeinodes_counter,
                           EXT4_INODES_PER_GROUP(sb));
@@ -946,8 +946,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
            sbi->s_log_groups_per_flex) {
                ext4_group_t flex_group;
                flex_group = ext4_flex_group(sbi, input->group);
-                atomic_add(input->free_blocks_count,
+                atomic_add(EXT4_B2C(sbi, input->free_blocks_count),
-                           &sbi->s_flex_groups[flex_group].free_blocks);
+                           &sbi->s_flex_groups[flex_group].free_clusters);
                atomic_add(EXT4_INODES_PER_GROUP(sb),
                           &sbi->s_flex_groups[flex_group].free_inodes);
        }
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 44d0c8db2239..9953d80145ad 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -45,6 +45,7 @@
 #include <linux/freezer.h>
 #include "ext4.h"
+#include "ext4_extents.h"
 #include "ext4_jbd2.h"
 #include "xattr.h"
 #include "acl.h"
@@ -163,8 +164,8 @@ ext4_fsblk_t ext4_inode_table(struct super_block *sb,
                 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0);
 }
-__u32 ext4_free_blks_count(struct super_block *sb,
+__u32 ext4_free_group_clusters(struct super_block *sb,
-                              struct ext4_group_desc *bg)
+                               struct ext4_group_desc *bg)
 {
        return le16_to_cpu(bg->bg_free_blocks_count_lo) |
                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
@@ -219,8 +220,8 @@ void ext4_inode_table_set(struct super_block *sb,
                bg->bg_inode_table_hi = cpu_to_le32(blk >> 32);
 }
-void ext4_free_blks_set(struct super_block *sb,
+void ext4_free_group_clusters_set(struct super_block *sb,
-                          struct ext4_group_desc *bg, __u32 count)
+                                  struct ext4_group_desc *bg, __u32 count)
 {
        bg->bg_free_blocks_count_lo = cpu_to_le16((__u16)count);
        if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
@@ -414,6 +415,22 @@ static void save_error_info(struct super_block *sb, const char *func,
        ext4_commit_super(sb, 1);
 }
+/*
+ * The del_gendisk() function uninitializes the disk-specific data
+ * structures, including the bdi structure, without telling anyone
+ * else.  Once this happens, any attempt to call mark_buffer_dirty()
+ * (for example, by ext4_commit_super), will cause a kernel OOPS.
+ * This is a kludge to prevent these oops until we can put in a proper
+ * hook in del_gendisk() to inform the VFS and file system layers.
+ */
+static int block_device_ejected(struct super_block *sb)
+{
+        struct inode *bd_inode = sb->s_bdev->bd_inode;
+        struct backing_dev_info *bdi = bd_inode->i_mapping->backing_dev_info;
+        return bdi->dev == NULL;
+}
 /* Deal with the reporting of failure conditions on a filesystem such as
 * inconsistencies detected or read IO failures.
@@ -821,10 +838,10 @@ static void ext4_put_super(struct super_block *sb)
                brelse(sbi->s_group_desc[i]);
        ext4_kvfree(sbi->s_group_desc);
        ext4_kvfree(sbi->s_flex_groups);
-        percpu_counter_destroy(&sbi->s_freeblocks_counter);
+        percpu_counter_destroy(&sbi->s_freeclusters_counter);
        percpu_counter_destroy(&sbi->s_freeinodes_counter);
        percpu_counter_destroy(&sbi->s_dirs_counter);
-        percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
+        percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
        brelse(sbi->s_sbh);
 #ifdef CONFIG_QUOTA
        for (i = 0; i < MAXQUOTAS; i++)
@@ -1057,8 +1074,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
                seq_puts(seq, ",nouid32");
        if (test_opt(sb, DEBUG) && !(def_mount_opts & EXT4_DEFM_DEBUG))
                seq_puts(seq, ",debug");
-        if (test_opt(sb, OLDALLOC))
-                seq_puts(seq, ",oldalloc");
 #ifdef CONFIG_EXT4_FS_XATTR
        if (test_opt(sb, XATTR_USER))
                seq_puts(seq, ",user_xattr");
@@ -1567,10 +1582,12 @@ static int parse_options(char *options, struct super_block *sb,
                        set_opt(sb, DEBUG);
                        break;
                case Opt_oldalloc:
-                        set_opt(sb, OLDALLOC);
+                        ext4_msg(sb, KERN_WARNING,
+                                 "Ignoring deprecated oldalloc option");
                        break;
                case Opt_orlov:
-                        clear_opt(sb, OLDALLOC);
+                        ext4_msg(sb, KERN_WARNING,
+                                 "Ignoring deprecated orlov option");
                        break;
 #ifdef CONFIG_EXT4_FS_XATTR
                case Opt_user_xattr:
@@ -1801,6 +1818,7 @@ set_qf_format:
                        break;
                case Opt_nodelalloc:
                        clear_opt(sb, DELALLOC);
+                        clear_opt2(sb, EXPLICIT_DELALLOC);
                        break;
                case Opt_mblk_io_submit:
                        set_opt(sb, MBLK_IO_SUBMIT);
@@ -1817,6 +1835,7 @@ set_qf_format:
                        break;
                case Opt_delalloc:
                        set_opt(sb, DELALLOC);
+                        set_opt2(sb, EXPLICIT_DELALLOC);
                        break;
                case Opt_block_validity:
                        set_opt(sb, BLOCK_VALIDITY);
@@ -1935,7 +1954,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
                res = MS_RDONLY;
        }
        if (read_only)
-                return res;
+                goto done;
        if (!(sbi->s_mount_state & EXT4_VALID_FS))
                ext4_msg(sb, KERN_WARNING, "warning: mounting unchecked fs, "
                         "running e2fsck is recommended");
@@ -1966,6 +1985,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
                EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
        ext4_commit_super(sb, 1);
+done:
        if (test_opt(sb, DEBUG))
                printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, "
                                "bpg=%lu, ipg=%lu, mo=%04x, mo2=%04x]\n",
@@ -2015,8 +2035,8 @@ static int ext4_fill_flex_info(struct super_block *sb)
                flex_group = ext4_flex_group(sbi, i);
                atomic_add(ext4_free_inodes_count(sb, gdp),
                           &sbi->s_flex_groups[flex_group].free_inodes);
-                atomic_add(ext4_free_blks_count(sb, gdp),
+                atomic_add(ext4_free_group_clusters(sb, gdp),
-                           &sbi->s_flex_groups[flex_group].free_blocks);
+                           &sbi->s_flex_groups[flex_group].free_clusters);
                atomic_add(ext4_used_dirs_count(sb, gdp),
                           &sbi->s_flex_groups[flex_group].used_dirs);
        }
@@ -2134,7 +2154,8 @@ static int ext4_check_descriptors(struct super_block *sb,
        if (NULL != first_not_zeroed)
                *first_not_zeroed = grp;
-        ext4_free_blocks_count_set(sbi->s_es, ext4_count_free_blocks(sb));
+        ext4_free_blocks_count_set(sbi->s_es,
+                                   EXT4_C2B(sbi, ext4_count_free_clusters(sb)));
        sbi->s_es->s_free_inodes_count =cpu_to_le32(ext4_count_free_inodes(sb));
        return 1;
 }
@@ -2454,7 +2475,8 @@ static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a,
                                              char *buf)
 {
        return snprintf(buf, PAGE_SIZE, "%llu\n",
-                        (s64) percpu_counter_sum(&sbi->s_dirtyblocks_counter));
+                (s64) EXT4_C2B(sbi,
+                        percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
 }
 static ssize_t session_write_kbytes_show(struct ext4_attr *a,
@@ -2682,6 +2704,13 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly)
                        return 0;
                }
        }
+        if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_BIGALLOC) &&
+            !EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+                ext4_msg(sb, KERN_ERR,
+                         "Can't support bigalloc feature without "
+                         "extents feature\n");
+                return 0;
+        }
        return 1;
 }
@@ -3087,10 +3116,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        char *cp;
        const char *descr;
        int ret = -ENOMEM;
-        int blocksize;
+        int blocksize, clustersize;
        unsigned int db_count;
        unsigned int i;
-        int needs_recovery, has_huge_files;
+        int needs_recovery, has_huge_files, has_bigalloc;
        __u64 blocks_count;
        int err;
        unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
@@ -3224,6 +3253,33 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                           &journal_ioprio, NULL, 0))
                goto failed_mount;
+        if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
+                printk_once(KERN_WARNING "EXT4-fs: Warning: mounting "
+                            "with data=journal disables delayed "
+                            "allocation and O_DIRECT support!\n");
+                if (test_opt2(sb, EXPLICIT_DELALLOC)) {
+                        ext4_msg(sb, KERN_ERR, "can't mount with "
+                                 "both data=journal and delalloc");
+                        goto failed_mount;
+                }
+                if (test_opt(sb, DIOREAD_NOLOCK)) {
+                        ext4_msg(sb, KERN_ERR, "can't mount with "
+                                 "both data=journal and delalloc");
+                        goto failed_mount;
+                }
+                if (test_opt(sb, DELALLOC))
+                        clear_opt(sb, DELALLOC);
+        }
+        blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
+        if (test_opt(sb, DIOREAD_NOLOCK)) {
+                if (blocksize < PAGE_SIZE) {
+                        ext4_msg(sb, KERN_ERR, "can't mount with "
+                                 "dioread_nolock if block size != PAGE_SIZE");
+                        goto failed_mount;
+                }
+        }
        sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
                (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
@@ -3265,8 +3321,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        if (!ext4_feature_set_ok(sb, (sb->s_flags & MS_RDONLY)))
                goto failed_mount;
-        blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
        if (blocksize < EXT4_MIN_BLOCK_SIZE ||
            blocksize > EXT4_MAX_BLOCK_SIZE) {
                ext4_msg(sb, KERN_ERR,
@@ -3369,12 +3423,53 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                sb->s_dirt = 1;
        }
-        if (sbi->s_blocks_per_group > blocksize * 8) {
+        /* Handle clustersize */
-                ext4_msg(sb, KERN_ERR,
+        clustersize = BLOCK_SIZE << le32_to_cpu(es->s_log_cluster_size);
-                       "#blocks per group too big: %lu",
+        has_bigalloc = EXT4_HAS_RO_COMPAT_FEATURE(sb,
-                       sbi->s_blocks_per_group);
+                                EXT4_FEATURE_RO_COMPAT_BIGALLOC);
-                goto failed_mount;
+        if (has_bigalloc) {
+                if (clustersize < blocksize) {
+                        ext4_msg(sb, KERN_ERR,
+                                 "cluster size (%d) smaller than "
+                                 "block size (%d)", clustersize, blocksize);
+                        goto failed_mount;
+                }
+                sbi->s_cluster_bits = le32_to_cpu(es->s_log_cluster_size) -
+                        le32_to_cpu(es->s_log_block_size);
+                sbi->s_clusters_per_group =
+                        le32_to_cpu(es->s_clusters_per_group);
+                if (sbi->s_clusters_per_group > blocksize * 8) {
+                        ext4_msg(sb, KERN_ERR,
+                                 "#clusters per group too big: %lu",
+                                 sbi->s_clusters_per_group);
+                        goto failed_mount;
+                }
+                if (sbi->s_blocks_per_group !=
+                    (sbi->s_clusters_per_group * (clustersize / blocksize))) {
+                        ext4_msg(sb, KERN_ERR, "blocks per group (%lu) and "
+                                 "clusters per group (%lu) inconsistent",
+                                 sbi->s_blocks_per_group,
+                                 sbi->s_clusters_per_group);
+                        goto failed_mount;
+                }
+        } else {
+                if (clustersize != blocksize) {
+                        ext4_warning(sb, "fragment/cluster size (%d) != "
+                                     "block size (%d)", clustersize,
+                                     blocksize);
+                        clustersize = blocksize;
+                }
+                if (sbi->s_blocks_per_group > blocksize * 8) {
+                        ext4_msg(sb, KERN_ERR,
+                                 "#blocks per group too big: %lu",
+                                 sbi->s_blocks_per_group);
+                        goto failed_mount;
+                }
+                sbi->s_clusters_per_group = sbi->s_blocks_per_group;
+                sbi->s_cluster_bits = 0;
        }
+        sbi->s_cluster_ratio = clustersize / blocksize;
        if (sbi->s_inodes_per_group > blocksize * 8) {
                ext4_msg(sb, KERN_ERR,
                       "#inodes per group too big: %lu",
@@ -3446,10 +3541,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                goto failed_mount;
        }
-#ifdef CONFIG_PROC_FS
        if (ext4_proc_root)
                sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root);
-#endif
        bgl_lock_init(sbi->s_blockgroup_lock);
@@ -3483,8 +3576,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        sbi->s_err_report.function = print_daily_error_info;
        sbi->s_err_report.data = (unsigned long) sb;
-        err = percpu_counter_init(&sbi->s_freeblocks_counter,
+        err = percpu_counter_init(&sbi->s_freeclusters_counter,
-                        ext4_count_free_blocks(sb));
+                        ext4_count_free_clusters(sb));
        if (!err) {
                err = percpu_counter_init(&sbi->s_freeinodes_counter,
                                ext4_count_free_inodes(sb));
@@ -3494,7 +3587,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                                ext4_count_dirs(sb));
        }
        if (!err) {
-                err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
+                err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0);
        }
        if (err) {
                ext4_msg(sb, KERN_ERR, "insufficient memory");
@@ -3609,13 +3702,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
         * The journal may have updated the bg summary counts, so we
         * need to update the global counters.
         */
-        percpu_counter_set(&sbi->s_freeblocks_counter,
+        percpu_counter_set(&sbi->s_freeclusters_counter,
-                           ext4_count_free_blocks(sb));
+                           ext4_count_free_clusters(sb));
        percpu_counter_set(&sbi->s_freeinodes_counter,
                           ext4_count_free_inodes(sb));
        percpu_counter_set(&sbi->s_dirs_counter,
                           ext4_count_dirs(sb));
-        percpu_counter_set(&sbi->s_dirtyblocks_counter, 0);
+        percpu_counter_set(&sbi->s_dirtyclusters_counter, 0);
 no_journal:
        /*
@@ -3679,25 +3772,6 @@ no_journal:
                         "available");
        }
-        if (test_opt(sb, DELALLOC) &&
-            (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) {
-                ext4_msg(sb, KERN_WARNING, "Ignoring delalloc option - "
-                         "requested data journaling mode");
-                clear_opt(sb, DELALLOC);
-        }
-        if (test_opt(sb, DIOREAD_NOLOCK)) {
-                if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
-                        ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock "
-                                "option - requested data journaling mode");
-                        clear_opt(sb, DIOREAD_NOLOCK);
-                }
-                if (sb->s_blocksize < PAGE_SIZE) {
-                        ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock "
-                                "option - block size is too small");
-                        clear_opt(sb, DIOREAD_NOLOCK);
-                }
-        }
        err = ext4_setup_system_zone(sb);
        if (err) {
                ext4_msg(sb, KERN_ERR, "failed to initialize system "
@@ -3710,22 +3784,19 @@ no_journal:
        if (err) {
                ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)",
                         err);
-                goto failed_mount4;
+                goto failed_mount5;
        }
        err = ext4_register_li_request(sb, first_not_zeroed);
        if (err)
-                goto failed_mount4;
+                goto failed_mount6;
        sbi->s_kobj.kset = ext4_kset;
        init_completion(&sbi->s_kobj_unregister);
        err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL,
                                   "%s", sb->s_id);
-        if (err) {
+        if (err)
-                ext4_mb_release(sb);
+                goto failed_mount7;
-                ext4_ext_release(sb);
-                goto failed_mount4;
-        };
        EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
        ext4_orphan_cleanup(sb, es);
@@ -3759,13 +3830,19 @@ cantfind_ext4:
                ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem");
        goto failed_mount;
+failed_mount7:
+        ext4_unregister_li_request(sb);
+failed_mount6:
+        ext4_ext_release(sb);
+failed_mount5:
+        ext4_mb_release(sb);
+        ext4_release_system_zone(sb);
 failed_mount4:
        iput(root);
        sb->s_root = NULL;
        ext4_msg(sb, KERN_ERR, "mount failed");
        destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq);
 failed_mount_wq:
-        ext4_release_system_zone(sb);
        if (sbi->s_journal) {
                jbd2_journal_destroy(sbi->s_journal);
                sbi->s_journal = NULL;
@@ -3774,10 +3851,10 @@ failed_mount3:
        del_timer(&sbi->s_err_report);
        if (sbi->s_flex_groups)
                ext4_kvfree(sbi->s_flex_groups);
-        percpu_counter_destroy(&sbi->s_freeblocks_counter);
+        percpu_counter_destroy(&sbi->s_freeclusters_counter);
        percpu_counter_destroy(&sbi->s_freeinodes_counter);
        percpu_counter_destroy(&sbi->s_dirs_counter);
-        percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
+        percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
        if (sbi->s_mmp_tsk)
                kthread_stop(sbi->s_mmp_tsk);
 failed_mount2:
@@ -4064,7 +4141,7 @@ static int ext4_commit_super(struct super_block *sb, int sync)
        struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
        int error = 0;
-        if (!sbh)
+        if (!sbh || block_device_ejected(sb))
                return error;
        if (buffer_write_io_error(sbh)) {
                /*
@@ -4100,8 +4177,9 @@ static int ext4_commit_super(struct super_block *sb, int sync)
        else
                es->s_kbytes_written =
                        cpu_to_le64(EXT4_SB(sb)->s_kbytes_written);
-        ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
+        ext4_free_blocks_count_set(es,
-                                           &EXT4_SB(sb)->s_freeblocks_counter));
+                        EXT4_C2B(EXT4_SB(sb), percpu_counter_sum_positive(
+                                &EXT4_SB(sb)->s_freeclusters_counter)));
        es->s_free_inodes_count =
                cpu_to_le32(percpu_counter_sum_positive(
                                &EXT4_SB(sb)->s_freeinodes_counter));
@@ -4506,16 +4584,34 @@ restore_opts:
        return err;
 }
+/*
+ * Note: calculating the overhead so we can be compatible with
+ * historical BSD practice is quite difficult in the face of
+ * clusters/bigalloc.  This is because multiple metadata blocks from
+ * different block group can end up in the same allocation cluster.
+ * Calculating the exact overhead in the face of clustered allocation
+ * requires either O(all block bitmaps) in memory or O(number of block
+ * groups**2) in time.  We will still calculate the superblock for
+ * older file systems --- and if we come across with a bigalloc file
+ * system with zero in s_overhead_clusters the estimate will be close to
+ * correct especially for very large cluster sizes --- but for newer
+ * file systems, it's better to calculate this figure once at mkfs
+ * time, and store it in the superblock.  If the superblock value is
+ * present (even for non-bigalloc file systems), we will use it.
+ */
 static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct super_block *sb = dentry->d_sb;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_super_block *es = sbi->s_es;
+        struct ext4_group_desc *gdp;
        u64 fsid;
        s64 bfree;
        if (test_opt(sb, MINIX_DF)) {
                sbi->s_overhead_last = 0;
+        } else if (es->s_overhead_clusters) {
+                sbi->s_overhead_last = le32_to_cpu(es->s_overhead_clusters);
        } else if (sbi->s_blocks_last != ext4_blocks_count(es)) {
                ext4_group_t i, ngroups = ext4_get_groups_count(sb);
                ext4_fsblk_t overhead = 0;
@@ -4530,24 +4626,16 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
                 * All of the blocks before first_data_block are
                 * overhead
                 */
-                overhead = le32_to_cpu(es->s_first_data_block);
+                overhead = EXT4_B2C(sbi, le32_to_cpu(es->s_first_data_block));
                /*
-                 * Add the overhead attributed to the superblock and
+                 * Add the overhead found in each block group
-                 * block group descriptors.  If the sparse superblocks
-                 * feature is turned on, then not all groups have this.
                 */
                for (i = 0; i < ngroups; i++) {
-                        overhead += ext4_bg_has_super(sb, i) +
+                        gdp = ext4_get_group_desc(sb, i, NULL);
-                                ext4_bg_num_gdb(sb, i);
+                        overhead += ext4_num_overhead_clusters(sb, i, gdp);
                        cond_resched();
                }
-                /*
-                 * Every block group has an inode bitmap, a block
-                 * bitmap, and an inode table.
-                 */
-                overhead += ngroups * (2 + sbi->s_itb_per_group);
                sbi->s_overhead_last = overhead;
                smp_wmb();
                sbi->s_blocks_last = ext4_blocks_count(es);
@@ -4555,11 +4643,12 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_type = EXT4_SUPER_MAGIC;
        buf->f_bsize = sb->s_blocksize;
-        buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last;
+        buf->f_blocks = (ext4_blocks_count(es) -
-        bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) -
+                         EXT4_C2B(sbi, sbi->s_overhead_last));
-                       percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter);
+        bfree = percpu_counter_sum_positive(&sbi->s_freeclusters_counter) -
+                percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter);
        /* prevent underflow in case that few free space is available */
-        buf->f_bfree = max_t(s64, bfree, 0);
+        buf->f_bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0));
        buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
        if (buf->f_bfree < ext4_r_blocks_count(es))
                buf->f_bavail = 0;
@@ -4980,13 +5069,11 @@ static int __init ext4_init_fs(void)
                return err;
        err = ext4_init_system_zone();
        if (err)
-                goto out7;
+                goto out6;
        ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
        if (!ext4_kset)
-                goto out6;
-        ext4_proc_root = proc_mkdir("fs/ext4", NULL);
-        if (!ext4_proc_root)
                goto out5;
+        ext4_proc_root = proc_mkdir("fs/ext4", NULL);
        err = ext4_init_feat_adverts();
        if (err)
@@ -5022,12 +5109,12 @@ out2:
 out3:
        ext4_exit_feat_adverts();
 out4:
-        remove_proc_entry("fs/ext4", NULL);
+        if (ext4_proc_root)
-out5:
+                remove_proc_entry("fs/ext4", NULL);
        kset_unregister(ext4_kset);
-out6:
+out5:
        ext4_exit_system_zone();
-out7:
+out6:
        ext4_exit_pageio();
        return err;
 }
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index c757adc97250..93a00d89a220 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -820,8 +820,14 @@ inserted:
                        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
                                goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
+                        /*
+                         * take i_data_sem because we will test
+                         * i_delalloc_reserved_flag in ext4_mb_new_blocks
+                         */
+                        down_read((&EXT4_I(inode)->i_data_sem));
                        block = ext4_new_meta_blocks(handle, inode, goal, 0,
                                                     NULL, &error);
+                        up_read((&EXT4_I(inode)->i_data_sem));
                        if (error)
                                goto cleanup;
@@ -985,11 +991,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
        no_expand = ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND);
        ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND);
-        error = ext4_get_inode_loc(inode, &is.iloc);
+        error = ext4_reserve_inode_write(handle, inode, &is.iloc);
-        if (error)
-                goto cleanup;
-        error = ext4_journal_get_write_access(handle, is.iloc.bh);
        if (error)
                goto cleanup;
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 5efbd5d7701a..aca191bd5f8f 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -156,8 +156,8 @@ static int uni16_to_x8(struct super_block *sb, unsigned char *ascii,
                } else {
                        if (uni_xlate == 1) {
                                *op++ = ':';
-                                op = pack_hex_byte(op, ec >> 8);
+                                op = hex_byte_pack(op, ec >> 8);
-                                op = pack_hex_byte(op, ec);
+                                op = hex_byte_pack(op, ec);
                                len -= 5;
                        } else {
                                *op++ = '?';
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index a5d3853822e0..1510a4d51990 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -326,15 +326,14 @@ extern int fat_fill_super(struct super_block *sb, void *data, int silent,
 extern int fat_flush_inodes(struct super_block *sb, struct inode *i1,
                            struct inode *i2);
 /* fat/misc.c */
-extern void
+extern __printf(3, 4) __cold
-__fat_fs_error(struct super_block *sb, int report, const char *fmt, ...)
+void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...);
-        __attribute__ ((format (printf, 3, 4))) __cold;
 #define fat_fs_error(sb, fmt, args...)          \
        __fat_fs_error(sb, 1, fmt , ## args)
 #define fat_fs_error_ratelimit(sb, fmt, args...) \
        __fat_fs_error(sb, __ratelimit(&MSDOS_SB(sb)->ratelimit), fmt , ## args)
-void fat_msg(struct super_block *sb, const char *level, const char *fmt, ...)
+__printf(3, 4) __cold
-        __attribute__ ((format (printf, 3, 4))) __cold;
+void fat_msg(struct super_block *sb, const char *level, const char *fmt, ...);
 extern int fat_clusters_flush(struct super_block *sb);
 extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster);
 extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts,
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 1726d7303047..808cac7edcfb 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -379,7 +379,7 @@ static int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
                        return error;
                MSDOS_I(inode)->mmu_private = inode->i_size;
-                inode->i_nlink = fat_subdirs(inode);
+                set_nlink(inode, fat_subdirs(inode));
        } else { /* not a directory */
                inode->i_generation |= 1;
                inode->i_mode = fat_make_mode(sbi, de->attr,
@@ -1233,7 +1233,7 @@ static int fat_read_root(struct inode *inode)
        fat_save_attrs(inode, ATTR_DIR);
        inode->i_mtime.tv_sec = inode->i_atime.tv_sec = inode->i_ctime.tv_sec = 0;
        inode->i_mtime.tv_nsec = inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = 0;
-        inode->i_nlink = fat_subdirs(inode)+2;
+        set_nlink(inode, fat_subdirs(inode)+2);
        return 0;
 }
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index 66e83b845455..216b419f30e2 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -387,7 +387,7 @@ static int msdos_mkdir(struct inode *dir, struct dentry *dentry, int mode)
                /* the directory was completed, just return a error */
                goto out;
        }
-        inode->i_nlink = 2;
+        set_nlink(inode, 2);
        inode->i_mtime = inode->i_atime = inode->i_ctime = ts;
        /* timestamp is already written, so mark_inode_dirty() is unneeded. */
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index bb3f29c3557b..a87a65663c25 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -900,7 +900,7 @@ static int vfat_mkdir(struct inode *dir, struct dentry *dentry, int mode)
                goto out;
        }
        inode->i_version++;
-        inode->i_nlink = 2;
+        set_nlink(inode, 2);
        inode->i_mtime = inode->i_atime = inode->i_ctime = ts;
        /* timestamp is already written, so mark_inode_dirty() is unneeded. */
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index 1a4311437a8b..7b2af5abe2fa 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -227,7 +227,7 @@ vxfs_iinit(struct inode *ip, struct vxfs_inode_info *vip)
        ip->i_uid = (uid_t)vip->vii_uid;
        ip->i_gid = (gid_t)vip->vii_gid;
-        ip->i_nlink = vip->vii_nlink;
+        set_nlink(ip, vip->vii_nlink);
        ip->i_size = vip->vii_size;
        ip->i_atime.tv_sec = vip->vii_atime;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 04cf3b91e501..73c3992b2bb4 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -41,11 +41,23 @@ struct wb_writeback_work {
        unsigned int for_kupdate:1;
        unsigned int range_cyclic:1;
        unsigned int for_background:1;
+        enum wb_reason reason;          /* why was writeback initiated? */
        struct list_head list;          /* pending work list */
        struct completion *done;        /* set if the caller waits */
 };
+const char *wb_reason_name[] = {
+        [WB_REASON_BACKGROUND]          = "background",
+        [WB_REASON_TRY_TO_FREE_PAGES]   = "try_to_free_pages",
+        [WB_REASON_SYNC]                = "sync",
+        [WB_REASON_PERIODIC]            = "periodic",
+        [WB_REASON_LAPTOP_TIMER]        = "laptop_timer",
+        [WB_REASON_FREE_MORE_MEM]       = "free_more_memory",
+        [WB_REASON_FS_FREE_SPACE]       = "fs_free_space",
+        [WB_REASON_FORKER_THREAD]       = "forker_thread"
+};
 /*
 * Include the creation of the trace points after defining the
 * wb_writeback_work structure so that the definition remains local to this
@@ -115,7 +127,7 @@ static void bdi_queue_work(struct backing_dev_info *bdi,
 static void
 __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
-                      bool range_cyclic)
+                      bool range_cyclic, enum wb_reason reason)
 {
        struct wb_writeback_work *work;
@@ -135,6 +147,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
        work->sync_mode = WB_SYNC_NONE;
        work->nr_pages  = nr_pages;
        work->range_cyclic = range_cyclic;
+        work->reason    = reason;
        bdi_queue_work(bdi, work);
 }
@@ -150,9 +163,10 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
 *   completion. Caller need not hold sb s_umount semaphore.
 *
 */
-void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
+void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
+                        enum wb_reason reason)
 {
-        __bdi_start_writeback(bdi, nr_pages, true);
+        __bdi_start_writeback(bdi, nr_pages, true, reason);
 }
 /**
@@ -251,7 +265,7 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t)
 */
 static int move_expired_inodes(struct list_head *delaying_queue,
                               struct list_head *dispatch_queue,
-                               unsigned long *older_than_this)
+                               struct wb_writeback_work *work)
 {
        LIST_HEAD(tmp);
        struct list_head *pos, *node;
@@ -262,8 +276,8 @@ static int move_expired_inodes(struct list_head *delaying_queue,
        while (!list_empty(delaying_queue)) {
                inode = wb_inode(delaying_queue->prev);
-                if (older_than_this &&
+                if (work->older_than_this &&
-                    inode_dirtied_after(inode, *older_than_this))
+                    inode_dirtied_after(inode, *work->older_than_this))
                        break;
                if (sb && sb != inode->i_sb)
                        do_sb_sort = 1;
@@ -302,13 +316,13 @@ out:
 *                                           |
 *                                           +--> dequeue for IO
 */
-static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
+static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work)
 {
        int moved;
        assert_spin_locked(&wb->list_lock);
        list_splice_init(&wb->b_more_io, &wb->b_io);
-        moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
+        moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, work);
-        trace_writeback_queue_io(wb, older_than_this, moved);
+        trace_writeback_queue_io(wb, work, moved);
 }
 static int write_inode(struct inode *inode, struct writeback_control *wbc)
@@ -641,31 +655,40 @@ static long __writeback_inodes_wb(struct bdi_writeback *wb,
        return wrote;
 }
-long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages)
+long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
+                                enum wb_reason reason)
 {
        struct wb_writeback_work work = {
                .nr_pages       = nr_pages,
                .sync_mode      = WB_SYNC_NONE,
                .range_cyclic   = 1,
+                .reason         = reason,
        };
        spin_lock(&wb->list_lock);
        if (list_empty(&wb->b_io))
-                queue_io(wb, NULL);
+                queue_io(wb, &work);
        __writeback_inodes_wb(wb, &work);
        spin_unlock(&wb->list_lock);
        return nr_pages - work.nr_pages;
 }
-static inline bool over_bground_thresh(void)
+static bool over_bground_thresh(struct backing_dev_info *bdi)
 {
        unsigned long background_thresh, dirty_thresh;
        global_dirty_limits(&background_thresh, &dirty_thresh);
-        return (global_page_state(NR_FILE_DIRTY) +
+        if (global_page_state(NR_FILE_DIRTY) +
-                global_page_state(NR_UNSTABLE_NFS) > background_thresh);
+            global_page_state(NR_UNSTABLE_NFS) > background_thresh)
+                return true;
+        if (bdi_stat(bdi, BDI_RECLAIMABLE) >
+                                bdi_dirty_limit(bdi, background_thresh))
+                return true;
+        return false;
 }
 /*
@@ -675,7 +698,7 @@ static inline bool over_bground_thresh(void)
 static void wb_update_bandwidth(struct bdi_writeback *wb,
                                unsigned long start_time)
 {
-        __bdi_update_bandwidth(wb->bdi, 0, 0, 0, 0, start_time);
+        __bdi_update_bandwidth(wb->bdi, 0, 0, 0, 0, 0, start_time);
 }
 /*
@@ -727,7 +750,7 @@ static long wb_writeback(struct bdi_writeback *wb,
                 * For background writeout, stop when we are below the
                 * background dirty threshold
                 */
-                if (work->for_background && !over_bground_thresh())
+                if (work->for_background && !over_bground_thresh(wb->bdi))
                        break;
                if (work->for_kupdate) {
@@ -738,7 +761,7 @@ static long wb_writeback(struct bdi_writeback *wb,
                trace_writeback_start(wb->bdi, work);
                if (list_empty(&wb->b_io))
-                        queue_io(wb, work->older_than_this);
+                        queue_io(wb, work);
                if (work->sb)
                        progress = writeback_sb_inodes(work->sb, wb, work);
                else
@@ -811,13 +834,14 @@ static unsigned long get_nr_dirty_pages(void)
 static long wb_check_background_flush(struct bdi_writeback *wb)
 {
-        if (over_bground_thresh()) {
+        if (over_bground_thresh(wb->bdi)) {
                struct wb_writeback_work work = {
                        .nr_pages       = LONG_MAX,
                        .sync_mode      = WB_SYNC_NONE,
                        .for_background = 1,
                        .range_cyclic   = 1,
+                        .reason         = WB_REASON_BACKGROUND,
                };
                return wb_writeback(wb, &work);
@@ -851,6 +875,7 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
                        .sync_mode      = WB_SYNC_NONE,
                        .for_kupdate    = 1,
                        .range_cyclic   = 1,
+                        .reason         = WB_REASON_PERIODIC,
                };
                return wb_writeback(wb, &work);
@@ -969,7 +994,7 @@ int bdi_writeback_thread(void *data)
 * Start writeback of `nr_pages' pages.  If `nr_pages' is zero, write back
 * the whole world.
 */
-void wakeup_flusher_threads(long nr_pages)
+void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
 {
        struct backing_dev_info *bdi;
@@ -982,7 +1007,7 @@ void wakeup_flusher_threads(long nr_pages)
        list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
                if (!bdi_has_dirty_io(bdi))
                        continue;
-                __bdi_start_writeback(bdi, nr_pages, false);
+                __bdi_start_writeback(bdi, nr_pages, false, reason);
        }
        rcu_read_unlock();
 }
@@ -1203,7 +1228,9 @@ static void wait_sb_inodes(struct super_block *sb)
 * on how many (if any) will be written, and this function does not wait
 * for IO completion of submitted IO.
 */
-void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr)
+void writeback_inodes_sb_nr(struct super_block *sb,
+                            unsigned long nr,
+                            enum wb_reason reason)
 {
        DECLARE_COMPLETION_ONSTACK(done);
        struct wb_writeback_work work = {
@@ -1212,6 +1239,7 @@ void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr)
                .tagged_writepages      = 1,
                .done                   = &done,
                .nr_pages               = nr,
+                .reason                 = reason,
        };
        WARN_ON(!rwsem_is_locked(&sb->s_umount));
@@ -1228,9 +1256,9 @@ EXPORT_SYMBOL(writeback_inodes_sb_nr);
 * on how many (if any) will be written, and this function does not wait
 * for IO completion of submitted IO.
 */
-void writeback_inodes_sb(struct super_block *sb)
+void writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
 {
-        return writeback_inodes_sb_nr(sb, get_nr_dirty_pages());
+        return writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason);
 }
 EXPORT_SYMBOL(writeback_inodes_sb);
@@ -1241,11 +1269,11 @@ EXPORT_SYMBOL(writeback_inodes_sb);
 * Invoke writeback_inodes_sb if no writeback is currently underway.
 * Returns 1 if writeback was started, 0 if not.
 */
-int writeback_inodes_sb_if_idle(struct super_block *sb)
+int writeback_inodes_sb_if_idle(struct super_block *sb, enum wb_reason reason)
 {
        if (!writeback_in_progress(sb->s_bdi)) {
                down_read(&sb->s_umount);
-                writeback_inodes_sb(sb);
+                writeback_inodes_sb(sb, reason);
                up_read(&sb->s_umount);
                return 1;
        } else
@@ -1262,11 +1290,12 @@ EXPORT_SYMBOL(writeback_inodes_sb_if_idle);
 * Returns 1 if writeback was started, 0 if not.
 */
 int writeback_inodes_sb_nr_if_idle(struct super_block *sb,
-                                   unsigned long nr)
+                                   unsigned long nr,
+                                   enum wb_reason reason)
 {
        if (!writeback_in_progress(sb->s_bdi)) {
                down_read(&sb->s_umount);
-                writeback_inodes_sb_nr(sb, nr);
+                writeback_inodes_sb_nr(sb, nr, reason);
                up_read(&sb->s_umount);
                return 1;
        } else
@@ -1290,6 +1319,7 @@ void sync_inodes_sb(struct super_block *sb)
                .nr_pages       = LONG_MAX,
                .range_cyclic   = 0,
                .done           = &done,
+                .reason         = WB_REASON_SYNC,
        };
        WARN_ON(!rwsem_is_locked(&sb->s_umount));
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 85542a7daf40..42593c587d48 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -231,7 +231,7 @@ static struct dentry *fuse_ctl_add_dentry(struct dentry *parent,
        if (iop)
                inode->i_op = iop;
        inode->i_fop = fop;
-        inode->i_nlink = nlink;
+        set_nlink(inode, nlink);
        inode->i_private = fc;
        d_add(dentry, inode);
        return dentry;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index add96f6ffda5..3e6d72756479 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -151,7 +151,7 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
        inode->i_ino     = attr->ino;
        inode->i_mode    = (inode->i_mode & S_IFMT) | (attr->mode & 07777);
-        inode->i_nlink   = attr->nlink;
+        set_nlink(inode, attr->nlink);
        inode->i_uid     = attr->uid;
        inode->i_gid     = attr->gid;
        inode->i_blocks  = attr->blocks;
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 66707118af25..2553b858a72e 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -201,7 +201,7 @@ int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
 void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
 void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs);
-__attribute__ ((format(printf, 2, 3)))
+__printf(2, 3)
 void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...);
 /**
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 78418b4fa857..1656df7aacd2 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -299,7 +299,7 @@ static void gfs2_set_nlink(struct inode *inode, u32 nlink)
                if (nlink == 0)
                        clear_nlink(inode);
                else
-                        inode->i_nlink = nlink;
+                        set_nlink(inode, nlink);
        }
 }
diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
index 3ebc437736fe..1cbdeea1db44 100644
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -46,11 +46,26 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
        case HFS_EXT_CNID:
                hfs_inode_read_fork(tree->inode, mdb->drXTExtRec, mdb->drXTFlSize,
                                    mdb->drXTFlSize, be32_to_cpu(mdb->drXTClpSiz));
+                if (HFS_I(tree->inode)->alloc_blocks >
+                                        HFS_I(tree->inode)->first_blocks) {
+                        printk(KERN_ERR "hfs: invalid btree extent records\n");
+                        unlock_new_inode(tree->inode);
+                        goto free_inode;
+                }
                tree->inode->i_mapping->a_ops = &hfs_btree_aops;
                break;
        case HFS_CAT_CNID:
                hfs_inode_read_fork(tree->inode, mdb->drCTExtRec, mdb->drCTFlSize,
                                    mdb->drCTFlSize, be32_to_cpu(mdb->drCTClpSiz));
+                if (!HFS_I(tree->inode)->first_blocks) {
+                        printk(KERN_ERR "hfs: invalid btree extent records "
+                                                                "(0 size).\n");
+                        unlock_new_inode(tree->inode);
+                        goto free_inode;
+                }
                tree->inode->i_mapping->a_ops = &hfs_btree_aops;
                break;
        default:
@@ -59,11 +74,6 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
        }
        unlock_new_inode(tree->inode);
-        if (!HFS_I(tree->inode)->first_blocks) {
-                printk(KERN_ERR "hfs: invalid btree extent records (0 size).\n");
-                goto free_inode;
-        }
        mapping = tree->inode->i_mapping;
        page = read_mapping_page(mapping, 0, NULL);
        if (IS_ERR(page))
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index b4d70b13be92..bce4eef91a06 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -198,7 +198,7 @@ static int hfs_create(struct inode *dir, struct dentry *dentry, int mode,
        res = hfs_cat_create(inode->i_ino, dir, &dentry->d_name, inode);
        if (res) {
-                inode->i_nlink = 0;
+                clear_nlink(inode);
                hfs_delete_inode(inode);
                iput(inode);
                return res;
@@ -227,7 +227,7 @@ static int hfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        res = hfs_cat_create(inode->i_ino, dir, &dentry->d_name, inode);
        if (res) {
-                inode->i_nlink = 0;
+                clear_nlink(inode);
                hfs_delete_inode(inode);
                iput(inode);
                return res;
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 96a1b625fc74..a1a9fdcd2a00 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -183,7 +183,7 @@ struct inode *hfs_new_inode(struct inode *dir, struct qstr *name, int mode)
        inode->i_mode = mode;
        inode->i_uid = current_fsuid();
        inode->i_gid = current_fsgid();
-        inode->i_nlink = 1;
+        set_nlink(inode, 1);
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
        HFS_I(inode)->flags = 0;
        HFS_I(inode)->rsrc_inode = NULL;
@@ -313,7 +313,7 @@ static int hfs_read_inode(struct inode *inode, void *data)
        /* Initialize the inode */
        inode->i_uid = hsb->s_uid;
        inode->i_gid = hsb->s_gid;
-        inode->i_nlink = 1;
+        set_nlink(inode, 1);
        if (idata->key)
                HFS_I(inode)->cat_key = *idata->key;
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 25b2443a004c..4536cd3f15ae 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -415,7 +415,7 @@ static int hfsplus_symlink(struct inode *dir, struct dentry *dentry,
        goto out;
 out_err:
-        inode->i_nlink = 0;
+        clear_nlink(inode);
        hfsplus_delete_inode(inode);
        iput(inode);
 out:
@@ -440,7 +440,7 @@ static int hfsplus_mknod(struct inode *dir, struct dentry *dentry,
        res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode);
        if (res) {
-                inode->i_nlink = 0;
+                clear_nlink(inode);
                hfsplus_delete_inode(inode);
                iput(inode);
                goto out;
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 4cc1e3a36ec7..40e1413be4cf 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -391,7 +391,7 @@ struct inode *hfsplus_new_inode(struct super_block *sb, int mode)
        inode->i_mode = mode;
        inode->i_uid = current_fsuid();
        inode->i_gid = current_fsgid();
-        inode->i_nlink = 1;
+        set_nlink(inode, 1);
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
        hip = HFSPLUS_I(inode);
@@ -512,7 +512,7 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
                hfs_bnode_read(fd->bnode, &entry, fd->entryoffset,
                                        sizeof(struct hfsplus_cat_folder));
                hfsplus_get_perms(inode, &folder->permissions, 1);
-                inode->i_nlink = 1;
+                set_nlink(inode, 1);
                inode->i_size = 2 + be32_to_cpu(folder->valence);
                inode->i_atime = hfsp_mt2ut(folder->access_date);
                inode->i_mtime = hfsp_mt2ut(folder->content_mod_date);
@@ -532,11 +532,11 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
                hfsplus_inode_read_fork(inode, HFSPLUS_IS_RSRC(inode) ?
                                        &file->rsrc_fork : &file->data_fork);
                hfsplus_get_perms(inode, &file->permissions, 0);
-                inode->i_nlink = 1;
+                set_nlink(inode, 1);
                if (S_ISREG(inode->i_mode)) {
                        if (file->permissions.dev)
-                                inode->i_nlink =
+                                set_nlink(inode,
-                                        be32_to_cpu(file->permissions.dev);
+                                          be32_to_cpu(file->permissions.dev));
                        inode->i_op = &hfsplus_file_inode_operations;
                        inode->i_fop = &hfsplus_file_operations;
                        inode->i_mapping->a_ops = &hfsplus_aops;
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 0d22afdd4611..2f72da5ae686 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -541,7 +541,7 @@ static int read_name(struct inode *ino, char *name)
        ino->i_ino = st.ino;
        ino->i_mode = st.mode;
-        ino->i_nlink = st.nlink;
+        set_nlink(ino, st.nlink);
        ino->i_uid = st.uid;
        ino->i_gid = st.gid;
        ino->i_atime = st.atime;
diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c
index d51a98384bc0..dd7bc38a3825 100644
--- a/fs/hostfs/hostfs_user.c
+++ b/fs/hostfs/hostfs_user.c
@@ -16,7 +16,6 @@
 #include <sys/vfs.h>
 #include "hostfs.h"
 #include "os.h"
-#include "user.h"
 #include <utime.h>
 static void stat64_to_hostfs(const struct stat64 *buf, struct hostfs_stat *p)
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index 96a8ed91cedd..2fa0089a02a8 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -247,7 +247,7 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct name
                        result->i_mode &= ~0111;
                        result->i_op = &hpfs_file_iops;
                        result->i_fop = &hpfs_file_ops;
-                        result->i_nlink = 1;
+                        set_nlink(result, 1);
                }
                unlock_new_inode(result);
        }
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index 331b5e234ef3..de946170ebb1 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -311,8 +311,8 @@ static inline struct hpfs_sb_info *hpfs_sb(struct super_block *sb)
 /* super.c */
-void hpfs_error(struct super_block *, const char *, ...)
+__printf(2, 3)
-        __attribute__((format (printf, 2, 3)));
+void hpfs_error(struct super_block *, const char *, ...);
 int hpfs_stop_cycles(struct super_block *, int, int *, int *, char *);
 unsigned hpfs_count_one_bitmap(struct super_block *, secno);
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index 338cd8368451..3b2cec29972b 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -53,7 +53,7 @@ void hpfs_read_inode(struct inode *i)
                i->i_mode &= ~0111;
                i->i_op = &hpfs_file_iops;
                i->i_fop = &hpfs_file_ops;
-                i->i_nlink = 0;*/
+                clear_nlink(i);*/
                make_bad_inode(i);
                return;
        }
@@ -77,7 +77,7 @@ void hpfs_read_inode(struct inode *i)
                        i->i_mode = S_IFLNK | 0777;
                        i->i_op = &page_symlink_inode_operations;
                        i->i_data.a_ops = &hpfs_symlink_aops;
-                        i->i_nlink = 1;
+                        set_nlink(i, 1);
                        i->i_size = ea_size;
                        i->i_blocks = 1;
                        brelse(bh);
@@ -101,7 +101,7 @@ void hpfs_read_inode(struct inode *i)
                        }
                        if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISFIFO(mode) || S_ISSOCK(mode)) {
                                brelse(bh);
-                                i->i_nlink = 1;
+                                set_nlink(i, 1);
                                i->i_size = 0;
                                i->i_blocks = 1;
                                init_special_inode(i, mode,
@@ -125,13 +125,13 @@ void hpfs_read_inode(struct inode *i)
                hpfs_count_dnodes(i->i_sb, hpfs_inode->i_dno, &n_dnodes, &n_subdirs, NULL);
                i->i_blocks = 4 * n_dnodes;
                i->i_size = 2048 * n_dnodes;
-                i->i_nlink = 2 + n_subdirs;
+                set_nlink(i, 2 + n_subdirs);
        } else {
                i->i_mode |= S_IFREG;
                if (!hpfs_inode->i_ea_mode) i->i_mode &= ~0111;
                i->i_op = &hpfs_file_iops;
                i->i_fop = &hpfs_file_ops;
-                i->i_nlink = 1;
+                set_nlink(i, 1);
                i->i_size = le32_to_cpu(fnode->file_size);
                i->i_blocks = ((i->i_size + 511) >> 9) + 1;
                i->i_data.a_ops = &hpfs_aops;
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index 2df69e2f07cf..ea91fcb0ef9b 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -56,7 +56,7 @@ static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        result->i_fop = &hpfs_dir_ops;
        result->i_blocks = 4;
        result->i_size = 2048;
-        result->i_nlink = 2;
+        set_nlink(result, 2);
        if (dee.read_only)
                result->i_mode &= ~0222;
@@ -150,7 +150,7 @@ static int hpfs_create(struct inode *dir, struct dentry *dentry, int mode, struc
        result->i_mode &= ~0111;
        result->i_op = &hpfs_file_iops;
        result->i_fop = &hpfs_file_ops;
-        result->i_nlink = 1;
+        set_nlink(result, 1);
        hpfs_i(result)->i_parent_dir = dir->i_ino;
        result->i_ctime.tv_sec = result->i_mtime.tv_sec = result->i_atime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date));
        result->i_ctime.tv_nsec = 0;
@@ -242,7 +242,7 @@ static int hpfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t
        hpfs_i(result)->i_ea_size = 0;
        result->i_uid = current_fsuid();
        result->i_gid = current_fsgid();
-        result->i_nlink = 1;
+        set_nlink(result, 1);
        result->i_size = 0;
        result->i_blocks = 1;
        init_special_inode(result, mode, rdev);
@@ -318,7 +318,7 @@ static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *sy
        result->i_uid = current_fsuid();
        result->i_gid = current_fsgid();
        result->i_blocks = 1;
-        result->i_nlink = 1;
+        set_nlink(result, 1);
        result->i_size = strlen(symlink);
        result->i_op = &page_symlink_inode_operations;
        result->i_data.a_ops = &hpfs_symlink_aops;
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index 970ea987b3f6..f590b1160c6c 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -702,7 +702,7 @@ static struct inode *get_inode(struct super_block *sb, struct dentry *dentry)
        inode->i_ctime = proc_ino->i_ctime;
        inode->i_ino = proc_ino->i_ino;
        inode->i_mode = proc_ino->i_mode;
-        inode->i_nlink = proc_ino->i_nlink;
+        set_nlink(inode, proc_ino->i_nlink);
        inode->i_size = proc_ino->i_size;
        inode->i_blocks = proc_ino->i_blocks;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index ec889538e5a6..0be5a78598d0 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -970,7 +970,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size,
        d_instantiate(path.dentry, inode);
        inode->i_size = size;
-        inode->i_nlink = 0;
+        clear_nlink(inode);
        error = -ENFILE;
        file = alloc_file(&path, FMODE_WRITE | FMODE_READ,
diff --git a/fs/inode.c b/fs/inode.c
index ecbb68dc7e2a..ee4e66b998f4 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -142,7 +142,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
        atomic_set(&inode->i_count, 1);
        inode->i_op = &empty_iops;
        inode->i_fop = &empty_fops;
-        inode->i_nlink = 1;
+        inode->__i_nlink = 1;
        inode->i_opflags = 0;
        inode->i_uid = 0;
        inode->i_gid = 0;
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index a5d03672d04e..f950059525fc 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -20,6 +20,7 @@
 #include <linux/statfs.h>
 #include <linux/cdrom.h>
 #include <linux/parser.h>
+#include <linux/mpage.h>
 #include "isofs.h"
 #include "zisofs.h"
@@ -1148,7 +1149,13 @@ struct buffer_head *isofs_bread(struct inode *inode, sector_t block)
 static int isofs_readpage(struct file *file, struct page *page)
 {
-        return block_read_full_page(page,isofs_get_block);
+        return mpage_readpage(page, isofs_get_block);
+}
+static int isofs_readpages(struct file *file, struct address_space *mapping,
+                        struct list_head *pages, unsigned nr_pages)
+{
+        return mpage_readpages(mapping, pages, nr_pages, isofs_get_block);
 }
 static sector_t _isofs_bmap(struct address_space *mapping, sector_t block)
@@ -1158,6 +1165,7 @@ static sector_t _isofs_bmap(struct address_space *mapping, sector_t block)
 static const struct address_space_operations isofs_aops = {
        .readpage = isofs_readpage,
+        .readpages = isofs_readpages,
        .bmap = _isofs_bmap
 };
@@ -1319,7 +1327,7 @@ static int isofs_read_inode(struct inode *inode)
                        inode->i_mode = S_IFDIR | sbi->s_dmode;
                else
                        inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
-                inode->i_nlink = 1;     /*
+                set_nlink(inode, 1);    /*
                                         * Set to 1.  We know there are 2, but
                                         * the find utility tries to optimize
                                         * if it is 2, and it screws up.  It is
@@ -1337,7 +1345,7 @@ static int isofs_read_inode(struct inode *inode)
                         */
                        inode->i_mode = S_IFREG | S_IRUGO | S_IXUGO;
                }
-                inode->i_nlink = 1;
+                set_nlink(inode, 1);
        }
        inode->i_uid = sbi->s_uid;
        inode->i_gid = sbi->s_gid;
diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c
index 1fbc7de88f50..70e79d0c756a 100644
--- a/fs/isofs/rock.c
+++ b/fs/isofs/rock.c
@@ -363,7 +363,7 @@ repeat:
                        break;
                case SIG('P', 'X'):
                        inode->i_mode = isonum_733(rr->u.PX.mode);
-                        inode->i_nlink = isonum_733(rr->u.PX.n_links);
+                        set_nlink(inode, isonum_733(rr->u.PX.n_links));
                        inode->i_uid = isonum_733(rr->u.PX.uid);
                        inode->i_gid = isonum_733(rr->u.PX.gid);
                        break;
@@ -496,7 +496,7 @@ repeat:
                                goto out;
                        }
                        inode->i_mode = reloc->i_mode;
-                        inode->i_nlink = reloc->i_nlink;
+                        set_nlink(inode, reloc->i_nlink);
                        inode->i_uid = reloc->i_uid;
                        inode->i_gid = reloc->i_gid;
                        inode->i_rdev = reloc->i_rdev;
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 9fe061fb8779..fea8dd661d2b 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -1135,6 +1135,14 @@ static int journal_get_superblock(journal_t *journal)
                goto out;
        }
+        if (be32_to_cpu(sb->s_first) == 0 ||
+            be32_to_cpu(sb->s_first) >= journal->j_maxlen) {
+                printk(KERN_WARNING
+                        "JBD: Invalid start block of journal: %u\n",
+                        be32_to_cpu(sb->s_first));
+                goto out;
+        }
        return 0;
 out:
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index eef6979821a4..68d704db787f 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -352,7 +352,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        J_ASSERT(commit_transaction->t_state == T_RUNNING);
        trace_jbd2_start_commit(journal, commit_transaction);
-        jbd_debug(1, "JBD: starting commit of transaction %d\n",
+        jbd_debug(1, "JBD2: starting commit of transaction %d\n",
                        commit_transaction->t_tid);
        write_lock(&journal->j_state_lock);
@@ -427,7 +427,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        __jbd2_journal_clean_checkpoint_list(journal);
        spin_unlock(&journal->j_list_lock);
-        jbd_debug (3, "JBD: commit phase 1\n");
+        jbd_debug(3, "JBD2: commit phase 1\n");
        /*
         * Switch to a new revoke table.
@@ -447,7 +447,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        wake_up(&journal->j_wait_transaction_locked);
        write_unlock(&journal->j_state_lock);
-        jbd_debug (3, "JBD: commit phase 2\n");
+        jbd_debug(3, "JBD2: commit phase 2\n");
        /*
         * Now start flushing things to disk, in the order they appear
@@ -462,7 +462,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
                                          WRITE_SYNC);
        blk_finish_plug(&plug);
-        jbd_debug(3, "JBD: commit phase 2\n");
+        jbd_debug(3, "JBD2: commit phase 2\n");
        /*
         * Way to go: we have now written out all of the data for a
@@ -522,7 +522,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
                        J_ASSERT (bufs == 0);
-                        jbd_debug(4, "JBD: get descriptor\n");
+                        jbd_debug(4, "JBD2: get descriptor\n");
                        descriptor = jbd2_journal_get_descriptor_buffer(journal);
                        if (!descriptor) {
@@ -531,7 +531,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
                        }
                        bh = jh2bh(descriptor);
-                        jbd_debug(4, "JBD: got buffer %llu (%p)\n",
+                        jbd_debug(4, "JBD2: got buffer %llu (%p)\n",
                                (unsigned long long)bh->b_blocknr, bh->b_data);
                        header = (journal_header_t *)&bh->b_data[0];
                        header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
@@ -625,7 +625,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
                    commit_transaction->t_buffers == NULL ||
                    space_left < tag_bytes + 16) {
-                        jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
+                        jbd_debug(4, "JBD2: Submit %d IOs\n", bufs);
                        /* Write an end-of-descriptor marker before
                           submitting the IOs.  "tag" still points to
@@ -707,7 +707,7 @@ start_journal_io:
           so we incur less scheduling load.
        */
-        jbd_debug(3, "JBD: commit phase 3\n");
+        jbd_debug(3, "JBD2: commit phase 3\n");
        /*
         * akpm: these are BJ_IO, and j_list_lock is not needed.
@@ -771,7 +771,7 @@ wait_for_iobuf:
        J_ASSERT (commit_transaction->t_shadow_list == NULL);
-        jbd_debug(3, "JBD: commit phase 4\n");
+        jbd_debug(3, "JBD2: commit phase 4\n");
        /* Here we wait for the revoke record and descriptor record buffers */
 wait_for_ctlbuf:
@@ -801,7 +801,7 @@ wait_for_iobuf:
        if (err)
                jbd2_journal_abort(journal, err);
-        jbd_debug(3, "JBD: commit phase 5\n");
+        jbd_debug(3, "JBD2: commit phase 5\n");
        write_lock(&journal->j_state_lock);
        J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH);
        commit_transaction->t_state = T_COMMIT_JFLUSH;
@@ -830,7 +830,7 @@ wait_for_iobuf:
           transaction can be removed from any checkpoint list it was on
           before. */
-        jbd_debug(3, "JBD: commit phase 6\n");
+        jbd_debug(3, "JBD2: commit phase 6\n");
        J_ASSERT(list_empty(&commit_transaction->t_inode_list));
        J_ASSERT(commit_transaction->t_buffers == NULL);
@@ -964,7 +964,7 @@ restart_loop:
        /* Done with this transaction! */
-        jbd_debug(3, "JBD: commit phase 7\n");
+        jbd_debug(3, "JBD2: commit phase 7\n");
        J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH);
@@ -1039,7 +1039,7 @@ restart_loop:
                journal->j_commit_callback(journal, commit_transaction);
        trace_jbd2_end_commit(journal, commit_transaction);
-        jbd_debug(1, "JBD: commit %d complete, head %d\n",
+        jbd_debug(1, "JBD2: commit %d complete, head %d\n",
                  journal->j_commit_sequence, journal->j_tail_sequence);
        if (to_free)
                kfree(commit_transaction);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index f24df13adc4e..0fa0123151d3 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -491,7 +491,7 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t target)
                 */
                journal->j_commit_request = target;
-                jbd_debug(1, "JBD: requesting commit %d/%d\n",
+                jbd_debug(1, "JBD2: requesting commit %d/%d\n",
                          journal->j_commit_request,
                          journal->j_commit_sequence);
                wake_up(&journal->j_wait_commit);
@@ -500,7 +500,7 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t target)
                /* This should never happen, but if it does, preserve
                   the evidence before kjournald goes into a loop and
                   increments j_commit_sequence beyond all recognition. */
-                WARN_ONCE(1, "jbd: bad log_start_commit: %u %u %u %u\n",
+                WARN_ONCE(1, "JBD2: bad log_start_commit: %u %u %u %u\n",
                          journal->j_commit_request,
                          journal->j_commit_sequence,
                          target, journal->j_running_transaction ? 
@@ -645,7 +645,7 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
        }
 #endif
        while (tid_gt(tid, journal->j_commit_sequence)) {
-                jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n",
+                jbd_debug(1, "JBD2: want %d, j_commit_sequence=%d\n",
                                  tid, journal->j_commit_sequence);
                wake_up(&journal->j_wait_commit);
                read_unlock(&journal->j_state_lock);
@@ -1093,7 +1093,7 @@ static int journal_reset(journal_t *journal)
        first = be32_to_cpu(sb->s_first);
        last = be32_to_cpu(sb->s_maxlen);
        if (first + JBD2_MIN_JOURNAL_BLOCKS > last + 1) {
-                printk(KERN_ERR "JBD: Journal too short (blocks %llu-%llu).\n",
+                printk(KERN_ERR "JBD2: Journal too short (blocks %llu-%llu).\n",
                       first, last);
                journal_fail_superblock(journal);
                return -EINVAL;
@@ -1139,7 +1139,7 @@ void jbd2_journal_update_superblock(journal_t *journal, int wait)
         */
        if (sb->s_start == 0 && journal->j_tail_sequence ==
                                journal->j_transaction_sequence) {
-                jbd_debug(1,"JBD: Skipping superblock update on recovered sb "
+                jbd_debug(1, "JBD2: Skipping superblock update on recovered sb "
                        "(start %ld, seq %d, errno %d)\n",
                        journal->j_tail, journal->j_tail_sequence,
                        journal->j_errno);
@@ -1163,7 +1163,7 @@ void jbd2_journal_update_superblock(journal_t *journal, int wait)
        }
        read_lock(&journal->j_state_lock);
-        jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n",
+        jbd_debug(1, "JBD2: updating superblock (start %ld, seq %d, errno %d)\n",
                  journal->j_tail, journal->j_tail_sequence, journal->j_errno);
        sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
@@ -1216,8 +1216,8 @@ static int journal_get_superblock(journal_t *journal)
                ll_rw_block(READ, 1, &bh);
                wait_on_buffer(bh);
                if (!buffer_uptodate(bh)) {
-                        printk (KERN_ERR
+                        printk(KERN_ERR
-                                "JBD: IO error reading journal superblock\n");
+                                "JBD2: IO error reading journal superblock\n");
                        goto out;
                }
        }
@@ -1228,7 +1228,7 @@ static int journal_get_superblock(journal_t *journal)
        if (sb->s_header.h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER) ||
            sb->s_blocksize != cpu_to_be32(journal->j_blocksize)) {
-                printk(KERN_WARNING "JBD: no valid journal superblock found\n");
+                printk(KERN_WARNING "JBD2: no valid journal superblock found\n");
                goto out;
        }
@@ -1240,14 +1240,22 @@ static int journal_get_superblock(journal_t *journal)
                journal->j_format_version = 2;
                break;
        default:
-                printk(KERN_WARNING "JBD: unrecognised superblock format ID\n");
+                printk(KERN_WARNING "JBD2: unrecognised superblock format ID\n");
                goto out;
        }
        if (be32_to_cpu(sb->s_maxlen) < journal->j_maxlen)
                journal->j_maxlen = be32_to_cpu(sb->s_maxlen);
        else if (be32_to_cpu(sb->s_maxlen) > journal->j_maxlen) {
-                printk (KERN_WARNING "JBD: journal file too short\n");
+                printk(KERN_WARNING "JBD2: journal file too short\n");
+                goto out;
+        }
+        if (be32_to_cpu(sb->s_first) == 0 ||
+            be32_to_cpu(sb->s_first) >= journal->j_maxlen) {
+                printk(KERN_WARNING
+                        "JBD2: Invalid start block of journal: %u\n",
+                        be32_to_cpu(sb->s_first));
                goto out;
        }
@@ -1310,8 +1318,8 @@ int jbd2_journal_load(journal_t *journal)
                     ~cpu_to_be32(JBD2_KNOWN_ROCOMPAT_FEATURES)) ||
                    (sb->s_feature_incompat &
                     ~cpu_to_be32(JBD2_KNOWN_INCOMPAT_FEATURES))) {
-                        printk (KERN_WARNING
+                        printk(KERN_WARNING
-                                "JBD: Unrecognised features on journal\n");
+                                "JBD2: Unrecognised features on journal\n");
                        return -EINVAL;
                }
        }
@@ -1346,7 +1354,7 @@ int jbd2_journal_load(journal_t *journal)
        return 0;
 recovery_error:
-        printk (KERN_WARNING "JBD: recovery failed\n");
+        printk(KERN_WARNING "JBD2: recovery failed\n");
        return -EIO;
 }
@@ -1577,7 +1585,7 @@ static int journal_convert_superblock_v1(journal_t *journal,
        struct buffer_head *bh;
        printk(KERN_WARNING
-                "JBD: Converting superblock from version 1 to 2.\n");
+                "JBD2: Converting superblock from version 1 to 2.\n");
        /* Pre-initialise new fields to zero */
        offset = ((char *) &(sb->s_feature_compat)) - ((char *) sb);
@@ -1694,7 +1702,7 @@ int jbd2_journal_wipe(journal_t *journal, int write)
        if (!journal->j_tail)
                goto no_recovery;
-        printk (KERN_WARNING "JBD: %s recovery information on journal\n",
+        printk(KERN_WARNING "JBD2: %s recovery information on journal\n",
                write ? "Clearing" : "Ignoring");
        err = jbd2_journal_skip_recovery(journal);
@@ -2020,7 +2028,7 @@ static int journal_init_jbd2_journal_head_cache(void)
        retval = 0;
        if (!jbd2_journal_head_cache) {
                retval = -ENOMEM;
-                printk(KERN_EMERG "JBD: no memory for journal_head cache\n");
+                printk(KERN_EMERG "JBD2: no memory for journal_head cache\n");
        }
        return retval;
 }
@@ -2383,7 +2391,7 @@ static void __exit journal_exit(void)
 #ifdef CONFIG_JBD2_DEBUG
        int n = atomic_read(&nr_journal_heads);
        if (n)
-                printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n);
+                printk(KERN_EMERG "JBD2: leaked %d journal_heads!\n", n);
 #endif
        jbd2_remove_debugfs_entry();
        jbd2_remove_jbd_stats_proc_entry();
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 1cad869494f0..da6d7baf1390 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -89,7 +89,7 @@ static int do_readahead(journal_t *journal, unsigned int start)
                err = jbd2_journal_bmap(journal, next, &blocknr);
                if (err) {
-                        printk (KERN_ERR "JBD: bad block at offset %u\n",
+                        printk(KERN_ERR "JBD2: bad block at offset %u\n",
                                next);
                        goto failed;
                }
@@ -138,14 +138,14 @@ static int jread(struct buffer_head **bhp, journal_t *journal,
        *bhp = NULL;
        if (offset >= journal->j_maxlen) {
-                printk(KERN_ERR "JBD: corrupted journal superblock\n");
+                printk(KERN_ERR "JBD2: corrupted journal superblock\n");
                return -EIO;
        }
        err = jbd2_journal_bmap(journal, offset, &blocknr);
        if (err) {
-                printk (KERN_ERR "JBD: bad block at offset %u\n",
+                printk(KERN_ERR "JBD2: bad block at offset %u\n",
                        offset);
                return err;
        }
@@ -163,7 +163,7 @@ static int jread(struct buffer_head **bhp, journal_t *journal,
        }
        if (!buffer_uptodate(bh)) {
-                printk (KERN_ERR "JBD: Failed to read block at offset %u\n",
+                printk(KERN_ERR "JBD2: Failed to read block at offset %u\n",
                        offset);
                brelse(bh);
                return -EIO;
@@ -251,10 +251,10 @@ int jbd2_journal_recover(journal_t *journal)
        if (!err)
                err = do_one_pass(journal, &info, PASS_REPLAY);
-        jbd_debug(1, "JBD: recovery, exit status %d, "
+        jbd_debug(1, "JBD2: recovery, exit status %d, "
                  "recovered transactions %u to %u\n",
                  err, info.start_transaction, info.end_transaction);
-        jbd_debug(1, "JBD: Replayed %d and revoked %d/%d blocks\n",
+        jbd_debug(1, "JBD2: Replayed %d and revoked %d/%d blocks\n",
                  info.nr_replays, info.nr_revoke_hits, info.nr_revokes);
        /* Restart the log at the next transaction ID, thus invalidating
@@ -293,14 +293,14 @@ int jbd2_journal_skip_recovery(journal_t *journal)
        err = do_one_pass(journal, &info, PASS_SCAN);
        if (err) {
-                printk(KERN_ERR "JBD: error %d scanning journal\n", err);
+                printk(KERN_ERR "JBD2: error %d scanning journal\n", err);
                ++journal->j_transaction_sequence;
        } else {
 #ifdef CONFIG_JBD2_DEBUG
                int dropped = info.end_transaction - 
                        be32_to_cpu(journal->j_superblock->s_sequence);
                jbd_debug(1,
-                          "JBD: ignoring %d transaction%s from the journal.\n",
+                          "JBD2: ignoring %d transaction%s from the journal.\n",
                          dropped, (dropped == 1) ? "" : "s");
 #endif
                journal->j_transaction_sequence = ++info.end_transaction;
@@ -338,7 +338,7 @@ static int calc_chksums(journal_t *journal, struct buffer_head *bh,
                wrap(journal, *next_log_block);
                err = jread(&obh, journal, io_block);
                if (err) {
-                        printk(KERN_ERR "JBD: IO error %d recovering block "
+                        printk(KERN_ERR "JBD2: IO error %d recovering block "
                                "%lu in log\n", err, io_block);
                        return 1;
                } else {
@@ -411,7 +411,7 @@ static int do_one_pass(journal_t *journal,
                 * either the next descriptor block or the final commit
                 * record. */
-                jbd_debug(3, "JBD: checking block %ld\n", next_log_block);
+                jbd_debug(3, "JBD2: checking block %ld\n", next_log_block);
                err = jread(&bh, journal, next_log_block);
                if (err)
                        goto failed;
@@ -491,8 +491,8 @@ static int do_one_pass(journal_t *journal,
                                        /* Recover what we can, but
                                         * report failure at the end. */
                                        success = err;
-                                        printk (KERN_ERR
+                                        printk(KERN_ERR
-                                                "JBD: IO error %d recovering "
+                                                "JBD2: IO error %d recovering "
                                                "block %ld in log\n",
                                                err, io_block);
                                } else {
@@ -520,7 +520,7 @@ static int do_one_pass(journal_t *journal,
                                                        journal->j_blocksize);
                                        if (nbh == NULL) {
                                                printk(KERN_ERR
-                                                       "JBD: Out of memory "
+                                                       "JBD2: Out of memory "
                                                       "during recovery.\n");
                                                err = -ENOMEM;
                                                brelse(bh);
@@ -689,7 +689,7 @@ static int do_one_pass(journal_t *journal,
                /* It's really bad news if different passes end up at
                 * different places (but possible due to IO errors). */
                if (info->end_transaction != next_commit_ID) {
-                        printk (KERN_ERR "JBD: recovery pass %d ended at "
+                        printk(KERN_ERR "JBD2: recovery pass %d ended at "
                                "transaction %u, expected %u\n",
                                pass, next_commit_ID, info->end_transaction);
                        if (!success)
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 2d7109414cdd..a0e41a4c080e 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -27,6 +27,7 @@
 #include <linux/highmem.h>
 #include <linux/hrtimer.h>
 #include <linux/backing-dev.h>
+#include <linux/bug.h>
 #include <linux/module.h>
 static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
@@ -115,7 +116,7 @@ static inline void update_t_max_wait(transaction_t *transaction,
 */
 static int start_this_handle(journal_t *journal, handle_t *handle,
-                             int gfp_mask)
+                             gfp_t gfp_mask)
 {
        transaction_t   *transaction, *new_transaction = NULL;
        tid_t           tid;
@@ -124,7 +125,7 @@ static int start_this_handle(journal_t *journal, handle_t *handle,
        unsigned long ts = jiffies;
        if (nblocks > journal->j_max_transaction_buffers) {
-                printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n",
+                printk(KERN_ERR "JBD2: %s wants too many credits (%d > %d)\n",
                       current->comm, nblocks,
                       journal->j_max_transaction_buffers);
                return -ENOSPC;
@@ -320,7 +321,7 @@ static handle_t *new_handle(int nblocks)
 * Return a pointer to a newly allocated handle, or an ERR_PTR() value
 * on failure.
 */
-handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int gfp_mask)
+handle_t *jbd2__journal_start(journal_t *journal, int nblocks, gfp_t gfp_mask)
 {
        handle_t *handle = journal_current_handle();
        int err;
@@ -443,7 +444,7 @@ out:
 * transaction capabable of guaranteeing the requested number of
 * credits.
 */
-int jbd2__journal_restart(handle_t *handle, int nblocks, int gfp_mask)
+int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask)
 {
        transaction_t *transaction = handle->h_transaction;
        journal_t *journal = transaction->t_journal;
@@ -563,7 +564,7 @@ static void warn_dirty_buffer(struct buffer_head *bh)
        char b[BDEVNAME_SIZE];
        printk(KERN_WARNING
-               "JBD: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). "
+               "JBD2: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). "
               "There's a risk of filesystem corruption in case of system "
               "crash.\n",
               bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr);
@@ -1049,6 +1050,10 @@ void jbd2_buffer_abort_trigger(struct journal_head *jh,
 * mark dirty metadata which needs to be journaled as part of the current
 * transaction.
 *
+ * The buffer must have previously had jbd2_journal_get_write_access()
+ * called so that it has a valid journal_head attached to the buffer
+ * head.
+ *
 * The buffer is placed on the transaction's metadata list and is marked
 * as belonging to the transaction.
 *
@@ -1065,11 +1070,16 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
        transaction_t *transaction = handle->h_transaction;
        journal_t *journal = transaction->t_journal;
        struct journal_head *jh = bh2jh(bh);
+        int ret = 0;
        jbd_debug(5, "journal_head %p\n", jh);
        JBUFFER_TRACE(jh, "entry");
        if (is_handle_aborted(handle))
                goto out;
+        if (!buffer_jbd(bh)) {
+                ret = -EUCLEAN;
+                goto out;
+        }
        jbd_lock_bh_state(bh);
@@ -1093,8 +1103,20 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
         */
        if (jh->b_transaction == transaction && jh->b_jlist == BJ_Metadata) {
                JBUFFER_TRACE(jh, "fastpath");
-                J_ASSERT_JH(jh, jh->b_transaction ==
+                if (unlikely(jh->b_transaction !=
-                                        journal->j_running_transaction);
+                             journal->j_running_transaction)) {
+                        printk(KERN_EMERG "JBD: %s: "
+                               "jh->b_transaction (%llu, %p, %u) != "
+                               "journal->j_running_transaction (%p, %u)",
+                               journal->j_devname,
+                               (unsigned long long) bh->b_blocknr,
+                               jh->b_transaction,
+                               jh->b_transaction ? jh->b_transaction->t_tid : 0,
+                               journal->j_running_transaction,
+                               journal->j_running_transaction ?
+                               journal->j_running_transaction->t_tid : 0);
+                        ret = -EINVAL;
+                }
                goto out_unlock_bh;
        }
@@ -1108,9 +1130,32 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
         */
        if (jh->b_transaction != transaction) {
                JBUFFER_TRACE(jh, "already on other transaction");
-                J_ASSERT_JH(jh, jh->b_transaction ==
+                if (unlikely(jh->b_transaction !=
-                                        journal->j_committing_transaction);
+                             journal->j_committing_transaction)) {
-                J_ASSERT_JH(jh, jh->b_next_transaction == transaction);
+                        printk(KERN_EMERG "JBD: %s: "
+                               "jh->b_transaction (%llu, %p, %u) != "
+                               "journal->j_committing_transaction (%p, %u)",
+                               journal->j_devname,
+                               (unsigned long long) bh->b_blocknr,
+                               jh->b_transaction,
+                               jh->b_transaction ? jh->b_transaction->t_tid : 0,
+                               journal->j_committing_transaction,
+                               journal->j_committing_transaction ?
+                               journal->j_committing_transaction->t_tid : 0);
+                        ret = -EINVAL;
+                }
+                if (unlikely(jh->b_next_transaction != transaction)) {
+                        printk(KERN_EMERG "JBD: %s: "
+                               "jh->b_next_transaction (%llu, %p, %u) != "
+                               "transaction (%p, %u)",
+                               journal->j_devname,
+                               (unsigned long long) bh->b_blocknr,
+                               jh->b_next_transaction,
+                               jh->b_next_transaction ?
+                               jh->b_next_transaction->t_tid : 0,
+                               transaction, transaction->t_tid);
+                        ret = -EINVAL;
+                }
                /* And this case is illegal: we can't reuse another
                 * transaction's data buffer, ever. */
                goto out_unlock_bh;
@@ -1127,7 +1172,8 @@ out_unlock_bh:
        jbd_unlock_bh_state(bh);
 out:
        JBUFFER_TRACE(jh, "exit");
-        return 0;
+        WARN_ON(ret);   /* All errors are bugs, so dump the stack */
+        return ret;
 }
 /*
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 9659b7c00468..be6169bd8acd 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -245,7 +245,7 @@ static int jffs2_unlink(struct inode *dir_i, struct dentry *dentry)
        ret = jffs2_do_unlink(c, dir_f, dentry->d_name.name,
                              dentry->d_name.len, dead_f, now);
        if (dead_f->inocache)
-                dentry->d_inode->i_nlink = dead_f->inocache->pino_nlink;
+                set_nlink(dentry->d_inode, dead_f->inocache->pino_nlink);
        if (!ret)
                dir_i->i_mtime = dir_i->i_ctime = ITIME(now);
        return ret;
@@ -278,7 +278,7 @@ static int jffs2_link (struct dentry *old_dentry, struct inode *dir_i, struct de
        if (!ret) {
                mutex_lock(&f->sem);
-                old_dentry->d_inode->i_nlink = ++f->inocache->pino_nlink;
+                set_nlink(old_dentry->d_inode, ++f->inocache->pino_nlink);
                mutex_unlock(&f->sem);
                d_instantiate(dentry, old_dentry->d_inode);
                dir_i->i_mtime = dir_i->i_ctime = ITIME(now);
@@ -497,7 +497,7 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
        f = JFFS2_INODE_INFO(inode);
        /* Directories get nlink 2 at start */
-        inode->i_nlink = 2;
+        set_nlink(inode, 2);
        /* but ic->pino_nlink is the parent ino# */
        f->inocache->pino_nlink = dir_i->i_ino;
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index bbcb9755dd2b..7286e44ac665 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -278,7 +278,7 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino)
        inode->i_mtime = ITIME(je32_to_cpu(latest_node.mtime));
        inode->i_ctime = ITIME(je32_to_cpu(latest_node.ctime));
-        inode->i_nlink = f->inocache->pino_nlink;
+        set_nlink(inode, f->inocache->pino_nlink);
        inode->i_blocks = (inode->i_size + 511) >> 9;
@@ -291,7 +291,7 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino)
        case S_IFDIR:
        {
                struct jffs2_full_dirent *fd;
-                inode->i_nlink = 2; /* parent and '.' */
+                set_nlink(inode, 2); /* parent and '.' */
                for (fd=f->dents; fd; fd = fd->next) {
                        if (fd->type == DT_DIR && fd->ino)
@@ -453,7 +453,7 @@ struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, struct jffs2_r
                iput(inode);
                return ERR_PTR(ret);
        }
-        inode->i_nlink = 1;
+        set_nlink(inode, 1);
        inode->i_ino = je32_to_cpu(ri->ino);
        inode->i_mode = jemode_to_cpu(ri->mode);
        inode->i_gid = je16_to_cpu(ri->gid);
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index b78b2f978f04..1b6f15f191b3 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -457,7 +457,7 @@ struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary)
        /* read the page of fixed disk inode (AIT) in raw mode */
        mp = read_metapage(ip, address << sbi->l2nbperpage, PSIZE, 1);
        if (mp == NULL) {
-                ip->i_nlink = 1;        /* Don't want iput() deleting it */
+                set_nlink(ip, 1);       /* Don't want iput() deleting it */
                iput(ip);
                return (NULL);
        }
@@ -469,7 +469,7 @@ struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary)
        /* copy on-disk inode to in-memory inode */
        if ((copy_from_dinode(dp, ip)) != 0) {
                /* handle bad return by returning NULL for ip */
-                ip->i_nlink = 1;        /* Don't want iput() deleting it */
+                set_nlink(ip, 1);       /* Don't want iput() deleting it */
                iput(ip);
                /* release the page */
                release_metapage(mp);
@@ -3076,7 +3076,7 @@ static int copy_from_dinode(struct dinode * dip, struct inode *ip)
                                ip->i_mode |= 0001;
                }
        }
-        ip->i_nlink = le32_to_cpu(dip->di_nlink);
+        set_nlink(ip, le32_to_cpu(dip->di_nlink));
        jfs_ip->saved_uid = le32_to_cpu(dip->di_uid);
        if (sbi->uid == -1)
diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c
index 2686531e235a..c1a3e603279c 100644
--- a/fs/jfs/jfs_inode.c
+++ b/fs/jfs/jfs_inode.c
@@ -157,7 +157,7 @@ fail_drop:
        dquot_drop(inode);
        inode->i_flags |= S_NOQUOTA;
 fail_unlock:
-        inode->i_nlink = 0;
+        clear_nlink(inode);
        unlock_new_inode(inode);
 fail_put:
        iput(inode);
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index e17545e15664..a112ad96e474 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -172,7 +172,7 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, int mode,
        mutex_unlock(&JFS_IP(dip)->commit_mutex);
        if (rc) {
                free_ea_wmap(ip);
-                ip->i_nlink = 0;
+                clear_nlink(ip);
                unlock_new_inode(ip);
                iput(ip);
        } else {
@@ -292,7 +292,7 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode)
                goto out3;
        }
-        ip->i_nlink = 2;        /* for '.' */
+        set_nlink(ip, 2);       /* for '.' */
        ip->i_op = &jfs_dir_inode_operations;
        ip->i_fop = &jfs_dir_operations;
@@ -311,7 +311,7 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode)
        mutex_unlock(&JFS_IP(dip)->commit_mutex);
        if (rc) {
                free_ea_wmap(ip);
-                ip->i_nlink = 0;
+                clear_nlink(ip);
                unlock_new_inode(ip);
                iput(ip);
        } else {
@@ -844,7 +844,7 @@ static int jfs_link(struct dentry *old_dentry,
        rc = txCommit(tid, 2, &iplist[0], 0);
        if (rc) {
-                ip->i_nlink--; /* never instantiated */
+                drop_nlink(ip); /* never instantiated */
                iput(ip);
        } else
                d_instantiate(dentry, ip);
@@ -1048,7 +1048,7 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
        mutex_unlock(&JFS_IP(dip)->commit_mutex);
        if (rc) {
                free_ea_wmap(ip);
-                ip->i_nlink = 0;
+                clear_nlink(ip);
                unlock_new_inode(ip);
                iput(ip);
        } else {
@@ -1433,7 +1433,7 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry,
        mutex_unlock(&JFS_IP(dir)->commit_mutex);
        if (rc) {
                free_ea_wmap(ip);
-                ip->i_nlink = 0;
+                clear_nlink(ip);
                unlock_new_inode(ip);
                iput(ip);
        } else {
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 06c8a67cbe76..a44eff076c17 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -485,7 +485,6 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
                goto out_unload;
        }
        inode->i_ino = 0;
-        inode->i_nlink = 1;
        inode->i_size = sb->s_bdev->bd_inode->i_size;
        inode->i_mapping->a_ops = &jfs_metapage_aops;
        insert_inode_hash(inode);
diff --git a/fs/libfs.c b/fs/libfs.c
index c18e9a1235b6..f6d411eef1e7 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -490,7 +490,7 @@ int simple_fill_super(struct super_block *s, unsigned long magic,
        inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
        inode->i_op = &simple_dir_inode_operations;
        inode->i_fop = &simple_dir_operations;
-        inode->i_nlink = 2;
+        set_nlink(inode, 2);
        root = d_alloc_root(inode);
        if (!root) {
                iput(inode);
@@ -510,8 +510,10 @@ int simple_fill_super(struct super_block *s, unsigned long magic,
                if (!dentry)
                        goto out;
                inode = new_inode(s);
-                if (!inode)
+                if (!inode) {
+                        dput(dentry);
                        goto out;
+                }
                inode->i_mode = S_IFREG | files->mode;
                inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
                inode->i_fop = files->ops;
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index b3ff3d894165..b7d7f67cee5a 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -197,7 +197,7 @@ static int logfs_remove_inode(struct inode *inode)
 {
        int ret;
-        inode->i_nlink--;
+        drop_nlink(inode);
        ret = write_inode(inode);
        LOGFS_BUG_ON(ret, inode->i_sb);
        return ret;
@@ -433,7 +433,7 @@ static int __logfs_create(struct inode *dir, struct dentry *dentry,
        ta = kzalloc(sizeof(*ta), GFP_KERNEL);
        if (!ta) {
-                inode->i_nlink--;
+                drop_nlink(inode);
                iput(inode);
                return -ENOMEM;
        }
@@ -456,7 +456,7 @@ static int __logfs_create(struct inode *dir, struct dentry *dentry,
                abort_transaction(inode, ta);
                li->li_flags |= LOGFS_IF_STILLBORN;
                /* FIXME: truncate symlink */
-                inode->i_nlink--;
+                drop_nlink(inode);
                iput(inode);
                goto out;
        }
@@ -563,7 +563,7 @@ static int logfs_link(struct dentry *old_dentry, struct inode *dir,
        inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
        ihold(inode);
-        inode->i_nlink++;
+        inc_nlink(inode);
        mark_inode_dirty_sync(inode);
        return __logfs_create(dir, dentry, inode, NULL, 0);
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
index edfea7a3a747..7e441ad5f792 100644
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -93,7 +93,7 @@ static struct inode *__logfs_iget(struct super_block *sb, ino_t ino)
                /* inode->i_nlink == 0 can be true when called from
                 * block validator */
                /* set i_nlink to 0 to prevent caching */
-                inode->i_nlink = 0;
+                clear_nlink(inode);
                logfs_inode(inode)->li_flags |= LOGFS_IF_ZOMBIE;
                iget_failed(inode);
                if (!err)
@@ -199,7 +199,6 @@ static void logfs_init_inode(struct super_block *sb, struct inode *inode)
        inode->i_blocks = 0;
        inode->i_ctime  = CURRENT_TIME;
        inode->i_mtime  = CURRENT_TIME;
-        inode->i_nlink  = 1;
        li->li_refcount = 1;
        INIT_LIST_HEAD(&li->li_freeing_list);
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
index f22d108bfa5d..398ecff6e548 100644
--- a/fs/logfs/logfs.h
+++ b/fs/logfs/logfs.h
@@ -618,7 +618,6 @@ static inline int logfs_buf_recover(struct logfs_area *area, u64 ofs,
 struct page *emergency_read_begin(struct address_space *mapping, pgoff_t index);
 void emergency_read_end(struct page *page);
 void logfs_crash_dump(struct super_block *sb);
-void *memchr_inv(const void *s, int c, size_t n);
 int logfs_statfs(struct dentry *dentry, struct kstatfs *stats);
 int logfs_check_ds(struct logfs_disk_super *ds);
 int logfs_write_sb(struct super_block *sb);
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index d8d09380c7de..2ac4217b7901 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -126,7 +126,7 @@ static void logfs_disk_to_inode(struct logfs_disk_inode *di, struct inode*inode)
        inode->i_atime  = be64_to_timespec(di->di_atime);
        inode->i_ctime  = be64_to_timespec(di->di_ctime);
        inode->i_mtime  = be64_to_timespec(di->di_mtime);
-        inode->i_nlink  = be32_to_cpu(di->di_refcount);
+        set_nlink(inode, be32_to_cpu(di->di_refcount));
        inode->i_generation = be32_to_cpu(di->di_generation);
        switch (inode->i_mode & S_IFMT) {
diff --git a/fs/logfs/super.c b/fs/logfs/super.c
index b9b3154b0485..e795c234ea33 100644
--- a/fs/logfs/super.c
+++ b/fs/logfs/super.c
@@ -92,28 +92,6 @@ void logfs_crash_dump(struct super_block *sb)
 }
 /*
- * TODO: move to lib/string.c
- */
-/**
- * memchr_inv - Find a character in an area of memory.
- * @s: The memory area
- * @c: The byte to search for
- * @n: The size of the area.
- *
- * returns the address of the first character other than @c, or %NULL
- * if the whole buffer contains just @c.
- */
-void *memchr_inv(const void *s, int c, size_t n)
-{
-        const unsigned char *p = s;
-        while (n-- != 0)
-                if ((unsigned char)c != *p++)
-                        return (void *)(p - 1);
-        return NULL;
-}
-/*
 * FIXME: There should be a reserve for root, similar to ext2.
 */
 int logfs_statfs(struct dentry *dentry, struct kstatfs *stats)
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index e7d23e25bf1d..64cdcd662ffc 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -446,7 +446,7 @@ static struct inode *V1_minix_iget(struct inode *inode)
        inode->i_mode = raw_inode->i_mode;
        inode->i_uid = (uid_t)raw_inode->i_uid;
        inode->i_gid = (gid_t)raw_inode->i_gid;
-        inode->i_nlink = raw_inode->i_nlinks;
+        set_nlink(inode, raw_inode->i_nlinks);
        inode->i_size = raw_inode->i_size;
        inode->i_mtime.tv_sec = inode->i_atime.tv_sec = inode->i_ctime.tv_sec = raw_inode->i_time;
        inode->i_mtime.tv_nsec = 0;
@@ -479,7 +479,7 @@ static struct inode *V2_minix_iget(struct inode *inode)
        inode->i_mode = raw_inode->i_mode;
        inode->i_uid = (uid_t)raw_inode->i_uid;
        inode->i_gid = (gid_t)raw_inode->i_gid;
-        inode->i_nlink = raw_inode->i_nlinks;
+        set_nlink(inode, raw_inode->i_nlinks);
        inode->i_size = raw_inode->i_size;
        inode->i_mtime.tv_sec = raw_inode->i_mtime;
        inode->i_atime.tv_sec = raw_inode->i_atime;
diff --git a/fs/namei.c b/fs/namei.c
index 7657be4352bf..ac6d214da827 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -137,7 +137,7 @@ static int do_getname(const char __user *filename, char *page)
        return retval;
 }
-static char *getname_flags(const char __user * filename, int flags)
+static char *getname_flags(const char __user *filename, int flags, int *empty)
 {
        char *tmp, *result;
@@ -148,6 +148,8 @@ static char *getname_flags(const char __user * filename, int flags)
                result = tmp;
                if (retval < 0) {
+                        if (retval == -ENOENT && empty)
+                                *empty = 1;
                        if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) {
                                __putname(tmp);
                                result = ERR_PTR(retval);
@@ -160,7 +162,7 @@ static char *getname_flags(const char __user * filename, int flags)
 char *getname(const char __user * filename)
 {
-        return getname_flags(filename, 0);
+        return getname_flags(filename, 0, 0);
 }
 #ifdef CONFIG_AUDITSYSCALL
@@ -1798,11 +1800,11 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
        return __lookup_hash(&this, base, NULL);
 }
-int user_path_at(int dfd, const char __user *name, unsigned flags,
+int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
-                 struct path *path)
+                 struct path *path, int *empty)
 {
        struct nameidata nd;
-        char *tmp = getname_flags(name, flags);
+        char *tmp = getname_flags(name, flags, empty);
        int err = PTR_ERR(tmp);
        if (!IS_ERR(tmp)) {
@@ -1816,6 +1818,12 @@ int user_path_at(int dfd, const char __user *name, unsigned flags,
        return err;
 }
+int user_path_at(int dfd, const char __user *name, unsigned flags,
+                 struct path *path)
+{
+        return user_path_at_empty(dfd, name, flags, path, 0);
+}
 static int user_path_parent(int dfd, const char __user *path,
                        struct nameidata *nd, char **name)
 {
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 202f370526a7..5b5fa33b6b9d 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -228,7 +228,7 @@ static void ncp_set_attr(struct inode *inode, struct ncp_entry_info *nwinfo)
        DDPRINTK("ncp_read_inode: inode->i_mode = %u\n", inode->i_mode);
-        inode->i_nlink = 1;
+        set_nlink(inode, 1);
        inode->i_uid = server->m.uid;
        inode->i_gid = server->m.gid;
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 918ad647afea..726e59a9e50f 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -488,17 +488,18 @@ static __be32 decode_recallany_args(struct svc_rqst *rqstp,
                                      struct xdr_stream *xdr,
                                      struct cb_recallanyargs *args)
 {
-        __be32 *p;
+        uint32_t bitmap[2];
+        __be32 *p, status;
        args->craa_addr = svc_addr(rqstp);
        p = read_buf(xdr, 4);
        if (unlikely(p == NULL))
                return htonl(NFS4ERR_BADXDR);
        args->craa_objs_to_keep = ntohl(*p++);
-        p = read_buf(xdr, 4);
+        status = decode_bitmap(xdr, bitmap);
-        if (unlikely(p == NULL))
+        if (unlikely(status))
-                return htonl(NFS4ERR_BADXDR);
+                return status;
-        args->craa_type_mask = ntohl(*p);
+        args->craa_type_mask = bitmap[0];
        return 0;
 }
@@ -986,4 +987,5 @@ struct svc_version nfs4_callback_version4 = {
        .vs_proc = nfs4_callback_procedures1,
        .vs_xdrsize = NFS4_CALLBACK_XDRSIZE,
        .vs_dispatch = NULL,
+        .vs_hidden = 1,
 };
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 91c01f0a4c3b..0a1f8312b4dc 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -137,11 +137,9 @@ nfs_file_open(struct inode *inode, struct file *filp)
 static int
 nfs_file_release(struct inode *inode, struct file *filp)
 {
-        struct dentry *dentry = filp->f_path.dentry;
        dprintk("NFS: release(%s/%s)\n",
-                        dentry->d_parent->d_name.name,
+                        filp->f_path.dentry->d_parent->d_name.name,
-                        dentry->d_name.name);
+                        filp->f_path.dentry->d_name.name);
        nfs_inc_stats(inode, NFSIOS_VFSRELEASE);
        return nfs_release(inode, filp);
@@ -228,14 +226,13 @@ nfs_file_read(struct kiocb *iocb, const struct iovec *iov,
        struct dentry * dentry = iocb->ki_filp->f_path.dentry;
        struct inode * inode = dentry->d_inode;
        ssize_t result;
-        size_t count = iov_length(iov, nr_segs);
        if (iocb->ki_filp->f_flags & O_DIRECT)
                return nfs_file_direct_read(iocb, iov, nr_segs, pos);
        dprintk("NFS: read(%s/%s, %lu@%lu)\n",
                dentry->d_parent->d_name.name, dentry->d_name.name,
-                (unsigned long) count, (unsigned long) pos);
+                (unsigned long) iov_length(iov, nr_segs), (unsigned long) pos);
        result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping);
        if (!result) {
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 4dc6d078f108..c07a55aec838 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -320,7 +320,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
                memset(&inode->i_ctime, 0, sizeof(inode->i_ctime));
                inode->i_version = 0;
                inode->i_size = 0;
-                inode->i_nlink = 0;
+                clear_nlink(inode);
                inode->i_uid = -2;
                inode->i_gid = -2;
                inode->i_blocks = 0;
@@ -355,7 +355,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
                                | NFS_INO_INVALID_DATA
                                | NFS_INO_REVAL_PAGECACHE;
                if (fattr->valid & NFS_ATTR_FATTR_NLINK)
-                        inode->i_nlink = fattr->nlink;
+                        set_nlink(inode, fattr->nlink);
                else if (nfs_server_capable(inode, NFS_CAP_NLINK))
                        nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
                if (fattr->valid & NFS_ATTR_FATTR_OWNER)
@@ -1361,7 +1361,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                        invalid |= NFS_INO_INVALID_ATTR;
                        if (S_ISDIR(inode->i_mode))
                                invalid |= NFS_INO_INVALID_DATA;
-                        inode->i_nlink = fattr->nlink;
+                        set_nlink(inode, fattr->nlink);
                }
        } else if (server->caps & NFS_CAP_NLINK)
                invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 955699515e70..a62d36b9a99e 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -450,9 +450,8 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
        fl->dsaddr = dsaddr;
-        if (fl->first_stripe_index < 0 ||
+        if (fl->first_stripe_index >= dsaddr->stripe_count) {
-            fl->first_stripe_index >= dsaddr->stripe_count) {
+                dprintk("%s Bad first_stripe_index %u\n",
-                dprintk("%s Bad first_stripe_index %d\n",
                                __func__, fl->first_stripe_index);
                goto out_put;
        }
@@ -553,7 +552,7 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo,
        /* Note that a zero value for num_fh is legal for STRIPE_SPARSE.
         * Futher checking is done in filelayout_check_layout */
-        if (fl->num_fh < 0 || fl->num_fh >
+        if (fl->num_fh >
            max(NFS4_PNFS_MAX_STRIPE_CNT, NFS4_PNFS_MAX_MULTI_CNT))
                goto out_err;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index d2ae413c986a..b60fddf606f7 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5950,6 +5950,7 @@ static void nfs4_layoutcommit_release(void *calldata)
 {
        struct nfs4_layoutcommit_data *data = calldata;
        struct pnfs_layout_segment *lseg, *tmp;
+        unsigned long *bitlock = &NFS_I(data->args.inode)->flags;
        pnfs_cleanup_layoutcommit(data);
        /* Matched by references in pnfs_set_layoutcommit */
@@ -5959,6 +5960,11 @@ static void nfs4_layoutcommit_release(void *calldata)
                                       &lseg->pls_flags))
                        put_lseg(lseg);
        }
+        clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock);
+        smp_mb__after_clear_bit();
+        wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING);
        put_rpccred(data->cred);
        kfree(data);
 }
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 1dce12f41a4f..e6161b213ed1 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -6602,8 +6602,6 @@ static int nfs4_xdr_dec_secinfo(struct rpc_rqst *rqstp,
        if (status)
                goto out;
        status = decode_secinfo(xdr, res);
-        if (status)
-                goto out;
 out:
        return status;
 }
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index d0cda12fddc3..c807ab93140e 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -38,21 +38,15 @@
 */
 #include <linux/module.h>
-#include <scsi/osd_initiator.h>
+#include <scsi/osd_ore.h>
 #include "objlayout.h"
 #define NFSDBG_FACILITY         NFSDBG_PNFS_LD
-#define _LLU(x) ((unsigned long long)x)
-enum { BIO_MAX_PAGES_KMALLOC =
-                (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),
-};
 struct objio_dev_ent {
        struct nfs4_deviceid_node id_node;
-        struct osd_dev *od;
+        struct ore_dev od;
 };
 static void
@@ -60,8 +54,8 @@ objio_free_deviceid_node(struct nfs4_deviceid_node *d)
 {
        struct objio_dev_ent *de = container_of(d, struct objio_dev_ent, id_node);
-        dprintk("%s: free od=%p\n", __func__, de->od);
+        dprintk("%s: free od=%p\n", __func__, de->od.od);
-        osduld_put_device(de->od);
+        osduld_put_device(de->od.od);
        kfree(de);
 }
@@ -98,12 +92,12 @@ _dev_list_add(const struct nfs_server *nfss,
                                nfss->pnfs_curr_ld,
                                nfss->nfs_client,
                                d_id);
-        de->od = od;
+        de->od.od = od;
        d = nfs4_insert_deviceid_node(&de->id_node);
        n = container_of(d, struct objio_dev_ent, id_node);
        if (n != de) {
-                dprintk("%s: Race with other n->od=%p\n", __func__, n->od);
+                dprintk("%s: Race with other n->od=%p\n", __func__, n->od.od);
                objio_free_deviceid_node(&de->id_node);
                de = n;
        }
@@ -111,28 +105,11 @@ _dev_list_add(const struct nfs_server *nfss,
        return de;
 }
-struct caps_buffers {
-        u8 caps_key[OSD_CRYPTO_KEYID_SIZE];
-        u8 creds[OSD_CAP_LEN];
-};
 struct objio_segment {
        struct pnfs_layout_segment lseg;
-        struct pnfs_osd_object_cred *comps;
+        struct ore_layout layout;
+        struct ore_components oc;
-        unsigned mirrors_p1;
-        unsigned stripe_unit;
-        unsigned group_width;   /* Data stripe_units without integrity comps */
-        u64 group_depth;
-        unsigned group_count;
-        unsigned max_io_size;
-        unsigned comps_index;
-        unsigned num_comps;
-        /* variable length */
-        struct objio_dev_ent *ods[];
 };
 static inline struct objio_segment *
@@ -141,59 +118,44 @@ OBJIO_LSEG(struct pnfs_layout_segment *lseg)
        return container_of(lseg, struct objio_segment, lseg);
 }
-struct objio_state;
-typedef ssize_t (*objio_done_fn)(struct objio_state *ios);
 struct objio_state {
        /* Generic layer */
-        struct objlayout_io_state ol_state;
+        struct objlayout_io_res oir;
-        struct objio_segment *layout;
+        bool sync;
+        /*FIXME: Support for extra_bytes at ore_get_rw_state() */
-        struct kref kref;
+        struct ore_io_state *ios;
-        objio_done_fn done;
-        void *private;
-        unsigned long length;
-        unsigned numdevs; /* Actually used devs in this IO */
-        /* A per-device variable array of size numdevs */
-        struct _objio_per_comp {
-                struct bio *bio;
-                struct osd_request *or;
-                unsigned long length;
-                u64 offset;
-                unsigned dev;
-        } per_dev[];
 };
 /* Send and wait for a get_device_info of devices in the layout,
   then look them up with the osd_initiator library */
-static struct objio_dev_ent *_device_lookup(struct pnfs_layout_hdr *pnfslay,
+static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay,
-                                struct objio_segment *objio_seg, unsigned comp,
+        struct objio_segment *objio_seg, unsigned c, struct nfs4_deviceid *d_id,
-                                gfp_t gfp_flags)
+        gfp_t gfp_flags)
 {
        struct pnfs_osd_deviceaddr *deviceaddr;
-        struct nfs4_deviceid *d_id;
        struct objio_dev_ent *ode;
        struct osd_dev *od;
        struct osd_dev_info odi;
        int err;
-        d_id = &objio_seg->comps[comp].oc_object_id.oid_device_id;
        ode = _dev_list_find(NFS_SERVER(pnfslay->plh_inode), d_id);
-        if (ode)
+        if (ode) {
-                return ode;
+                objio_seg->oc.ods[c] = &ode->od; /* must use container_of */
+                return 0;
+        }
        err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr, gfp_flags);
        if (unlikely(err)) {
                dprintk("%s: objlayout_get_deviceinfo dev(%llx:%llx) =>%d\n",
                        __func__, _DEVID_LO(d_id), _DEVID_HI(d_id), err);
-                return ERR_PTR(err);
+                return err;
        }
        odi.systemid_len = deviceaddr->oda_systemid.len;
        if (odi.systemid_len > sizeof(odi.systemid)) {
+                dprintk("%s: odi.systemid_len > sizeof(systemid=%zd)\n",
+                        __func__, sizeof(odi.systemid));
                err = -EINVAL;
                goto out;
        } else if (odi.systemid_len)
@@ -218,96 +180,53 @@ static struct objio_dev_ent *_device_lookup(struct pnfs_layout_hdr *pnfslay,
        ode = _dev_list_add(NFS_SERVER(pnfslay->plh_inode), d_id, od,
                            gfp_flags);
+        objio_seg->oc.ods[c] = &ode->od; /* must use container_of */
+        dprintk("Adding new dev_id(%llx:%llx)\n",
+                _DEVID_LO(d_id), _DEVID_HI(d_id));
 out:
-        dprintk("%s: return=%d\n", __func__, err);
        objlayout_put_deviceinfo(deviceaddr);
-        return err ? ERR_PTR(err) : ode;
+        return err;
 }
-static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay,
+static void copy_single_comp(struct ore_components *oc, unsigned c,
-        struct objio_segment *objio_seg,
+                             struct pnfs_osd_object_cred *src_comp)
-        gfp_t gfp_flags)
 {
-        unsigned i;
+        struct ore_comp *ocomp = &oc->comps[c];
-        int err;
-        /* lookup all devices */
+        WARN_ON(src_comp->oc_cap_key.cred_len > 0); /* libosd is NO_SEC only */
-        for (i = 0; i < objio_seg->num_comps; i++) {
+        WARN_ON(src_comp->oc_cap.cred_len > sizeof(ocomp->cred));
-                struct objio_dev_ent *ode;
-                ode = _device_lookup(pnfslay, objio_seg, i, gfp_flags);
+        ocomp->obj.partition = src_comp->oc_object_id.oid_partition_id;
-                if (unlikely(IS_ERR(ode))) {
+        ocomp->obj.id = src_comp->oc_object_id.oid_object_id;
-                        err = PTR_ERR(ode);
-                        goto out;
-                }
-                objio_seg->ods[i] = ode;
-        }
-        err = 0;
-out:
+        memcpy(ocomp->cred, src_comp->oc_cap.cred, sizeof(ocomp->cred));
-        dprintk("%s: return=%d\n", __func__, err);
-        return err;
 }
-static int _verify_data_map(struct pnfs_osd_layout *layout)
+int __alloc_objio_seg(unsigned numdevs, gfp_t gfp_flags,
+                       struct objio_segment **pseg)
 {
-        struct pnfs_osd_data_map *data_map = &layout->olo_map;
+        struct __alloc_objio_segment {
-        u64 stripe_length;
+                struct objio_segment olseg;
-        u32 group_width;
+                struct ore_dev *ods[numdevs];
+                struct ore_comp comps[numdevs];
-/* FIXME: Only raid0 for now. if not go through MDS */
+        } *aolseg;
-        if (data_map->odm_raid_algorithm != PNFS_OSD_RAID_0) {
-                printk(KERN_ERR "Only RAID_0 for now\n");
-                return -ENOTSUPP;
-        }
-        if (0 != (data_map->odm_num_comps % (data_map->odm_mirror_cnt + 1))) {
-                printk(KERN_ERR "Data Map wrong, num_comps=%u mirrors=%u\n",
-                          data_map->odm_num_comps, data_map->odm_mirror_cnt);
-                return -EINVAL;
-        }
-        if (data_map->odm_group_width)
+        aolseg = kzalloc(sizeof(*aolseg), gfp_flags);
-                group_width = data_map->odm_group_width;
+        if (unlikely(!aolseg)) {
-        else
+                dprintk("%s: Faild allocation numdevs=%d size=%zd\n", __func__,
-                group_width = data_map->odm_num_comps /
+                        numdevs, sizeof(*aolseg));
-                                                (data_map->odm_mirror_cnt + 1);
+                return -ENOMEM;
-        stripe_length = (u64)data_map->odm_stripe_unit * group_width;
-        if (stripe_length >= (1ULL << 32)) {
-                printk(KERN_ERR "Total Stripe length(0x%llx)"
-                          " >= 32bit is not supported\n", _LLU(stripe_length));
-                return -ENOTSUPP;
        }
-        if (0 != (data_map->odm_stripe_unit & ~PAGE_MASK)) {
+        aolseg->olseg.oc.numdevs = numdevs;
-                printk(KERN_ERR "Stripe Unit(0x%llx)"
+        aolseg->olseg.oc.single_comp = EC_MULTPLE_COMPS;
-                          " must be Multples of PAGE_SIZE(0x%lx)\n",
+        aolseg->olseg.oc.comps = aolseg->comps;
-                          _LLU(data_map->odm_stripe_unit), PAGE_SIZE);
+        aolseg->olseg.oc.ods = aolseg->ods;
-                return -ENOTSUPP;
-        }
+        *pseg = &aolseg->olseg;
        return 0;
 }
-static void copy_single_comp(struct pnfs_osd_object_cred *cur_comp,
-                             struct pnfs_osd_object_cred *src_comp,
-                             struct caps_buffers *caps_p)
-{
-        WARN_ON(src_comp->oc_cap_key.cred_len > sizeof(caps_p->caps_key));
-        WARN_ON(src_comp->oc_cap.cred_len > sizeof(caps_p->creds));
-        *cur_comp = *src_comp;
-        memcpy(caps_p->caps_key, src_comp->oc_cap_key.cred,
-               sizeof(caps_p->caps_key));
-        cur_comp->oc_cap_key.cred = caps_p->caps_key;
-        memcpy(caps_p->creds, src_comp->oc_cap.cred,
-               sizeof(caps_p->creds));
-        cur_comp->oc_cap.cred = caps_p->creds;
-}
 int objio_alloc_lseg(struct pnfs_layout_segment **outp,
        struct pnfs_layout_hdr *pnfslay,
        struct pnfs_layout_range *range,
@@ -317,59 +236,43 @@ int objio_alloc_lseg(struct pnfs_layout_segment **outp,
        struct objio_segment *objio_seg;
        struct pnfs_osd_xdr_decode_layout_iter iter;
        struct pnfs_osd_layout layout;
-        struct pnfs_osd_object_cred *cur_comp, src_comp;
+        struct pnfs_osd_object_cred src_comp;
-        struct caps_buffers *caps_p;
+        unsigned cur_comp;
        int err;
        err = pnfs_osd_xdr_decode_layout_map(&layout, &iter, xdr);
        if (unlikely(err))
                return err;
-        err = _verify_data_map(&layout);
+        err = __alloc_objio_seg(layout.olo_num_comps, gfp_flags, &objio_seg);
        if (unlikely(err))
                return err;
-        objio_seg = kzalloc(sizeof(*objio_seg) +
+        objio_seg->layout.stripe_unit = layout.olo_map.odm_stripe_unit;
-                            sizeof(objio_seg->ods[0]) * layout.olo_num_comps +
+        objio_seg->layout.group_width = layout.olo_map.odm_group_width;
-                            sizeof(*objio_seg->comps) * layout.olo_num_comps +
+        objio_seg->layout.group_depth = layout.olo_map.odm_group_depth;
-                            sizeof(struct caps_buffers) * layout.olo_num_comps,
+        objio_seg->layout.mirrors_p1 = layout.olo_map.odm_mirror_cnt + 1;
-                            gfp_flags);
+        objio_seg->layout.raid_algorithm = layout.olo_map.odm_raid_algorithm;
-        if (!objio_seg)
-                return -ENOMEM;
-        objio_seg->comps = (void *)(objio_seg->ods + layout.olo_num_comps);
+        err = ore_verify_layout(layout.olo_map.odm_num_comps,
-        cur_comp = objio_seg->comps;
+                                          &objio_seg->layout);
-        caps_p = (void *)(cur_comp + layout.olo_num_comps);
-        while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err))
-                copy_single_comp(cur_comp++, &src_comp, caps_p++);
        if (unlikely(err))
                goto err;
-        objio_seg->num_comps = layout.olo_num_comps;
+        objio_seg->oc.first_dev = layout.olo_comps_index;
-        objio_seg->comps_index = layout.olo_comps_index;
+        cur_comp = 0;
-        err = objio_devices_lookup(pnfslay, objio_seg, gfp_flags);
+        while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err)) {
-        if (err)
+                copy_single_comp(&objio_seg->oc, cur_comp, &src_comp);
-                goto err;
+                err = objio_devices_lookup(pnfslay, objio_seg, cur_comp,
+                                           &src_comp.oc_object_id.oid_device_id,
-        objio_seg->mirrors_p1 = layout.olo_map.odm_mirror_cnt + 1;
+                                           gfp_flags);
-        objio_seg->stripe_unit = layout.olo_map.odm_stripe_unit;
+                if (err)
-        if (layout.olo_map.odm_group_width) {
+                        goto err;
-                objio_seg->group_width = layout.olo_map.odm_group_width;
+                ++cur_comp;
-                objio_seg->group_depth = layout.olo_map.odm_group_depth;
-                objio_seg->group_count = layout.olo_map.odm_num_comps /
-                                                objio_seg->mirrors_p1 /
-                                                objio_seg->group_width;
-        } else {
-                objio_seg->group_width = layout.olo_map.odm_num_comps /
-                                                objio_seg->mirrors_p1;
-                objio_seg->group_depth = -1;
-                objio_seg->group_count = 1;
        }
+        /* pnfs_osd_xdr_decode_layout_comp returns false on error */
-        /* Cache this calculation it will hit for every page */
+        if (unlikely(err))
-        objio_seg->max_io_size = (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE -
+                goto err;
-                                  objio_seg->stripe_unit) *
-                                 objio_seg->group_width;
        *outp = &objio_seg->lseg;
        return 0;
@@ -386,43 +289,63 @@ void objio_free_lseg(struct pnfs_layout_segment *lseg)
        int i;
        struct objio_segment *objio_seg = OBJIO_LSEG(lseg);
-        for (i = 0; i < objio_seg->num_comps; i++) {
+        for (i = 0; i < objio_seg->oc.numdevs; i++) {
-                if (!objio_seg->ods[i])
+                struct ore_dev *od = objio_seg->oc.ods[i];
+                struct objio_dev_ent *ode;
+                if (!od)
                        break;
-                nfs4_put_deviceid_node(&objio_seg->ods[i]->id_node);
+                ode = container_of(od, typeof(*ode), od);
+                nfs4_put_deviceid_node(&ode->id_node);
        }
        kfree(objio_seg);
 }
-int objio_alloc_io_state(struct pnfs_layout_segment *lseg,
+static int
-                         struct objlayout_io_state **outp,
+objio_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type, bool is_reading,
-                         gfp_t gfp_flags)
+        struct pnfs_layout_segment *lseg, struct page **pages, unsigned pgbase,
+        loff_t offset, size_t count, void *rpcdata, gfp_t gfp_flags,
+        struct objio_state **outp)
 {
        struct objio_segment *objio_seg = OBJIO_LSEG(lseg);
-        struct objio_state *ios;
+        struct ore_io_state *ios;
-        const unsigned first_size = sizeof(*ios) +
+        int ret;
-                                objio_seg->num_comps * sizeof(ios->per_dev[0]);
+        struct __alloc_objio_state {
-        const unsigned sec_size = objio_seg->num_comps *
+                struct objio_state objios;
-                                                sizeof(ios->ol_state.ioerrs[0]);
+                struct pnfs_osd_ioerr ioerrs[objio_seg->oc.numdevs];
+        } *aos;
-        ios = kzalloc(first_size + sec_size, gfp_flags);
-        if (unlikely(!ios))
+        aos = kzalloc(sizeof(*aos), gfp_flags);
+        if (unlikely(!aos))
                return -ENOMEM;
-        ios->layout = objio_seg;
+        objlayout_init_ioerrs(&aos->objios.oir, objio_seg->oc.numdevs,
-        ios->ol_state.ioerrs = ((void *)ios) + first_size;
+                        aos->ioerrs, rpcdata, pnfs_layout_type);
-        ios->ol_state.num_comps = objio_seg->num_comps;
-        *outp = &ios->ol_state;
+        ret = ore_get_rw_state(&objio_seg->layout, &objio_seg->oc, is_reading,
+                               offset, count, &ios);
+        if (unlikely(ret)) {
+                kfree(aos);
+                return ret;
+        }
+        ios->pages = pages;
+        ios->pgbase = pgbase;
+        ios->private = aos;
+        BUG_ON(ios->nr_pages > (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT);
+        aos->objios.sync = 0;
+        aos->objios.ios = ios;
+        *outp = &aos->objios;
        return 0;
 }
-void objio_free_io_state(struct objlayout_io_state *ol_state)
+void objio_free_result(struct objlayout_io_res *oir)
 {
-        struct objio_state *ios = container_of(ol_state, struct objio_state,
+        struct objio_state *objios = container_of(oir, struct objio_state, oir);
-                                               ol_state);
-        kfree(ios);
+        ore_put_io_state(objios->ios);
+        kfree(objios);
 }
 enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep)
@@ -455,539 +378,152 @@ enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep)
        }
 }
-static void _clear_bio(struct bio *bio)
+static void __on_dev_error(struct ore_io_state *ios,
+        struct ore_dev *od, unsigned dev_index, enum osd_err_priority oep,
+        u64 dev_offset, u64  dev_len)
 {
-        struct bio_vec *bv;
+        struct objio_state *objios = ios->private;
-        unsigned i;
+        struct pnfs_osd_objid pooid;
+        struct objio_dev_ent *ode = container_of(od, typeof(*ode), od);
-        __bio_for_each_segment(bv, bio, i, 0) {
+        /* FIXME: what to do with more-then-one-group layouts. We need to
-                unsigned this_count = bv->bv_len;
+         * translate from ore_io_state index to oc->comps index
+         */
-                if (likely(PAGE_SIZE == this_count))
+        unsigned comp = dev_index;
-                        clear_highpage(bv->bv_page);
-                else
-                        zero_user(bv->bv_page, bv->bv_offset, this_count);
-        }
-}
-static int _io_check(struct objio_state *ios, bool is_write)
-{
-        enum osd_err_priority oep = OSD_ERR_PRI_NO_ERROR;
-        int lin_ret = 0;
-        int i;
-        for (i = 0; i <  ios->numdevs; i++) {
-                struct osd_sense_info osi;
-                struct osd_request *or = ios->per_dev[i].or;
-                int ret;
-                if (!or)
-                        continue;
-                ret = osd_req_decode_sense(or, &osi);
+        pooid.oid_device_id = ode->id_node.deviceid;
-                if (likely(!ret))
+        pooid.oid_partition_id = ios->oc->comps[comp].obj.partition;
-                        continue;
+        pooid.oid_object_id = ios->oc->comps[comp].obj.id;
-                if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) {
+        objlayout_io_set_result(&objios->oir, comp,
-                        /* start read offset passed endof file */
+                                &pooid, osd_pri_2_pnfs_err(oep),
-                        BUG_ON(is_write);
+                                dev_offset, dev_len, !ios->reading);
-                        _clear_bio(ios->per_dev[i].bio);
-                        dprintk("%s: start read offset passed end of file "
-                                "offset=0x%llx, length=0x%lx\n", __func__,
-                                _LLU(ios->per_dev[i].offset),
-                                ios->per_dev[i].length);
-                        continue; /* we recovered */
-                }
-                objlayout_io_set_result(&ios->ol_state, i,
-                                        &ios->layout->comps[i].oc_object_id,
-                                        osd_pri_2_pnfs_err(osi.osd_err_pri),
-                                        ios->per_dev[i].offset,
-                                        ios->per_dev[i].length,
-                                        is_write);
-                if (osi.osd_err_pri >= oep) {
-                        oep = osi.osd_err_pri;
-                        lin_ret = ret;
-                }
-        }
-        return lin_ret;
-}
-/*
- * Common IO state helpers.
- */
-static void _io_free(struct objio_state *ios)
-{
-        unsigned i;
-        for (i = 0; i < ios->numdevs; i++) {
-                struct _objio_per_comp *per_dev = &ios->per_dev[i];
-                if (per_dev->or) {
-                        osd_end_request(per_dev->or);
-                        per_dev->or = NULL;
-                }
-                if (per_dev->bio) {
-                        bio_put(per_dev->bio);
-                        per_dev->bio = NULL;
-                }
-        }
-}
-struct osd_dev *_io_od(struct objio_state *ios, unsigned dev)
-{
-        unsigned min_dev = ios->layout->comps_index;
-        unsigned max_dev = min_dev + ios->layout->num_comps;
-        BUG_ON(dev < min_dev || max_dev <= dev);
-        return ios->layout->ods[dev - min_dev]->od;
-}
-struct _striping_info {
-        u64 obj_offset;
-        u64 group_length;
-        unsigned dev;
-        unsigned unit_off;
-};
-static void _calc_stripe_info(struct objio_state *ios, u64 file_offset,
-                              struct _striping_info *si)
-{
-        u32     stripe_unit = ios->layout->stripe_unit;
-        u32     group_width = ios->layout->group_width;
-        u64     group_depth = ios->layout->group_depth;
-        u32     U = stripe_unit * group_width;
-        u64     T = U * group_depth;
-        u64     S = T * ios->layout->group_count;
-        u64     M = div64_u64(file_offset, S);
-        /*
-        G = (L - (M * S)) / T
-        H = (L - (M * S)) % T
-        */
-        u64     LmodU = file_offset - M * S;
-        u32     G = div64_u64(LmodU, T);
-        u64     H = LmodU - G * T;
-        u32     N = div_u64(H, U);
-        div_u64_rem(file_offset, stripe_unit, &si->unit_off);
-        si->obj_offset = si->unit_off + (N * stripe_unit) +
-                                  (M * group_depth * stripe_unit);
-        /* "H - (N * U)" is just "H % U" so it's bound to u32 */
-        si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width;
-        si->dev *= ios->layout->mirrors_p1;
-        si->group_length = T - H;
-}
-static int _add_stripe_unit(struct objio_state *ios,  unsigned *cur_pg,
-                unsigned pgbase, struct _objio_per_comp *per_dev, int len,
-                gfp_t gfp_flags)
-{
-        unsigned pg = *cur_pg;
-        int cur_len = len;
-        struct request_queue *q =
-                        osd_request_queue(_io_od(ios, per_dev->dev));
-        if (per_dev->bio == NULL) {
-                unsigned pages_in_stripe = ios->layout->group_width *
-                                      (ios->layout->stripe_unit / PAGE_SIZE);
-                unsigned bio_size = (ios->ol_state.nr_pages + pages_in_stripe) /
-                                    ios->layout->group_width;
-                if (BIO_MAX_PAGES_KMALLOC < bio_size)
-                        bio_size = BIO_MAX_PAGES_KMALLOC;
-                per_dev->bio = bio_kmalloc(gfp_flags, bio_size);
-                if (unlikely(!per_dev->bio)) {
-                        dprintk("Faild to allocate BIO size=%u\n", bio_size);
-                        return -ENOMEM;
-                }
-        }
-        while (cur_len > 0) {
-                unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len);
-                unsigned added_len;
-                BUG_ON(ios->ol_state.nr_pages <= pg);
-                cur_len -= pglen;
-                added_len = bio_add_pc_page(q, per_dev->bio,
-                                        ios->ol_state.pages[pg], pglen, pgbase);
-                if (unlikely(pglen != added_len))
-                        return -ENOMEM;
-                pgbase = 0;
-                ++pg;
-        }
-        BUG_ON(cur_len);
-        per_dev->length += len;
-        *cur_pg = pg;
-        return 0;
-}
-static int _prepare_one_group(struct objio_state *ios, u64 length,
-                              struct _striping_info *si, unsigned *last_pg,
-                              gfp_t gfp_flags)
-{
-        unsigned stripe_unit = ios->layout->stripe_unit;
-        unsigned mirrors_p1 = ios->layout->mirrors_p1;
-        unsigned devs_in_group = ios->layout->group_width * mirrors_p1;
-        unsigned dev = si->dev;
-        unsigned first_dev = dev - (dev % devs_in_group);
-        unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0;
-        unsigned cur_pg = *last_pg;
-        int ret = 0;
-        while (length) {
-                struct _objio_per_comp *per_dev = &ios->per_dev[dev - first_dev];
-                unsigned cur_len, page_off = 0;
-                if (!per_dev->length) {
-                        per_dev->dev = dev;
-                        if (dev < si->dev) {
-                                per_dev->offset = si->obj_offset + stripe_unit -
-                                                                   si->unit_off;
-                                cur_len = stripe_unit;
-                        } else if (dev == si->dev) {
-                                per_dev->offset = si->obj_offset;
-                                cur_len = stripe_unit - si->unit_off;
-                                page_off = si->unit_off & ~PAGE_MASK;
-                                BUG_ON(page_off &&
-                                      (page_off != ios->ol_state.pgbase));
-                        } else { /* dev > si->dev */
-                                per_dev->offset = si->obj_offset - si->unit_off;
-                                cur_len = stripe_unit;
-                        }
-                        if (max_comp < dev - first_dev)
-                                max_comp = dev - first_dev;
-                } else {
-                        cur_len = stripe_unit;
-                }
-                if (cur_len >= length)
-                        cur_len = length;
-                ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev,
-                                       cur_len, gfp_flags);
-                if (unlikely(ret))
-                        goto out;
-                dev += mirrors_p1;
-                dev = (dev % devs_in_group) + first_dev;
-                length -= cur_len;
-                ios->length += cur_len;
-        }
-out:
-        ios->numdevs = max_comp + mirrors_p1;
-        *last_pg = cur_pg;
-        return ret;
-}
-static int _io_rw_pagelist(struct objio_state *ios, gfp_t gfp_flags)
-{
-        u64 length = ios->ol_state.count;
-        u64 offset = ios->ol_state.offset;
-        struct _striping_info si;
-        unsigned last_pg = 0;
-        int ret = 0;
-        while (length) {
-                _calc_stripe_info(ios, offset, &si);
-                if (length < si.group_length)
-                        si.group_length = length;
-                ret = _prepare_one_group(ios, si.group_length, &si, &last_pg, gfp_flags);
-                if (unlikely(ret))
-                        goto out;
-                offset += si.group_length;
-                length -= si.group_length;
-        }
-out:
-        if (!ios->length)
-                return ret;
-        return 0;
-}
-static ssize_t _sync_done(struct objio_state *ios)
-{
-        struct completion *waiting = ios->private;
-        complete(waiting);
-        return 0;
-}
-static void _last_io(struct kref *kref)
-{
-        struct objio_state *ios = container_of(kref, struct objio_state, kref);
-        ios->done(ios);
-}
-static void _done_io(struct osd_request *or, void *p)
-{
-        struct objio_state *ios = p;
-        kref_put(&ios->kref, _last_io);
-}
-static ssize_t _io_exec(struct objio_state *ios)
-{
-        DECLARE_COMPLETION_ONSTACK(wait);
-        ssize_t status = 0; /* sync status */
-        unsigned i;
-        objio_done_fn saved_done_fn = ios->done;
-        bool sync = ios->ol_state.sync;
-        if (sync) {
-                ios->done = _sync_done;
-                ios->private = &wait;
-        }
-        kref_init(&ios->kref);
-        for (i = 0; i < ios->numdevs; i++) {
-                struct osd_request *or = ios->per_dev[i].or;
-                if (!or)
-                        continue;
-                kref_get(&ios->kref);
-                osd_execute_request_async(or, _done_io, ios);
-        }
-        kref_put(&ios->kref, _last_io);
-        if (sync) {
-                wait_for_completion(&wait);
-                status = saved_done_fn(ios);
-        }
-        return status;
 }
 /*
 * read
 */
-static ssize_t _read_done(struct objio_state *ios)
+static void _read_done(struct ore_io_state *ios, void *private)
 {
+        struct objio_state *objios = private;
        ssize_t status;
-        int ret = _io_check(ios, false);
+        int ret = ore_check_io(ios, &__on_dev_error);
-        _io_free(ios);
+        /* FIXME: _io_free(ios) can we dealocate the libosd resources; */
        if (likely(!ret))
                status = ios->length;
        else
                status = ret;
-        objlayout_read_done(&ios->ol_state, status, ios->ol_state.sync);
+        objlayout_read_done(&objios->oir, status, objios->sync);
-        return status;
 }
-static int _read_mirrors(struct objio_state *ios, unsigned cur_comp)
+int objio_read_pagelist(struct nfs_read_data *rdata)
 {
-        struct osd_request *or = NULL;
+        struct objio_state *objios;
-        struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
-        unsigned dev = per_dev->dev;
-        struct pnfs_osd_object_cred *cred =
-                        &ios->layout->comps[cur_comp];
-        struct osd_obj_id obj = {
-                .partition = cred->oc_object_id.oid_partition_id,
-                .id = cred->oc_object_id.oid_object_id,
-        };
        int ret;
-        or = osd_start_request(_io_od(ios, dev), GFP_KERNEL);
+        ret = objio_alloc_io_state(NFS_I(rdata->inode)->layout, true,
-        if (unlikely(!or)) {
+                        rdata->lseg, rdata->args.pages, rdata->args.pgbase,
-                ret = -ENOMEM;
+                        rdata->args.offset, rdata->args.count, rdata,
-                goto err;
+                        GFP_KERNEL, &objios);
-        }
-        per_dev->or = or;
-        osd_req_read(or, &obj, per_dev->offset, per_dev->bio, per_dev->length);
-        ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL);
-        if (ret) {
-                dprintk("%s: Faild to osd_finalize_request() => %d\n",
-                        __func__, ret);
-                goto err;
-        }
-        dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n",
-                __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset),
-                per_dev->length);
-err:
-        return ret;
-}
-static ssize_t _read_exec(struct objio_state *ios)
-{
-        unsigned i;
-        int ret;
-        for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
-                if (!ios->per_dev[i].length)
-                        continue;
-                ret = _read_mirrors(ios, i);
-                if (unlikely(ret))
-                        goto err;
-        }
-        ios->done = _read_done;
-        return _io_exec(ios); /* In sync mode exec returns the io status */
-err:
-        _io_free(ios);
-        return ret;
-}
-ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state)
-{
-        struct objio_state *ios = container_of(ol_state, struct objio_state,
-                                               ol_state);
-        int ret;
-        ret = _io_rw_pagelist(ios, GFP_KERNEL);
        if (unlikely(ret))
                return ret;
-        return _read_exec(ios);
+        objios->ios->done = _read_done;
+        dprintk("%s: offset=0x%llx length=0x%x\n", __func__,
+                rdata->args.offset, rdata->args.count);
+        return ore_read(objios->ios);
 }
 /*
 * write
 */
-static ssize_t _write_done(struct objio_state *ios)
+static void _write_done(struct ore_io_state *ios, void *private)
 {
+        struct objio_state *objios = private;
        ssize_t status;
-        int ret = _io_check(ios, true);
+        int ret = ore_check_io(ios, &__on_dev_error);
-        _io_free(ios);
+        /* FIXME: _io_free(ios) can we dealocate the libosd resources; */
        if (likely(!ret)) {
                /* FIXME: should be based on the OSD's persistence model
                 * See OSD2r05 Section 4.13 Data persistence model */
-                ios->ol_state.committed = NFS_FILE_SYNC;
+                objios->oir.committed = NFS_FILE_SYNC;
                status = ios->length;
        } else {
                status = ret;
        }
-        objlayout_write_done(&ios->ol_state, status, ios->ol_state.sync);
+        objlayout_write_done(&objios->oir, status, objios->sync);
-        return status;
 }
-static int _write_mirrors(struct objio_state *ios, unsigned cur_comp)
+static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate)
 {
-        struct _objio_per_comp *master_dev = &ios->per_dev[cur_comp];
+        struct objio_state *objios = priv;
-        unsigned dev = ios->per_dev[cur_comp].dev;
+        struct nfs_write_data *wdata = objios->oir.rpcdata;
-        unsigned last_comp = cur_comp + ios->layout->mirrors_p1;
+        pgoff_t index = offset / PAGE_SIZE;
-        int ret;
+        struct page *page = find_get_page(wdata->inode->i_mapping, index);
-        for (; cur_comp < last_comp; ++cur_comp, ++dev) {
-                struct osd_request *or = NULL;
-                struct pnfs_osd_object_cred *cred =
-                                        &ios->layout->comps[cur_comp];
-                struct osd_obj_id obj = {
-                        .partition = cred->oc_object_id.oid_partition_id,
-                        .id = cred->oc_object_id.oid_object_id,
-                };
-                struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
-                struct bio *bio;
-                or = osd_start_request(_io_od(ios, dev), GFP_NOFS);
-                if (unlikely(!or)) {
-                        ret = -ENOMEM;
-                        goto err;
-                }
-                per_dev->or = or;
-                if (per_dev != master_dev) {
-                        bio = bio_kmalloc(GFP_NOFS,
-                                          master_dev->bio->bi_max_vecs);
-                        if (unlikely(!bio)) {
-                                dprintk("Faild to allocate BIO size=%u\n",
-                                        master_dev->bio->bi_max_vecs);
-                                ret = -ENOMEM;
-                                goto err;
-                        }
-                        __bio_clone(bio, master_dev->bio);
-                        bio->bi_bdev = NULL;
-                        bio->bi_next = NULL;
-                        per_dev->bio = bio;
-                        per_dev->dev = dev;
-                        per_dev->length = master_dev->length;
-                        per_dev->offset =  master_dev->offset;
-                } else {
-                        bio = master_dev->bio;
-                        bio->bi_rw |= REQ_WRITE;
-                }
-                osd_req_write(or, &obj, per_dev->offset, bio, per_dev->length);
-                ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL);
+        if (!page) {
-                if (ret) {
+                page = find_or_create_page(wdata->inode->i_mapping,
-                        dprintk("%s: Faild to osd_finalize_request() => %d\n",
+                                                index, GFP_NOFS);
-                                __func__, ret);
+                if (unlikely(!page)) {
-                        goto err;
+                        dprintk("%s: grab_cache_page Failed index=0x%lx\n",
+                                __func__, index);
+                        return NULL;
                }
+                unlock_page(page);
-                dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n",
-                        __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset),
-                        per_dev->length);
        }
+        if (PageDirty(page) || PageWriteback(page))
+                *uptodate = true;
+        else
+                *uptodate = PageUptodate(page);
+        dprintk("%s: index=0x%lx uptodate=%d\n", __func__, index, *uptodate);
+        return page;
+}
-err:
+static void __r4w_put_page(void *priv, struct page *page)
-        return ret;
+{
+        dprintk("%s: index=0x%lx\n", __func__, page->index);
+        page_cache_release(page);
+        return;
 }
-static ssize_t _write_exec(struct objio_state *ios)
+static const struct _ore_r4w_op _r4w_op = {
+        .get_page = &__r4w_get_page,
+        .put_page = &__r4w_put_page,
+};
+int objio_write_pagelist(struct nfs_write_data *wdata, int how)
 {
-        unsigned i;
+        struct objio_state *objios;
        int ret;
-        for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
+        ret = objio_alloc_io_state(NFS_I(wdata->inode)->layout, false,
-                if (!ios->per_dev[i].length)
+                        wdata->lseg, wdata->args.pages, wdata->args.pgbase,
-                        continue;
+                        wdata->args.offset, wdata->args.count, wdata, GFP_NOFS,
-                ret = _write_mirrors(ios, i);
+                        &objios);
-                if (unlikely(ret))
+        if (unlikely(ret))
-                        goto err;
+                return ret;
-        }
-        ios->done = _write_done;
-        return _io_exec(ios); /* In sync mode exec returns the io->status */
-err:
+        objios->sync = 0 != (how & FLUSH_SYNC);
-        _io_free(ios);
+        objios->ios->r4w = &_r4w_op;
-        return ret;
-}
-ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable)
+        if (!objios->sync)
-{
+                objios->ios->done = _write_done;
-        struct objio_state *ios = container_of(ol_state, struct objio_state,
-                                               ol_state);
-        int ret;
-        /* TODO: ios->stable = stable; */
+        dprintk("%s: offset=0x%llx length=0x%x\n", __func__,
-        ret = _io_rw_pagelist(ios, GFP_NOFS);
+                wdata->args.offset, wdata->args.count);
+        ret = ore_write(objios->ios);
        if (unlikely(ret))
                return ret;
-        return _write_exec(ios);
+        if (objios->sync)
+                _write_done(objios->ios, objios);
+        return 0;
 }
 static bool objio_pg_test(struct nfs_pageio_descriptor *pgio,
@@ -997,7 +533,7 @@ static bool objio_pg_test(struct nfs_pageio_descriptor *pgio,
                return false;
        return pgio->pg_count + req->wb_bytes <=
-                        OBJIO_LSEG(pgio->pg_lseg)->max_io_size;
+                        OBJIO_LSEG(pgio->pg_lseg)->layout.max_io_length;
 }
 static const struct nfs_pageio_ops objio_pg_read_ops = {
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index 1d06f8e2adea..72074e3a04f9 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -156,77 +156,39 @@ last_byte_offset(u64 start, u64 len)
        return end > start ? end - 1 : NFS4_MAX_UINT64;
 }
-static struct objlayout_io_state *
+void _fix_verify_io_params(struct pnfs_layout_segment *lseg,
-objlayout_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type,
+                           struct page ***p_pages, unsigned *p_pgbase,
-                        struct page **pages,
+                           u64 offset, unsigned long count)
-                        unsigned pgbase,
-                        loff_t offset,
-                        size_t count,
-                        struct pnfs_layout_segment *lseg,
-                        void *rpcdata,
-                        gfp_t gfp_flags)
 {
-        struct objlayout_io_state *state;
        u64 lseg_end_offset;
-        dprintk("%s: allocating io_state\n", __func__);
-        if (objio_alloc_io_state(lseg, &state, gfp_flags))
-                return NULL;
        BUG_ON(offset < lseg->pls_range.offset);
        lseg_end_offset = end_offset(lseg->pls_range.offset,
                                     lseg->pls_range.length);
        BUG_ON(offset >= lseg_end_offset);
-        if (offset + count > lseg_end_offset) {
+        WARN_ON(offset + count > lseg_end_offset);
-                count = lseg->pls_range.length -
-                                (offset - lseg->pls_range.offset);
-                dprintk("%s: truncated count %Zd\n", __func__, count);
-        }
-        if (pgbase > PAGE_SIZE) {
+        if (*p_pgbase > PAGE_SIZE) {
-                pages += pgbase >> PAGE_SHIFT;
+                dprintk("%s: pgbase(0x%x) > PAGE_SIZE\n", __func__, *p_pgbase);
-                pgbase &= ~PAGE_MASK;
+                *p_pages += *p_pgbase >> PAGE_SHIFT;
+                *p_pgbase &= ~PAGE_MASK;
        }
-        INIT_LIST_HEAD(&state->err_list);
-        state->lseg = lseg;
-        state->rpcdata = rpcdata;
-        state->pages = pages;
-        state->pgbase = pgbase;
-        state->nr_pages = (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT;
-        state->offset = offset;
-        state->count = count;
-        state->sync = 0;
-        return state;
-}
-static void
-objlayout_free_io_state(struct objlayout_io_state *state)
-{
-        dprintk("%s: freeing io_state\n", __func__);
-        if (unlikely(!state))
-                return;
-        objio_free_io_state(state);
 }
 /*
 * I/O done common code
 */
 static void
-objlayout_iodone(struct objlayout_io_state *state)
+objlayout_iodone(struct objlayout_io_res *oir)
 {
-        dprintk("%s: state %p status\n", __func__, state);
+        if (likely(oir->status >= 0)) {
+                objio_free_result(oir);
-        if (likely(state->status >= 0)) {
-                objlayout_free_io_state(state);
        } else {
-                struct objlayout *objlay = OBJLAYOUT(state->lseg->pls_layout);
+                struct objlayout *objlay = oir->objlay;
                spin_lock(&objlay->lock);
                objlay->delta_space_valid = OBJ_DSU_INVALID;
-                list_add(&objlay->err_list, &state->err_list);
+                list_add(&objlay->err_list, &oir->err_list);
                spin_unlock(&objlay->lock);
        }
 }
@@ -238,13 +200,13 @@ objlayout_iodone(struct objlayout_io_state *state)
 * the error for later reporting at layout-return.
 */
 void
-objlayout_io_set_result(struct objlayout_io_state *state, unsigned index,
+objlayout_io_set_result(struct objlayout_io_res *oir, unsigned index,
                        struct pnfs_osd_objid *pooid, int osd_error,
                        u64 offset, u64 length, bool is_write)
 {
-        struct pnfs_osd_ioerr *ioerr = &state->ioerrs[index];
+        struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[index];
-        BUG_ON(index >= state->num_comps);
+        BUG_ON(index >= oir->num_comps);
        if (osd_error) {
                ioerr->oer_component = *pooid;
                ioerr->oer_comp_offset = offset;
@@ -285,21 +247,18 @@ static void _rpc_read_complete(struct work_struct *work)
 }
 void
-objlayout_read_done(struct objlayout_io_state *state, ssize_t status, bool sync)
+objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
 {
-        int eof = state->eof;
+        struct nfs_read_data *rdata = oir->rpcdata;
-        struct nfs_read_data *rdata;
-        state->status = status;
+        oir->status = rdata->task.tk_status = status;
-        dprintk("%s: Begin status=%zd eof=%d\n", __func__, status, eof);
+        if (status >= 0)
-        rdata = state->rpcdata;
-        rdata->task.tk_status = status;
-        if (status >= 0) {
                rdata->res.count = status;
-                rdata->res.eof = eof;
+        objlayout_iodone(oir);
-        }
+        /* must not use oir after this point */
-        objlayout_iodone(state);
-        /* must not use state after this point */
+        dprintk("%s: Return status=%zd eof=%d sync=%d\n", __func__,
+                status, rdata->res.eof, sync);
        if (sync)
                pnfs_ld_read_done(rdata);
@@ -317,40 +276,36 @@ objlayout_read_pagelist(struct nfs_read_data *rdata)
 {
        loff_t offset = rdata->args.offset;
        size_t count = rdata->args.count;
-        struct objlayout_io_state *state;
+        int err;
-        ssize_t status = 0;
        loff_t eof;
-        dprintk("%s: Begin inode %p offset %llu count %d\n",
-                __func__, rdata->inode, offset, (int)count);
        eof = i_size_read(rdata->inode);
        if (unlikely(offset + count > eof)) {
                if (offset >= eof) {
-                        status = 0;
+                        err = 0;
                        rdata->res.count = 0;
                        rdata->res.eof = 1;
+                        /*FIXME: do we need to call pnfs_ld_read_done() */
                        goto out;
                }
                count = eof - offset;
        }
-        state = objlayout_alloc_io_state(NFS_I(rdata->inode)->layout,
+        rdata->res.eof = (offset + count) >= eof;
-                                         rdata->args.pages, rdata->args.pgbase,
+        _fix_verify_io_params(rdata->lseg, &rdata->args.pages,
-                                         offset, count,
+                              &rdata->args.pgbase,
-                                         rdata->lseg, rdata,
+                              rdata->args.offset, rdata->args.count);
-                                         GFP_KERNEL);
-        if (unlikely(!state)) {
-                status = -ENOMEM;
-                goto out;
-        }
-        state->eof = state->offset + state->count >= eof;
+        dprintk("%s: inode(%lx) offset 0x%llx count 0x%Zx eof=%d\n",
+                __func__, rdata->inode->i_ino, offset, count, rdata->res.eof);
-        status = objio_read_pagelist(state);
+        err = objio_read_pagelist(rdata);
 out:
-        dprintk("%s: Return status %Zd\n", __func__, status);
+        if (unlikely(err)) {
-        rdata->pnfs_error = status;
+                rdata->pnfs_error = err;
+                dprintk("%s: Returned Error %d\n", __func__, err);
+                return PNFS_NOT_ATTEMPTED;
+        }
        return PNFS_ATTEMPTED;
 }
@@ -371,26 +326,20 @@ static void _rpc_write_complete(struct work_struct *work)
 }
 void
-objlayout_write_done(struct objlayout_io_state *state, ssize_t status,
+objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
-                     bool sync)
 {
-        struct nfs_write_data *wdata;
+        struct nfs_write_data *wdata = oir->rpcdata;
-        dprintk("%s: Begin\n", __func__);
+        oir->status = wdata->task.tk_status = status;
-        wdata = state->rpcdata;
-        state->status = status;
-        wdata->task.tk_status = status;
        if (status >= 0) {
                wdata->res.count = status;
-                wdata->verf.committed = state->committed;
+                wdata->verf.committed = oir->committed;
-                dprintk("%s: Return status %d committed %d\n",
+        }
-                        __func__, wdata->task.tk_status,
+        objlayout_iodone(oir);
-                        wdata->verf.committed);
+        /* must not use oir after this point */
-        } else
-                dprintk("%s: Return status %d\n",
+        dprintk("%s: Return status %zd committed %d sync=%d\n", __func__,
-                        __func__, wdata->task.tk_status);
+                status, wdata->verf.committed, sync);
-        objlayout_iodone(state);
-        /* must not use state after this point */
        if (sync)
                pnfs_ld_write_done(wdata);
@@ -407,30 +356,18 @@ enum pnfs_try_status
 objlayout_write_pagelist(struct nfs_write_data *wdata,
                         int how)
 {
-        struct objlayout_io_state *state;
+        int err;
-        ssize_t status;
-        dprintk("%s: Begin inode %p offset %llu count %u\n",
-                __func__, wdata->inode, wdata->args.offset, wdata->args.count);
-        state = objlayout_alloc_io_state(NFS_I(wdata->inode)->layout,
-                                         wdata->args.pages,
-                                         wdata->args.pgbase,
-                                         wdata->args.offset,
-                                         wdata->args.count,
-                                         wdata->lseg, wdata,
-                                         GFP_NOFS);
-        if (unlikely(!state)) {
-                status = -ENOMEM;
-                goto out;
-        }
-        state->sync = how & FLUSH_SYNC;
+        _fix_verify_io_params(wdata->lseg, &wdata->args.pages,
+                              &wdata->args.pgbase,
+                              wdata->args.offset, wdata->args.count);
-        status = objio_write_pagelist(state, how & FLUSH_STABLE);
+        err = objio_write_pagelist(wdata, how);
- out:
+        if (unlikely(err)) {
-        dprintk("%s: Return status %Zd\n", __func__, status);
+                wdata->pnfs_error = err;
-        wdata->pnfs_error = status;
+                dprintk("%s: Returned Error %d\n", __func__, err);
+                return PNFS_NOT_ATTEMPTED;
+        }
        return PNFS_ATTEMPTED;
 }
@@ -537,14 +474,14 @@ merge_ioerr(struct pnfs_osd_ioerr *dest_err,
 static void
 encode_accumulated_error(struct objlayout *objlay, __be32 *p)
 {
-        struct objlayout_io_state *state, *tmp;
+        struct objlayout_io_res *oir, *tmp;
        struct pnfs_osd_ioerr accumulated_err = {.oer_errno = 0};
-        list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) {
+        list_for_each_entry_safe(oir, tmp, &objlay->err_list, err_list) {
                unsigned i;
-                for (i = 0; i < state->num_comps; i++) {
+                for (i = 0; i < oir->num_comps; i++) {
-                        struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i];
+                        struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[i];
                        if (!ioerr->oer_errno)
                                continue;
@@ -563,8 +500,8 @@ encode_accumulated_error(struct objlayout *objlay, __be32 *p)
                        merge_ioerr(&accumulated_err, ioerr);
                }
-                list_del(&state->err_list);
+                list_del(&oir->err_list);
-                objlayout_free_io_state(state);
+                objio_free_result(oir);
        }
        pnfs_osd_xdr_encode_ioerr(p, &accumulated_err);
@@ -576,7 +513,7 @@ objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay,
                              const struct nfs4_layoutreturn_args *args)
 {
        struct objlayout *objlay = OBJLAYOUT(pnfslay);
-        struct objlayout_io_state *state, *tmp;
+        struct objlayout_io_res *oir, *tmp;
        __be32 *start;
        dprintk("%s: Begin\n", __func__);
@@ -585,13 +522,13 @@ objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay,
        spin_lock(&objlay->lock);
-        list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) {
+        list_for_each_entry_safe(oir, tmp, &objlay->err_list, err_list) {
                __be32 *last_xdr = NULL, *p;
                unsigned i;
                int res = 0;
-                for (i = 0; i < state->num_comps; i++) {
+                for (i = 0; i < oir->num_comps; i++) {
-                        struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i];
+                        struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[i];
                        if (!ioerr->oer_errno)
                                continue;
@@ -615,7 +552,7 @@ objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay,
                        }
                        last_xdr = p;
-                        pnfs_osd_xdr_encode_ioerr(p, &state->ioerrs[i]);
+                        pnfs_osd_xdr_encode_ioerr(p, &oir->ioerrs[i]);
                }
                /* TODO: use xdr_write_pages */
@@ -631,8 +568,8 @@ objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay,
                        encode_accumulated_error(objlay, last_xdr);
                        goto loop_done;
                }
-                list_del(&state->err_list);
+                list_del(&oir->err_list);
-                objlayout_free_io_state(state);
+                objio_free_result(oir);
        }
 loop_done:
        spin_unlock(&objlay->lock);
diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
index a8244c8e042d..8ec34727ed21 100644
--- a/fs/nfs/objlayout/objlayout.h
+++ b/fs/nfs/objlayout/objlayout.h
@@ -74,19 +74,11 @@ OBJLAYOUT(struct pnfs_layout_hdr *lo)
 * per-I/O operation state
 * embedded in objects provider io_state data structure
 */
-struct objlayout_io_state {
+struct objlayout_io_res {
-        struct pnfs_layout_segment *lseg;
+        struct objlayout *objlay;
-        struct page **pages;
-        unsigned pgbase;
-        unsigned nr_pages;
-        unsigned long count;
-        loff_t offset;
-        bool sync;
        void *rpcdata;
        int status;             /* res */
-        int eof;                /* res */
        int committed;          /* res */
        /* Error reporting (layout_return) */
@@ -100,6 +92,18 @@ struct objlayout_io_state {
        struct pnfs_osd_ioerr *ioerrs;
 };
+static inline
+void objlayout_init_ioerrs(struct objlayout_io_res *oir, unsigned num_comps,
+                        struct pnfs_osd_ioerr *ioerrs, void *rpcdata,
+                        struct pnfs_layout_hdr *pnfs_layout_type)
+{
+        oir->objlay = OBJLAYOUT(pnfs_layout_type);
+        oir->rpcdata = rpcdata;
+        INIT_LIST_HEAD(&oir->err_list);
+        oir->num_comps = num_comps;
+        oir->ioerrs = ioerrs;
+}
 /*
 * Raid engine I/O API
 */
@@ -110,28 +114,24 @@ extern int objio_alloc_lseg(struct pnfs_layout_segment **outp,
        gfp_t gfp_flags);
 extern void objio_free_lseg(struct pnfs_layout_segment *lseg);
-extern int objio_alloc_io_state(
+/* objio_free_result will free these @oir structs recieved from
-        struct pnfs_layout_segment *lseg,
+ * objlayout_{read,write}_done
-        struct objlayout_io_state **outp,
+ */
-        gfp_t gfp_flags);
+extern void objio_free_result(struct objlayout_io_res *oir);
-extern void objio_free_io_state(struct objlayout_io_state *state);
-extern ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state);
+extern int objio_read_pagelist(struct nfs_read_data *rdata);
-extern ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state,
+extern int objio_write_pagelist(struct nfs_write_data *wdata, int how);
-                                    bool stable);
 /*
 * callback API
 */
-extern void objlayout_io_set_result(struct objlayout_io_state *state,
+extern void objlayout_io_set_result(struct objlayout_io_res *oir,
                        unsigned index, struct pnfs_osd_objid *pooid,
                        int osd_error, u64 offset, u64 length, bool is_write);
 static inline void
-objlayout_add_delta_space_used(struct objlayout_io_state *state, s64 space_used)
+objlayout_add_delta_space_used(struct objlayout *objlay, s64 space_used)
 {
-        struct objlayout *objlay = OBJLAYOUT(state->lseg->pls_layout);
        /* If one of the I/Os errored out and the delta_space_used was
         * invalid we render the complete report as invalid. Protocol mandate
         * the DSU be accurate or not reported.
@@ -144,9 +144,9 @@ objlayout_add_delta_space_used(struct objlayout_io_state *state, s64 space_used)
        spin_unlock(&objlay->lock);
 }
-extern void objlayout_read_done(struct objlayout_io_state *state,
+extern void objlayout_read_done(struct objlayout_io_res *oir,
                                ssize_t status, bool sync);
-extern void objlayout_write_done(struct objlayout_io_state *state,
+extern void objlayout_write_done(struct objlayout_io_res *oir,
                                 ssize_t status, bool sync);
 extern int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index a788d8522c88..5668f7c54c41 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -42,7 +42,7 @@ nfs_page_free(struct nfs_page *p)
 /**
 * nfs_create_request - Create an NFS read/write request.
- * @file: file descriptor to use
+ * @ctx: open context to use
 * @inode: inode to which the request is attached
 * @page: page to write
 * @offset: starting offset within the page for the write
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index ba1d5388fafd..baf73536bc04 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1444,17 +1444,31 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
        /* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */
        data = kzalloc(sizeof(*data), GFP_NOFS);
        if (!data) {
-                mark_inode_dirty_sync(inode);
                status = -ENOMEM;
                goto out;
        }
+        if (!test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags))
+                goto out_free;
+        if (test_and_set_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags)) {
+                if (!sync) {
+                        status = -EAGAIN;
+                        goto out_free;
+                }
+                status = wait_on_bit_lock(&nfsi->flags, NFS_INO_LAYOUTCOMMITTING,
+                                        nfs_wait_bit_killable, TASK_KILLABLE);
+                if (status)
+                        goto out_free;
+        }
        INIT_LIST_HEAD(&data->lseg_list);
        spin_lock(&inode->i_lock);
        if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
+                clear_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags);
                spin_unlock(&inode->i_lock);
-                kfree(data);
+                wake_up_bit(&nfsi->flags, NFS_INO_LAYOUTCOMMITTING);
-                goto out;
+                goto out_free;
        }
        pnfs_list_write_lseg(inode, &data->lseg_list);
@@ -1476,6 +1490,11 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
        status = nfs4_proc_layoutcommit(data, sync);
 out:
+        if (status)
+                mark_inode_dirty_sync(inode);
        dprintk("<-- %s status %d\n", __func__, status);
        return status;
+out_free:
+        kfree(data);
+        goto out;
 }
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index cd1edfd8c2d0..1dda78db6a73 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1244,7 +1244,6 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
 {
        struct nfs_writeargs    *argp = &data->args;
        struct nfs_writeres     *resp = &data->res;
-        struct nfs_server       *server = NFS_SERVER(data->inode);
        int status;
        dprintk("NFS: %5u nfs_writeback_done (status %d)\n",
@@ -1278,7 +1277,7 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
                if (time_before(complain, jiffies)) {
                        dprintk("NFS:       faulty NFS server %s:"
                                " (committed = %d) != (stable = %d)\n",
-                                server->nfs_client->cl_hostname,
+                                NFS_SERVER(data->inode)->nfs_client->cl_hostname,
                                resp->verf->committed, argp->stable);
                        complain = jiffies + 300 * HZ;
                }
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 66d095d7955e..b6fa792d6b85 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -655,7 +655,7 @@ static __be32 nfsd4_decode_share_access(struct nfsd4_compoundargs *argp, u32 *x)
        default:
                return nfserr_bad_xdr;
        }
-        w &= !NFS4_SHARE_ACCESS_MASK;
+        w &= ~NFS4_SHARE_ACCESS_MASK;
        if (!w)
                return nfs_ok;
        if (!argp->minorversion)
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 4b8e828ae15f..eda7d7e55e05 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -257,6 +257,8 @@ static void nfsd_last_thread(struct svc_serv *serv)
        nfsd_serv = NULL;
        nfsd_shutdown();
+        svc_rpcb_cleanup(serv);
        printk(KERN_WARNING "nfsd: last server has exited, flushing export "
                            "cache\n");
        nfsd_export_flush();
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 666628b395f1..b50ffb72e5b3 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -354,7 +354,7 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode)
 failed_acl:
 failed_bmap:
-        inode->i_nlink = 0;
+        clear_nlink(inode);
        iput(inode);  /* raw_inode will be deleted through
                         generic_delete_inode() */
        goto failed;
@@ -396,7 +396,7 @@ int nilfs_read_inode_common(struct inode *inode,
        inode->i_mode = le16_to_cpu(raw_inode->i_mode);
        inode->i_uid = (uid_t)le32_to_cpu(raw_inode->i_uid);
        inode->i_gid = (gid_t)le32_to_cpu(raw_inode->i_gid);
-        inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
+        set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
        inode->i_size = le64_to_cpu(raw_inode->i_size);
        inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
        inode->i_ctime.tv_sec = le64_to_cpu(raw_inode->i_ctime);
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index a3141990061e..768982de10e4 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -289,7 +289,7 @@ static int nilfs_do_unlink(struct inode *dir, struct dentry *dentry)
                nilfs_warning(inode->i_sb, __func__,
                              "deleting nonexistent file (%lu), %d\n",
                              inode->i_ino, inode->i_nlink);
-                inode->i_nlink = 1;
+                set_nlink(inode, 1);
        }
        err = nilfs_delete_entry(de, page);
        if (err)
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 255d5e1c03b7..3777d138f895 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -276,10 +276,10 @@ int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 /* super.c */
 extern struct inode *nilfs_alloc_inode(struct super_block *);
 extern void nilfs_destroy_inode(struct inode *);
-extern void nilfs_error(struct super_block *, const char *, const char *, ...)
+extern __printf(3, 4)
-        __attribute__ ((format (printf, 3, 4)));
+void nilfs_error(struct super_block *, const char *, const char *, ...);
-extern void nilfs_warning(struct super_block *, const char *, const char *, ...)
+extern __printf(3, 4)
-        __attribute__ ((format (printf, 3, 4)));
+void nilfs_warning(struct super_block *, const char *, const char *, ...);
 extern struct nilfs_super_block *
 nilfs_read_super_block(struct super_block *, u64, int, struct buffer_head **);
 extern int nilfs_store_magic_and_option(struct super_block *,
diff --git a/fs/ntfs/debug.h b/fs/ntfs/debug.h
index 2142b1c68b61..53c27eaf2307 100644
--- a/fs/ntfs/debug.h
+++ b/fs/ntfs/debug.h
@@ -30,8 +30,9 @@
 extern int debug_msgs;
-extern void __ntfs_debug(const char *file, int line, const char *function,
+extern __printf(4, 5)
-        const char *format, ...) __attribute__ ((format (printf, 4, 5)));
+void __ntfs_debug(const char *file, int line, const char *function,
+                  const char *format, ...);
 /**
 * ntfs_debug - write a debug level message to syslog
 * @f:          a printf format string containing the message
@@ -52,12 +53,14 @@ extern void ntfs_debug_dump_runlist(const runlist_element *rl);
 #endif  /* !DEBUG */
-extern void __ntfs_warning(const char *function, const struct super_block *sb,
+extern  __printf(3, 4)
-                const char *fmt, ...) __attribute__ ((format (printf, 3, 4)));
+void __ntfs_warning(const char *function, const struct super_block *sb,
+                    const char *fmt, ...);
 #define ntfs_warning(sb, f, a...)       __ntfs_warning(__func__, sb, f, ##a)
-extern void __ntfs_error(const char *function, const struct super_block *sb,
+extern  __printf(3, 4)
-                const char *fmt, ...) __attribute__ ((format (printf, 3, 4)));
+void __ntfs_error(const char *function, const struct super_block *sb,
+                  const char *fmt, ...);
 #define ntfs_error(sb, f, a...)         __ntfs_error(__func__, sb, f, ##a)
 #endif /* _LINUX_NTFS_DEBUG_H */
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 1371487da955..97e2dacbc867 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -612,7 +612,7 @@ static int ntfs_read_locked_inode(struct inode *vi)
         * might be tricky due to vfs interactions. Need to think about this
         * some more when implementing the unlink command.
         */
-        vi->i_nlink = le16_to_cpu(m->link_count);
+        set_nlink(vi, le16_to_cpu(m->link_count));
        /*
         * FIXME: Reparse points can have the directory bit set even though
         * they would be S_IFLNK. Need to deal with this further below when we
@@ -634,7 +634,7 @@ static int ntfs_read_locked_inode(struct inode *vi)
                vi->i_mode &= ~vol->dmask;
                /* Things break without this kludge! */
                if (vi->i_nlink > 1)
-                        vi->i_nlink = 1;
+                        set_nlink(vi, 1);
        } else {
                vi->i_mode |= S_IFREG;
                /* Apply the file permissions mask set in the mount options. */
@@ -1242,7 +1242,7 @@ static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi)
        vi->i_version   = base_vi->i_version;
        vi->i_uid       = base_vi->i_uid;
        vi->i_gid       = base_vi->i_gid;
-        vi->i_nlink     = base_vi->i_nlink;
+        set_nlink(vi, base_vi->i_nlink);
        vi->i_mtime     = base_vi->i_mtime;
        vi->i_ctime     = base_vi->i_ctime;
        vi->i_atime     = base_vi->i_atime;
@@ -1508,7 +1508,7 @@ static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi)
        vi->i_version   = base_vi->i_version;
        vi->i_uid       = base_vi->i_uid;
        vi->i_gid       = base_vi->i_gid;
-        vi->i_nlink     = base_vi->i_nlink;
+        set_nlink(vi, base_vi->i_nlink);
        vi->i_mtime     = base_vi->i_mtime;
        vi->i_ctime     = base_vi->i_ctime;
        vi->i_atime     = base_vi->i_atime;
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 8582e3f4f120..e2878b5895fb 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -2292,7 +2292,7 @@ static int ocfs2_fill_new_dir_id(struct ocfs2_super *osb,
        ocfs2_journal_dirty(handle, di_bh);
        i_size_write(inode, size);
-        inode->i_nlink = 2;
+        set_nlink(inode, 2);
        inode->i_blocks = ocfs2_inode_sector_count(inode);
        ret = ocfs2_mark_inode_dirty(handle, inode, di_bh);
@@ -2354,7 +2354,7 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
        ocfs2_journal_dirty(handle, new_bh);
        i_size_write(inode, inode->i_sb->s_blocksize);
-        inode->i_nlink = 2;
+        set_nlink(inode, 2);
        inode->i_blocks = ocfs2_inode_sector_count(inode);
        status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
        if (status < 0) {
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 7642d7ca73e5..e1ed5e502ff2 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2092,7 +2092,7 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
        inode->i_uid     = be32_to_cpu(lvb->lvb_iuid);
        inode->i_gid     = be32_to_cpu(lvb->lvb_igid);
        inode->i_mode    = be16_to_cpu(lvb->lvb_imode);
-        inode->i_nlink   = be16_to_cpu(lvb->lvb_inlink);
+        set_nlink(inode, be16_to_cpu(lvb->lvb_inlink));
        ocfs2_unpack_timespec(&inode->i_atime,
                              be64_to_cpu(lvb->lvb_iatime_packed));
        ocfs2_unpack_timespec(&inode->i_mtime,
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index b4c8bb6b8d28..a22d2c098890 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -291,7 +291,7 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
                     (unsigned long long)OCFS2_I(inode)->ip_blkno,
                     (unsigned long long)le64_to_cpu(fe->i_blkno));
-        inode->i_nlink = ocfs2_read_links_count(fe);
+        set_nlink(inode, ocfs2_read_links_count(fe));
        trace_ocfs2_populate_inode(OCFS2_I(inode)->ip_blkno,
                                   le32_to_cpu(fe->i_flags));
@@ -1290,7 +1290,7 @@ void ocfs2_refresh_inode(struct inode *inode,
        OCFS2_I(inode)->ip_dyn_features = le16_to_cpu(fe->i_dyn_features);
        ocfs2_set_inode_flags(inode);
        i_size_write(inode, le64_to_cpu(fe->i_size));
-        inode->i_nlink = ocfs2_read_links_count(fe);
+        set_nlink(inode, ocfs2_read_links_count(fe));
        inode->i_uid = le32_to_cpu(fe->i_uid);
        inode->i_gid = le32_to_cpu(fe->i_gid);
        inode->i_mode = le16_to_cpu(fe->i_mode);
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 53aa41ed7bf3..a8b2bfea574e 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -199,9 +199,7 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, int mode)
         * these are used by the support functions here and in
         * callers. */
        if (S_ISDIR(mode))
-                inode->i_nlink = 2;
+                set_nlink(inode, 2);
-        else
-                inode->i_nlink = 1;
        inode_init_owner(inode, dir, mode);
        dquot_initialize(inode);
        return inode;
@@ -1379,7 +1377,7 @@ static int ocfs2_rename(struct inode *old_dir,
        }
        if (new_inode) {
-                new_inode->i_nlink--;
+                drop_nlink(new_inode);
                new_inode->i_ctime = CURRENT_TIME;
        }
        old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME;
@@ -1387,9 +1385,9 @@ static int ocfs2_rename(struct inode *old_dir,
        if (update_dot_dot) {
                status = ocfs2_update_entry(old_inode, handle,
                                            &old_inode_dot_dot_res, new_dir);
-                old_dir->i_nlink--;
+                drop_nlink(old_dir);
                if (new_inode) {
-                        new_inode->i_nlink--;
+                        drop_nlink(new_inode);
                } else {
                        inc_nlink(new_dir);
                        mark_inode_dirty(new_dir);
@@ -2018,7 +2016,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
        orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
        if (S_ISDIR(inode->i_mode))
                ocfs2_add_links_count(orphan_fe, 1);
-        orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe);
+        set_nlink(orphan_dir_inode, ocfs2_read_links_count(orphan_fe));
        ocfs2_journal_dirty(handle, orphan_dir_bh);
        status = __ocfs2_add_entry(handle, orphan_dir_inode, name,
@@ -2116,7 +2114,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
        orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
        if (S_ISDIR(inode->i_mode))
                ocfs2_add_links_count(orphan_fe, -1);
-        orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe);
+        set_nlink(orphan_dir_inode, ocfs2_read_links_count(orphan_fe));
        ocfs2_journal_dirty(handle, orphan_dir_bh);
 leave:
@@ -2282,7 +2280,7 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
                goto leave;
        }
-        inode->i_nlink = 0;
+        clear_nlink(inode);
        /* do the real work now. */
        status = __ocfs2_mknod_locked(dir, inode,
                                      0, &new_di_bh, parent_di_bh, handle,
@@ -2437,7 +2435,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
        di = (struct ocfs2_dinode *)di_bh->b_data;
        le32_add_cpu(&di->i_flags, -OCFS2_ORPHANED_FL);
        di->i_orphaned_slot = 0;
-        inode->i_nlink = 1;
+        set_nlink(inode, 1);
        ocfs2_set_links_count(di, inode->i_nlink);
        ocfs2_journal_dirty(handle, di_bh);
diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h
index 40c7de084c10..74ff74cf78fe 100644
--- a/fs/ocfs2/super.h
+++ b/fs/ocfs2/super.h
@@ -31,17 +31,15 @@ extern struct workqueue_struct *ocfs2_wq;
 int ocfs2_publish_get_mount_state(struct ocfs2_super *osb,
                                  int node_num);
-void __ocfs2_error(struct super_block *sb,
+__printf(3, 4)
-                   const char *function,
+void __ocfs2_error(struct super_block *sb, const char *function,
-                   const char *fmt, ...)
+                   const char *fmt, ...);
-        __attribute__ ((format (printf, 3, 4)));
 #define ocfs2_error(sb, fmt, args...) __ocfs2_error(sb, __PRETTY_FUNCTION__, fmt, ##args)
-void __ocfs2_abort(struct super_block *sb,
+__printf(3, 4)
-                   const char *function,
+void __ocfs2_abort(struct super_block *sb, const char *function,
-                   const char *fmt, ...)
+                   const char *fmt, ...);
-        __attribute__ ((format (printf, 3, 4)));
 #define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args)
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index a2a5bff774e3..e4e0ff7962e2 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -242,7 +242,7 @@ found:
                inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
                inode->i_op = &openprom_inode_operations;
                inode->i_fop = &openprom_operations;
-                inode->i_nlink = 2;
+                set_nlink(inode, 2);
                break;
        case op_inode_prop:
                if (!strcmp(dp->name, "options") && (len == 17) &&
@@ -251,7 +251,7 @@ found:
                else
                        inode->i_mode = S_IFREG | S_IRUGO;
                inode->i_fop = &openpromfs_prop_ops;
-                inode->i_nlink = 1;
+                set_nlink(inode, 1);
                inode->i_size = ent_oi->u.prop->length;
                break;
        }
diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c
index af9fdf046769..bd8ae788f689 100644
--- a/fs/partitions/ldm.c
+++ b/fs/partitions/ldm.c
@@ -49,18 +49,20 @@
 #define ldm_error(f, a...) _ldm_printk (KERN_ERR,   __func__, f, ##a)
 #define ldm_info(f, a...)  _ldm_printk (KERN_INFO,  __func__, f, ##a)
-__attribute__ ((format (printf, 3, 4)))
+static __printf(3, 4)
-static void _ldm_printk (const char *level, const char *function,
+void _ldm_printk(const char *level, const char *function, const char *fmt, ...)
-                         const char *fmt, ...)
 {
-        static char buf[128];
+        struct va_format vaf;
        va_list args;
        va_start (args, fmt);
-        vsnprintf (buf, sizeof (buf), fmt, args);
-        va_end (args);
-        printk ("%s%s(): %s\n", level, function, buf);
+        vaf.fmt = fmt;
+        vaf.va = &args;
+        printk("%s%s(): %pV\n", level, function, &vaf);
+        va_end(args);
 }
 /**
diff --git a/fs/pipe.c b/fs/pipe.c
index 0e0be1dc0f8e..4065f07366b3 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1254,6 +1254,7 @@ out:
 static const struct super_operations pipefs_ops = {
        .destroy_inode = free_inode_nonrcu,
+        .statfs = simple_statfs,
 };
 /*
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 5eb02069e1b8..2db1bd3173b2 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1107,13 +1107,6 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
                goto err_sighand;
        }
-        if (oom_adjust != task->signal->oom_adj) {
-                if (oom_adjust == OOM_DISABLE)
-                        atomic_inc(&task->mm->oom_disable_count);
-                if (task->signal->oom_adj == OOM_DISABLE)
-                        atomic_dec(&task->mm->oom_disable_count);
-        }
        /*
         * Warn that /proc/pid/oom_adj is deprecated, see
         * Documentation/feature-removal-schedule.txt.
@@ -1215,12 +1208,6 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
                goto err_sighand;
        }
-        if (oom_score_adj != task->signal->oom_score_adj) {
-                if (oom_score_adj == OOM_SCORE_ADJ_MIN)
-                        atomic_inc(&task->mm->oom_disable_count);
-                if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
-                        atomic_dec(&task->mm->oom_disable_count);
-        }
        task->signal->oom_score_adj = oom_score_adj;
        if (has_capability_noaudit(current, CAP_SYS_RESOURCE))
                task->signal->oom_score_adj_min = oom_score_adj;
@@ -1665,12 +1652,46 @@ out:
        return error;
 }
+static int proc_pid_fd_link_getattr(struct vfsmount *mnt, struct dentry *dentry,
+                struct kstat *stat)
+{
+        struct inode *inode = dentry->d_inode;
+        struct task_struct *task = get_proc_task(inode);
+        int rc;
+        if (task == NULL)
+                return -ESRCH;
+        rc = -EACCES;
+        if (lock_trace(task))
+                goto out_task;
+        generic_fillattr(inode, stat);
+        unlock_trace(task);
+        rc = 0;
+out_task:
+        put_task_struct(task);
+        return rc;
+}
 static const struct inode_operations proc_pid_link_inode_operations = {
        .readlink       = proc_pid_readlink,
        .follow_link    = proc_pid_follow_link,
        .setattr        = proc_setattr,
 };
+static const struct inode_operations proc_fdinfo_link_inode_operations = {
+        .setattr        = proc_setattr,
+        .getattr        = proc_pid_fd_link_getattr,
+};
+static const struct inode_operations proc_fd_link_inode_operations = {
+        .readlink       = proc_pid_readlink,
+        .follow_link    = proc_pid_follow_link,
+        .setattr        = proc_setattr,
+        .getattr        = proc_pid_fd_link_getattr,
+};
 /* building an inode */
@@ -1902,49 +1923,61 @@ out:
 static int proc_fd_info(struct inode *inode, struct path *path, char *info)
 {
-        struct task_struct *task = get_proc_task(inode);
+        struct task_struct *task;
-        struct files_struct *files = NULL;
+        struct files_struct *files;
        struct file *file;
        int fd = proc_fd(inode);
+        int rc;
-        if (task) {
+        task = get_proc_task(inode);
-                files = get_files_struct(task);
+        if (!task)
-                put_task_struct(task);
+                return -ENOENT;
-        }
-        if (files) {
+        rc = -EACCES;
-                /*
+        if (lock_trace(task))
-                 * We are not taking a ref to the file structure, so we must
+                goto out_task;
-                 * hold ->file_lock.
-                 */
+        rc = -ENOENT;
-                spin_lock(&files->file_lock);
+        files = get_files_struct(task);
-                file = fcheck_files(files, fd);
+        if (files == NULL)
-                if (file) {
+                goto out_unlock;
-                        unsigned int f_flags;
-                        struct fdtable *fdt;
+        /*
+         * We are not taking a ref to the file structure, so we must
-                        fdt = files_fdtable(files);
+         * hold ->file_lock.
-                        f_flags = file->f_flags & ~O_CLOEXEC;
+         */
-                        if (FD_ISSET(fd, fdt->close_on_exec))
+        spin_lock(&files->file_lock);
-                                f_flags |= O_CLOEXEC;
+        file = fcheck_files(files, fd);
+        if (file) {
-                        if (path) {
+                unsigned int f_flags;
-                                *path = file->f_path;
+                struct fdtable *fdt;
-                                path_get(&file->f_path);
-                        }
+                fdt = files_fdtable(files);
-                        if (info)
+                f_flags = file->f_flags & ~O_CLOEXEC;
-                                snprintf(info, PROC_FDINFO_MAX,
+                if (FD_ISSET(fd, fdt->close_on_exec))
-                                         "pos:\t%lli\n"
+                        f_flags |= O_CLOEXEC;
-                                         "flags:\t0%o\n",
-                                         (long long) file->f_pos,
+                if (path) {
-                                         f_flags);
+                        *path = file->f_path;
-                        spin_unlock(&files->file_lock);
+                        path_get(&file->f_path);
-                        put_files_struct(files);
-                        return 0;
                }
-                spin_unlock(&files->file_lock);
+                if (info)
-                put_files_struct(files);
+                        snprintf(info, PROC_FDINFO_MAX,
-        }
+                                 "pos:\t%lli\n"
-        return -ENOENT;
+                                 "flags:\t0%o\n",
+                                 (long long) file->f_pos,
+                                 f_flags);
+                rc = 0;
+        } else
+                rc = -ENOENT;
+        spin_unlock(&files->file_lock);
+        put_files_struct(files);
+out_unlock:
+        unlock_trace(task);
+out_task:
+        put_task_struct(task);
+        return rc;
 }
 static int proc_fd_link(struct inode *inode, struct path *path)
@@ -2039,7 +2072,7 @@ static struct dentry *proc_fd_instantiate(struct inode *dir,
        spin_unlock(&files->file_lock);
        put_files_struct(files);
-        inode->i_op = &proc_pid_link_inode_operations;
+        inode->i_op = &proc_fd_link_inode_operations;
        inode->i_size = 64;
        ei->op.proc_get_link = proc_fd_link;
        d_set_d_op(dentry, &tid_fd_dentry_operations);
@@ -2071,7 +2104,12 @@ static struct dentry *proc_lookupfd_common(struct inode *dir,
        if (fd == ~0U)
                goto out;
+        result = ERR_PTR(-EACCES);
+        if (lock_trace(task))
+                goto out;
        result = instantiate(dir, dentry, task, &fd);
+        unlock_trace(task);
 out:
        put_task_struct(task);
 out_no_task:
@@ -2091,23 +2129,28 @@ static int proc_readfd_common(struct file * filp, void * dirent,
        retval = -ENOENT;
        if (!p)
                goto out_no_task;
+        retval = -EACCES;
+        if (lock_trace(p))
+                goto out;
        retval = 0;
        fd = filp->f_pos;
        switch (fd) {
                case 0:
                        if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0)
-                                goto out;
+                                goto out_unlock;
                        filp->f_pos++;
                case 1:
                        ino = parent_ino(dentry);
                        if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
-                                goto out;
+                                goto out_unlock;
                        filp->f_pos++;
                default:
                        files = get_files_struct(p);
                        if (!files)
-                                goto out;
+                                goto out_unlock;
                        rcu_read_lock();
                        for (fd = filp->f_pos-2;
                             fd < files_fdtable(files)->max_fds;
@@ -2131,6 +2174,9 @@ static int proc_readfd_common(struct file * filp, void * dirent,
                        rcu_read_unlock();
                        put_files_struct(files);
        }
+out_unlock:
+        unlock_trace(p);
 out:
        put_task_struct(p);
 out_no_task:
@@ -2208,6 +2254,7 @@ static struct dentry *proc_fdinfo_instantiate(struct inode *dir,
        ei->fd = fd;
        inode->i_mode = S_IFREG | S_IRUSR;
        inode->i_fop = &proc_fdinfo_file_operations;
+        inode->i_op = &proc_fdinfo_link_inode_operations;
        d_set_d_op(dentry, &tid_fd_dentry_operations);
        d_add(dentry, inode);
        /* Close the race of the process dying before we return the dentry */
@@ -2261,7 +2308,7 @@ static struct dentry *proc_pident_instantiate(struct inode *dir,
        ei = PROC_I(inode);
        inode->i_mode = p->mode;
        if (S_ISDIR(inode->i_mode))
-                inode->i_nlink = 2;     /* Use getattr to fix if necessary */
+                set_nlink(inode, 2);    /* Use getattr to fix if necessary */
        if (p->iop)
                inode->i_op = p->iop;
        if (p->fop)
@@ -2655,7 +2702,7 @@ static struct dentry *proc_base_instantiate(struct inode *dir,
        inode->i_mode = p->mode;
        if (S_ISDIR(inode->i_mode))
-                inode->i_nlink = 2;
+                set_nlink(inode, 2);
        if (S_ISLNK(inode->i_mode))
                inode->i_size = 64;
        if (p->iop)
@@ -2994,8 +3041,8 @@ static struct dentry *proc_pid_instantiate(struct inode *dir,
        inode->i_fop = &proc_tgid_base_operations;
        inode->i_flags|=S_IMMUTABLE;
-        inode->i_nlink = 2 + pid_entry_count_dirs(tgid_base_stuff,
+        set_nlink(inode, 2 + pid_entry_count_dirs(tgid_base_stuff,
-                ARRAY_SIZE(tgid_base_stuff));
+                                                  ARRAY_SIZE(tgid_base_stuff)));
        d_set_d_op(dentry, &pid_dentry_operations);
@@ -3246,8 +3293,8 @@ static struct dentry *proc_task_instantiate(struct inode *dir,
        inode->i_fop = &proc_tid_base_operations;
        inode->i_flags|=S_IMMUTABLE;
-        inode->i_nlink = 2 + pid_entry_count_dirs(tid_base_stuff,
+        set_nlink(inode, 2 + pid_entry_count_dirs(tid_base_stuff,
-                ARRAY_SIZE(tid_base_stuff));
+                                                  ARRAY_SIZE(tid_base_stuff)));
        d_set_d_op(dentry, &pid_dentry_operations);
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 9d99131d0d65..10090d9c7ad5 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -283,7 +283,7 @@ static int proc_getattr(struct vfsmount *mnt, struct dentry *dentry,
        struct inode *inode = dentry->d_inode;
        struct proc_dir_entry *de = PROC_I(inode)->pde;
        if (de && de->nlink)
-                inode->i_nlink = de->nlink;
+                set_nlink(inode, de->nlink);
        generic_fillattr(inode, stat);
        return 0;
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 7ed72d6c1c6f..7737c5468a40 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -445,7 +445,7 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
                if (de->size)
                        inode->i_size = de->size;
                if (de->nlink)
-                        inode->i_nlink = de->nlink;
+                        set_nlink(inode, de->nlink);
                if (de->proc_iops)
                        inode->i_op = de->proc_iops;
                if (de->proc_fops) {
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 1a77dbef226f..a6b62173d4c3 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -3,6 +3,7 @@
 */
 #include <linux/init.h>
 #include <linux/sysctl.h>
+#include <linux/poll.h>
 #include <linux/proc_fs.h>
 #include <linux/security.h>
 #include <linux/namei.h>
@@ -14,6 +15,15 @@ static const struct inode_operations proc_sys_inode_operations;
 static const struct file_operations proc_sys_dir_file_operations;
 static const struct inode_operations proc_sys_dir_operations;
+void proc_sys_poll_notify(struct ctl_table_poll *poll)
+{
+        if (!poll)
+                return;
+        atomic_inc(&poll->event);
+        wake_up_interruptible(&poll->wait);
+}
 static struct inode *proc_sys_make_inode(struct super_block *sb,
                struct ctl_table_header *head, struct ctl_table *table)
 {
@@ -39,7 +49,7 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
                inode->i_fop = &proc_sys_file_operations;
        } else {
                inode->i_mode |= S_IFDIR;
-                inode->i_nlink = 0;
+                clear_nlink(inode);
                inode->i_op = &proc_sys_dir_operations;
                inode->i_fop = &proc_sys_dir_file_operations;
        }
@@ -176,6 +186,39 @@ static ssize_t proc_sys_write(struct file *filp, const char __user *buf,
        return proc_sys_call_handler(filp, (void __user *)buf, count, ppos, 1);
 }
+static int proc_sys_open(struct inode *inode, struct file *filp)
+{
+        struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+        if (table->poll)
+                filp->private_data = proc_sys_poll_event(table->poll);
+        return 0;
+}
+static unsigned int proc_sys_poll(struct file *filp, poll_table *wait)
+{
+        struct inode *inode = filp->f_path.dentry->d_inode;
+        struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+        unsigned long event = (unsigned long)filp->private_data;
+        unsigned int ret = DEFAULT_POLLMASK;
+        if (!table->proc_handler)
+                goto out;
+        if (!table->poll)
+                goto out;
+        poll_wait(filp, &table->poll->wait, wait);
+        if (event != atomic_read(&table->poll->event)) {
+                filp->private_data = proc_sys_poll_event(table->poll);
+                ret = POLLIN | POLLRDNORM | POLLERR | POLLPRI;
+        }
+out:
+        return ret;
+}
 static int proc_sys_fill_cache(struct file *filp, void *dirent,
                                filldir_t filldir,
@@ -364,12 +407,15 @@ static int proc_sys_getattr(struct vfsmount *mnt, struct dentry *dentry, struct
 }
 static const struct file_operations proc_sys_file_operations = {
+        .open           = proc_sys_open,
+        .poll           = proc_sys_poll,
        .read           = proc_sys_read,
        .write          = proc_sys_write,
        .llseek         = default_llseek,
 };
 static const struct file_operations proc_sys_dir_file_operations = {
+        .read           = generic_read_dir,
        .readdir        = proc_sys_readdir,
        .llseek         = generic_file_llseek,
 };
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 5afaa58a8630..e418c5abdb0e 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -44,6 +44,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
                "VmPeak:\t%8lu kB\n"
                "VmSize:\t%8lu kB\n"
                "VmLck:\t%8lu kB\n"
+                "VmPin:\t%8lu kB\n"
                "VmHWM:\t%8lu kB\n"
                "VmRSS:\t%8lu kB\n"
                "VmData:\t%8lu kB\n"
@@ -55,6 +56,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
                hiwater_vm << (PAGE_SHIFT-10),
                (total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
                mm->locked_vm << (PAGE_SHIFT-10),
+                mm->pinned_vm << (PAGE_SHIFT-10),
                hiwater_rss << (PAGE_SHIFT-10),
                total_rss << (PAGE_SHIFT-10),
                data << (PAGE_SHIFT-10),
@@ -1039,6 +1041,9 @@ static int show_numa_map(struct seq_file *m, void *v)
                seq_printf(m, " stack");
        }
+        if (is_vm_hugetlb_page(vma))
+                seq_printf(m, " huge");
        walk_page_range(vma->vm_start, vma->vm_end, &walk);
        if (!md->pages)
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 893b961dcfd8..379a02dc1217 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -24,6 +24,7 @@
 #include <linux/highmem.h>
 #include <linux/time.h>
 #include <linux/init.h>
+#include <linux/list.h>
 #include <linux/string.h>
 #include <linux/mount.h>
 #include <linux/ramfs.h>
@@ -32,13 +33,18 @@
 #include <linux/magic.h>
 #include <linux/pstore.h>
 #include <linux/slab.h>
+#include <linux/spinlock.h>
 #include <linux/uaccess.h>
 #include "internal.h"
 #define PSTORE_NAMELEN  64
+static DEFINE_SPINLOCK(allpstore_lock);
+static LIST_HEAD(allpstore);
 struct pstore_private {
+        struct list_head list;
        struct pstore_info *psi;
        enum pstore_type_id type;
        u64     id;
@@ -81,8 +87,16 @@ static int pstore_unlink(struct inode *dir, struct dentry *dentry)
 static void pstore_evict_inode(struct inode *inode)
 {
+        struct pstore_private   *p = inode->i_private;
+        unsigned long           flags;
        end_writeback(inode);
-        kfree(inode->i_private);
+        if (p) {
+                spin_lock_irqsave(&allpstore_lock, flags);
+                list_del(&p->list);
+                spin_unlock_irqrestore(&allpstore_lock, flags);
+                kfree(p);
+        }
 }
 static const struct inode_operations pstore_dir_inode_operations = {
@@ -182,9 +196,23 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id,
        struct dentry           *root = pstore_sb->s_root;
        struct dentry           *dentry;
        struct inode            *inode;
-        int                     rc;
+        int                     rc = 0;
        char                    name[PSTORE_NAMELEN];
-        struct pstore_private   *private;
+        struct pstore_private   *private, *pos;
+        unsigned long           flags;
+        spin_lock_irqsave(&allpstore_lock, flags);
+        list_for_each_entry(pos, &allpstore, list) {
+                if (pos->type == type &&
+                    pos->id == id &&
+                    pos->psi == psi) {
+                        rc = -EEXIST;
+                        break;
+                }
+        }
+        spin_unlock_irqrestore(&allpstore_lock, flags);
+        if (rc)
+                return rc;
        rc = -ENOMEM;
        inode = pstore_get_inode(pstore_sb, root->d_inode, S_IFREG | 0444, 0);
@@ -229,6 +257,10 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id,
        d_add(dentry, inode);
+        spin_lock_irqsave(&allpstore_lock, flags);
+        list_add(&private->list, &allpstore);
+        spin_unlock_irqrestore(&allpstore_lock, flags);
        mutex_unlock(&root->d_inode->i_mutex);
        return 0;
@@ -277,7 +309,7 @@ int pstore_fill_super(struct super_block *sb, void *data, int silent)
                goto fail;
        }
-        pstore_get_records();
+        pstore_get_records(0);
        return 0;
 fail:
diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h
index 611c1b3c46fa..3bde461c3f34 100644
--- a/fs/pstore/internal.h
+++ b/fs/pstore/internal.h
@@ -1,5 +1,5 @@
 extern void     pstore_set_kmsg_bytes(int);
-extern void     pstore_get_records(void);
+extern void     pstore_get_records(int);
 extern int      pstore_mkfile(enum pstore_type_id, char *psname, u64 id,
                              char *data, size_t size,
                              struct timespec time, struct pstore_info *psi);
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index c5300ec31696..2bd620f0d796 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -25,12 +25,30 @@
 #include <linux/module.h>
 #include <linux/pstore.h>
 #include <linux/string.h>
+#include <linux/timer.h>
 #include <linux/slab.h>
 #include <linux/uaccess.h>
+#include <linux/hardirq.h>
+#include <linux/workqueue.h>
 #include "internal.h"
 /*
+ * We defer making "oops" entries appear in pstore - see
+ * whether the system is actually still running well enough
+ * to let someone see the entry
+ */
+#define PSTORE_INTERVAL (60 * HZ)
+static int pstore_new_entry;
+static void pstore_timefunc(unsigned long);
+static DEFINE_TIMER(pstore_timer, pstore_timefunc, 0, 0);
+static void pstore_dowork(struct work_struct *);
+static DECLARE_WORK(pstore_work, pstore_dowork);
+/*
 * pstore_lock just protects "psinfo" during
 * calls to pstore_register()
 */
@@ -69,15 +87,22 @@ static void pstore_dump(struct kmsg_dumper *dumper,
        unsigned long   size, total = 0;
        char            *dst, *why;
        u64             id;
-        int             hsize;
+        int             hsize, ret;
        unsigned int    part = 1;
+        unsigned long   flags = 0;
+        int             is_locked = 0;
        if (reason < ARRAY_SIZE(reason_str))
                why = reason_str[reason];
        else
                why = "Unknown";
-        mutex_lock(&psinfo->buf_mutex);
+        if (in_nmi()) {
+                is_locked = spin_trylock(&psinfo->buf_lock);
+                if (!is_locked)
+                        pr_err("pstore dump routine blocked in NMI, may corrupt error record\n");
+        } else
+                spin_lock_irqsave(&psinfo->buf_lock, flags);
        oopscount++;
        while (total < kmsg_bytes) {
                dst = psinfo->buf;
@@ -97,18 +122,20 @@ static void pstore_dump(struct kmsg_dumper *dumper,
                memcpy(dst, s1 + s1_start, l1_cpy);
                memcpy(dst + l1_cpy, s2 + s2_start, l2_cpy);
-                id = psinfo->write(PSTORE_TYPE_DMESG, part,
+                ret = psinfo->write(PSTORE_TYPE_DMESG, &id, part,
                                   hsize + l1_cpy + l2_cpy, psinfo);
-                if (reason == KMSG_DUMP_OOPS && pstore_is_mounted())
+                if (ret == 0 && reason == KMSG_DUMP_OOPS && pstore_is_mounted())
-                        pstore_mkfile(PSTORE_TYPE_DMESG, psinfo->name, id,
+                        pstore_new_entry = 1;
-                                      psinfo->buf, hsize + l1_cpy + l2_cpy,
-                                      CURRENT_TIME, psinfo);
                l1 -= l1_cpy;
                l2 -= l2_cpy;
                total += l1_cpy + l2_cpy;
                part++;
        }
-        mutex_unlock(&psinfo->buf_mutex);
+        if (in_nmi()) {
+                if (is_locked)
+                        spin_unlock(&psinfo->buf_lock);
+        } else
+                spin_unlock_irqrestore(&psinfo->buf_lock, flags);
 }
 static struct kmsg_dumper pstore_dumper = {
@@ -148,19 +175,24 @@ int pstore_register(struct pstore_info *psi)
        }
        if (pstore_is_mounted())
-                pstore_get_records();
+                pstore_get_records(0);
        kmsg_dump_register(&pstore_dumper);
+        pstore_timer.expires = jiffies + PSTORE_INTERVAL;
+        add_timer(&pstore_timer);
        return 0;
 }
 EXPORT_SYMBOL_GPL(pstore_register);
 /*
- * Read all the records from the persistent store. Create and
+ * Read all the records from the persistent store. Create
- * file files in our filesystem.
+ * files in our filesystem.  Don't warn about -EEXIST errors
+ * when we are re-scanning the backing store looking to add new
+ * error records.
 */
-void pstore_get_records(void)
+void pstore_get_records(int quiet)
 {
        struct pstore_info *psi = psinfo;
        ssize_t                 size;
@@ -168,36 +200,55 @@ void pstore_get_records(void)
        enum pstore_type_id     type;
        struct timespec         time;
        int                     failed = 0, rc;
+        unsigned long           flags;
        if (!psi)
                return;
-        mutex_lock(&psinfo->buf_mutex);
+        spin_lock_irqsave(&psinfo->buf_lock, flags);
        rc = psi->open(psi);
        if (rc)
                goto out;
        while ((size = psi->read(&id, &type, &time, psi)) > 0) {
-                if (pstore_mkfile(type, psi->name, id, psi->buf, (size_t)size,
+                rc = pstore_mkfile(type, psi->name, id, psi->buf, (size_t)size,
-                                  time, psi))
+                                  time, psi);
+                if (rc && (rc != -EEXIST || !quiet))
                        failed++;
        }
        psi->close(psi);
 out:
-        mutex_unlock(&psinfo->buf_mutex);
+        spin_unlock_irqrestore(&psinfo->buf_lock, flags);
        if (failed)
                printk(KERN_WARNING "pstore: failed to load %d record(s) from '%s'\n",
                       failed, psi->name);
 }
+static void pstore_dowork(struct work_struct *work)
+{
+        pstore_get_records(1);
+}
+static void pstore_timefunc(unsigned long dummy)
+{
+        if (pstore_new_entry) {
+                pstore_new_entry = 0;
+                schedule_work(&pstore_work);
+        }
+        mod_timer(&pstore_timer, jiffies + PSTORE_INTERVAL);
+}
 /*
 * Call platform driver to write a record to the
 * persistent store.
 */
 int pstore_write(enum pstore_type_id type, char *buf, size_t size)
 {
-        u64     id;
+        u64             id;
+        int             ret;
+        unsigned long   flags;
        if (!psinfo)
                return -ENODEV;
@@ -205,13 +256,13 @@ int pstore_write(enum pstore_type_id type, char *buf, size_t size)
        if (size > psinfo->bufsize)
                return -EFBIG;
-        mutex_lock(&psinfo->buf_mutex);
+        spin_lock_irqsave(&psinfo->buf_lock, flags);
        memcpy(psinfo->buf, buf, size);
-        id = psinfo->write(type, 0, size, psinfo);
+        ret = psinfo->write(type, &id, 0, size, psinfo);
-        if (pstore_is_mounted())
+        if (ret == 0 && pstore_is_mounted())
                pstore_mkfile(PSTORE_TYPE_DMESG, psinfo->name, id, psinfo->buf,
                              size, CURRENT_TIME, psinfo);
-        mutex_unlock(&psinfo->buf_mutex);
+        spin_unlock_irqrestore(&psinfo->buf_lock, flags);
        return 0;
 }
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 2b0646613f5a..3bdd21418432 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -379,7 +379,7 @@ struct inode *qnx4_iget(struct super_block *sb, unsigned long ino)
        inode->i_mode    = le16_to_cpu(raw_inode->di_mode);
        inode->i_uid     = (uid_t)le16_to_cpu(raw_inode->di_uid);
        inode->i_gid     = (gid_t)le16_to_cpu(raw_inode->di_gid);
-        inode->i_nlink   = le16_to_cpu(raw_inode->di_nlink);
+        set_nlink(inode, le16_to_cpu(raw_inode->di_nlink));
        inode->i_size    = le32_to_cpu(raw_inode->di_size);
        inode->i_mtime.tv_sec   = le32_to_cpu(raw_inode->di_mtime);
        inode->i_mtime.tv_nsec = 0;
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 10b6be3ca280..35f4b0ecdeb3 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -286,7 +286,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
                /* caller already holds s_umount */
                if (sb->s_flags & MS_RDONLY)
                        return -EROFS;
-                writeback_inodes_sb(sb);
+                writeback_inodes_sb(sb, WB_REASON_SYNC);
                return 0;
        default:
                return -EINVAL;
@@ -363,12 +363,15 @@ SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special,
        }
        sb = quotactl_block(special);
-        if (IS_ERR(sb))
+        if (IS_ERR(sb)) {
-                return PTR_ERR(sb);
+                ret = PTR_ERR(sb);
+                goto out;
+        }
        ret = do_quotactl(sb, type, cmds, id, addr, pathp);
        drop_super(sb);
+out:
        if (pathp && !IS_ERR(pathp))
                path_put(pathp);
        return ret;
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index eacb166fb259..462ceb38fec6 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -23,7 +23,6 @@
 * caches is sufficient.
 */
-#include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
 #include <linux/highmem.h>
@@ -288,14 +287,7 @@ static int __init init_ramfs_fs(void)
 {
        return register_filesystem(&ramfs_fs_type);
 }
-static void __exit exit_ramfs_fs(void)
-{
-        unregister_filesystem(&ramfs_fs_type);
-}
 module_init(init_ramfs_fs)
-module_exit(exit_ramfs_fs)
 int __init init_rootfs(void)
 {
@@ -311,5 +303,3 @@ int __init init_rootfs(void)
        return err;
 }
-MODULE_LICENSE("GPL");
diff --git a/fs/read_write.c b/fs/read_write.c
index dfd125798791..5ad4248b0cd8 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -633,7 +633,8 @@ ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov,
 ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
                              unsigned long nr_segs, unsigned long fast_segs,
                              struct iovec *fast_pointer,
-                              struct iovec **ret_pointer)
+                              struct iovec **ret_pointer,
+                              int check_access)
 {
        unsigned long seg;
        ssize_t ret;
@@ -689,7 +690,8 @@ ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
                        ret = -EINVAL;
                        goto out;
                }
-                if (unlikely(!access_ok(vrfy_dir(type), buf, len))) {
+                if (check_access
+                    && unlikely(!access_ok(vrfy_dir(type), buf, len))) {
                        ret = -EFAULT;
                        goto out;
                }
@@ -721,7 +723,7 @@ static ssize_t do_readv_writev(int type, struct file *file,
        }
        ret = rw_copy_check_uvector(type, uvector, nr_segs,
-                        ARRAY_SIZE(iovstack), iovstack, &iov);
+                                    ARRAY_SIZE(iovstack), iovstack, &iov, 1);
        if (ret <= 0)
                goto out;
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 9b0d4b78b4fb..950f13af0951 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1154,7 +1154,7 @@ static void init_inode(struct inode *inode, struct treepath *path)
                set_inode_item_key_version(inode, KEY_FORMAT_3_5);
                set_inode_sd_version(inode, STAT_DATA_V1);
                inode->i_mode = sd_v1_mode(sd);
-                inode->i_nlink = sd_v1_nlink(sd);
+                set_nlink(inode, sd_v1_nlink(sd));
                inode->i_uid = sd_v1_uid(sd);
                inode->i_gid = sd_v1_gid(sd);
                inode->i_size = sd_v1_size(sd);
@@ -1199,7 +1199,7 @@ static void init_inode(struct inode *inode, struct treepath *path)
                struct stat_data *sd = (struct stat_data *)B_I_PITEM(bh, ih);
                inode->i_mode = sd_v2_mode(sd);
-                inode->i_nlink = sd_v2_nlink(sd);
+                set_nlink(inode, sd_v2_nlink(sd));
                inode->i_uid = sd_v2_uid(sd);
                inode->i_size = sd_v2_size(sd);
                inode->i_gid = sd_v2_gid(sd);
@@ -1444,7 +1444,7 @@ void reiserfs_read_locked_inode(struct inode *inode,
                /* a stale NFS handle can trigger this without it being an error */
                pathrelse(&path_to_sd);
                reiserfs_make_bad_inode(inode);
-                inode->i_nlink = 0;
+                clear_nlink(inode);
                return;
        }
@@ -1832,7 +1832,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
 #endif
        /* fill stat data */
-        inode->i_nlink = (S_ISDIR(mode) ? 2 : 1);
+        set_nlink(inode, (S_ISDIR(mode) ? 2 : 1));
        /* uid and gid must already be set by the caller for quota init */
@@ -1987,7 +1987,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
        make_bad_inode(inode);
      out_inserted_sd:
-        inode->i_nlink = 0;
+        clear_nlink(inode);
        th->t_trans_id = 0;     /* so the caller can't use this handle later */
        unlock_new_inode(inode); /* OK to do even if we hadn't locked it */
        iput(inode);
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index ef392324bbf1..80058e8ce361 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -19,7 +19,7 @@
 #include <linux/reiserfs_xattr.h>
 #include <linux/quotaops.h>
-#define INC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) { inc_nlink(i); if (i->i_nlink >= REISERFS_LINK_MAX) i->i_nlink=1; }
+#define INC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) { inc_nlink(i); if (i->i_nlink >= REISERFS_LINK_MAX) set_nlink(i, 1); }
 #define DEC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) drop_nlink(i);
 // directory item contains array of entry headers. This performs
@@ -622,7 +622,7 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, int mode,
                               dentry->d_name.len, inode, 1 /*visible */ );
        if (retval) {
                int err;
-                inode->i_nlink--;
+                drop_nlink(inode);
                reiserfs_update_sd(&th, inode);
                err = journal_end(&th, dir->i_sb, jbegin_count);
                if (err)
@@ -702,7 +702,7 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, int mode,
                               dentry->d_name.len, inode, 1 /*visible */ );
        if (retval) {
                int err;
-                inode->i_nlink--;
+                drop_nlink(inode);
                reiserfs_update_sd(&th, inode);
                err = journal_end(&th, dir->i_sb, jbegin_count);
                if (err)
@@ -787,7 +787,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
                               dentry->d_name.len, inode, 1 /*visible */ );
        if (retval) {
                int err;
-                inode->i_nlink = 0;
+                clear_nlink(inode);
                DEC_DIR_INODE_NLINK(dir);
                reiserfs_update_sd(&th, inode);
                err = journal_end(&th, dir->i_sb, jbegin_count);
@@ -964,7 +964,7 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry)
                reiserfs_warning(inode->i_sb, "reiserfs-7042",
                                 "deleting nonexistent file (%lu), %d",
                                 inode->i_ino, inode->i_nlink);
-                inode->i_nlink = 1;
+                set_nlink(inode, 1);
        }
        drop_nlink(inode);
@@ -1086,7 +1086,7 @@ static int reiserfs_symlink(struct inode *parent_dir,
                                    dentry->d_name.len, inode, 1 /*visible */ );
        if (retval) {
                int err;
-                inode->i_nlink--;
+                drop_nlink(inode);
                reiserfs_update_sd(&th, inode);
                err = journal_end(&th, parent_dir->i_sb, jbegin_count);
                if (err)
@@ -1129,7 +1129,7 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir,
        retval = journal_begin(&th, dir->i_sb, jbegin_count);
        if (retval) {
-                inode->i_nlink--;
+                drop_nlink(inode);
                reiserfs_write_unlock(dir->i_sb);
                return retval;
        }
@@ -1144,7 +1144,7 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir,
        if (retval) {
                int err;
-                inode->i_nlink--;
+                drop_nlink(inode);
                err = journal_end(&th, dir->i_sb, jbegin_count);
                reiserfs_write_unlock(dir->i_sb);
                return err ? err : retval;
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 2305e3121cb1..8b4089f30408 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -337,7 +337,7 @@ static struct inode *romfs_iget(struct super_block *sb, unsigned long pos)
        inode->i_metasize = (ROMFH_SIZE + nlen + 1 + ROMFH_PAD) & ROMFH_MASK;
        inode->i_dataoffset = pos + inode->i_metasize;
-        i->i_nlink = 1;         /* Hard to decide.. */
+        set_nlink(i, 1);                /* Hard to decide.. */
        i->i_size = be32_to_cpu(ri.size);
        i->i_mtime.tv_sec = i->i_atime.tv_sec = i->i_ctime.tv_sec = 0;
        i->i_mtime.tv_nsec = i->i_atime.tv_nsec = i->i_ctime.tv_nsec = 0;
diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
index 048b59d5b2f0..c70111ebefd4 100644
--- a/fs/squashfs/Kconfig
+++ b/fs/squashfs/Kconfig
@@ -78,6 +78,28 @@ config SQUASHFS_XZ
          If unsure, say N.
+config SQUASHFS_4K_DEVBLK_SIZE
+        bool "Use 4K device block size?"
+        depends on SQUASHFS
+        help
+          By default Squashfs sets the dev block size (sb_min_blocksize)
+          to 1K or the smallest block size supported by the block device
+          (if larger).  This, because blocks are packed together and
+          unaligned in Squashfs, should reduce latency.
+          This, however, gives poor performance on MTD NAND devices where
+          the optimal I/O size is 4K (even though the devices can support
+          smaller block sizes).
+          Using a 4K device block size may also improve overall I/O
+          performance for some file access patterns (e.g. sequential
+          accesses of files in filesystem order) on all media.
+          Setting this option will force Squashfs to use a 4K device block
+          size by default.
+          If unsure, say N.
 config SQUASHFS_EMBEDDED
        bool "Additional option for memory-constrained systems"
        depends on SQUASHFS
diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
index 04bebcaa2373..fd7b3b3bda13 100644
--- a/fs/squashfs/inode.c
+++ b/fs/squashfs/inode.c
@@ -159,7 +159,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
                        frag_offset = 0;
                }
-                inode->i_nlink = 1;
+                set_nlink(inode, 1);
                inode->i_size = le32_to_cpu(sqsh_ino->file_size);
                inode->i_fop = &generic_ro_fops;
                inode->i_mode |= S_IFREG;
@@ -203,7 +203,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
                }
                xattr_id = le32_to_cpu(sqsh_ino->xattr);
-                inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
+                set_nlink(inode, le32_to_cpu(sqsh_ino->nlink));
                inode->i_size = le64_to_cpu(sqsh_ino->file_size);
                inode->i_op = &squashfs_inode_ops;
                inode->i_fop = &generic_ro_fops;
@@ -232,7 +232,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
                if (err < 0)
                        goto failed_read;
-                inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
+                set_nlink(inode, le32_to_cpu(sqsh_ino->nlink));
                inode->i_size = le16_to_cpu(sqsh_ino->file_size);
                inode->i_op = &squashfs_dir_inode_ops;
                inode->i_fop = &squashfs_dir_ops;
@@ -257,7 +257,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
                        goto failed_read;
                xattr_id = le32_to_cpu(sqsh_ino->xattr);
-                inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
+                set_nlink(inode, le32_to_cpu(sqsh_ino->nlink));
                inode->i_size = le32_to_cpu(sqsh_ino->file_size);
                inode->i_op = &squashfs_dir_inode_ops;
                inode->i_fop = &squashfs_dir_ops;
@@ -284,7 +284,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
                if (err < 0)
                        goto failed_read;
-                inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
+                set_nlink(inode, le32_to_cpu(sqsh_ino->nlink));
                inode->i_size = le32_to_cpu(sqsh_ino->symlink_size);
                inode->i_op = &squashfs_symlink_inode_ops;
                inode->i_data.a_ops = &squashfs_symlink_aops;
@@ -325,7 +325,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
                        inode->i_mode |= S_IFCHR;
                else
                        inode->i_mode |= S_IFBLK;
-                inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
+                set_nlink(inode, le32_to_cpu(sqsh_ino->nlink));
                rdev = le32_to_cpu(sqsh_ino->rdev);
                init_special_inode(inode, inode->i_mode, new_decode_dev(rdev));
@@ -349,7 +349,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
                        inode->i_mode |= S_IFBLK;
                xattr_id = le32_to_cpu(sqsh_ino->xattr);
                inode->i_op = &squashfs_inode_ops;
-                inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
+                set_nlink(inode, le32_to_cpu(sqsh_ino->nlink));
                rdev = le32_to_cpu(sqsh_ino->rdev);
                init_special_inode(inode, inode->i_mode, new_decode_dev(rdev));
@@ -370,7 +370,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
                        inode->i_mode |= S_IFIFO;
                else
                        inode->i_mode |= S_IFSOCK;
-                inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
+                set_nlink(inode, le32_to_cpu(sqsh_ino->nlink));
                init_special_inode(inode, inode->i_mode, 0);
                break;
        }
@@ -389,7 +389,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
                        inode->i_mode |= S_IFSOCK;
                xattr_id = le32_to_cpu(sqsh_ino->xattr);
                inode->i_op = &squashfs_inode_ops;
-                inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
+                set_nlink(inode, le32_to_cpu(sqsh_ino->nlink));
                init_special_inode(inode, inode->i_mode, 0);
                break;
        }
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
index b4a4e539a08c..e8e14645de9a 100644
--- a/fs/squashfs/squashfs_fs.h
+++ b/fs/squashfs/squashfs_fs.h
@@ -36,6 +36,13 @@
 #define SQUASHFS_FILE_SIZE              131072
 #define SQUASHFS_FILE_LOG               17
+/* default size of block device I/O */
+#ifdef CONFIG_SQUASHFS_4K_DEVBLK_SIZE
+#define SQUASHFS_DEVBLK_SIZE 4096
+#else
+#define SQUASHFS_DEVBLK_SIZE 1024
+#endif
 #define SQUASHFS_FILE_MAX_SIZE          1048576
 #define SQUASHFS_FILE_MAX_LOG           20
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 7438850c62d0..2da1715452ac 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -95,7 +95,7 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
        }
        msblk = sb->s_fs_info;
-        msblk->devblksize = sb_min_blocksize(sb, BLOCK_SIZE);
+        msblk->devblksize = sb_min_blocksize(sb, SQUASHFS_DEVBLK_SIZE);
        msblk->devblksize_log2 = ffz(~msblk->devblksize);
        mutex_init(&msblk->read_data_mutex);
diff --git a/fs/stack.c b/fs/stack.c
index b4f2ab48a61f..9c11519245a6 100644
--- a/fs/stack.c
+++ b/fs/stack.c
@@ -71,6 +71,6 @@ void fsstack_copy_attr_all(struct inode *dest, const struct inode *src)
        dest->i_ctime = src->i_ctime;
        dest->i_blkbits = src->i_blkbits;
        dest->i_flags = src->i_flags;
-        dest->i_nlink = src->i_nlink;
+        set_nlink(dest, src->i_nlink);
 }
 EXPORT_SYMBOL_GPL(fsstack_copy_attr_all);
diff --git a/fs/stat.c b/fs/stat.c
index 78a3aa83c7ea..8806b8997d2e 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -294,15 +294,16 @@ SYSCALL_DEFINE4(readlinkat, int, dfd, const char __user *, pathname,
 {
        struct path path;
        int error;
+        int empty = 0;
        if (bufsiz <= 0)
                return -EINVAL;
-        error = user_path_at(dfd, pathname, LOOKUP_EMPTY, &path);
+        error = user_path_at_empty(dfd, pathname, LOOKUP_EMPTY, &path, &empty);
        if (!error) {
                struct inode *inode = path.dentry->d_inode;
-                error = -EINVAL;
+                error = empty ? -ENOENT : -EINVAL;
                if (inode->i_op->readlink) {
                        error = security_inode_readlink(path.dentry);
                        if (!error) {
diff --git a/fs/statfs.c b/fs/statfs.c
index 8244924dec55..9cf04a118965 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -76,7 +76,7 @@ EXPORT_SYMBOL(vfs_statfs);
 int user_statfs(const char __user *pathname, struct kstatfs *st)
 {
        struct path path;
-        int error = user_path(pathname, &path);
+        int error = user_path_at(AT_FDCWD, pathname, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);
        if (!error) {
                error = vfs_statfs(&path, st);
                path_put(&path);
diff --git a/fs/super.c b/fs/super.c
index 3f56a269a4f4..afd0f1ad45e0 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -61,7 +61,7 @@ static int prune_super(struct shrinker *shrink, struct shrink_control *sc)
                return -1;
        if (!grab_super_passive(sb))
-                return -1;
+                return !sc->nr_to_scan ? 0 : -1;
        if (sb->s_op && sb->s_op->nr_cached_objects)
                fs_objects = sb->s_op->nr_cached_objects(sb);
@@ -727,8 +727,13 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
        if (sb->s_op->remount_fs) {
                retval = sb->s_op->remount_fs(sb, &flags, data);
-                if (retval)
+                if (retval) {
-                        return retval;
+                        if (!force)
+                                return retval;
+                        /* If forced remount, go ahead despite any errors */
+                        WARN(1, "forced remount of a %s fs returned %i\n",
+                             sb->s_type->name, retval);
+                }
        }
        sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK);
diff --git a/fs/sync.c b/fs/sync.c
index c98a7477edfd..101b8ef901d7 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -43,7 +43,7 @@ static int __sync_filesystem(struct super_block *sb, int wait)
        if (wait)
                sync_inodes_sb(sb);
        else
-                writeback_inodes_sb(sb);
+                writeback_inodes_sb(sb, WB_REASON_SYNC);
        if (sb->s_op->sync_fs)
                sb->s_op->sync_fs(sb, wait);
@@ -98,7 +98,7 @@ static void sync_filesystems(int wait)
 */
 SYSCALL_DEFINE0(sync)
 {
-        wakeup_flusher_threads(0);
+        wakeup_flusher_threads(0, WB_REASON_SYNC);
        sync_filesystems(0);
        sync_filesystems(1);
        if (unlikely(laptop_mode))
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 48ffbdf0d017..7fdf6a7b7436 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -865,15 +865,13 @@ int sysfs_rename(struct sysfs_dirent *sd,
                sd->s_name = new_name;
        }
-        /* Remove from old parent's list and insert into new parent's list. */
+        /* Move to the appropriate place in the appropriate directories rbtree. */
-        if (sd->s_parent != new_parent_sd) {
+        sysfs_unlink_sibling(sd);
-                sysfs_unlink_sibling(sd);
+        sysfs_get(new_parent_sd);
-                sysfs_get(new_parent_sd);
+        sysfs_put(sd->s_parent);
-                sysfs_put(sd->s_parent);
-                sd->s_parent = new_parent_sd;
-                sysfs_link_sibling(sd);
-        }
        sd->s_ns = new_ns;
+        sd->s_parent = new_parent_sd;
+        sysfs_link_sibling(sd);
        error = 0;
 out:
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index e23f28894a3a..c81b22f3ace1 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -218,7 +218,7 @@ static void sysfs_refresh_inode(struct sysfs_dirent *sd, struct inode *inode)
        }
        if (sysfs_type(sd) == SYSFS_DIR)
-                inode->i_nlink = sd->s_dir.subdirs + 2;
+                set_nlink(inode, sd->s_dir.subdirs + 2);
 }
 int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 0630eb969a28..25ffb3e9a3f8 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -219,7 +219,7 @@ struct inode *sysv_iget(struct super_block *sb, unsigned int ino)
        inode->i_mode = fs16_to_cpu(sbi, raw_inode->i_mode);
        inode->i_uid = (uid_t)fs16_to_cpu(sbi, raw_inode->i_uid);
        inode->i_gid = (gid_t)fs16_to_cpu(sbi, raw_inode->i_gid);
-        inode->i_nlink = fs16_to_cpu(sbi, raw_inode->i_nlink);
+        set_nlink(inode, fs16_to_cpu(sbi, raw_inode->i_nlink));
        inode->i_size = fs32_to_cpu(sbi, raw_inode->i_size);
        inode->i_atime.tv_sec = fs32_to_cpu(sbi, raw_inode->i_atime);
        inode->i_mtime.tv_sec = fs32_to_cpu(sbi, raw_inode->i_mtime);
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index 315de66e52b2..bc4f94b28706 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -63,7 +63,7 @@
 static void shrink_liability(struct ubifs_info *c, int nr_to_write)
 {
        down_read(&c->vfs_sb->s_umount);
-        writeback_inodes_sb(c->vfs_sb);
+        writeback_inodes_sb(c->vfs_sb, WB_REASON_FS_FREE_SPACE);
        up_read(&c->vfs_sb->s_umount);
 }
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index b28121278d46..20403dc5d437 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -129,7 +129,7 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum)
                goto out_ino;
        inode->i_flags |= (S_NOCMTIME | S_NOATIME);
-        inode->i_nlink = le32_to_cpu(ino->nlink);
+        set_nlink(inode, le32_to_cpu(ino->nlink));
        inode->i_uid   = le32_to_cpu(ino->uid);
        inode->i_gid   = le32_to_cpu(ino->gid);
        inode->i_atime.tv_sec  = (int64_t)le64_to_cpu(ino->atime_sec);
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index 16f19f55e63f..bf18f7a04544 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -558,10 +558,10 @@ int ubifs_removexattr(struct dentry *dentry, const char *name)
        }
        ubifs_assert(inode->i_nlink == 1);
-        inode->i_nlink = 0;
+        clear_nlink(inode);
        err = remove_xattr(c, host, inode, &nm);
        if (err)
-                inode->i_nlink = 1;
+                set_nlink(inode, 1);
        /* If @i_nlink is 0, 'iput()' will delete the inode */
        iput(inode);
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index 95518a9f589e..987585bb0a1d 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -59,8 +59,8 @@ static int __load_block_bitmap(struct super_block *sb,
        int nr_groups = bitmap->s_nr_groups;
        if (block_group >= nr_groups) {
-                udf_debug("block_group (%d) > nr_groups (%d)\n", block_group,
+                udf_debug("block_group (%d) > nr_groups (%d)\n",
-                          nr_groups);
+                          block_group, nr_groups);
        }
        if (bitmap->s_block_bitmap[block_group]) {
@@ -126,8 +126,9 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
        if (bloc->logicalBlockNum + count < count ||
            (bloc->logicalBlockNum + count) > partmap->s_partition_len) {
                udf_debug("%d < %d || %d + %d > %d\n",
-                          bloc->logicalBlockNum, 0, bloc->logicalBlockNum,
+                          bloc->logicalBlockNum, 0,
-                          count, partmap->s_partition_len);
+                          bloc->logicalBlockNum, count,
+                          partmap->s_partition_len);
                goto error_return;
        }
@@ -155,7 +156,7 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
                        if (udf_set_bit(bit + i, bh->b_data)) {
                                udf_debug("bit %ld already set\n", bit + i);
                                udf_debug("byte=%2x\n",
-                                        ((char *)bh->b_data)[(bit + i) >> 3]);
+                                          ((char *)bh->b_data)[(bit + i) >> 3]);
                        }
                }
                udf_add_free_space(sb, sbi->s_partition, count);
@@ -369,7 +370,8 @@ static void udf_table_free_blocks(struct super_block *sb,
        if (bloc->logicalBlockNum + count < count ||
            (bloc->logicalBlockNum + count) > partmap->s_partition_len) {
                udf_debug("%d < %d || %d + %d > %d\n",
-                          bloc->logicalBlockNum, 0, bloc->logicalBlockNum, count,
+                          bloc->logicalBlockNum, 0,
+                          bloc->logicalBlockNum, count,
                          partmap->s_partition_len);
                goto error_return;
        }
diff --git a/fs/udf/directory.c b/fs/udf/directory.c
index 2ffdb6733af1..3e44f575fb9c 100644
--- a/fs/udf/directory.c
+++ b/fs/udf/directory.c
@@ -162,8 +162,8 @@ struct fileIdentDesc *udf_get_fileident(void *buffer, int bufsize, int *offset)
        int padlen;
        if ((!buffer) || (!offset)) {
-                udf_debug("invalidparms\n, buffer=%p, offset=%p\n", buffer,
+                udf_debug("invalidparms, buffer=%p, offset=%p\n",
-                          offset);
+                          buffer, offset);
                return NULL;
        }
@@ -201,7 +201,7 @@ struct short_ad *udf_get_fileshortad(uint8_t *ptr, int maxoffset, uint32_t *offs
        struct short_ad *sa;
        if ((!ptr) || (!offset)) {
-                printk(KERN_ERR "udf: udf_get_fileshortad() invalidparms\n");
+                pr_err("%s: invalidparms\n", __func__);
                return NULL;
        }
@@ -223,7 +223,7 @@ struct long_ad *udf_get_filelongad(uint8_t *ptr, int maxoffset, uint32_t *offset
        struct long_ad *la;
        if ((!ptr) || (!offset)) {
-                printk(KERN_ERR "udf: udf_get_filelongad() invalidparms\n");
+                pr_err("%s: invalidparms\n", __func__);
                return NULL;
        }
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 1d1358ed80c1..4fd1d809738c 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -37,6 +37,7 @@
 #include <linux/writeback.h>
 #include <linux/slab.h>
 #include <linux/crc-itu-t.h>
+#include <linux/mpage.h>
 #include "udf_i.h"
 #include "udf_sb.h"
@@ -83,12 +84,10 @@ void udf_evict_inode(struct inode *inode)
        end_writeback(inode);
        if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB &&
            inode->i_size != iinfo->i_lenExtents) {
-                printk(KERN_WARNING "UDF-fs (%s): Inode %lu (mode %o) has "
+                udf_warn(inode->i_sb, "Inode %lu (mode %o) has inode size %llu different from extent length %llu. Filesystem need not be standards compliant.\n",
-                        "inode size %llu different from extent length %llu. "
+                         inode->i_ino, inode->i_mode,
-                        "Filesystem need not be standards compliant.\n",
+                         (unsigned long long)inode->i_size,
-                        inode->i_sb->s_id, inode->i_ino, inode->i_mode,
+                         (unsigned long long)iinfo->i_lenExtents);
-                        (unsigned long long)inode->i_size,
-                        (unsigned long long)iinfo->i_lenExtents);
        }
        kfree(iinfo->i_ext.i_data);
        iinfo->i_ext.i_data = NULL;
@@ -104,7 +103,13 @@ static int udf_writepage(struct page *page, struct writeback_control *wbc)
 static int udf_readpage(struct file *file, struct page *page)
 {
-        return block_read_full_page(page, udf_get_block);
+        return mpage_readpage(page, udf_get_block);
+}
+static int udf_readpages(struct file *file, struct address_space *mapping,
+                        struct list_head *pages, unsigned nr_pages)
+{
+        return mpage_readpages(mapping, pages, nr_pages, udf_get_block);
 }
 static int udf_write_begin(struct file *file, struct address_space *mapping,
@@ -139,6 +144,7 @@ static sector_t udf_bmap(struct address_space *mapping, sector_t block)
 const struct address_space_operations udf_aops = {
        .readpage       = udf_readpage,
+        .readpages      = udf_readpages,
        .writepage      = udf_writepage,
        .write_begin            = udf_write_begin,
        .write_end              = generic_write_end,
@@ -1169,16 +1175,15 @@ static void __udf_read_inode(struct inode *inode)
         */
        bh = udf_read_ptagged(inode->i_sb, &iinfo->i_location, 0, &ident);
        if (!bh) {
-                printk(KERN_ERR "udf: udf_read_inode(ino %ld) failed !bh\n",
+                udf_err(inode->i_sb, "(ino %ld) failed !bh\n", inode->i_ino);
-                       inode->i_ino);
                make_bad_inode(inode);
                return;
        }
        if (ident != TAG_IDENT_FE && ident != TAG_IDENT_EFE &&
            ident != TAG_IDENT_USE) {
-                printk(KERN_ERR "udf: udf_read_inode(ino %ld) "
+                udf_err(inode->i_sb, "(ino %ld) failed ident=%d\n",
-                                "failed ident=%d\n", inode->i_ino, ident);
+                        inode->i_ino, ident);
                brelse(bh);
                make_bad_inode(inode);
                return;
@@ -1218,8 +1223,8 @@ static void __udf_read_inode(struct inode *inode)
                }
                brelse(ibh);
        } else if (fe->icbTag.strategyType != cpu_to_le16(4)) {
-                printk(KERN_ERR "udf: unsupported strategy type: %d\n",
+                udf_err(inode->i_sb, "unsupported strategy type: %d\n",
-                       le16_to_cpu(fe->icbTag.strategyType));
+                        le16_to_cpu(fe->icbTag.strategyType));
                brelse(bh);
                make_bad_inode(inode);
                return;
@@ -1236,6 +1241,7 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
        int offset;
        struct udf_sb_info *sbi = UDF_SB(inode->i_sb);
        struct udf_inode_info *iinfo = UDF_I(inode);
+        unsigned int link_count;
        fe = (struct fileEntry *)bh->b_data;
        efe = (struct extendedFileEntry *)bh->b_data;
@@ -1318,9 +1324,10 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
        inode->i_mode &= ~sbi->s_umask;
        read_unlock(&sbi->s_cred_lock);
-        inode->i_nlink = le16_to_cpu(fe->fileLinkCount);
+        link_count = le16_to_cpu(fe->fileLinkCount);
-        if (!inode->i_nlink)
+        if (!link_count)
-                inode->i_nlink = 1;
+                link_count = 1;
+        set_nlink(inode, link_count);
        inode->i_size = le64_to_cpu(fe->informationLength);
        iinfo->i_lenExtents = inode->i_size;
@@ -1413,9 +1420,8 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
                udf_debug("METADATA BITMAP FILE-----\n");
                break;
        default:
-                printk(KERN_ERR "udf: udf_fill_inode(ino %ld) failed unknown "
+                udf_err(inode->i_sb, "(ino %ld) failed unknown file type=%d\n",
-                                "file type=%d\n", inode->i_ino,
+                        inode->i_ino, fe->icbTag.fileType);
-                                fe->icbTag.fileType);
                make_bad_inode(inode);
                return;
        }
@@ -1438,8 +1444,8 @@ static int udf_alloc_i_data(struct inode *inode, size_t size)
        iinfo->i_ext.i_data = kmalloc(size, GFP_KERNEL);
        if (!iinfo->i_ext.i_data) {
-                printk(KERN_ERR "udf:udf_alloc_i_data (ino %ld) "
+                udf_err(inode->i_sb, "(ino %ld) no free memory\n",
-                                "no free memory\n", inode->i_ino);
+                        inode->i_ino);
                return -ENOMEM;
        }
@@ -1689,9 +1695,8 @@ out:
        if (do_sync) {
                sync_dirty_buffer(bh);
                if (buffer_write_io_error(bh)) {
-                        printk(KERN_WARNING "IO error syncing udf inode "
+                        udf_warn(inode->i_sb, "IO error syncing udf inode [%08lx]\n",
-                                "[%s:%08lx]\n", inode->i_sb->s_id,
+                                 inode->i_ino);
-                                inode->i_ino);
                        err = -EIO;
                }
        }
@@ -1982,8 +1987,7 @@ int8_t udf_current_aext(struct inode *inode, struct extent_position *epos,
                *elen = le32_to_cpu(lad->extLength) & UDF_EXTENT_LENGTH_MASK;
                break;
        default:
-                udf_debug("alloc_type = %d unsupported\n",
+                udf_debug("alloc_type = %d unsupported\n", iinfo->i_alloc_type);
-                                iinfo->i_alloc_type);
                return -1;
        }
diff --git a/fs/udf/lowlevel.c b/fs/udf/lowlevel.c
index 43e24a3b8e10..6583fe9b0645 100644
--- a/fs/udf/lowlevel.c
+++ b/fs/udf/lowlevel.c
@@ -38,7 +38,7 @@ unsigned int udf_get_last_session(struct super_block *sb)
        if (i == 0) {
                udf_debug("XA disk: %s, vol_desc_start=%d\n",
-                          (ms_info.xa_flag ? "yes" : "no"), ms_info.addr.lba);
+                          ms_info.xa_flag ? "yes" : "no", ms_info.addr.lba);
                if (ms_info.xa_flag) /* necessary for a valid ms_info.addr */
                        vol_desc_start = ms_info.addr.lba;
        } else {
diff --git a/fs/udf/misc.c b/fs/udf/misc.c
index 9215700c00a4..c175b4dabc14 100644
--- a/fs/udf/misc.c
+++ b/fs/udf/misc.c
@@ -204,6 +204,7 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block,
 {
        struct tag *tag_p;
        struct buffer_head *bh = NULL;
+        u8 checksum;
        /* Read the block */
        if (block == 0xFFFFFFFF)
@@ -211,8 +212,8 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block,
        bh = udf_tread(sb, block);
        if (!bh) {
-                udf_debug("block=%d, location=%d: read failed\n",
+                udf_err(sb, "read failed, block=%u, location=%d\n",
-                          block, location);
+                        block, location);
                return NULL;
        }
@@ -227,16 +228,18 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block,
        }
        /* Verify the tag checksum */
-        if (udf_tag_checksum(tag_p) != tag_p->tagChecksum) {
+        checksum = udf_tag_checksum(tag_p);
-                printk(KERN_ERR "udf: tag checksum failed block %d\n", block);
+        if (checksum != tag_p->tagChecksum) {
+                udf_err(sb, "tag checksum failed, block %u: 0x%02x != 0x%02x\n",
+                        block, checksum, tag_p->tagChecksum);
                goto error_out;
        }
        /* Verify the tag version */
        if (tag_p->descVersion != cpu_to_le16(0x0002U) &&
            tag_p->descVersion != cpu_to_le16(0x0003U)) {
-                udf_debug("tag version 0x%04x != 0x0002 || 0x0003 block %d\n",
+                udf_err(sb, "tag version 0x%04x != 0x0002 || 0x0003, block %u\n",
-                          le16_to_cpu(tag_p->descVersion), block);
+                        le16_to_cpu(tag_p->descVersion), block);
                goto error_out;
        }
@@ -248,8 +251,8 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block,
                return bh;
        udf_debug("Crc failure block %d: crc = %d, crclen = %d\n", block,
-            le16_to_cpu(tag_p->descCRC), le16_to_cpu(tag_p->descCRCLength));
+                  le16_to_cpu(tag_p->descCRC),
+                  le16_to_cpu(tag_p->descCRCLength));
 error_out:
        brelse(bh);
        return NULL;
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index f1dce848ef96..4639e137222f 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -577,8 +577,7 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode,
        fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
        if (!fi) {
-                inode->i_nlink--;
+                inode_dec_link_count(inode);
-                mark_inode_dirty(inode);
                iput(inode);
                return err;
        }
@@ -618,8 +617,7 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode,
        init_special_inode(inode, mode, rdev);
        fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
        if (!fi) {
-                inode->i_nlink--;
+                inode_dec_link_count(inode);
-                mark_inode_dirty(inode);
                iput(inode);
                return err;
        }
@@ -665,12 +663,11 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        inode->i_fop = &udf_dir_operations;
        fi = udf_add_entry(inode, NULL, &fibh, &cfi, &err);
        if (!fi) {
-                inode->i_nlink--;
+                inode_dec_link_count(inode);
-                mark_inode_dirty(inode);
                iput(inode);
                goto out;
        }
-        inode->i_nlink = 2;
+        set_nlink(inode, 2);
        cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize);
        cfi.icb.extLocation = cpu_to_lelb(dinfo->i_location);
        *(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse =
@@ -683,7 +680,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
        if (!fi) {
-                inode->i_nlink = 0;
+                clear_nlink(inode);
                mark_inode_dirty(inode);
                iput(inode);
                goto out;
@@ -799,9 +796,8 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
        if (retval)
                goto end_rmdir;
        if (inode->i_nlink != 2)
-                udf_warning(inode->i_sb, "udf_rmdir",
+                udf_warn(inode->i_sb, "empty directory has nlink != 2 (%d)\n",
-                            "empty directory has nlink != 2 (%d)",
+                         inode->i_nlink);
-                            inode->i_nlink);
        clear_nlink(inode);
        inode->i_size = 0;
        inode_dec_link_count(dir);
@@ -840,7 +836,7 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry)
        if (!inode->i_nlink) {
                udf_debug("Deleting nonexistent file (%lu), %d\n",
                          inode->i_ino, inode->i_nlink);
-                inode->i_nlink = 1;
+                set_nlink(inode, 1);
        }
        retval = udf_delete_entry(dir, fi, &fibh, &cfi);
        if (retval)
diff --git a/fs/udf/partition.c b/fs/udf/partition.c
index a71090ea0e07..d6caf01a2097 100644
--- a/fs/udf/partition.c
+++ b/fs/udf/partition.c
@@ -33,8 +33,8 @@ uint32_t udf_get_pblock(struct super_block *sb, uint32_t block,
        struct udf_sb_info *sbi = UDF_SB(sb);
        struct udf_part_map *map;
        if (partition >= sbi->s_partitions) {
-                udf_debug("block=%d, partition=%d, offset=%d: "
+                udf_debug("block=%d, partition=%d, offset=%d: invalid partition\n",
-                          "invalid partition\n", block, partition, offset);
+                          block, partition, offset);
                return 0xFFFFFFFF;
        }
        map = &sbi->s_partmaps[partition];
@@ -60,8 +60,8 @@ uint32_t udf_get_pblock_virt15(struct super_block *sb, uint32_t block,
        vdata = &map->s_type_specific.s_virtual;
        if (block > vdata->s_num_entries) {
-                udf_debug("Trying to access block beyond end of VAT "
+                udf_debug("Trying to access block beyond end of VAT (%d max %d)\n",
-                          "(%d max %d)\n", block, vdata->s_num_entries);
+                          block, vdata->s_num_entries);
                return 0xFFFFFFFF;
        }
@@ -321,9 +321,14 @@ uint32_t udf_get_pblock_meta25(struct super_block *sb, uint32_t block,
        /* We shouldn't mount such media... */
        BUG_ON(!inode);
        retblk = udf_try_read_meta(inode, block, partition, offset);
-        if (retblk == 0xFFFFFFFF) {
+        if (retblk == 0xFFFFFFFF && mdata->s_metadata_fe) {
-                udf_warning(sb, __func__, "error reading from METADATA, "
+                udf_warn(sb, "error reading from METADATA, trying to read from MIRROR\n");
-                        "trying to read from MIRROR");
+                if (!(mdata->s_flags & MF_MIRROR_FE_LOADED)) {
+                        mdata->s_mirror_fe = udf_find_metadata_inode_efe(sb,
+                                mdata->s_mirror_file_loc, map->s_partition_num);
+                        mdata->s_flags |= MF_MIRROR_FE_LOADED;
+                }
                inode = mdata->s_mirror_fe;
                if (!inode)
                        return 0xFFFFFFFF;
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 7b27b063ff6d..e185253470df 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -75,8 +75,6 @@
 #define UDF_DEFAULT_BLOCKSIZE 2048
-static char error_buf[1024];
 /* These are the "meat" - everything else is stuffing */
 static int udf_fill_super(struct super_block *, void *, int);
 static void udf_put_super(struct super_block *);
@@ -92,8 +90,6 @@ static void udf_close_lvid(struct super_block *);
 static unsigned int udf_count_free(struct super_block *);
 static int udf_statfs(struct dentry *, struct kstatfs *);
 static int udf_show_options(struct seq_file *, struct vfsmount *);
-static void udf_error(struct super_block *sb, const char *function,
-                      const char *fmt, ...);
 struct logicalVolIntegrityDescImpUse *udf_sb_lvidiu(struct udf_sb_info *sbi)
 {
@@ -244,9 +240,8 @@ static int udf_sb_alloc_partition_maps(struct super_block *sb, u32 count)
        sbi->s_partmaps = kcalloc(count, sizeof(struct udf_part_map),
                                  GFP_KERNEL);
        if (!sbi->s_partmaps) {
-                udf_error(sb, __func__,
+                udf_err(sb, "Unable to allocate space for %d partition maps\n",
-                          "Unable to allocate space for %d partition maps",
+                        count);
-                          count);
                sbi->s_partitions = 0;
                return -ENOMEM;
        }
@@ -550,8 +545,7 @@ static int udf_parse_options(char *options, struct udf_options *uopt,
                        uopt->dmode = option & 0777;
                        break;
                default:
-                        printk(KERN_ERR "udf: bad mount option \"%s\" "
+                        pr_err("bad mount option \"%s\" or missing value\n", p);
-                               "or missing value\n", p);
                        return 0;
                }
        }
@@ -645,20 +639,16 @@ static loff_t udf_check_vsd(struct super_block *sb)
                                udf_debug("ISO9660 Boot Record found\n");
                                break;
                        case 1:
-                                udf_debug("ISO9660 Primary Volume Descriptor "
+                                udf_debug("ISO9660 Primary Volume Descriptor found\n");
-                                          "found\n");
                                break;
                        case 2:
-                                udf_debug("ISO9660 Supplementary Volume "
+                                udf_debug("ISO9660 Supplementary Volume Descriptor found\n");
-                                          "Descriptor found\n");
                                break;
                        case 3:
-                                udf_debug("ISO9660 Volume Partition Descriptor "
+                                udf_debug("ISO9660 Volume Partition Descriptor found\n");
-                                          "found\n");
                                break;
                        case 255:
-                                udf_debug("ISO9660 Volume Descriptor Set "
+                                udf_debug("ISO9660 Volume Descriptor Set Terminator found\n");
-                                          "Terminator found\n");
                                break;
                        default:
                                udf_debug("ISO9660 VRS (%u) found\n",
@@ -809,8 +799,7 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block)
                              pvoldesc->recordingDateAndTime)) {
 #ifdef UDFFS_DEBUG
                struct timestamp *ts = &pvoldesc->recordingDateAndTime;
-                udf_debug("recording time %04u/%02u/%02u"
+                udf_debug("recording time %04u/%02u/%02u %02u:%02u (%x)\n",
-                          " %02u:%02u (%x)\n",
                          le16_to_cpu(ts->year), ts->month, ts->day, ts->hour,
                          ts->minute, le16_to_cpu(ts->typeAndTimezone));
 #endif
@@ -821,7 +810,7 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block)
                        strncpy(UDF_SB(sb)->s_volume_ident, outstr->u_name,
                                outstr->u_len > 31 ? 31 : outstr->u_len);
                        udf_debug("volIdent[] = '%s'\n",
-                                        UDF_SB(sb)->s_volume_ident);
+                                  UDF_SB(sb)->s_volume_ident);
                }
        if (!udf_build_ustr(instr, pvoldesc->volSetIdent, 128))
@@ -837,64 +826,57 @@ out1:
        return ret;
 }
+struct inode *udf_find_metadata_inode_efe(struct super_block *sb,
+                                        u32 meta_file_loc, u32 partition_num)
+{
+        struct kernel_lb_addr addr;
+        struct inode *metadata_fe;
+        addr.logicalBlockNum = meta_file_loc;
+        addr.partitionReferenceNum = partition_num;
+        metadata_fe = udf_iget(sb, &addr);
+        if (metadata_fe == NULL)
+                udf_warn(sb, "metadata inode efe not found\n");
+        else if (UDF_I(metadata_fe)->i_alloc_type != ICBTAG_FLAG_AD_SHORT) {
+                udf_warn(sb, "metadata inode efe does not have short allocation descriptors!\n");
+                iput(metadata_fe);
+                metadata_fe = NULL;
+        }
+        return metadata_fe;
+}
 static int udf_load_metadata_files(struct super_block *sb, int partition)
 {
        struct udf_sb_info *sbi = UDF_SB(sb);
        struct udf_part_map *map;
        struct udf_meta_data *mdata;
        struct kernel_lb_addr addr;
-        int fe_error = 0;
        map = &sbi->s_partmaps[partition];
        mdata = &map->s_type_specific.s_metadata;
        /* metadata address */
-        addr.logicalBlockNum =  mdata->s_meta_file_loc;
-        addr.partitionReferenceNum = map->s_partition_num;
        udf_debug("Metadata file location: block = %d part = %d\n",
-                          addr.logicalBlockNum, addr.partitionReferenceNum);
+                  mdata->s_meta_file_loc, map->s_partition_num);
-        mdata->s_metadata_fe = udf_iget(sb, &addr);
+        mdata->s_metadata_fe = udf_find_metadata_inode_efe(sb,
+                mdata->s_meta_file_loc, map->s_partition_num);
        if (mdata->s_metadata_fe == NULL) {
-                udf_warning(sb, __func__, "metadata inode efe not found, "
+                /* mirror file entry */
-                                "will try mirror inode.");
+                udf_debug("Mirror metadata file location: block = %d part = %d\n",
-                fe_error = 1;
+                          mdata->s_mirror_file_loc, map->s_partition_num);
-        } else if (UDF_I(mdata->s_metadata_fe)->i_alloc_type !=
-                 ICBTAG_FLAG_AD_SHORT) {
-                udf_warning(sb, __func__, "metadata inode efe does not have "
-                        "short allocation descriptors!");
-                fe_error = 1;
-                iput(mdata->s_metadata_fe);
-                mdata->s_metadata_fe = NULL;
-        }
-        /* mirror file entry */
+                mdata->s_mirror_fe = udf_find_metadata_inode_efe(sb,
-        addr.logicalBlockNum = mdata->s_mirror_file_loc;
+                        mdata->s_mirror_file_loc, map->s_partition_num);
-        addr.partitionReferenceNum = map->s_partition_num;
-        udf_debug("Mirror metadata file location: block = %d part = %d\n",
-                          addr.logicalBlockNum, addr.partitionReferenceNum);
-        mdata->s_mirror_fe = udf_iget(sb, &addr);
+                if (mdata->s_mirror_fe == NULL) {
+                        udf_err(sb, "Both metadata and mirror metadata inode efe can not found\n");
-        if (mdata->s_mirror_fe == NULL) {
-                if (fe_error) {
-                        udf_error(sb, __func__, "mirror inode efe not found "
-                        "and metadata inode is missing too, exiting...");
-                        goto error_exit;
-                } else
-                        udf_warning(sb, __func__, "mirror inode efe not found,"
-                                        " but metadata inode is OK");
-        } else if (UDF_I(mdata->s_mirror_fe)->i_alloc_type !=
-                 ICBTAG_FLAG_AD_SHORT) {
-                udf_warning(sb, __func__, "mirror inode efe does not have "
-                        "short allocation descriptors!");
-                iput(mdata->s_mirror_fe);
-                mdata->s_mirror_fe = NULL;
-                if (fe_error)
                        goto error_exit;
+                }
        }
        /*
@@ -907,18 +889,15 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
                addr.partitionReferenceNum = map->s_partition_num;
                udf_debug("Bitmap file location: block = %d part = %d\n",
-                        addr.logicalBlockNum, addr.partitionReferenceNum);
+                          addr.logicalBlockNum, addr.partitionReferenceNum);
                mdata->s_bitmap_fe = udf_iget(sb, &addr);
                if (mdata->s_bitmap_fe == NULL) {
                        if (sb->s_flags & MS_RDONLY)
-                                udf_warning(sb, __func__, "bitmap inode efe "
+                                udf_warn(sb, "bitmap inode efe not found but it's ok since the disc is mounted read-only\n");
-                                        "not found but it's ok since the disc"
-                                        " is mounted read-only");
                        else {
-                                udf_error(sb, __func__, "bitmap inode efe not "
+                                udf_err(sb, "bitmap inode efe not found and attempted read-write mount\n");
-                                        "found and attempted read-write mount");
                                goto error_exit;
                        }
                }
@@ -971,9 +950,8 @@ static struct udf_bitmap *udf_sb_alloc_bitmap(struct super_block *sb, u32 index)
                bitmap = vzalloc(size); /* TODO: get rid of vzalloc */
        if (bitmap == NULL) {
-                udf_error(sb, __func__,
+                udf_err(sb, "Unable to allocate space for bitmap and %d buffer_head pointers\n",
-                          "Unable to allocate space for bitmap "
+                        nr_groups);
-                          "and %d buffer_head pointers", nr_groups);
                return NULL;
        }
@@ -1003,10 +981,9 @@ static int udf_fill_partdesc_info(struct super_block *sb,
        if (p->accessType == cpu_to_le32(PD_ACCESS_TYPE_OVERWRITABLE))
                map->s_partition_flags |= UDF_PART_FLAG_OVERWRITABLE;
-        udf_debug("Partition (%d type %x) starts at physical %d, "
+        udf_debug("Partition (%d type %x) starts at physical %d, block length %d\n",
-                  "block length %d\n", p_index,
+                  p_index, map->s_partition_type,
-                  map->s_partition_type, map->s_partition_root,
+                  map->s_partition_root, map->s_partition_len);
-                  map->s_partition_len);
        if (strcmp(p->partitionContents.ident, PD_PARTITION_CONTENTS_NSR02) &&
            strcmp(p->partitionContents.ident, PD_PARTITION_CONTENTS_NSR03))
@@ -1023,12 +1000,12 @@ static int udf_fill_partdesc_info(struct super_block *sb,
                map->s_uspace.s_table = udf_iget(sb, &loc);
                if (!map->s_uspace.s_table) {
                        udf_debug("cannot load unallocSpaceTable (part %d)\n",
-                                        p_index);
+                                  p_index);
                        return 1;
                }
                map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_TABLE;
                udf_debug("unallocSpaceTable (part %d) @ %ld\n",
-                                p_index, map->s_uspace.s_table->i_ino);
+                          p_index, map->s_uspace.s_table->i_ino);
        }
        if (phd->unallocSpaceBitmap.extLength) {
@@ -1041,8 +1018,8 @@ static int udf_fill_partdesc_info(struct super_block *sb,
                bitmap->s_extPosition = le32_to_cpu(
                                phd->unallocSpaceBitmap.extPosition);
                map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_BITMAP;
-                udf_debug("unallocSpaceBitmap (part %d) @ %d\n", p_index,
+                udf_debug("unallocSpaceBitmap (part %d) @ %d\n",
-                                                bitmap->s_extPosition);
+                          p_index, bitmap->s_extPosition);
        }
        if (phd->partitionIntegrityTable.extLength)
@@ -1058,13 +1035,13 @@ static int udf_fill_partdesc_info(struct super_block *sb,
                map->s_fspace.s_table = udf_iget(sb, &loc);
                if (!map->s_fspace.s_table) {
                        udf_debug("cannot load freedSpaceTable (part %d)\n",
-                                p_index);
+                                  p_index);
                        return 1;
                }
                map->s_partition_flags |= UDF_PART_FLAG_FREED_TABLE;
                udf_debug("freedSpaceTable (part %d) @ %ld\n",
-                                p_index, map->s_fspace.s_table->i_ino);
+                          p_index, map->s_fspace.s_table->i_ino);
        }
        if (phd->freedSpaceBitmap.extLength) {
@@ -1077,8 +1054,8 @@ static int udf_fill_partdesc_info(struct super_block *sb,
                bitmap->s_extPosition = le32_to_cpu(
                                phd->freedSpaceBitmap.extPosition);
                map->s_partition_flags |= UDF_PART_FLAG_FREED_BITMAP;
-                udf_debug("freedSpaceBitmap (part %d) @ %d\n", p_index,
+                udf_debug("freedSpaceBitmap (part %d) @ %d\n",
-                                        bitmap->s_extPosition);
+                          p_index, bitmap->s_extPosition);
        }
        return 0;
 }
@@ -1118,11 +1095,9 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index)
        udf_find_vat_block(sb, p_index, type1_index, sbi->s_last_block);
        if (!sbi->s_vat_inode &&
            sbi->s_last_block != blocks - 1) {
-                printk(KERN_NOTICE "UDF-fs: Failed to read VAT inode from the"
+                pr_notice("Failed to read VAT inode from the last recorded block (%lu), retrying with the last block of the device (%lu).\n",
-                       " last recorded block (%lu), retrying with the last "
+                          (unsigned long)sbi->s_last_block,
-                       "block of the device (%lu).\n",
+                          (unsigned long)blocks - 1);
-                       (unsigned long)sbi->s_last_block,
-                       (unsigned long)blocks - 1);
                udf_find_vat_block(sb, p_index, type1_index, blocks - 1);
        }
        if (!sbi->s_vat_inode)
@@ -1220,8 +1195,8 @@ static int udf_load_partdesc(struct super_block *sb, sector_t block)
        if (map->s_partition_type == UDF_METADATA_MAP25) {
                ret = udf_load_metadata_files(sb, i);
                if (ret) {
-                        printk(KERN_ERR "UDF-fs: error loading MetaData "
+                        udf_err(sb, "error loading MetaData partition map %d\n",
-                        "partition map %d\n", i);
+                                i);
                        goto out_bh;
                }
        } else {
@@ -1234,9 +1209,7 @@ static int udf_load_partdesc(struct super_block *sb, sector_t block)
                 * overwrite blocks instead of relocating them).
                 */
                sb->s_flags |= MS_RDONLY;
-                printk(KERN_NOTICE "UDF-fs: Filesystem marked read-only "
+                pr_notice("Filesystem marked read-only because writing to pseudooverwrite partition is not implemented\n");
-                        "because writing to pseudooverwrite partition is "
-                        "not implemented.\n");
        }
 out_bh:
        /* In case loading failed, we handle cleanup in udf_fill_super */
@@ -1344,9 +1317,8 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block,
                                struct metadataPartitionMap *mdm =
                                                (struct metadataPartitionMap *)
                                                &(lvd->partitionMaps[offset]);
-                                udf_debug("Parsing Logical vol part %d "
+                                udf_debug("Parsing Logical vol part %d type %d  id=%s\n",
-                                        "type %d  id=%s\n", i, type,
+                                          i, type, UDF_ID_METADATA);
-                                        UDF_ID_METADATA);
                                map->s_partition_type = UDF_METADATA_MAP25;
                                map->s_partition_func = udf_get_pblock_meta25;
@@ -1361,25 +1333,24 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block,
                                        le32_to_cpu(mdm->allocUnitSize);
                                mdata->s_align_unit_size =
                                        le16_to_cpu(mdm->alignUnitSize);
-                                mdata->s_dup_md_flag     =
+                                if (mdm->flags & 0x01)
-                                        mdm->flags & 0x01;
+                                        mdata->s_flags |= MF_DUPLICATE_MD;
                                udf_debug("Metadata Ident suffix=0x%x\n",
-                                        (le16_to_cpu(
+                                          le16_to_cpu(*(__le16 *)
-                                         ((__le16 *)
+                                                      mdm->partIdent.identSuffix));
-                                              mdm->partIdent.identSuffix)[0])));
                                udf_debug("Metadata part num=%d\n",
-                                        le16_to_cpu(mdm->partitionNum));
+                                          le16_to_cpu(mdm->partitionNum));
                                udf_debug("Metadata part alloc unit size=%d\n",
-                                        le32_to_cpu(mdm->allocUnitSize));
+                                          le32_to_cpu(mdm->allocUnitSize));
                                udf_debug("Metadata file loc=%d\n",
-                                        le32_to_cpu(mdm->metadataFileLoc));
+                                          le32_to_cpu(mdm->metadataFileLoc));
                                udf_debug("Mirror file loc=%d\n",
-                                       le32_to_cpu(mdm->metadataMirrorFileLoc));
+                                          le32_to_cpu(mdm->metadataMirrorFileLoc));
                                udf_debug("Bitmap file loc=%d\n",
-                                       le32_to_cpu(mdm->metadataBitmapFileLoc));
+                                          le32_to_cpu(mdm->metadataBitmapFileLoc));
-                                udf_debug("Duplicate Flag: %d %d\n",
+                                udf_debug("Flags: %d %d\n",
-                                        mdata->s_dup_md_flag, mdm->flags);
+                                          mdata->s_flags, mdm->flags);
                        } else {
                                udf_debug("Unknown ident: %s\n",
                                          upm2->partIdent.ident);
@@ -1389,16 +1360,15 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block,
                        map->s_partition_num = le16_to_cpu(upm2->partitionNum);
                }
                udf_debug("Partition (%d:%d) type %d on volume %d\n",
-                          i, map->s_partition_num, type,
+                          i, map->s_partition_num, type, map->s_volumeseqnum);
-                          map->s_volumeseqnum);
        }
        if (fileset) {
                struct long_ad *la = (struct long_ad *)&(lvd->logicalVolContentsUse[0]);
                *fileset = lelb_to_cpu(la->extLocation);
-                udf_debug("FileSet found in LogicalVolDesc at block=%d, "
+                udf_debug("FileSet found in LogicalVolDesc at block=%d, partition=%d\n",
-                          "partition=%d\n", fileset->logicalBlockNum,
+                          fileset->logicalBlockNum,
                          fileset->partitionReferenceNum);
        }
        if (lvd->integritySeqExt.extLength)
@@ -1478,9 +1448,9 @@ static noinline int udf_process_sequence(struct super_block *sb, long block,
                bh = udf_read_tagged(sb, block, block, &ident);
                if (!bh) {
-                        printk(KERN_ERR "udf: Block %Lu of volume descriptor "
+                        udf_err(sb,
-                               "sequence is corrupted or we could not read "
+                                "Block %llu of volume descriptor sequence is corrupted or we could not read it\n",
-                               "it.\n", (unsigned long long)block);
+                                (unsigned long long)block);
                        return 1;
                }
@@ -1553,7 +1523,7 @@ static noinline int udf_process_sequence(struct super_block *sb, long block,
         * in a suitable order
         */
        if (!vds[VDS_POS_PRIMARY_VOL_DESC].block) {
-                printk(KERN_ERR "udf: Primary Volume Descriptor not found!\n");
+                udf_err(sb, "Primary Volume Descriptor not found!\n");
                return 1;
        }
        if (udf_load_pvoldesc(sb, vds[VDS_POS_PRIMARY_VOL_DESC].block))
@@ -1740,7 +1710,7 @@ static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt,
        if (!sb_set_blocksize(sb, uopt->blocksize)) {
                if (!silent)
-                        printk(KERN_WARNING "UDF-fs: Bad block size\n");
+                        udf_warn(sb, "Bad block size\n");
                return 0;
        }
        sbi->s_last_block = uopt->lastblock;
@@ -1749,12 +1719,11 @@ static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt,
                nsr_off = udf_check_vsd(sb);
                if (!nsr_off) {
                        if (!silent)
-                                printk(KERN_WARNING "UDF-fs: No VRS found\n");
+                                udf_warn(sb, "No VRS found\n");
                        return 0;
                }
                if (nsr_off == -1)
-                        udf_debug("Failed to read byte 32768. Assuming open "
+                        udf_debug("Failed to read byte 32768. Assuming open disc. Skipping validity check\n");
-                                  "disc. Skipping validity check\n");
                if (!sbi->s_last_block)
                        sbi->s_last_block = udf_get_last_block(sb);
        } else {
@@ -1765,7 +1734,7 @@ static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt,
        sbi->s_anchor = uopt->anchor;
        if (!udf_find_anchor(sb, fileset)) {
                if (!silent)
-                        printk(KERN_WARNING "UDF-fs: No anchor found\n");
+                        udf_warn(sb, "No anchor found\n");
                return 0;
        }
        return 1;
@@ -1937,8 +1906,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
        if (uopt.flags & (1 << UDF_FLAG_UTF8) &&
            uopt.flags & (1 << UDF_FLAG_NLS_MAP)) {
-                udf_error(sb, "udf_read_super",
+                udf_err(sb, "utf8 cannot be combined with iocharset\n");
-                          "utf8 cannot be combined with iocharset\n");
                goto error_out;
        }
 #ifdef CONFIG_UDF_NLS
@@ -1987,15 +1955,14 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
                ret = udf_load_vrs(sb, &uopt, silent, &fileset);
                if (!ret && uopt.blocksize != UDF_DEFAULT_BLOCKSIZE) {
                        if (!silent)
-                                printk(KERN_NOTICE
+                                pr_notice("Rescanning with blocksize %d\n",
-                                       "UDF-fs: Rescanning with blocksize "
+                                          UDF_DEFAULT_BLOCKSIZE);
-                                       "%d\n", UDF_DEFAULT_BLOCKSIZE);
                        uopt.blocksize = UDF_DEFAULT_BLOCKSIZE;
                        ret = udf_load_vrs(sb, &uopt, silent, &fileset);
                }
        }
        if (!ret) {
-                printk(KERN_WARNING "UDF-fs: No partition found (1)\n");
+                udf_warn(sb, "No partition found (1)\n");
                goto error_out;
        }
@@ -2010,10 +1977,9 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
                                le16_to_cpu(lvidiu->maxUDFWriteRev); */
                if (minUDFReadRev > UDF_MAX_READ_VERSION) {
-                        printk(KERN_ERR "UDF-fs: minUDFReadRev=%x "
+                        udf_err(sb, "minUDFReadRev=%x (max is %x)\n",
-                                        "(max is %x)\n",
+                                le16_to_cpu(lvidiu->minUDFReadRev),
-                               le16_to_cpu(lvidiu->minUDFReadRev),
+                                UDF_MAX_READ_VERSION);
-                               UDF_MAX_READ_VERSION);
                        goto error_out;
                } else if (minUDFWriteRev > UDF_MAX_WRITE_VERSION)
                        sb->s_flags |= MS_RDONLY;
@@ -2027,28 +1993,27 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
        }
        if (!sbi->s_partitions) {
-                printk(KERN_WARNING "UDF-fs: No partition found (2)\n");
+                udf_warn(sb, "No partition found (2)\n");
                goto error_out;
        }
        if (sbi->s_partmaps[sbi->s_partition].s_partition_flags &
                        UDF_PART_FLAG_READ_ONLY) {
-                printk(KERN_NOTICE "UDF-fs: Partition marked readonly; "
+                pr_notice("Partition marked readonly; forcing readonly mount\n");
-                                   "forcing readonly mount\n");
                sb->s_flags |= MS_RDONLY;
        }
        if (udf_find_fileset(sb, &fileset, &rootdir)) {
-                printk(KERN_WARNING "UDF-fs: No fileset found\n");
+                udf_warn(sb, "No fileset found\n");
                goto error_out;
        }
        if (!silent) {
                struct timestamp ts;
                udf_time_to_disk_stamp(&ts, sbi->s_record_time);
-                udf_info("UDF: Mounting volume '%s', "
+                udf_info("Mounting volume '%s', timestamp %04u/%02u/%02u %02u:%02u (%x)\n",
-                         "timestamp %04u/%02u/%02u %02u:%02u (%x)\n",
+                         sbi->s_volume_ident,
-                         sbi->s_volume_ident, le16_to_cpu(ts.year), ts.month, ts.day,
+                         le16_to_cpu(ts.year), ts.month, ts.day,
                         ts.hour, ts.minute, le16_to_cpu(ts.typeAndTimezone));
        }
        if (!(sb->s_flags & MS_RDONLY))
@@ -2059,8 +2024,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
        /* perhaps it's not extensible enough, but for now ... */
        inode = udf_iget(sb, &rootdir);
        if (!inode) {
-                printk(KERN_ERR "UDF-fs: Error in udf_iget, block=%d, "
+                udf_err(sb, "Error in udf_iget, block=%d, partition=%d\n",
-                                "partition=%d\n",
                       rootdir.logicalBlockNum, rootdir.partitionReferenceNum);
                goto error_out;
        }
@@ -2068,7 +2032,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
        /* Allocate a dentry for the root inode */
        sb->s_root = d_alloc_root(inode);
        if (!sb->s_root) {
-                printk(KERN_ERR "UDF-fs: Couldn't allocate root dentry\n");
+                udf_err(sb, "Couldn't allocate root dentry\n");
                iput(inode);
                goto error_out;
        }
@@ -2096,32 +2060,40 @@ error_out:
        return -EINVAL;
 }
-static void udf_error(struct super_block *sb, const char *function,
+void _udf_err(struct super_block *sb, const char *function,
-                      const char *fmt, ...)
+              const char *fmt, ...)
 {
+        struct va_format vaf;
        va_list args;
-        if (!(sb->s_flags & MS_RDONLY)) {
+        /* mark sb error */
-                /* mark sb error */
+        if (!(sb->s_flags & MS_RDONLY))
                sb->s_dirt = 1;
-        }
        va_start(args, fmt);
-        vsnprintf(error_buf, sizeof(error_buf), fmt, args);
+        vaf.fmt = fmt;
+        vaf.va = &args;
+        pr_err("error (device %s): %s: %pV", sb->s_id, function, &vaf);
        va_end(args);
-        printk(KERN_CRIT "UDF-fs error (device %s): %s: %s\n",
-                sb->s_id, function, error_buf);
 }
-void udf_warning(struct super_block *sb, const char *function,
+void _udf_warn(struct super_block *sb, const char *function,
-                 const char *fmt, ...)
+               const char *fmt, ...)
 {
+        struct va_format vaf;
        va_list args;
        va_start(args, fmt);
-        vsnprintf(error_buf, sizeof(error_buf), fmt, args);
+        vaf.fmt = fmt;
+        vaf.va = &args;
+        pr_warn("warning (device %s): %s: %pV", sb->s_id, function, &vaf);
        va_end(args);
-        printk(KERN_WARNING "UDF-fs warning (device %s): %s: %s\n",
-               sb->s_id, function, error_buf);
 }
 static void udf_put_super(struct super_block *sb)
@@ -2213,11 +2185,11 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb,
        bh = udf_read_ptagged(sb, &loc, 0, &ident);
        if (!bh) {
-                printk(KERN_ERR "udf: udf_count_free failed\n");
+                udf_err(sb, "udf_count_free failed\n");
                goto out;
        } else if (ident != TAG_IDENT_SBD) {
                brelse(bh);
-                printk(KERN_ERR "udf: udf_count_free failed\n");
+                udf_err(sb, "udf_count_free failed\n");
                goto out;
        }
diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c
index 8424308db4b4..4b98fee8e161 100644
--- a/fs/udf/truncate.c
+++ b/fs/udf/truncate.c
@@ -95,23 +95,21 @@ void udf_truncate_tail_extent(struct inode *inode)
                lbcount += elen;
                if (lbcount > inode->i_size) {
                        if (lbcount - inode->i_size >= inode->i_sb->s_blocksize)
-                                printk(KERN_WARNING
+                                udf_warn(inode->i_sb,
-                                       "udf_truncate_tail_extent(): Too long "
+                                         "Too long extent after EOF in inode %u: i_size: %lld lbcount: %lld extent %u+%u\n",
-                                       "extent after EOF in inode %u: i_size: "
+                                         (unsigned)inode->i_ino,
-                                       "%Ld lbcount: %Ld extent %u+%u\n",
+                                         (long long)inode->i_size,
-                                       (unsigned)inode->i_ino,
+                                         (long long)lbcount,
-                                       (long long)inode->i_size,
+                                         (unsigned)eloc.logicalBlockNum,
-                                       (long long)lbcount,
+                                         (unsigned)elen);
-                                       (unsigned)eloc.logicalBlockNum,
-                                       (unsigned)elen);
                        nelen = elen - (lbcount - inode->i_size);
                        epos.offset -= adsize;
                        extent_trunc(inode, &epos, &eloc, etype, elen, nelen);
                        epos.offset += adsize;
                        if (udf_next_aext(inode, &epos, &eloc, &elen, 1) != -1)
-                                printk(KERN_ERR "udf_truncate_tail_extent(): "
+                                udf_err(inode->i_sb,
-                                       "Extent after EOF in inode %u.\n",
+                                        "Extent after EOF in inode %u\n",
-                                       (unsigned)inode->i_ino);
+                                        (unsigned)inode->i_ino);
                        break;
                }
        }
diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h
index 4858c191242b..5142a82e3276 100644
--- a/fs/udf/udf_sb.h
+++ b/fs/udf/udf_sb.h
@@ -54,13 +54,16 @@
 #pragma pack(1) /* XXX(hch): Why?  This file just defines in-core structures */
+#define MF_DUPLICATE_MD         0x01
+#define MF_MIRROR_FE_LOADED     0x02
 struct udf_meta_data {
        __u32   s_meta_file_loc;
        __u32   s_mirror_file_loc;
        __u32   s_bitmap_file_loc;
        __u32   s_alloc_unit_size;
        __u16   s_align_unit_size;
-        __u8    s_dup_md_flag;
+        int     s_flags;
        struct inode *s_metadata_fe;
        struct inode *s_mirror_fe;
        struct inode *s_bitmap_fe;
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index dbd52d4b5eed..f34e6fc0cdaa 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -1,6 +1,8 @@
 #ifndef __UDF_DECL_H
 #define __UDF_DECL_H
+#define pr_fmt(fmt) "UDF-fs: " fmt
 #include "ecma_167.h"
 #include "osta_udf.h"
@@ -16,23 +18,30 @@
 #define UDF_PREALLOCATE
 #define UDF_DEFAULT_PREALLOC_BLOCKS     8
+extern __printf(3, 4) void _udf_err(struct super_block *sb,
+                const char *function, const char *fmt, ...);
+#define udf_err(sb, fmt, ...)                                   \
+        _udf_err(sb, __func__, fmt, ##__VA_ARGS__)
+extern __printf(3, 4) void _udf_warn(struct super_block *sb,
+                const char *function, const char *fmt, ...);
+#define udf_warn(sb, fmt, ...)                                  \
+        _udf_warn(sb, __func__, fmt, ##__VA_ARGS__)
+#define udf_info(fmt, ...)                                      \
+        pr_info("INFO " fmt, ##__VA_ARGS__)
 #undef UDFFS_DEBUG
 #ifdef UDFFS_DEBUG
-#define udf_debug(f, a...) \
+#define udf_debug(fmt, ...)                                     \
-do { \
+        printk(KERN_DEBUG pr_fmt("%s:%d:%s: " fmt),             \
-        printk(KERN_DEBUG "UDF-fs DEBUG %s:%d:%s: ", \
+               __FILE__, __LINE__, __func__, ##__VA_ARGS__)
-                __FILE__, __LINE__, __func__); \
-        printk(f, ##a); \
-} while (0)
 #else
-#define udf_debug(f, a...) /**/
+#define udf_debug(fmt, ...)                                     \
+        no_printk(fmt, ##__VA_ARGS__)
 #endif
-#define udf_info(f, a...) \
-        printk(KERN_INFO "UDF-fs INFO " f, ##a);
 #define udf_fixed_to_variable(x) ( ( ( (x) >> 5 ) * 39 ) + ( (x) & 0x0000001F ) )
 #define udf_variable_to_fixed(x) ( ( ( (x) / 39 ) << 5 ) + ( (x) % 39 ) )
@@ -112,8 +121,6 @@ struct extent_position {
 /* super.c */
-__attribute__((format(printf, 3, 4)))
-extern void udf_warning(struct super_block *, const char *, const char *, ...);
 static inline void udf_updated_lvid(struct super_block *sb)
 {
        struct buffer_head *bh = UDF_SB(sb)->s_lvid_bh;
@@ -126,6 +133,8 @@ static inline void udf_updated_lvid(struct super_block *sb)
        UDF_SB(sb)->s_lvid_dirty = 1;
 }
 extern u64 lvid_get_unique_id(struct super_block *sb);
+struct inode *udf_find_metadata_inode_efe(struct super_block *sb,
+                                        u32 meta_file_loc, u32 partition_num);
 /* namei.c */
 extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *,
diff --git a/fs/udf/udftime.c b/fs/udf/udftime.c
index b8c828c4d200..1f11483eba6a 100644
--- a/fs/udf/udftime.c
+++ b/fs/udf/udftime.c
@@ -34,9 +34,10 @@
 * http://www.boulder.nist.gov/timefreq/pubs/bulletin/leapsecond.htm
 */
+#include "udfdecl.h"
 #include <linux/types.h>
 #include <linux/kernel.h>
-#include "udfdecl.h"
 #define EPOCH_YEAR 1970
diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
index d03a90b6ad69..44b815e57f94 100644
--- a/fs/udf/unicode.c
+++ b/fs/udf/unicode.c
@@ -114,7 +114,7 @@ int udf_CS0toUTF8(struct ustr *utf_o, const struct ustr *ocu_i)
        cmp_id = ocu_i->u_cmpID;
        if (cmp_id != 8 && cmp_id != 16) {
                memset(utf_o, 0, sizeof(struct ustr));
-                printk(KERN_ERR "udf: unknown compression code (%d) stri=%s\n",
+                pr_err("unknown compression code (%d) stri=%s\n",
                       cmp_id, ocu_i->u_name);
                return 0;
        }
@@ -242,7 +242,7 @@ try_again:
        if (utf_cnt) {
 error_out:
                ocu[++u_len] = '?';
-                printk(KERN_DEBUG "udf: bad UTF-8 character\n");
+                printk(KERN_DEBUG pr_fmt("bad UTF-8 character\n"));
        }
        ocu[length - 1] = (uint8_t)u_len + 1;
@@ -267,7 +267,7 @@ static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o,
        cmp_id = ocu_i->u_cmpID;
        if (cmp_id != 8 && cmp_id != 16) {
                memset(utf_o, 0, sizeof(struct ustr));
-                printk(KERN_ERR "udf: unknown compression code (%d) stri=%s\n",
+                pr_err("unknown compression code (%d) stri=%s\n",
                       cmp_id, ocu_i->u_name);
                return 0;
        }
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index 2eabf04af3de..78a4c70d46b5 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -341,7 +341,7 @@ cg_found:
 fail_remove_inode:
        unlock_super(sb);
-        inode->i_nlink = 0;
+        clear_nlink(inode);
        iput(inode);
        UFSD("EXIT (FAILED): err %d\n", err);
        return ERR_PTR(err);
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index b4d791a83207..879b13436fa4 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -589,7 +589,7 @@ static int ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode)
         * Copy data to the in-core inode.
         */
        inode->i_mode = mode = fs16_to_cpu(sb, ufs_inode->ui_mode);
-        inode->i_nlink = fs16_to_cpu(sb, ufs_inode->ui_nlink);
+        set_nlink(inode, fs16_to_cpu(sb, ufs_inode->ui_nlink));
        if (inode->i_nlink == 0) {
                ufs_error (sb, "ufs_read_inode", "inode %lu has zero nlink\n", inode->i_ino);
                return -1;
@@ -637,7 +637,7 @@ static int ufs2_read_inode(struct inode *inode, struct ufs2_inode *ufs2_inode)
         * Copy data to the in-core inode.
         */
        inode->i_mode = mode = fs16_to_cpu(sb, ufs2_inode->ui_mode);
-        inode->i_nlink = fs16_to_cpu(sb, ufs2_inode->ui_nlink);
+        set_nlink(inode, fs16_to_cpu(sb, ufs2_inode->ui_nlink));
        if (inode->i_nlink == 0) {
                ufs_error (sb, "ufs_read_inode", "inode %lu has zero nlink\n", inode->i_ino);
                return -1;
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index 5be2755dd715..c26f2bcec264 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -117,9 +117,12 @@ extern int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buf
 extern const struct file_operations ufs_dir_operations;
 /* super.c */
-extern void ufs_warning (struct super_block *, const char *, const char *, ...) __attribute__ ((format (printf, 3, 4)));
+extern __printf(3, 4)
-extern void ufs_error (struct super_block *, const char *, const char *, ...) __attribute__ ((format (printf, 3, 4)));
+void ufs_warning(struct super_block *, const char *, const char *, ...);
-extern void ufs_panic (struct super_block *, const char *, const char *, ...) __attribute__ ((format (printf, 3, 4)));
+extern __printf(3, 4)
+void ufs_error(struct super_block *, const char *, const char *, ...);
+extern __printf(3, 4)
+void ufs_panic(struct super_block *, const char *, const char *, ...);
 /* symlink.c */
 extern const struct inode_operations ufs_fast_symlink_inode_operations;
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 11b2aad982d4..33b13310ee0c 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -902,11 +902,11 @@ xfs_vm_writepage(
         * random callers for direct reclaim or memcg reclaim.  We explicitly
         * allow reclaim from kswapd as the stack usage there is relatively low.
         *
-         * This should really be done by the core VM, but until that happens
+         * This should never happen except in the case of a VM regression so
-         * filesystems like XFS, btrfs and ext4 have to take care of this
+         * warn about it.
-         * by themselves.
         */
-        if ((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == PF_MEMALLOC)
+        if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) ==
+                        PF_MEMALLOC))
                goto redirty;
        /*
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 9ba2a07b7343..23ce927973a4 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -1153,7 +1153,7 @@ xfs_setup_inode(
        hlist_add_fake(&inode->i_hash);
        inode->i_mode   = ip->i_d.di_mode;
-        inode->i_nlink  = ip->i_d.di_nlink;
+        set_nlink(inode, ip->i_d.di_nlink);
        inode->i_uid    = ip->i_d.di_uid;
        inode->i_gid    = ip->i_d.di_gid;
diff --git a/fs/xfs/xfs_message.h b/fs/xfs/xfs_message.h
index 7fb7ea007672..56dc0c17f16a 100644
--- a/fs/xfs/xfs_message.h
+++ b/fs/xfs/xfs_message.h
@@ -3,31 +3,29 @@
 struct xfs_mount;
-extern void xfs_emerg(const struct xfs_mount *mp, const char *fmt, ...)
+extern __printf(2, 3)
-        __attribute__ ((format (printf, 2, 3)));
+void xfs_emerg(const struct xfs_mount *mp, const char *fmt, ...);
-extern void xfs_alert(const struct xfs_mount *mp, const char *fmt, ...)
+extern __printf(2, 3)
-        __attribute__ ((format (printf, 2, 3)));
+void xfs_alert(const struct xfs_mount *mp, const char *fmt, ...);
-extern void xfs_alert_tag(const struct xfs_mount *mp, int tag,
+extern __printf(3, 4)
-                         const char *fmt, ...)
+void xfs_alert_tag(const struct xfs_mount *mp, int tag, const char *fmt, ...);
-        __attribute__ ((format (printf, 3, 4)));
+extern __printf(2, 3)
-extern void xfs_crit(const struct xfs_mount *mp, const char *fmt, ...)
+void xfs_crit(const struct xfs_mount *mp, const char *fmt, ...);
-        __attribute__ ((format (printf, 2, 3)));
+extern __printf(2, 3)
-extern void xfs_err(const struct xfs_mount *mp, const char *fmt, ...)
+void xfs_err(const struct xfs_mount *mp, const char *fmt, ...);
-        __attribute__ ((format (printf, 2, 3)));
+extern __printf(2, 3)
-extern void xfs_warn(const struct xfs_mount *mp, const char *fmt, ...)
+void xfs_warn(const struct xfs_mount *mp, const char *fmt, ...);
-        __attribute__ ((format (printf, 2, 3)));
+extern __printf(2, 3)
-extern void xfs_notice(const struct xfs_mount *mp, const char *fmt, ...)
+void xfs_notice(const struct xfs_mount *mp, const char *fmt, ...);
-        __attribute__ ((format (printf, 2, 3)));
+extern __printf(2, 3)
-extern void xfs_info(const struct xfs_mount *mp, const char *fmt, ...)
+void xfs_info(const struct xfs_mount *mp, const char *fmt, ...);
-        __attribute__ ((format (printf, 2, 3)));
 #ifdef DEBUG
-extern void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...)
+extern __printf(2, 3)
-        __attribute__ ((format (printf, 2, 3)));
+void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...);
 #else
-static inline void
+static inline __printf(2, 3)
-__attribute__ ((format (printf, 2, 3)))
+void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...)
-xfs_debug(const struct xfs_mount *mp, const char *fmt, ...)
 {
 }
 #endif