104 files changed, 2745 insertions, 1637 deletions
diff --git a/fs/9p/Kconfig b/fs/9p/Kconfig
index 814ac4e213a8..0a93dc1cb4ac 100644
--- a/fs/9p/Kconfig
+++ b/fs/9p/Kconfig
@@ -1,6 +1,6 @@
 config 9P_FS
-        tristate "Plan 9 Resource Sharing Support (9P2000) (Experimental)"
+        tristate "Plan 9 Resource Sharing Support (9P2000)"
-        depends on INET && NET_9P && EXPERIMENTAL
+        depends on INET && NET_9P
        help
          If you say Y here, you will get experimental support for
          Plan 9 resource sharing via the 9P2000 protocol.
@@ -10,7 +10,6 @@ config 9P_FS
          If unsure, say N.
 if 9P_FS
 config 9P_FSCACHE
        bool "Enable 9P client caching support (EXPERIMENTAL)"
        depends on EXPERIMENTAL
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 82a7c38ddad0..691c78f58bef 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -259,7 +259,7 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
                if (IS_ERR(inode_fid)) {
                        err = PTR_ERR(inode_fid);
                        mutex_unlock(&v9inode->v_mutex);
-                        goto error;
+                        goto err_clunk_old_fid;
                }
                v9inode->writeback_fid = (void *) inode_fid;
        }
@@ -267,8 +267,8 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
        /* Since we are opening a file, assign the open fid to the file */
        filp = lookup_instantiate_filp(nd, dentry, generic_file_open);
        if (IS_ERR(filp)) {
-                p9_client_clunk(ofid);
+                err = PTR_ERR(filp);
-                return PTR_ERR(filp);
+                goto err_clunk_old_fid;
        }
        filp->private_data = ofid;
 #ifdef CONFIG_9P_FSCACHE
@@ -278,10 +278,11 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
        return 0;
 error:
-        if (ofid)
-                p9_client_clunk(ofid);
        if (fid)
                p9_client_clunk(fid);
+err_clunk_old_fid:
+        if (ofid)
+                p9_client_clunk(ofid);
        return err;
 }
diff --git a/fs/Kconfig b/fs/Kconfig
index f3aa9b08b228..979992dcb386 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -121,9 +121,25 @@ config TMPFS
          See <file:Documentation/filesystems/tmpfs.txt> for details.
+config TMPFS_XATTR
+        bool "Tmpfs extended attributes"
+        depends on TMPFS
+        default n
+        help
+          Extended attributes are name:value pairs associated with inodes by
+          the kernel or by users (see the attr(5) manual page, or visit
+          <http://acl.bestbits.at/> for details).
+          Currently this enables support for the trusted.* and
+          security.* namespaces.
+          If unsure, say N.
+          You need this for POSIX ACL support on tmpfs.
 config TMPFS_POSIX_ACL
        bool "Tmpfs POSIX Access Control Lists"
-        depends on TMPFS
+        depends on TMPFS_XATTR
        select GENERIC_ACL
        help
          POSIX Access Control Lists (ACLs) support permissions for users and
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 397d3057d336..1bffbe0ed778 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -820,6 +820,8 @@ static int load_flat_shared_library(int id, struct lib_info *libs)
        int res;
        char buf[16];
+        memset(&bprm, 0, sizeof(bprm));
        /* Create the file name */
        sprintf(buf, "/lib/lib%d.so", id);
@@ -835,6 +837,12 @@ static int load_flat_shared_library(int id, struct lib_info *libs)
        if (!bprm.cred)
                goto out;
+        /* We don't really care about recalculating credentials at this point
+         * as we're past the point of no return and are dealing with shared
+         * libraries.
+         */
+        bprm.cred_prepared = 1;
        res = prepare_binprm(&bprm);
        if (!IS_ERR_VALUE(res))
diff --git a/fs/block_dev.c b/fs/block_dev.c
index bf9c7a720371..1f2b19978333 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1238,6 +1238,8 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
        res = __blkdev_get(bdev, mode, 0);
        if (whole) {
+                struct gendisk *disk = whole->bd_disk;
                /* finish claiming */
                mutex_lock(&bdev->bd_mutex);
                spin_lock(&bdev_lock);
@@ -1264,15 +1266,16 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
                spin_unlock(&bdev_lock);
                /*
-                 * Block event polling for write claims.  Any write
+                 * Block event polling for write claims if requested.  Any
-                 * holder makes the write_holder state stick until all
+                 * write holder makes the write_holder state stick until
-                 * are released.  This is good enough and tracking
+                 * all are released.  This is good enough and tracking
-                 * individual writeable reference is too fragile given
+                 * individual writeable reference is too fragile given the
-                 * the way @mode is used in blkdev_get/put().
+                 * way @mode is used in blkdev_get/put().
                 */
-                if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder) {
+                if ((disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE) &&
+                    !res && (mode & FMODE_WRITE) && !bdev->bd_write_holder) {
                        bdev->bd_write_holder = true;
-                        disk_block_events(bdev->bd_disk);
+                        disk_block_events(disk);
                }
                mutex_unlock(&bdev->bd_mutex);
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 38b8ab554924..33da49dc3cc6 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -848,7 +848,8 @@ get_more_pages:
                op->payload_len = cpu_to_le32(len);
                req->r_request->hdr.data_len = cpu_to_le32(len);
-                ceph_osdc_start_request(&fsc->client->osdc, req, true);
+                rc = ceph_osdc_start_request(&fsc->client->osdc, req, true);
+                BUG_ON(rc);
                req = NULL;
                /* continue? */
@@ -880,8 +881,6 @@ release_pvec_pages:
 out:
        if (req)
                ceph_osdc_put_request(req);
-        if (rc > 0)
-                rc = 0;  /* vfs expects us to return 0 */
        ceph_put_snap_context(snapc);
        dout("writepages done, rc = %d\n", rc);
        return rc;
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 2a5404c1c42f..1f72b00447c4 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -569,7 +569,8 @@ retry:
                list_add_tail(&cap->session_caps, &session->s_caps);
                session->s_nr_caps++;
                spin_unlock(&session->s_cap_lock);
-        }
+        } else if (new_cap)
+                ceph_put_cap(mdsc, new_cap);
        if (!ci->i_snap_realm) {
                /*
@@ -2634,6 +2635,7 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
                              struct ceph_mds_session *session,
                              int *open_target_sessions)
 {
+        struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
        struct ceph_inode_info *ci = ceph_inode(inode);
        int mds = session->s_mds;
        unsigned mseq = le32_to_cpu(ex->migrate_seq);
@@ -2670,6 +2672,19 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
                         * export targets, so that we get the matching IMPORT
                         */
                        *open_target_sessions = 1;
+                        /*
+                         * we can't flush dirty caps that we've seen the
+                         * EXPORT but no IMPORT for
+                         */
+                        spin_lock(&mdsc->cap_dirty_lock);
+                        if (!list_empty(&ci->i_dirty_item)) {
+                                dout(" moving %p to cap_dirty_migrating\n",
+                                     inode);
+                                list_move(&ci->i_dirty_item,
+                                          &mdsc->cap_dirty_migrating);
+                        }
+                        spin_unlock(&mdsc->cap_dirty_lock);
                }
                __ceph_remove_cap(cap);
        }
@@ -2707,6 +2722,13 @@ static void handle_cap_import(struct ceph_mds_client *mdsc,
                ci->i_cap_exporting_issued = 0;
                ci->i_cap_exporting_mseq = 0;
                ci->i_cap_exporting_mds = -1;
+                spin_lock(&mdsc->cap_dirty_lock);
+                if (!list_empty(&ci->i_dirty_item)) {
+                        dout(" moving %p back to cap_dirty\n", inode);
+                        list_move(&ci->i_dirty_item, &mdsc->cap_dirty);
+                }
+                spin_unlock(&mdsc->cap_dirty_lock);
        } else {
                dout("handle_cap_import inode %p ci %p mds%d mseq %d\n",
                     inode, ci, mds, mseq);
@@ -2910,38 +2932,16 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
 */
 void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
 {
-        struct ceph_inode_info *ci, *nci = NULL;
+        struct ceph_inode_info *ci;
-        struct inode *inode, *ninode = NULL;
+        struct inode *inode;
-        struct list_head *p, *n;
        dout("flush_dirty_caps\n");
        spin_lock(&mdsc->cap_dirty_lock);
-        list_for_each_safe(p, n, &mdsc->cap_dirty) {
+        while (!list_empty(&mdsc->cap_dirty)) {
-                if (nci) {
+                ci = list_first_entry(&mdsc->cap_dirty, struct ceph_inode_info,
-                        ci = nci;
+                                      i_dirty_item);
-                        inode = ninode;
+                inode = igrab(&ci->vfs_inode);
-                        ci->i_ceph_flags &= ~CEPH_I_NOFLUSH;
+                dout("flush_dirty_caps %p\n", inode);
-                        dout("flush_dirty_caps inode %p (was next inode)\n",
-                             inode);
-                } else {
-                        ci = list_entry(p, struct ceph_inode_info,
-                                        i_dirty_item);
-                        inode = igrab(&ci->vfs_inode);
-                        BUG_ON(!inode);
-                        dout("flush_dirty_caps inode %p\n", inode);
-                }
-                if (n != &mdsc->cap_dirty) {
-                        nci = list_entry(n, struct ceph_inode_info,
-                                         i_dirty_item);
-                        ninode = igrab(&nci->vfs_inode);
-                        BUG_ON(!ninode);
-                        nci->i_ceph_flags |= CEPH_I_NOFLUSH;
-                        dout("flush_dirty_caps next inode %p, noflush\n",
-                             ninode);
-                } else {
-                        nci = NULL;
-                        ninode = NULL;
-                }
                spin_unlock(&mdsc->cap_dirty_lock);
                if (inode) {
                        ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH,
@@ -2951,6 +2951,7 @@ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
                spin_lock(&mdsc->cap_dirty_lock);
        }
        spin_unlock(&mdsc->cap_dirty_lock);
+        dout("flush_dirty_caps done\n");
 }
 /*
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 1a867a3601ae..33729e822bb9 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -360,7 +360,7 @@ more:
        rinfo = &fi->last_readdir->r_reply_info;
        dout("readdir frag %x num %d off %d chunkoff %d\n", frag,
             rinfo->dir_nr, off, fi->offset);
-        while (off - fi->offset >= 0 && off - fi->offset < rinfo->dir_nr) {
+        while (off >= fi->offset && off - fi->offset < rinfo->dir_nr) {
                u64 pos = ceph_make_fpos(frag, off);
                struct ceph_mds_reply_inode *in =
                        rinfo->dir_in[off - fi->offset].in;
@@ -1066,16 +1066,17 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
        struct inode *inode = file->f_dentry->d_inode;
        struct ceph_inode_info *ci = ceph_inode(inode);
        int left;
+        const int bufsize = 1024;
        if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT))
                return -EISDIR;
        if (!cf->dir_info) {
-                cf->dir_info = kmalloc(1024, GFP_NOFS);
+                cf->dir_info = kmalloc(bufsize, GFP_NOFS);
                if (!cf->dir_info)
                        return -ENOMEM;
                cf->dir_info_len =
-                        sprintf(cf->dir_info,
+                        snprintf(cf->dir_info, bufsize,
                                "entries:   %20lld\n"
                                " files:    %20lld\n"
                                " subdirs:  %20lld\n"
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index e41056174bf8..a610d3d67488 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -86,6 +86,7 @@ static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
 static struct dentry *__fh_to_dentry(struct super_block *sb,
                                     struct ceph_nfs_fh *fh)
 {
+        struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
        struct inode *inode;
        struct dentry *dentry;
        struct ceph_vino vino;
@@ -95,8 +96,24 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
        vino.ino = fh->ino;
        vino.snap = CEPH_NOSNAP;
        inode = ceph_find_inode(sb, vino);
-        if (!inode)
+        if (!inode) {
-                return ERR_PTR(-ESTALE);
+                struct ceph_mds_request *req;
+                req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPINO,
+                                               USE_ANY_MDS);
+                if (IS_ERR(req))
+                        return ERR_CAST(req);
+                req->r_ino1 = vino;
+                req->r_num_caps = 1;
+                err = ceph_mdsc_do_request(mdsc, NULL, req);
+                inode = req->r_target_inode;
+                if (inode)
+                        igrab(inode);
+                ceph_mdsc_put_request(req);
+                if (!inode)
+                        return ERR_PTR(-ESTALE);
+        }
        dentry = d_obtain_alias(inode);
        if (IS_ERR(dentry)) {
@@ -148,8 +165,10 @@ static struct dentry *__cfh_to_dentry(struct super_block *sb,
                snprintf(req->r_path2, 16, "%d", cfh->parent_name_hash);
                req->r_num_caps = 1;
                err = ceph_mdsc_do_request(mdsc, NULL, req);
+                inode = req->r_target_inode;
+                if (inode)
+                        igrab(inode);
                ceph_mdsc_put_request(req);
-                inode = ceph_find_inode(sb, vino);
                if (!inode)
                        return ERR_PTR(err ? err : -ESTALE);
        }
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index d0fae4ce9ba5..79743d146be6 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -578,6 +578,7 @@ static void __register_request(struct ceph_mds_client *mdsc,
        if (dir) {
                struct ceph_inode_info *ci = ceph_inode(dir);
+                ihold(dir);
                spin_lock(&ci->i_unsafe_lock);
                req->r_unsafe_dir = dir;
                list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops);
@@ -598,6 +599,9 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
                spin_lock(&ci->i_unsafe_lock);
                list_del_init(&req->r_unsafe_dir_item);
                spin_unlock(&ci->i_unsafe_lock);
+                iput(req->r_unsafe_dir);
+                req->r_unsafe_dir = NULL;
        }
        ceph_mdsc_put_request(req);
@@ -2691,7 +2695,6 @@ static void handle_lease(struct ceph_mds_client *mdsc,
 {
        struct super_block *sb = mdsc->fsc->sb;
        struct inode *inode;
-        struct ceph_inode_info *ci;
        struct dentry *parent, *dentry;
        struct ceph_dentry_info *di;
        int mds = session->s_mds;
@@ -2728,7 +2731,6 @@ static void handle_lease(struct ceph_mds_client *mdsc,
                dout("handle_lease no inode %llx\n", vino.ino);
                goto release;
        }
-        ci = ceph_inode(inode);
        /* dentry */
        parent = d_find_alias(inode);
@@ -3002,6 +3004,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
        spin_lock_init(&mdsc->snap_flush_lock);
        mdsc->cap_flush_seq = 0;
        INIT_LIST_HEAD(&mdsc->cap_dirty);
+        INIT_LIST_HEAD(&mdsc->cap_dirty_migrating);
        mdsc->num_cap_flushing = 0;
        spin_lock_init(&mdsc->cap_dirty_lock);
        init_waitqueue_head(&mdsc->cap_flushing_wq);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 4e3a9cc0bba6..7d8a0d662d56 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -278,6 +278,7 @@ struct ceph_mds_client {
        u64               cap_flush_seq;
        struct list_head  cap_dirty;        /* inodes with dirty caps */
+        struct list_head  cap_dirty_migrating; /* ...that are migration... */
        int               num_cap_flushing; /* # caps we are flushing */
        spinlock_t        cap_dirty_lock;   /* protects above items */
        wait_queue_head_t cap_flushing_wq;
diff --git a/fs/dcache.c b/fs/dcache.c
index 18b2a1f10ed8..37f72ee5bf7c 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1220,7 +1220,7 @@ void shrink_dcache_parent(struct dentry * parent)
 EXPORT_SYMBOL(shrink_dcache_parent);
 /*
- * Scan `nr' dentries and return the number which remain.
+ * Scan `sc->nr_slab_to_reclaim' dentries and return the number which remain.
 *
 * We need to avoid reentering the filesystem if the caller is performing a
 * GFP_NOFS allocation attempt.  One example deadlock is:
@@ -1231,8 +1231,12 @@ EXPORT_SYMBOL(shrink_dcache_parent);
 *
 * In this case we return -1 to tell the caller that we baled.
 */
-static int shrink_dcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
+static int shrink_dcache_memory(struct shrinker *shrink,
+                                struct shrink_control *sc)
 {
+        int nr = sc->nr_to_scan;
+        gfp_t gfp_mask = sc->gfp_mask;
        if (nr) {
                if (!(gfp_mask & __GFP_FS))
                        return -1;
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 0d329ff8ed4c..9b026ea8baa9 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -100,6 +100,7 @@ struct dlm_cluster {
        unsigned int cl_log_debug;
        unsigned int cl_protocol;
        unsigned int cl_timewarn_cs;
+        unsigned int cl_waitwarn_us;
 };
 enum {
@@ -114,6 +115,7 @@ enum {
        CLUSTER_ATTR_LOG_DEBUG,
        CLUSTER_ATTR_PROTOCOL,
        CLUSTER_ATTR_TIMEWARN_CS,
+        CLUSTER_ATTR_WAITWARN_US,
 };
 struct cluster_attribute {
@@ -166,6 +168,7 @@ CLUSTER_ATTR(scan_secs, 1);
 CLUSTER_ATTR(log_debug, 0);
 CLUSTER_ATTR(protocol, 0);
 CLUSTER_ATTR(timewarn_cs, 1);
+CLUSTER_ATTR(waitwarn_us, 0);
 static struct configfs_attribute *cluster_attrs[] = {
        [CLUSTER_ATTR_TCP_PORT] = &cluster_attr_tcp_port.attr,
@@ -179,6 +182,7 @@ static struct configfs_attribute *cluster_attrs[] = {
        [CLUSTER_ATTR_LOG_DEBUG] = &cluster_attr_log_debug.attr,
        [CLUSTER_ATTR_PROTOCOL] = &cluster_attr_protocol.attr,
        [CLUSTER_ATTR_TIMEWARN_CS] = &cluster_attr_timewarn_cs.attr,
+        [CLUSTER_ATTR_WAITWARN_US] = &cluster_attr_waitwarn_us.attr,
        NULL,
 };
@@ -439,6 +443,7 @@ static struct config_group *make_cluster(struct config_group *g,
        cl->cl_log_debug = dlm_config.ci_log_debug;
        cl->cl_protocol = dlm_config.ci_protocol;
        cl->cl_timewarn_cs = dlm_config.ci_timewarn_cs;
+        cl->cl_waitwarn_us = dlm_config.ci_waitwarn_us;
        space_list = &sps->ss_group;
        comm_list = &cms->cs_group;
@@ -986,6 +991,7 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num)
 #define DEFAULT_LOG_DEBUG          0
 #define DEFAULT_PROTOCOL           0
 #define DEFAULT_TIMEWARN_CS      500 /* 5 sec = 500 centiseconds */
+#define DEFAULT_WAITWARN_US        0
 struct dlm_config_info dlm_config = {
        .ci_tcp_port = DEFAULT_TCP_PORT,
@@ -998,6 +1004,7 @@ struct dlm_config_info dlm_config = {
        .ci_scan_secs = DEFAULT_SCAN_SECS,
        .ci_log_debug = DEFAULT_LOG_DEBUG,
        .ci_protocol = DEFAULT_PROTOCOL,
-        .ci_timewarn_cs = DEFAULT_TIMEWARN_CS
+        .ci_timewarn_cs = DEFAULT_TIMEWARN_CS,
+        .ci_waitwarn_us = DEFAULT_WAITWARN_US
 };
diff --git a/fs/dlm/config.h b/fs/dlm/config.h
index 4f1d6fce58c5..dd0ce24d5a80 100644
--- a/fs/dlm/config.h
+++ b/fs/dlm/config.h
@@ -28,6 +28,7 @@ struct dlm_config_info {
        int ci_log_debug;
        int ci_protocol;
        int ci_timewarn_cs;
+        int ci_waitwarn_us;
 };
 extern struct dlm_config_info dlm_config;
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index b94204913011..0262451eb9c6 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -209,6 +209,7 @@ struct dlm_args {
 #define DLM_IFL_WATCH_TIMEWARN  0x00400000
 #define DLM_IFL_TIMEOUT_CANCEL  0x00800000
 #define DLM_IFL_DEADLOCK_CANCEL 0x01000000
+#define DLM_IFL_STUB_MS         0x02000000 /* magic number for m_flags */
 #define DLM_IFL_USER            0x00000001
 #define DLM_IFL_ORPHAN          0x00000002
@@ -245,6 +246,7 @@ struct dlm_lkb {
        int8_t                  lkb_wait_type;  /* type of reply waiting for */
        int8_t                  lkb_wait_count;
+        int                     lkb_wait_nodeid; /* for debugging */
        struct list_head        lkb_idtbl_list; /* lockspace lkbtbl */
        struct list_head        lkb_statequeue; /* rsb g/c/w list */
@@ -254,6 +256,7 @@ struct dlm_lkb {
        struct list_head        lkb_ownqueue;   /* list of locks for a process */
        struct list_head        lkb_time_list;
        ktime_t                 lkb_timestamp;
+        ktime_t                 lkb_wait_time;
        unsigned long           lkb_timeout_cs;
        struct dlm_callback     lkb_callbacks[DLM_CALLBACKS_SIZE];
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 56d6bfcc1e48..f71d0b5abd95 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -799,10 +799,84 @@ static int msg_reply_type(int mstype)
        return -1;
 }
+static int nodeid_warned(int nodeid, int num_nodes, int *warned)
+{
+        int i;
+        for (i = 0; i < num_nodes; i++) {
+                if (!warned[i]) {
+                        warned[i] = nodeid;
+                        return 0;
+                }
+                if (warned[i] == nodeid)
+                        return 1;
+        }
+        return 0;
+}
+void dlm_scan_waiters(struct dlm_ls *ls)
+{
+        struct dlm_lkb *lkb;
+        ktime_t zero = ktime_set(0, 0);
+        s64 us;
+        s64 debug_maxus = 0;
+        u32 debug_scanned = 0;
+        u32 debug_expired = 0;
+        int num_nodes = 0;
+        int *warned = NULL;
+        if (!dlm_config.ci_waitwarn_us)
+                return;
+        mutex_lock(&ls->ls_waiters_mutex);
+        list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
+                if (ktime_equal(lkb->lkb_wait_time, zero))
+                        continue;
+                debug_scanned++;
+                us = ktime_to_us(ktime_sub(ktime_get(), lkb->lkb_wait_time));
+                if (us < dlm_config.ci_waitwarn_us)
+                        continue;
+                lkb->lkb_wait_time = zero;
+                debug_expired++;
+                if (us > debug_maxus)
+                        debug_maxus = us;
+                if (!num_nodes) {
+                        num_nodes = ls->ls_num_nodes;
+                        warned = kmalloc(GFP_KERNEL, num_nodes * sizeof(int));
+                        if (warned)
+                                memset(warned, 0, num_nodes * sizeof(int));
+                }
+                if (!warned)
+                        continue;
+                if (nodeid_warned(lkb->lkb_wait_nodeid, num_nodes, warned))
+                        continue;
+                log_error(ls, "waitwarn %x %lld %d us check connection to "
+                          "node %d", lkb->lkb_id, (long long)us,
+                          dlm_config.ci_waitwarn_us, lkb->lkb_wait_nodeid);
+        }
+        mutex_unlock(&ls->ls_waiters_mutex);
+        if (warned)
+                kfree(warned);
+        if (debug_expired)
+                log_debug(ls, "scan_waiters %u warn %u over %d us max %lld us",
+                          debug_scanned, debug_expired,
+                          dlm_config.ci_waitwarn_us, (long long)debug_maxus);
+}
 /* add/remove lkb from global waiters list of lkb's waiting for
   a reply from a remote node */
-static int add_to_waiters(struct dlm_lkb *lkb, int mstype)
+static int add_to_waiters(struct dlm_lkb *lkb, int mstype, int to_nodeid)
 {
        struct dlm_ls *ls = lkb->lkb_resource->res_ls;
        int error = 0;
@@ -842,6 +916,8 @@ static int add_to_waiters(struct dlm_lkb *lkb, int mstype)
        lkb->lkb_wait_count++;
        lkb->lkb_wait_type = mstype;
+        lkb->lkb_wait_time = ktime_get();
+        lkb->lkb_wait_nodeid = to_nodeid; /* for debugging */
        hold_lkb(lkb);
        list_add(&lkb->lkb_wait_reply, &ls->ls_waiters);
 out:
@@ -961,10 +1037,10 @@ static int remove_from_waiters_ms(struct dlm_lkb *lkb, struct dlm_message *ms)
        struct dlm_ls *ls = lkb->lkb_resource->res_ls;
        int error;
-        if (ms != &ls->ls_stub_ms)
+        if (ms->m_flags != DLM_IFL_STUB_MS)
                mutex_lock(&ls->ls_waiters_mutex);
        error = _remove_from_waiters(lkb, ms->m_type, ms);
-        if (ms != &ls->ls_stub_ms)
+        if (ms->m_flags != DLM_IFL_STUB_MS)
                mutex_unlock(&ls->ls_waiters_mutex);
        return error;
 }
@@ -1157,6 +1233,16 @@ void dlm_adjust_timeouts(struct dlm_ls *ls)
        list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list)
                lkb->lkb_timestamp = ktime_add_us(lkb->lkb_timestamp, adj_us);
        mutex_unlock(&ls->ls_timeout_mutex);
+        if (!dlm_config.ci_waitwarn_us)
+                return;
+        mutex_lock(&ls->ls_waiters_mutex);
+        list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
+                if (ktime_to_us(lkb->lkb_wait_time))
+                        lkb->lkb_wait_time = ktime_get();
+        }
+        mutex_unlock(&ls->ls_waiters_mutex);
 }
 /* lkb is master or local copy */
@@ -1376,14 +1462,8 @@ static void grant_lock_pending(struct dlm_rsb *r, struct dlm_lkb *lkb)
   ALTPR/ALTCW: our rqmode may have been changed to PR or CW to become
   compatible with other granted locks */
-static void munge_demoted(struct dlm_lkb *lkb, struct dlm_message *ms)
+static void munge_demoted(struct dlm_lkb *lkb)
 {
-        if (ms->m_type != DLM_MSG_CONVERT_REPLY) {
-                log_print("munge_demoted %x invalid reply type %d",
-                          lkb->lkb_id, ms->m_type);
-                return;
-        }
        if (lkb->lkb_rqmode == DLM_LOCK_IV || lkb->lkb_grmode == DLM_LOCK_IV) {
                log_print("munge_demoted %x invalid modes gr %d rq %d",
                          lkb->lkb_id, lkb->lkb_grmode, lkb->lkb_rqmode);
@@ -2844,12 +2924,12 @@ static int send_common(struct dlm_rsb *r, struct dlm_lkb *lkb, int mstype)
        struct dlm_mhandle *mh;
        int to_nodeid, error;
-        error = add_to_waiters(lkb, mstype);
+        to_nodeid = r->res_nodeid;
+        error = add_to_waiters(lkb, mstype, to_nodeid);
        if (error)
                return error;
-        to_nodeid = r->res_nodeid;
        error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh);
        if (error)
                goto fail;
@@ -2880,9 +2960,9 @@ static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
        /* down conversions go without a reply from the master */
        if (!error && down_conversion(lkb)) {
                remove_from_waiters(lkb, DLM_MSG_CONVERT_REPLY);
+                r->res_ls->ls_stub_ms.m_flags = DLM_IFL_STUB_MS;
                r->res_ls->ls_stub_ms.m_type = DLM_MSG_CONVERT_REPLY;
                r->res_ls->ls_stub_ms.m_result = 0;
-                r->res_ls->ls_stub_ms.m_flags = lkb->lkb_flags;
                __receive_convert_reply(r, lkb, &r->res_ls->ls_stub_ms);
        }
@@ -2951,12 +3031,12 @@ static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb)
        struct dlm_mhandle *mh;
        int to_nodeid, error;
-        error = add_to_waiters(lkb, DLM_MSG_LOOKUP);
+        to_nodeid = dlm_dir_nodeid(r);
+        error = add_to_waiters(lkb, DLM_MSG_LOOKUP, to_nodeid);
        if (error)
                return error;
-        to_nodeid = dlm_dir_nodeid(r);
        error = create_message(r, NULL, to_nodeid, DLM_MSG_LOOKUP, &ms, &mh);
        if (error)
                goto fail;
@@ -3070,6 +3150,9 @@ static void receive_flags(struct dlm_lkb *lkb, struct dlm_message *ms)
 static void receive_flags_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
 {
+        if (ms->m_flags == DLM_IFL_STUB_MS)
+                return;
        lkb->lkb_sbflags = ms->m_sbflags;
        lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) |
                         (ms->m_flags & 0x0000FFFF);
@@ -3612,7 +3695,7 @@ static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
                /* convert was queued on remote master */
                receive_flags_reply(lkb, ms);
                if (is_demoted(lkb))
-                        munge_demoted(lkb, ms);
+                        munge_demoted(lkb);
                del_lkb(r, lkb);
                add_lkb(r, lkb, DLM_LKSTS_CONVERT);
                add_timeout(lkb);
@@ -3622,7 +3705,7 @@ static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
                /* convert was granted on remote master */
                receive_flags_reply(lkb, ms);
                if (is_demoted(lkb))
-                        munge_demoted(lkb, ms);
+                        munge_demoted(lkb);
                grant_lock_pc(r, lkb, ms);
                queue_cast(r, lkb, 0);
                break;
@@ -3996,15 +4079,17 @@ void dlm_receive_buffer(union dlm_packet *p, int nodeid)
        dlm_put_lockspace(ls);
 }
-static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb)
+static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb,
+                                   struct dlm_message *ms_stub)
 {
        if (middle_conversion(lkb)) {
                hold_lkb(lkb);
-                ls->ls_stub_ms.m_type = DLM_MSG_CONVERT_REPLY;
+                memset(ms_stub, 0, sizeof(struct dlm_message));
-                ls->ls_stub_ms.m_result = -EINPROGRESS;
+                ms_stub->m_flags = DLM_IFL_STUB_MS;
-                ls->ls_stub_ms.m_flags = lkb->lkb_flags;
+                ms_stub->m_type = DLM_MSG_CONVERT_REPLY;
-                ls->ls_stub_ms.m_header.h_nodeid = lkb->lkb_nodeid;
+                ms_stub->m_result = -EINPROGRESS;
-                _receive_convert_reply(lkb, &ls->ls_stub_ms);
+                ms_stub->m_header.h_nodeid = lkb->lkb_nodeid;
+                _receive_convert_reply(lkb, ms_stub);
                /* Same special case as in receive_rcom_lock_args() */
                lkb->lkb_grmode = DLM_LOCK_IV;
@@ -4045,13 +4130,27 @@ static int waiter_needs_recovery(struct dlm_ls *ls, struct dlm_lkb *lkb)
 void dlm_recover_waiters_pre(struct dlm_ls *ls)
 {
        struct dlm_lkb *lkb, *safe;
+        struct dlm_message *ms_stub;
        int wait_type, stub_unlock_result, stub_cancel_result;
+        ms_stub = kmalloc(GFP_KERNEL, sizeof(struct dlm_message));
+        if (!ms_stub) {
+                log_error(ls, "dlm_recover_waiters_pre no mem");
+                return;
+        }
        mutex_lock(&ls->ls_waiters_mutex);
        list_for_each_entry_safe(lkb, safe, &ls->ls_waiters, lkb_wait_reply) {
-                log_debug(ls, "pre recover waiter lkid %x type %d flags %x",
-                          lkb->lkb_id, lkb->lkb_wait_type, lkb->lkb_flags);
+                /* exclude debug messages about unlocks because there can be so
+                   many and they aren't very interesting */
+                if (lkb->lkb_wait_type != DLM_MSG_UNLOCK) {
+                        log_debug(ls, "recover_waiter %x nodeid %d "
+                                  "msg %d to %d", lkb->lkb_id, lkb->lkb_nodeid,
+                                  lkb->lkb_wait_type, lkb->lkb_wait_nodeid);
+                }
                /* all outstanding lookups, regardless of destination  will be
                   resent after recovery is done */
@@ -4097,26 +4196,28 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
                        break;
                case DLM_MSG_CONVERT:
-                        recover_convert_waiter(ls, lkb);
+                        recover_convert_waiter(ls, lkb, ms_stub);
                        break;
                case DLM_MSG_UNLOCK:
                        hold_lkb(lkb);
-                        ls->ls_stub_ms.m_type = DLM_MSG_UNLOCK_REPLY;
+                        memset(ms_stub, 0, sizeof(struct dlm_message));
-                        ls->ls_stub_ms.m_result = stub_unlock_result;
+                        ms_stub->m_flags = DLM_IFL_STUB_MS;
-                        ls->ls_stub_ms.m_flags = lkb->lkb_flags;
+                        ms_stub->m_type = DLM_MSG_UNLOCK_REPLY;
-                        ls->ls_stub_ms.m_header.h_nodeid = lkb->lkb_nodeid;
+                        ms_stub->m_result = stub_unlock_result;
-                        _receive_unlock_reply(lkb, &ls->ls_stub_ms);
+                        ms_stub->m_header.h_nodeid = lkb->lkb_nodeid;
+                        _receive_unlock_reply(lkb, ms_stub);
                        dlm_put_lkb(lkb);
                        break;
                case DLM_MSG_CANCEL:
                        hold_lkb(lkb);
-                        ls->ls_stub_ms.m_type = DLM_MSG_CANCEL_REPLY;
+                        memset(ms_stub, 0, sizeof(struct dlm_message));
-                        ls->ls_stub_ms.m_result = stub_cancel_result;
+                        ms_stub->m_flags = DLM_IFL_STUB_MS;
-                        ls->ls_stub_ms.m_flags = lkb->lkb_flags;
+                        ms_stub->m_type = DLM_MSG_CANCEL_REPLY;
-                        ls->ls_stub_ms.m_header.h_nodeid = lkb->lkb_nodeid;
+                        ms_stub->m_result = stub_cancel_result;
-                        _receive_cancel_reply(lkb, &ls->ls_stub_ms);
+                        ms_stub->m_header.h_nodeid = lkb->lkb_nodeid;
+                        _receive_cancel_reply(lkb, ms_stub);
                        dlm_put_lkb(lkb);
                        break;
@@ -4127,6 +4228,7 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
                schedule();
        }
        mutex_unlock(&ls->ls_waiters_mutex);
+        kfree(ms_stub);
 }
 static struct dlm_lkb *find_resend_waiter(struct dlm_ls *ls)
@@ -4191,8 +4293,8 @@ int dlm_recover_waiters_post(struct dlm_ls *ls)
                ou = is_overlap_unlock(lkb);
                err = 0;
-                log_debug(ls, "recover_waiters_post %x type %d flags %x %s",
+                log_debug(ls, "recover_waiter %x nodeid %d msg %d r_nodeid %d",
-                          lkb->lkb_id, mstype, lkb->lkb_flags, r->res_name);
+                          lkb->lkb_id, lkb->lkb_nodeid, mstype, r->res_nodeid);
                /* At this point we assume that we won't get a reply to any
                   previous op or overlap op on this lock.  First, do a big
diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h
index 88e93c80cc22..265017a7c3e7 100644
--- a/fs/dlm/lock.h
+++ b/fs/dlm/lock.h
@@ -24,6 +24,7 @@ int dlm_put_lkb(struct dlm_lkb *lkb);
 void dlm_scan_rsbs(struct dlm_ls *ls);
 int dlm_lock_recovery_try(struct dlm_ls *ls);
 void dlm_unlock_recovery(struct dlm_ls *ls);
+void dlm_scan_waiters(struct dlm_ls *ls);
 void dlm_scan_timeout(struct dlm_ls *ls);
 void dlm_adjust_timeouts(struct dlm_ls *ls);
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index f994a7dfda85..14cbf4099753 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -243,7 +243,6 @@ static struct dlm_ls *find_ls_to_scan(void)
 static int dlm_scand(void *data)
 {
        struct dlm_ls *ls;
-        int timeout_jiffies = dlm_config.ci_scan_secs * HZ;
        while (!kthread_should_stop()) {
                ls = find_ls_to_scan();
@@ -252,13 +251,14 @@ static int dlm_scand(void *data)
                                ls->ls_scan_time = jiffies;
                                dlm_scan_rsbs(ls);
                                dlm_scan_timeout(ls);
+                                dlm_scan_waiters(ls);
                                dlm_unlock_recovery(ls);
                        } else {
                                ls->ls_scan_time += HZ;
                        }
-                } else {
+                        continue;
-                        schedule_timeout_interruptible(timeout_jiffies);
                }
+                schedule_timeout_interruptible(dlm_config.ci_scan_secs * HZ);
        }
        return 0;
 }
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index 30d8b85febbf..e2b878004364 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -71,6 +71,36 @@ static void send_op(struct plock_op *op)
        wake_up(&send_wq);
 }
+/* If a process was killed while waiting for the only plock on a file,
+   locks_remove_posix will not see any lock on the file so it won't
+   send an unlock-close to us to pass on to userspace to clean up the
+   abandoned waiter.  So, we have to insert the unlock-close when the
+   lock call is interrupted. */
+static void do_unlock_close(struct dlm_ls *ls, u64 number,
+                            struct file *file, struct file_lock *fl)
+{
+        struct plock_op *op;
+        op = kzalloc(sizeof(*op), GFP_NOFS);
+        if (!op)
+                return;
+        op->info.optype         = DLM_PLOCK_OP_UNLOCK;
+        op->info.pid            = fl->fl_pid;
+        op->info.fsid           = ls->ls_global_id;
+        op->info.number         = number;
+        op->info.start          = 0;
+        op->info.end            = OFFSET_MAX;
+        if (fl->fl_lmops && fl->fl_lmops->fl_grant)
+                op->info.owner  = (__u64) fl->fl_pid;
+        else
+                op->info.owner  = (__u64)(long) fl->fl_owner;
+        op->info.flags |= DLM_PLOCK_FL_CLOSE;
+        send_op(op);
+}
 int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
                   int cmd, struct file_lock *fl)
 {
@@ -114,9 +144,19 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
        send_op(op);
-        if (xop->callback == NULL)
+        if (xop->callback == NULL) {
-                wait_event(recv_wq, (op->done != 0));
+                rv = wait_event_killable(recv_wq, (op->done != 0));
-        else {
+                if (rv == -ERESTARTSYS) {
+                        log_debug(ls, "dlm_posix_lock: wait killed %llx",
+                                  (unsigned long long)number);
+                        spin_lock(&ops_lock);
+                        list_del(&op->list);
+                        spin_unlock(&ops_lock);
+                        kfree(xop);
+                        do_unlock_close(ls, number, file, fl);
+                        goto out;
+                }
+        } else {
                rv = FILE_LOCK_DEFERRED;
                goto out;
        }
@@ -233,6 +273,13 @@ int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
        else
                op->info.owner  = (__u64)(long) fl->fl_owner;
+        if (fl->fl_flags & FL_CLOSE) {
+                op->info.flags |= DLM_PLOCK_FL_CLOSE;
+                send_op(op);
+                rv = 0;
+                goto out;
+        }
        send_op(op);
        wait_event(recv_wq, (op->done != 0));
@@ -334,7 +381,10 @@ static ssize_t dev_read(struct file *file, char __user *u, size_t count,
        spin_lock(&ops_lock);
        if (!list_empty(&send_list)) {
                op = list_entry(send_list.next, struct plock_op, list);
-                list_move(&op->list, &recv_list);
+                if (op->info.flags & DLM_PLOCK_FL_CLOSE)
+                        list_del(&op->list);
+                else
+                        list_move(&op->list, &recv_list);
                memcpy(&info, &op->info, sizeof(info));
        }
        spin_unlock(&ops_lock);
@@ -342,6 +392,13 @@ static ssize_t dev_read(struct file *file, char __user *u, size_t count,
        if (!op)
                return -EAGAIN;
+        /* there is no need to get a reply from userspace for unlocks
+           that were generated by the vfs cleaning up for a close
+           (the process did not make an unlock call). */
+        if (op->info.flags & DLM_PLOCK_FL_CLOSE)
+                kfree(op);
        if (copy_to_user(u, &info, sizeof(info)))
                return -EFAULT;
        return sizeof(info);
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index d5ab3fe7c198..e96bf3e9be88 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -611,7 +611,6 @@ static ssize_t device_write(struct file *file, const char __user *buf,
 out_sig:
        sigprocmask(SIG_SETMASK, &tmpsig, NULL);
-        recalc_sigpending();
 out_free:
        kfree(kbuf);
        return error;
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 98b77c89494c..c00e055b6282 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -40,9 +40,12 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
 static void drop_slab(void)
 {
        int nr_objects;
+        struct shrink_control shrink = {
+                .gfp_mask = GFP_KERNEL,
+        };
        do {
-                nr_objects = shrink_slab(1000, GFP_KERNEL, 1000);
+                nr_objects = shrink_slab(&shrink, 1000, 1000);
        } while (nr_objects > 10);
 }
diff --git a/fs/exec.c b/fs/exec.c
index c016896dcbb2..936f5776655c 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -200,7 +200,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
 #ifdef CONFIG_STACK_GROWSUP
        if (write) {
-                ret = expand_stack_downwards(bprm->vma, pos);
+                ret = expand_downwards(bprm->vma, pos);
                if (ret < 0)
                        return NULL;
        }
@@ -600,7 +600,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
        unsigned long length = old_end - old_start;
        unsigned long new_start = old_start - shift;
        unsigned long new_end = old_end - shift;
-        struct mmu_gather *tlb;
+        struct mmu_gather tlb;
        BUG_ON(new_start > new_end);
@@ -626,12 +626,12 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
                return -ENOMEM;
        lru_add_drain();
-        tlb = tlb_gather_mmu(mm, 0);
+        tlb_gather_mmu(&tlb, mm, 0);
        if (new_end > old_start) {
                /*
                 * when the old and new regions overlap clear from new_end.
                 */
-                free_pgd_range(tlb, new_end, old_end, new_end,
+                free_pgd_range(&tlb, new_end, old_end, new_end,
                        vma->vm_next ? vma->vm_next->vm_start : 0);
        } else {
                /*
@@ -640,10 +640,10 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
                 * have constraints on va-space that make this illegal (IA64) -
                 * for the others its just a little faster.
                 */
-                free_pgd_range(tlb, old_start, old_end, new_end,
+                free_pgd_range(&tlb, old_start, old_end, new_end,
                        vma->vm_next ? vma->vm_next->vm_start : 0);
        }
-        tlb_finish_mmu(tlb, new_end, old_end);
+        tlb_finish_mmu(&tlb, new_end, old_end);
        /*
         * Shrink the vma to just the new range.  Always succeeds.
@@ -1051,6 +1051,7 @@ char *get_task_comm(char *buf, struct task_struct *tsk)
        task_unlock(tsk);
        return buf;
 }
+EXPORT_SYMBOL_GPL(get_task_comm);
 void set_task_comm(struct task_struct *tsk, char *buf)
 {
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 0a78dae7e2cb..1dd62ed35b85 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -898,7 +898,8 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
                brelse(bh);
                if (!sb_set_blocksize(sb, blocksize)) {
-                        ext2_msg(sb, KERN_ERR, "error: blocksize is too small");
+                        ext2_msg(sb, KERN_ERR,
+                                "error: bad blocksize %d", blocksize);
                        goto failed_sbi;
                }
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 32f3b8695859..34b6d9bfc48a 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1416,10 +1416,19 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
        frame->at = entries;
        frame->bh = bh;
        bh = bh2;
+        /*
+         * Mark buffers dirty here so that if do_split() fails we write a
+         * consistent set of buffers to disk.
+         */
+        ext3_journal_dirty_metadata(handle, frame->bh);
+        ext3_journal_dirty_metadata(handle, bh);
        de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
-        dx_release (frames);
+        if (!de) {
-        if (!(de))
+                ext3_mark_inode_dirty(handle, dir);
+                dx_release(frames);
                return retval;
+        }
+        dx_release(frames);
        return add_dirent_to_buf(handle, dentry, inode, de, bh);
 }
@@ -2189,6 +2198,7 @@ static int ext3_symlink (struct inode * dir,
        handle_t *handle;
        struct inode * inode;
        int l, err, retries = 0;
+        int credits;
        l = strlen(symname)+1;
        if (l > dir->i_sb->s_blocksize)
@@ -2196,10 +2206,26 @@ static int ext3_symlink (struct inode * dir,
        dquot_initialize(dir);
+        if (l > EXT3_N_BLOCKS * 4) {
+                /*
+                 * For non-fast symlinks, we just allocate inode and put it on
+                 * orphan list in the first transaction => we need bitmap,
+                 * group descriptor, sb, inode block, quota blocks.
+                 */
+                credits = 4 + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
+        } else {
+                /*
+                 * Fast symlink. We have to add entry to directory
+                 * (EXT3_DATA_TRANS_BLOCKS + EXT3_INDEX_EXTRA_TRANS_BLOCKS),
+                 * allocate new inode (bitmap, group descriptor, inode block,
+                 * quota blocks, sb is already counted in previous macros).
+                 */
+                credits = EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
+                          EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+                          EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
+        }
 retry:
-        handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
+        handle = ext3_journal_start(dir, credits);
-                                        EXT3_INDEX_EXTRA_TRANS_BLOCKS + 5 +
-                                        EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
        if (IS_ERR(handle))
                return PTR_ERR(handle);
@@ -2211,21 +2237,45 @@ retry:
        if (IS_ERR(inode))
                goto out_stop;
-        if (l > sizeof (EXT3_I(inode)->i_data)) {
+        if (l > EXT3_N_BLOCKS * 4) {
                inode->i_op = &ext3_symlink_inode_operations;
                ext3_set_aops(inode);
                /*
-                 * page_symlink() calls into ext3_prepare/commit_write.
+                 * We cannot call page_symlink() with transaction started
-                 * We have a transaction open.  All is sweetness.  It also sets
+                 * because it calls into ext3_write_begin() which acquires page
-                 * i_size in generic_commit_write().
+                 * lock which ranks below transaction start (and it can also
+                 * wait for journal commit if we are running out of space). So
+                 * we have to stop transaction now and restart it when symlink
+                 * contents is written. 
+                 *
+                 * To keep fs consistent in case of crash, we have to put inode
+                 * to orphan list in the mean time.
                 */
+                drop_nlink(inode);
+                err = ext3_orphan_add(handle, inode);
+                ext3_journal_stop(handle);
+                if (err)
+                        goto err_drop_inode;
                err = __page_symlink(inode, symname, l, 1);
+                if (err)
+                        goto err_drop_inode;
+                /*
+                 * Now inode is being linked into dir (EXT3_DATA_TRANS_BLOCKS
+                 * + EXT3_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified
+                 */
+                handle = ext3_journal_start(dir,
+                                EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
+                                EXT3_INDEX_EXTRA_TRANS_BLOCKS + 1);
+                if (IS_ERR(handle)) {
+                        err = PTR_ERR(handle);
+                        goto err_drop_inode;
+                }
+                inc_nlink(inode);
+                err = ext3_orphan_del(handle, inode);
                if (err) {
+                        ext3_journal_stop(handle);
                        drop_nlink(inode);
-                        unlock_new_inode(inode);
+                        goto err_drop_inode;
-                        ext3_mark_inode_dirty(handle, inode);
-                        iput (inode);
-                        goto out_stop;
                }
        } else {
                inode->i_op = &ext3_fast_symlink_inode_operations;
@@ -2239,6 +2289,10 @@ out_stop:
        if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
                goto retry;
        return err;
+err_drop_inode:
+        unlock_new_inode(inode);
+        iput(inode);
+        return err;
 }
 static int ext3_link (struct dentry * old_dentry,
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index ae8200f84e39..1cc7038e273d 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -151,6 +151,13 @@ static void fat_cache_add(struct inode *inode, struct fat_cache_id *new)
                        spin_unlock(&MSDOS_I(inode)->cache_lru_lock);
                        tmp = fat_cache_alloc(inode);
+                        if (!tmp) {
+                                spin_lock(&MSDOS_I(inode)->cache_lru_lock);
+                                MSDOS_I(inode)->nr_caches--;
+                                spin_unlock(&MSDOS_I(inode)->cache_lru_lock);
+                                return;
+                        }
                        spin_lock(&MSDOS_I(inode)->cache_lru_lock);
                        cache = fat_cache_merge(inode, new);
                        if (cache != NULL) {
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index ee42b9e0b16a..4ad64732cbce 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -98,7 +98,7 @@ next:
        *bh = sb_bread(sb, phys);
        if (*bh == NULL) {
-                printk(KERN_ERR "FAT: Directory bread(block %llu) failed\n",
+                fat_msg(sb, KERN_ERR, "Directory bread(block %llu) failed",
                       (llu)phys);
                /* skip this block */
                *pos = (iblock + 1) << sb->s_blocksize_bits;
@@ -136,9 +136,10 @@ static inline int fat_get_entry(struct inode *dir, loff_t *pos,
 * but ignore that right now.
 * Ahem... Stack smashing in ring 0 isn't fun. Fixed.
 */
-static int uni16_to_x8(unsigned char *ascii, const wchar_t *uni, int len,
+static int uni16_to_x8(struct super_block *sb, unsigned char *ascii,
-                       int uni_xlate, struct nls_table *nls)
+                       const wchar_t *uni, int len, struct nls_table *nls)
 {
+        int uni_xlate = MSDOS_SB(sb)->options.unicode_xlate;
        const wchar_t *ip;
        wchar_t ec;
        unsigned char *op;
@@ -166,23 +167,23 @@ static int uni16_to_x8(unsigned char *ascii, const wchar_t *uni, int len,
        }
        if (unlikely(*ip)) {
-                printk(KERN_WARNING "FAT: filename was truncated while "
+                fat_msg(sb, KERN_WARNING, "filename was truncated while "
-                       "converting.");
+                        "converting.");
        }
        *op = 0;
        return (op - ascii);
 }
-static inline int fat_uni_to_x8(struct msdos_sb_info *sbi, const wchar_t *uni,
+static inline int fat_uni_to_x8(struct super_block *sb, const wchar_t *uni,
                                unsigned char *buf, int size)
 {
+        struct msdos_sb_info *sbi = MSDOS_SB(sb);
        if (sbi->options.utf8)
                return utf16s_to_utf8s(uni, FAT_MAX_UNI_CHARS,
                                UTF16_HOST_ENDIAN, buf, size);
        else
-                return uni16_to_x8(buf, uni, size, sbi->options.unicode_xlate,
+                return uni16_to_x8(sb, buf, uni, size, sbi->nls_io);
-                                   sbi->nls_io);
 }
 static inline int
@@ -419,7 +420,7 @@ parse_record:
                /* Compare shortname */
                bufuname[last_u] = 0x0000;
-                len = fat_uni_to_x8(sbi, bufuname, bufname, sizeof(bufname));
+                len = fat_uni_to_x8(sb, bufuname, bufname, sizeof(bufname));
                if (fat_name_match(sbi, name, name_len, bufname, len))
                        goto found;
@@ -428,7 +429,7 @@ parse_record:
                        int size = PATH_MAX - FAT_MAX_UNI_SIZE;
                        /* Compare longname */
-                        len = fat_uni_to_x8(sbi, unicode, longname, size);
+                        len = fat_uni_to_x8(sb, unicode, longname, size);
                        if (fat_name_match(sbi, name, name_len, longname, len))
                                goto found;
                }
@@ -545,7 +546,7 @@ parse_record:
                if (nr_slots) {
                        void *longname = unicode + FAT_MAX_UNI_CHARS;
                        int size = PATH_MAX - FAT_MAX_UNI_SIZE;
-                        int len = fat_uni_to_x8(sbi, unicode, longname, size);
+                        int len = fat_uni_to_x8(sb, unicode, longname, size);
                        fill_name = longname;
                        fill_len = len;
@@ -621,7 +622,7 @@ parse_record:
        if (isvfat) {
                bufuname[j] = 0x0000;
-                i = fat_uni_to_x8(sbi, bufuname, bufname, sizeof(bufname));
+                i = fat_uni_to_x8(sb, bufuname, bufname, sizeof(bufname));
        }
        if (nr_slots) {
                /* hack for fat_ioctl_filldir() */
@@ -979,6 +980,7 @@ static int __fat_remove_entries(struct inode *dir, loff_t pos, int nr_slots)
 int fat_remove_entries(struct inode *dir, struct fat_slot_info *sinfo)
 {
+        struct super_block *sb = dir->i_sb;
        struct msdos_dir_entry *de;
        struct buffer_head *bh;
        int err = 0, nr_slots;
@@ -1013,8 +1015,8 @@ int fat_remove_entries(struct inode *dir, struct fat_slot_info *sinfo)
                 */
                err = __fat_remove_entries(dir, sinfo->slot_off, nr_slots);
                if (err) {
-                        printk(KERN_WARNING
+                        fat_msg(sb, KERN_WARNING,
-                               "FAT: Couldn't remove the long name slots\n");
+                               "Couldn't remove the long name slots");
                }
        }
@@ -1265,7 +1267,7 @@ int fat_add_entries(struct inode *dir, void *slots, int nr_slots,
                if (sbi->fat_bits != 32)
                        goto error;
        } else if (MSDOS_I(dir)->i_start == 0) {
-                printk(KERN_ERR "FAT: Corrupted directory (i_pos %lld)\n",
+                fat_msg(sb, KERN_ERR, "Corrupted directory (i_pos %lld)",
                       MSDOS_I(dir)->i_pos);
                err = -EIO;
                goto error;
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index f50408901f7e..8276cc282dec 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -319,19 +319,20 @@ extern struct inode *fat_build_inode(struct super_block *sb,
                        struct msdos_dir_entry *de, loff_t i_pos);
 extern int fat_sync_inode(struct inode *inode);
 extern int fat_fill_super(struct super_block *sb, void *data, int silent,
-                        const struct inode_operations *fs_dir_inode_ops,
+                          int isvfat, void (*setup)(struct super_block *));
-                        int isvfat, void (*setup)(struct super_block *));
 extern int fat_flush_inodes(struct super_block *sb, struct inode *i1,
                            struct inode *i2);
 /* fat/misc.c */
 extern void
-__fat_fs_error(struct super_block *s, int report, const char *fmt, ...)
+__fat_fs_error(struct super_block *sb, int report, const char *fmt, ...)
+        __attribute__ ((format (printf, 3, 4))) __cold;
+#define fat_fs_error(sb, fmt, args...)          \
+        __fat_fs_error(sb, 1, fmt , ## args)
+#define fat_fs_error_ratelimit(sb, fmt, args...) \
+        __fat_fs_error(sb, __ratelimit(&MSDOS_SB(sb)->ratelimit), fmt , ## args)
+void fat_msg(struct super_block *sb, const char *level, const char *fmt, ...)
        __attribute__ ((format (printf, 3, 4))) __cold;
-#define fat_fs_error(s, fmt, args...)           \
-        __fat_fs_error(s, 1, fmt , ## args)
-#define fat_fs_error_ratelimit(s, fmt, args...) \
-        __fat_fs_error(s, __ratelimit(&MSDOS_SB(s)->ratelimit), fmt , ## args)
 extern int fat_clusters_flush(struct super_block *sb);
 extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster);
 extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts,
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index b47d2c9f4fa1..2e81ac0df7e2 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -95,7 +95,7 @@ static int fat12_ent_bread(struct super_block *sb, struct fat_entry *fatent,
 err_brelse:
        brelse(bhs[0]);
 err:
-        printk(KERN_ERR "FAT: FAT read failed (blocknr %llu)\n", (llu)blocknr);
+        fat_msg(sb, KERN_ERR, "FAT read failed (blocknr %llu)", (llu)blocknr);
        return -EIO;
 }
@@ -108,7 +108,7 @@ static int fat_ent_bread(struct super_block *sb, struct fat_entry *fatent,
        fatent->fat_inode = MSDOS_SB(sb)->fat_inode;
        fatent->bhs[0] = sb_bread(sb, blocknr);
        if (!fatent->bhs[0]) {
-                printk(KERN_ERR "FAT: FAT read failed (blocknr %llu)\n",
+                fat_msg(sb, KERN_ERR, "FAT read failed (blocknr %llu)",
                       (llu)blocknr);
                return -EIO;
        }
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 8d68690bdcf1..cb8d8391ac0b 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -581,7 +581,8 @@ static int fat_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_bavail = sbi->free_clusters;
        buf->f_fsid.val[0] = (u32)id;
        buf->f_fsid.val[1] = (u32)(id >> 32);
-        buf->f_namelen = sbi->options.isvfat ? FAT_LFN_LEN : 12;
+        buf->f_namelen =
+                (sbi->options.isvfat ? FAT_LFN_LEN : 12) * NLS_MAX_CHARSET_SIZE;
        return 0;
 }
@@ -619,8 +620,8 @@ retry:
        bh = sb_bread(sb, i_pos >> sbi->dir_per_block_bits);
        if (!bh) {
-                printk(KERN_ERR "FAT: unable to read inode block "
+                fat_msg(sb, KERN_ERR, "unable to read inode block "
-                       "for updating (i_pos %lld)\n", i_pos);
+                       "for updating (i_pos %lld)", i_pos);
                return -EIO;
        }
        spin_lock(&sbi->inode_hash_lock);
@@ -976,8 +977,8 @@ static const match_table_t vfat_tokens = {
        {Opt_err, NULL}
 };
-static int parse_options(char *options, int is_vfat, int silent, int *debug,
+static int parse_options(struct super_block *sb, char *options, int is_vfat,
-                         struct fat_mount_options *opts)
+                         int silent, int *debug, struct fat_mount_options *opts)
 {
        char *p;
        substring_t args[MAX_OPT_ARGS];
@@ -1168,15 +1169,15 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
                /* obsolete mount options */
                case Opt_obsolate:
-                        printk(KERN_INFO "FAT: \"%s\" option is obsolete, "
+                        fat_msg(sb, KERN_INFO, "\"%s\" option is obsolete, "
-                               "not supported now\n", p);
+                               "not supported now", p);
                        break;
                /* unknown option */
                default:
                        if (!silent) {
-                                printk(KERN_ERR
+                                fat_msg(sb, KERN_ERR,
-                                       "FAT: Unrecognized mount option \"%s\" "
+                                       "Unrecognized mount option \"%s\" "
-                                       "or missing value\n", p);
+                                       "or missing value", p);
                        }
                        return -EINVAL;
                }
@@ -1185,7 +1186,7 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
 out:
        /* UTF-8 doesn't provide FAT semantics */
        if (!strcmp(opts->iocharset, "utf8")) {
-                printk(KERN_ERR "FAT: utf8 is not a recommended IO charset"
+                fat_msg(sb, KERN_ERR, "utf8 is not a recommended IO charset"
                       " for FAT filesystems, filesystem will be "
                       "case sensitive!\n");
        }
@@ -1238,8 +1239,7 @@ static int fat_read_root(struct inode *inode)
 /*
 * Read the super block of an MS-DOS FS.
 */
-int fat_fill_super(struct super_block *sb, void *data, int silent,
+int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
-                   const struct inode_operations *fs_dir_inode_ops, int isvfat,
                   void (*setup)(struct super_block *))
 {
        struct inode *root_inode = NULL, *fat_inode = NULL;
@@ -1268,11 +1268,10 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
        sb->s_magic = MSDOS_SUPER_MAGIC;
        sb->s_op = &fat_sops;
        sb->s_export_op = &fat_export_ops;
-        sbi->dir_ops = fs_dir_inode_ops;
        ratelimit_state_init(&sbi->ratelimit, DEFAULT_RATELIMIT_INTERVAL,
                             DEFAULT_RATELIMIT_BURST);
-        error = parse_options(data, isvfat, silent, &debug, &sbi->options);
+        error = parse_options(sb, data, isvfat, silent, &debug, &sbi->options);
        if (error)
                goto out_fail;
@@ -1282,20 +1281,20 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
        sb_min_blocksize(sb, 512);
        bh = sb_bread(sb, 0);
        if (bh == NULL) {
-                printk(KERN_ERR "FAT: unable to read boot sector\n");
+                fat_msg(sb, KERN_ERR, "unable to read boot sector");
                goto out_fail;
        }
        b = (struct fat_boot_sector *) bh->b_data;
        if (!b->reserved) {
                if (!silent)
-                        printk(KERN_ERR "FAT: bogus number of reserved sectors\n");
+                        fat_msg(sb, KERN_ERR, "bogus number of reserved sectors");
                brelse(bh);
                goto out_invalid;
        }
        if (!b->fats) {
                if (!silent)
-                        printk(KERN_ERR "FAT: bogus number of FAT structure\n");
+                        fat_msg(sb, KERN_ERR, "bogus number of FAT structure");
                brelse(bh);
                goto out_invalid;
        }
@@ -1308,7 +1307,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
        media = b->media;
        if (!fat_valid_media(media)) {
                if (!silent)
-                        printk(KERN_ERR "FAT: invalid media value (0x%02x)\n",
+                        fat_msg(sb, KERN_ERR, "invalid media value (0x%02x)",
                               media);
                brelse(bh);
                goto out_invalid;
@@ -1318,7 +1317,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
            || (logical_sector_size < 512)
            || (logical_sector_size > 4096)) {
                if (!silent)
-                        printk(KERN_ERR "FAT: bogus logical sector size %u\n",
+                        fat_msg(sb, KERN_ERR, "bogus logical sector size %u",
                               logical_sector_size);
                brelse(bh);
                goto out_invalid;
@@ -1326,15 +1325,15 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
        sbi->sec_per_clus = b->sec_per_clus;
        if (!is_power_of_2(sbi->sec_per_clus)) {
                if (!silent)
-                        printk(KERN_ERR "FAT: bogus sectors per cluster %u\n",
+                        fat_msg(sb, KERN_ERR, "bogus sectors per cluster %u",
                               sbi->sec_per_clus);
                brelse(bh);
                goto out_invalid;
        }
        if (logical_sector_size < sb->s_blocksize) {
-                printk(KERN_ERR "FAT: logical sector size too small for device"
+                fat_msg(sb, KERN_ERR, "logical sector size too small for device"
-                       " (logical sector size = %u)\n", logical_sector_size);
+                       " (logical sector size = %u)", logical_sector_size);
                brelse(bh);
                goto out_fail;
        }
@@ -1342,14 +1341,14 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
                brelse(bh);
                if (!sb_set_blocksize(sb, logical_sector_size)) {
-                        printk(KERN_ERR "FAT: unable to set blocksize %u\n",
+                        fat_msg(sb, KERN_ERR, "unable to set blocksize %u",
                               logical_sector_size);
                        goto out_fail;
                }
                bh = sb_bread(sb, 0);
                if (bh == NULL) {
-                        printk(KERN_ERR "FAT: unable to read boot sector"
+                        fat_msg(sb, KERN_ERR, "unable to read boot sector"
-                               " (logical sector size = %lu)\n",
+                               " (logical sector size = %lu)",
                               sb->s_blocksize);
                        goto out_fail;
                }
@@ -1385,16 +1384,16 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
                fsinfo_bh = sb_bread(sb, sbi->fsinfo_sector);
                if (fsinfo_bh == NULL) {
-                        printk(KERN_ERR "FAT: bread failed, FSINFO block"
+                        fat_msg(sb, KERN_ERR, "bread failed, FSINFO block"
-                               " (sector = %lu)\n", sbi->fsinfo_sector);
+                               " (sector = %lu)", sbi->fsinfo_sector);
                        brelse(bh);
                        goto out_fail;
                }
                fsinfo = (struct fat_boot_fsinfo *)fsinfo_bh->b_data;
                if (!IS_FSINFO(fsinfo)) {
-                        printk(KERN_WARNING "FAT: Invalid FSINFO signature: "
+                        fat_msg(sb, KERN_WARNING, "Invalid FSINFO signature: "
-                               "0x%08x, 0x%08x (sector = %lu)\n",
+                               "0x%08x, 0x%08x (sector = %lu)",
                               le32_to_cpu(fsinfo->signature1),
                               le32_to_cpu(fsinfo->signature2),
                               sbi->fsinfo_sector);
@@ -1415,8 +1414,8 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
        sbi->dir_entries = get_unaligned_le16(&b->dir_entries);
        if (sbi->dir_entries & (sbi->dir_per_block - 1)) {
                if (!silent)
-                        printk(KERN_ERR "FAT: bogus directroy-entries per block"
+                        fat_msg(sb, KERN_ERR, "bogus directroy-entries per block"
-                               " (%u)\n", sbi->dir_entries);
+                               " (%u)", sbi->dir_entries);
                brelse(bh);
                goto out_invalid;
        }
@@ -1438,7 +1437,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
        total_clusters = min(total_clusters, fat_clusters - FAT_START_ENT);
        if (total_clusters > MAX_FAT(sb)) {
                if (!silent)
-                        printk(KERN_ERR "FAT: count of clusters too big (%u)\n",
+                        fat_msg(sb, KERN_ERR, "count of clusters too big (%u)",
                               total_clusters);
                brelse(bh);
                goto out_invalid;
@@ -1471,7 +1470,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
        sprintf(buf, "cp%d", sbi->options.codepage);
        sbi->nls_disk = load_nls(buf);
        if (!sbi->nls_disk) {
-                printk(KERN_ERR "FAT: codepage %s not found\n", buf);
+                fat_msg(sb, KERN_ERR, "codepage %s not found", buf);
                goto out_fail;
        }
@@ -1479,7 +1478,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
        if (sbi->options.isvfat) {
                sbi->nls_io = load_nls(sbi->options.iocharset);
                if (!sbi->nls_io) {
-                        printk(KERN_ERR "FAT: IO charset %s not found\n",
+                        fat_msg(sb, KERN_ERR, "IO charset %s not found",
                               sbi->options.iocharset);
                        goto out_fail;
                }
@@ -1503,7 +1502,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
        insert_inode_hash(root_inode);
        sb->s_root = d_alloc_root(root_inode);
        if (!sb->s_root) {
-                printk(KERN_ERR "FAT: get root inode failed\n");
+                fat_msg(sb, KERN_ERR, "get root inode failed");
                goto out_fail;
        }
@@ -1512,8 +1511,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
 out_invalid:
        error = -EINVAL;
        if (!silent)
-                printk(KERN_INFO "VFS: Can't find a valid FAT filesystem"
+                fat_msg(sb, KERN_INFO, "Can't find a valid FAT filesystem");
-                       " on dev %s.\n", sb->s_id);
 out_fail:
        if (fat_inode)
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 970e682ea754..6d93360ca0cc 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -20,30 +20,46 @@
 * In case the file system is remounted read-only, it can be made writable
 * again by remounting it.
 */
-void __fat_fs_error(struct super_block *s, int report, const char *fmt, ...)
+void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...)
 {
-        struct fat_mount_options *opts = &MSDOS_SB(s)->options;
+        struct fat_mount_options *opts = &MSDOS_SB(sb)->options;
        va_list args;
+        struct va_format vaf;
        if (report) {
-                printk(KERN_ERR "FAT: Filesystem error (dev %s)\n", s->s_id);
-                printk(KERN_ERR "    ");
                va_start(args, fmt);
-                vprintk(fmt, args);
+                vaf.fmt = fmt;
+                vaf.va = &args;
+                printk(KERN_ERR "FAT-fs (%s): error, %pV\n", sb->s_id, &vaf);
                va_end(args);
-                printk("\n");
        }
        if (opts->errors == FAT_ERRORS_PANIC)
-                panic("FAT: fs panic from previous error\n");
+                panic("FAT-fs (%s): fs panic from previous error\n", sb->s_id);
-        else if (opts->errors == FAT_ERRORS_RO && !(s->s_flags & MS_RDONLY)) {
+        else if (opts->errors == FAT_ERRORS_RO && !(sb->s_flags & MS_RDONLY)) {
-                s->s_flags |= MS_RDONLY;
+                sb->s_flags |= MS_RDONLY;
-                printk(KERN_ERR "FAT: Filesystem has been set read-only\n");
+                printk(KERN_ERR "FAT-fs (%s): Filesystem has been "
+                                "set read-only\n", sb->s_id);
        }
 }
 EXPORT_SYMBOL_GPL(__fat_fs_error);
+/**
+ * fat_msg() - print preformated FAT specific messages. Every thing what is
+ * not fat_fs_error() should be fat_msg().
+ */
+void fat_msg(struct super_block *sb, const char *level, const char *fmt, ...)
+{
+        struct va_format vaf;
+        va_list args;
+        va_start(args, fmt);
+        vaf.fmt = fmt;
+        vaf.va = &args;
+        printk("%sFAT-fs (%s): %pV\n", level, sb->s_id, &vaf);
+        va_end(args);
+}
 /* Flushes the number of free clusters on FAT32 */
 /* XXX: Need to write one per FSINFO block.  Currently only writes 1 */
 int fat_clusters_flush(struct super_block *sb)
@@ -57,15 +73,15 @@ int fat_clusters_flush(struct super_block *sb)
        bh = sb_bread(sb, sbi->fsinfo_sector);
        if (bh == NULL) {
-                printk(KERN_ERR "FAT: bread failed in fat_clusters_flush\n");
+                fat_msg(sb, KERN_ERR, "bread failed in fat_clusters_flush");
                return -EIO;
        }
        fsinfo = (struct fat_boot_fsinfo *)bh->b_data;
        /* Sanity check */
        if (!IS_FSINFO(fsinfo)) {
-                printk(KERN_ERR "FAT: Invalid FSINFO signature: "
+                fat_msg(sb, KERN_ERR, "Invalid FSINFO signature: "
-                       "0x%08x, 0x%08x (sector = %lu)\n",
+                       "0x%08x, 0x%08x (sector = %lu)",
                       le32_to_cpu(fsinfo->signature1),
                       le32_to_cpu(fsinfo->signature2),
                       sbi->fsinfo_sector);
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index 711499040eb6..3b222dafd15b 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -659,14 +659,14 @@ static const struct inode_operations msdos_dir_inode_operations = {
 static void setup(struct super_block *sb)
 {
+        MSDOS_SB(sb)->dir_ops = &msdos_dir_inode_operations;
        sb->s_d_op = &msdos_dentry_operations;
        sb->s_flags |= MS_NOATIME;
 }
 static int msdos_fill_super(struct super_block *sb, void *data, int silent)
 {
-        return fat_fill_super(sb, data, silent, &msdos_dir_inode_operations,
+        return fat_fill_super(sb, data, silent, 0, setup);
-                             0, setup);
 }
 static struct dentry *msdos_mount(struct file_system_type *fs_type,
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index adae3fb7451a..20b4ea53fdc4 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -1065,6 +1065,7 @@ static const struct inode_operations vfat_dir_inode_operations = {
 static void setup(struct super_block *sb)
 {
+        MSDOS_SB(sb)->dir_ops = &vfat_dir_inode_operations;
        if (MSDOS_SB(sb)->options.name_check != 's')
                sb->s_d_op = &vfat_ci_dentry_ops;
        else
@@ -1073,8 +1074,7 @@ static void setup(struct super_block *sb)
 static int vfat_fill_super(struct super_block *sb, void *data, int silent)
 {
-        return fat_fill_super(sb, data, silent, &vfat_dir_inode_operations,
+        return fat_fill_super(sb, data, silent, 1, setup);
-                             1, setup);
 }
 static struct dentry *vfat_mount(struct file_system_type *fs_type,
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index 48a18f184d50..30afdfa7aec7 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -33,8 +33,6 @@ void fscache_enqueue_operation(struct fscache_operation *op)
        _enter("{OBJ%x OP%x,%u}",
               op->object->debug_id, op->debug_id, atomic_read(&op->usage));
-        fscache_set_op_state(op, "EnQ");
        ASSERT(list_empty(&op->pend_link));
        ASSERT(op->processor != NULL);
        ASSERTCMP(op->object->state, >=, FSCACHE_OBJECT_AVAILABLE);
@@ -66,8 +64,6 @@ EXPORT_SYMBOL(fscache_enqueue_operation);
 static void fscache_run_op(struct fscache_object *object,
                           struct fscache_operation *op)
 {
-        fscache_set_op_state(op, "Run");
        object->n_in_progress++;
        if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags))
                wake_up_bit(&op->flags, FSCACHE_OP_WAITING);
@@ -88,8 +84,6 @@ int fscache_submit_exclusive_op(struct fscache_object *object,
        _enter("{OBJ%x OP%x},", object->debug_id, op->debug_id);
-        fscache_set_op_state(op, "SubmitX");
        spin_lock(&object->lock);
        ASSERTCMP(object->n_ops, >=, object->n_in_progress);
        ASSERTCMP(object->n_ops, >=, object->n_exclusive);
@@ -194,8 +188,6 @@ int fscache_submit_op(struct fscache_object *object,
        ASSERTCMP(atomic_read(&op->usage), >, 0);
-        fscache_set_op_state(op, "Submit");
        spin_lock(&object->lock);
        ASSERTCMP(object->n_ops, >=, object->n_in_progress);
        ASSERTCMP(object->n_ops, >=, object->n_exclusive);
@@ -335,8 +327,6 @@ void fscache_put_operation(struct fscache_operation *op)
        if (!atomic_dec_and_test(&op->usage))
                return;
-        fscache_set_op_state(op, "Put");
        _debug("PUT OP");
        if (test_and_set_bit(FSCACHE_OP_DEAD, &op->flags))
                BUG();
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 41c441c2058d..a2a5d19ece6a 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -155,11 +155,9 @@ static void fscache_attr_changed_op(struct fscache_operation *op)
        fscache_stat(&fscache_n_attr_changed_calls);
        if (fscache_object_is_active(object)) {
-                fscache_set_op_state(op, "CallFS");
                fscache_stat(&fscache_n_cop_attr_changed);
                ret = object->cache->ops->attr_changed(object);
                fscache_stat_d(&fscache_n_cop_attr_changed);
-                fscache_set_op_state(op, "Done");
                if (ret < 0)
                        fscache_abort_object(object);
        }
@@ -190,7 +188,6 @@ int __fscache_attr_changed(struct fscache_cookie *cookie)
        fscache_operation_init(op, fscache_attr_changed_op, NULL);
        op->flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_EXCLUSIVE);
-        fscache_set_op_name(op, "Attr");
        spin_lock(&cookie->lock);
@@ -257,7 +254,6 @@ static struct fscache_retrieval *fscache_alloc_retrieval(
        op->context     = context;
        op->start_time  = jiffies;
        INIT_LIST_HEAD(&op->to_do);
-        fscache_set_op_name(&op->op, "Retr");
        return op;
 }
@@ -368,7 +364,6 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
                _leave(" = -ENOMEM");
                return -ENOMEM;
        }
-        fscache_set_op_name(&op->op, "RetrRA1");
        spin_lock(&cookie->lock);
@@ -487,7 +482,6 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
        op = fscache_alloc_retrieval(mapping, end_io_func, context);
        if (!op)
                return -ENOMEM;
-        fscache_set_op_name(&op->op, "RetrRAN");
        spin_lock(&cookie->lock);
@@ -589,7 +583,6 @@ int __fscache_alloc_page(struct fscache_cookie *cookie,
        op = fscache_alloc_retrieval(page->mapping, NULL, NULL);
        if (!op)
                return -ENOMEM;
-        fscache_set_op_name(&op->op, "RetrAL1");
        spin_lock(&cookie->lock);
@@ -662,8 +655,6 @@ static void fscache_write_op(struct fscache_operation *_op)
        _enter("{OP%x,%d}", op->op.debug_id, atomic_read(&op->op.usage));
-        fscache_set_op_state(&op->op, "GetPage");
        spin_lock(&object->lock);
        cookie = object->cookie;
@@ -698,15 +689,12 @@ static void fscache_write_op(struct fscache_operation *_op)
        spin_unlock(&cookie->stores_lock);
        spin_unlock(&object->lock);
-        fscache_set_op_state(&op->op, "Store");
        fscache_stat(&fscache_n_store_pages);
        fscache_stat(&fscache_n_cop_write_page);
        ret = object->cache->ops->write_page(op, page);
        fscache_stat_d(&fscache_n_cop_write_page);
-        fscache_set_op_state(&op->op, "EndWrite");
        fscache_end_page_write(object, page);
        if (ret < 0) {
-                fscache_set_op_state(&op->op, "Abort");
                fscache_abort_object(object);
        } else {
                fscache_enqueue_operation(&op->op);
@@ -778,7 +766,6 @@ int __fscache_write_page(struct fscache_cookie *cookie,
        fscache_operation_init(&op->op, fscache_write_op,
                               fscache_release_write_op);
        op->op.flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_WAITING);
-        fscache_set_op_name(&op->op, "Write1");
        ret = radix_tree_preload(gfp & ~__GFP_HIGHMEM);
        if (ret < 0)
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index a2a6abbccc07..2792a790e50b 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1346,11 +1346,14 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
 }
-static int gfs2_shrink_glock_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
+static int gfs2_shrink_glock_memory(struct shrinker *shrink,
+                                    struct shrink_control *sc)
 {
        struct gfs2_glock *gl;
        int may_demote;
        int nr_skipped = 0;
+        int nr = sc->nr_to_scan;
+        gfp_t gfp_mask = sc->gfp_mask;
        LIST_HEAD(skipped);
        if (nr == 0)
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index e23d9864c418..42e8d23bc047 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -38,6 +38,7 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include <linux/mm.h>
 #include <linux/spinlock.h>
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
@@ -77,19 +78,20 @@ static LIST_HEAD(qd_lru_list);
 static atomic_t qd_lru_count = ATOMIC_INIT(0);
 static DEFINE_SPINLOCK(qd_lru_lock);
-int gfs2_shrink_qd_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
+int gfs2_shrink_qd_memory(struct shrinker *shrink, struct shrink_control *sc)
 {
        struct gfs2_quota_data *qd;
        struct gfs2_sbd *sdp;
+        int nr_to_scan = sc->nr_to_scan;
-        if (nr == 0)
+        if (nr_to_scan == 0)
                goto out;
-        if (!(gfp_mask & __GFP_FS))
+        if (!(sc->gfp_mask & __GFP_FS))
                return -1;
        spin_lock(&qd_lru_lock);
-        while (nr && !list_empty(&qd_lru_list)) {
+        while (nr_to_scan && !list_empty(&qd_lru_list)) {
                qd = list_entry(qd_lru_list.next,
                                struct gfs2_quota_data, qd_reclaim);
                sdp = qd->qd_gl->gl_sbd;
@@ -110,7 +112,7 @@ int gfs2_shrink_qd_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
                spin_unlock(&qd_lru_lock);
                kmem_cache_free(gfs2_quotad_cachep, qd);
                spin_lock(&qd_lru_lock);
-                nr--;
+                nr_to_scan--;
        }
        spin_unlock(&qd_lru_lock);
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index e7d236ca48bd..90bf1c302a98 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -12,6 +12,7 @@
 struct gfs2_inode;
 struct gfs2_sbd;
+struct shrink_control;
 #define NO_QUOTA_CHANGE ((u32)-1)
@@ -51,7 +52,8 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)
        return ret;
 }
-extern int gfs2_shrink_qd_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask);
+extern int gfs2_shrink_qd_memory(struct shrinker *shrink,
+                                 struct shrink_control *sc);
 extern const struct quotactl_ops gfs2_quotactl_ops;
 #endif /* __QUOTA_DOT_H__ */
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index b9eeb1cd03ff..e7a035781b7d 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -412,10 +412,10 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
        pgoff = offset >> PAGE_SHIFT;
        i_size_write(inode, offset);
-        spin_lock(&mapping->i_mmap_lock);
+        mutex_lock(&mapping->i_mmap_mutex);
        if (!prio_tree_empty(&mapping->i_mmap))
                hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff);
-        spin_unlock(&mapping->i_mmap_lock);
+        mutex_unlock(&mapping->i_mmap_mutex);
        truncate_hugepages(inode, offset);
        return 0;
 }
diff --git a/fs/inode.c b/fs/inode.c
index 05f4fa521325..990d284877a1 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -326,12 +326,11 @@ void address_space_init_once(struct address_space *mapping)
        memset(mapping, 0, sizeof(*mapping));
        INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
        spin_lock_init(&mapping->tree_lock);
-        spin_lock_init(&mapping->i_mmap_lock);
+        mutex_init(&mapping->i_mmap_mutex);
        INIT_LIST_HEAD(&mapping->private_list);
        spin_lock_init(&mapping->private_lock);
        INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
        INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
-        mutex_init(&mapping->unmap_mutex);
 }
 EXPORT_SYMBOL(address_space_init_once);
@@ -752,8 +751,12 @@ static void prune_icache(int nr_to_scan)
 * This function is passed the number of inodes to scan, and it returns the
 * total number of remaining possibly-reclaimable inodes.
 */
-static int shrink_icache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
+static int shrink_icache_memory(struct shrinker *shrink,
+                                struct shrink_control *sc)
 {
+        int nr = sc->nr_to_scan;
+        gfp_t gfp_mask = sc->gfp_mask;
        if (nr) {
                /*
                 * Nasty deadlock avoidance.  We may hold various FS locks,
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 69b180459463..72ffa974b0b8 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -302,12 +302,6 @@ void journal_commit_transaction(journal_t *journal)
         * all outstanding updates to complete.
         */
-#ifdef COMMIT_STATS
-        spin_lock(&journal->j_list_lock);
-        summarise_journal_usage(journal);
-        spin_unlock(&journal->j_list_lock);
-#endif
        /* Do we need to erase the effects of a prior journal_flush? */
        if (journal->j_flags & JFS_FLUSHED) {
                jbd_debug(3, "super block updated\n");
@@ -722,8 +716,13 @@ wait_for_iobuf:
                   required. */
                JBUFFER_TRACE(jh, "file as BJ_Forget");
                journal_file_buffer(jh, commit_transaction, BJ_Forget);
-                /* Wake up any transactions which were waiting for this
+                /*
-                   IO to complete */
+                 * Wake up any transactions which were waiting for this
+                 * IO to complete. The barrier must be here so that changes
+                 * by journal_file_buffer() take effect before wake_up_bit()
+                 * does the waitqueue check.
+                 */
+                smp_mb();
                wake_up_bit(&bh->b_state, BH_Unshadow);
                JBUFFER_TRACE(jh, "brelse shadowed buffer");
                __brelse(bh);
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index b3713afaaa9e..e2d4285fbe90 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -437,9 +437,12 @@ int __log_space_left(journal_t *journal)
 int __log_start_commit(journal_t *journal, tid_t target)
 {
        /*
-         * Are we already doing a recent enough commit?
+         * The only transaction we can possibly wait upon is the
+         * currently running transaction (if it exists).  Otherwise,
+         * the target tid must be an old one.
         */
-        if (!tid_geq(journal->j_commit_request, target)) {
+        if (journal->j_running_transaction &&
+            journal->j_running_transaction->t_tid == target) {
                /*
                 * We want a new commit: OK, mark the request and wakeup the
                 * commit thread.  We do _not_ do the commit ourselves.
@@ -451,7 +454,14 @@ int __log_start_commit(journal_t *journal, tid_t target)
                          journal->j_commit_sequence);
                wake_up(&journal->j_wait_commit);
                return 1;
-        }
+        } else if (!tid_geq(journal->j_commit_request, target))
+                /* This should never happen, but if it does, preserve
+                   the evidence before kjournald goes into a loop and
+                   increments j_commit_sequence beyond all recognition. */
+                WARN_ONCE(1, "jbd: bad log_start_commit: %u %u %u %u\n",
+                    journal->j_commit_request, journal->j_commit_sequence,
+                    target, journal->j_running_transaction ?
+                    journal->j_running_transaction->t_tid : 0);
        return 0;
 }
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 60d2319651b2..f7ee81a065da 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -266,7 +266,8 @@ static handle_t *new_handle(int nblocks)
 * This function is visible to journal users (like ext3fs), so is not
 * called with the journal already locked.
 *
- * Return a pointer to a newly allocated handle, or NULL on failure
+ * Return a pointer to a newly allocated handle, or an ERR_PTR() value
+ * on failure.
 */
 handle_t *journal_start(journal_t *journal, int nblocks)
 {
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 6e28000a4b21..29148a81c783 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -338,12 +338,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
         * all outstanding updates to complete.
         */
-#ifdef COMMIT_STATS
-        spin_lock(&journal->j_list_lock);
-        summarise_journal_usage(journal);
-        spin_unlock(&journal->j_list_lock);
-#endif
        /* Do we need to erase the effects of a prior jbd2_journal_flush? */
        if (journal->j_flags & JBD2_FLUSHED) {
                jbd_debug(3, "super block updated\n");
diff --git a/fs/mbcache.c b/fs/mbcache.c
index 2f174be06555..8c32ef3ba88e 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -90,7 +90,8 @@ static DEFINE_SPINLOCK(mb_cache_spinlock);
 * What the mbcache registers as to get shrunk dynamically.
 */
-static int mb_cache_shrink_fn(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask);
+static int mb_cache_shrink_fn(struct shrinker *shrink,
+                              struct shrink_control *sc);
 static struct shrinker mb_cache_shrinker = {
        .shrink = mb_cache_shrink_fn,
@@ -156,18 +157,19 @@ forget:
 * gets low.
 *
 * @shrink: (ignored)
- * @nr_to_scan: Number of objects to scan
+ * @sc: shrink_control passed from reclaim
- * @gfp_mask: (ignored)
 *
 * Returns the number of objects which are present in the cache.
 */
 static int
-mb_cache_shrink_fn(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
+mb_cache_shrink_fn(struct shrinker *shrink, struct shrink_control *sc)
 {
        LIST_HEAD(free_list);
        struct mb_cache *cache;
        struct mb_cache_entry *entry, *tmp;
        int count = 0;
+        int nr_to_scan = sc->nr_to_scan;
+        gfp_t gfp_mask = sc->gfp_mask;
        mb_debug("trying to free %d entries", nr_to_scan);
        spin_lock(&mb_cache_spinlock);
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 0250e4ce4893..202f370526a7 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -461,7 +461,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
 #endif
        struct ncp_entry_info finfo;
-        data.wdog_pid = NULL;
+        memset(&data, 0, sizeof(data));
        server = kzalloc(sizeof(struct ncp_server), GFP_KERNEL);
        if (!server)
                return -ENOMEM;
@@ -496,7 +496,6 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
                                struct ncp_mount_data_v4* md = (struct ncp_mount_data_v4*)raw_data;
                                data.flags = md->flags;
-                                data.int_flags = 0;
                                data.mounted_uid = md->mounted_uid;
                                data.wdog_pid = find_get_pid(md->wdog_pid);
                                data.ncp_fd = md->ncp_fd;
@@ -507,7 +506,6 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
                                data.file_mode = md->file_mode;
                                data.dir_mode = md->dir_mode;
                                data.info_fd = -1;
-                                data.mounted_vol[0] = 0;
                        }
                        break;
                default:
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 7237672216c8..424e47773a84 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -2042,11 +2042,14 @@ static void nfs_access_free_list(struct list_head *head)
        }
 }
-int nfs_access_cache_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
+int nfs_access_cache_shrinker(struct shrinker *shrink,
+                              struct shrink_control *sc)
 {
        LIST_HEAD(head);
        struct nfs_inode *nfsi, *next;
        struct nfs_access_entry *cache;
+        int nr_to_scan = sc->nr_to_scan;
+        gfp_t gfp_mask = sc->gfp_mask;
        if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL)
                return (nr_to_scan == 0) ? 0 : -1;
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index ce118ce885dd..2df6ca7b5898 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -234,7 +234,7 @@ extern int nfs_init_client(struct nfs_client *clp,
 /* dir.c */
 extern int nfs_access_cache_shrinker(struct shrinker *shrink,
-                                        int nr_to_scan, gfp_t gfp_mask);
+                                        struct shrink_control *sc);
 /* inode.c */
 extern struct workqueue_struct *nfsiod_workqueue;
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index d545e97d99c3..8ed4d3433199 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -255,7 +255,11 @@ ssize_t part_discard_alignment_show(struct device *dev,
                                   struct device_attribute *attr, char *buf)
 {
        struct hd_struct *p = dev_to_part(dev);
-        return sprintf(buf, "%u\n", p->discard_alignment);
+        struct gendisk *disk = dev_to_disk(dev);
+        return sprintf(buf, "%u\n",
+                        queue_limit_discard_alignment(&disk->queue->limits,
+                                                        p->start_sect));
 }
 ssize_t part_stat_show(struct device *dev,
@@ -449,8 +453,6 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
        p->start_sect = start;
        p->alignment_offset =
                queue_limit_alignment_offset(&disk->queue->limits, start);
-        p->discard_alignment =
-                queue_limit_discard_alignment(&disk->queue->limits, start);
        p->nr_sects = len;
        p->partno = partno;
        p->policy = get_disk_ro(disk);
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index c03e8d3a3a5b..3763b436e69d 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -61,6 +61,14 @@ extern const struct file_operations proc_pagemap_operations;
 extern const struct file_operations proc_net_operations;
 extern const struct inode_operations proc_net_inode_operations;
+struct proc_maps_private {
+        struct pid *pid;
+        struct task_struct *task;
+#ifdef CONFIG_MMU
+        struct vm_area_struct *tail_vma;
+#endif
+};
 void proc_init_inodecache(void);
 static inline struct pid *proc_pid(struct inode *inode)
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 318d8654989b..2c9db29ea358 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -858,7 +858,192 @@ const struct file_operations proc_pagemap_operations = {
 #endif /* CONFIG_PROC_PAGE_MONITOR */
 #ifdef CONFIG_NUMA
-extern int show_numa_map(struct seq_file *m, void *v);
+struct numa_maps {
+        struct vm_area_struct *vma;
+        unsigned long pages;
+        unsigned long anon;
+        unsigned long active;
+        unsigned long writeback;
+        unsigned long mapcount_max;
+        unsigned long dirty;
+        unsigned long swapcache;
+        unsigned long node[MAX_NUMNODES];
+};
+struct numa_maps_private {
+        struct proc_maps_private proc_maps;
+        struct numa_maps md;
+};
+static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty)
+{
+        int count = page_mapcount(page);
+        md->pages++;
+        if (pte_dirty || PageDirty(page))
+                md->dirty++;
+        if (PageSwapCache(page))
+                md->swapcache++;
+        if (PageActive(page) || PageUnevictable(page))
+                md->active++;
+        if (PageWriteback(page))
+                md->writeback++;
+        if (PageAnon(page))
+                md->anon++;
+        if (count > md->mapcount_max)
+                md->mapcount_max = count;
+        md->node[page_to_nid(page)]++;
+}
+static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
+                unsigned long end, struct mm_walk *walk)
+{
+        struct numa_maps *md;
+        spinlock_t *ptl;
+        pte_t *orig_pte;
+        pte_t *pte;
+        md = walk->private;
+        orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+        do {
+                struct page *page;
+                int nid;
+                if (!pte_present(*pte))
+                        continue;
+                page = vm_normal_page(md->vma, addr, *pte);
+                if (!page)
+                        continue;
+                if (PageReserved(page))
+                        continue;
+                nid = page_to_nid(page);
+                if (!node_isset(nid, node_states[N_HIGH_MEMORY]))
+                        continue;
+                gather_stats(page, md, pte_dirty(*pte));
+        } while (pte++, addr += PAGE_SIZE, addr != end);
+        pte_unmap_unlock(orig_pte, ptl);
+        return 0;
+}
+#ifdef CONFIG_HUGETLB_PAGE
+static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask,
+                unsigned long addr, unsigned long end, struct mm_walk *walk)
+{
+        struct numa_maps *md;
+        struct page *page;
+        if (pte_none(*pte))
+                return 0;
+        page = pte_page(*pte);
+        if (!page)
+                return 0;
+        md = walk->private;
+        gather_stats(page, md, pte_dirty(*pte));
+        return 0;
+}
+#else
+static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask,
+                unsigned long addr, unsigned long end, struct mm_walk *walk)
+{
+        return 0;
+}
+#endif
+/*
+ * Display pages allocated per node and memory policy via /proc.
+ */
+static int show_numa_map(struct seq_file *m, void *v)
+{
+        struct numa_maps_private *numa_priv = m->private;
+        struct proc_maps_private *proc_priv = &numa_priv->proc_maps;
+        struct vm_area_struct *vma = v;
+        struct numa_maps *md = &numa_priv->md;
+        struct file *file = vma->vm_file;
+        struct mm_struct *mm = vma->vm_mm;
+        struct mm_walk walk = {};
+        struct mempolicy *pol;
+        int n;
+        char buffer[50];
+        if (!mm)
+                return 0;
+        /* Ensure we start with an empty set of numa_maps statistics. */
+        memset(md, 0, sizeof(*md));
+        md->vma = vma;
+        walk.hugetlb_entry = gather_hugetbl_stats;
+        walk.pmd_entry = gather_pte_stats;
+        walk.private = md;
+        walk.mm = mm;
+        pol = get_vma_policy(proc_priv->task, vma, vma->vm_start);
+        mpol_to_str(buffer, sizeof(buffer), pol, 0);
+        mpol_cond_put(pol);
+        seq_printf(m, "%08lx %s", vma->vm_start, buffer);
+        if (file) {
+                seq_printf(m, " file=");
+                seq_path(m, &file->f_path, "\n\t= ");
+        } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
+                seq_printf(m, " heap");
+        } else if (vma->vm_start <= mm->start_stack &&
+                        vma->vm_end >= mm->start_stack) {
+                seq_printf(m, " stack");
+        }
+        walk_page_range(vma->vm_start, vma->vm_end, &walk);
+        if (!md->pages)
+                goto out;
+        if (md->anon)
+                seq_printf(m, " anon=%lu", md->anon);
+        if (md->dirty)
+                seq_printf(m, " dirty=%lu", md->dirty);
+        if (md->pages != md->anon && md->pages != md->dirty)
+                seq_printf(m, " mapped=%lu", md->pages);
+        if (md->mapcount_max > 1)
+                seq_printf(m, " mapmax=%lu", md->mapcount_max);
+        if (md->swapcache)
+                seq_printf(m, " swapcache=%lu", md->swapcache);
+        if (md->active < md->pages && !is_vm_hugetlb_page(vma))
+                seq_printf(m, " active=%lu", md->active);
+        if (md->writeback)
+                seq_printf(m, " writeback=%lu", md->writeback);
+        for_each_node_state(n, N_HIGH_MEMORY)
+                if (md->node[n])
+                        seq_printf(m, " N%d=%lu", n, md->node[n]);
+out:
+        seq_putc(m, '\n');
+        if (m->count < m->size)
+                m->version = (vma != proc_priv->tail_vma) ? vma->vm_start : 0;
+        return 0;
+}
 static const struct seq_operations proc_pid_numa_maps_op = {
        .start  = m_start,
@@ -869,7 +1054,20 @@ static const struct seq_operations proc_pid_numa_maps_op = {
 static int numa_maps_open(struct inode *inode, struct file *file)
 {
-        return do_maps_open(inode, file, &proc_pid_numa_maps_op);
+        struct numa_maps_private *priv;
+        int ret = -ENOMEM;
+        priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+        if (priv) {
+                priv->proc_maps.pid = proc_pid(inode);
+                ret = seq_open(file, &proc_pid_numa_maps_op);
+                if (!ret) {
+                        struct seq_file *m = file->private_data;
+                        m->private = priv;
+                } else {
+                        kfree(priv);
+                }
+        }
+        return ret;
 }
 const struct file_operations proc_numa_maps_operations = {
@@ -878,4 +1076,4 @@ const struct file_operations proc_numa_maps_operations = {
        .llseek         = seq_lseek,
        .release        = seq_release_private,
 };
-#endif
+#endif /* CONFIG_NUMA */
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index d3c032f5fa0a..5b572c89e6c4 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -691,8 +691,11 @@ static void prune_dqcache(int count)
 * This is called from kswapd when we think we need some
 * more memory
 */
-static int shrink_dqcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
+static int shrink_dqcache_memory(struct shrinker *shrink,
+                                 struct shrink_control *sc)
 {
+        int nr = sc->nr_to_scan;
        if (nr) {
                spin_lock(&dq_list_lock);
                prune_dqcache(nr);
diff --git a/fs/splice.c b/fs/splice.c
index 50a5d978da16..aa866d309695 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -162,6 +162,14 @@ static const struct pipe_buf_operations user_page_pipe_buf_ops = {
        .get = generic_pipe_buf_get,
 };
+static void wakeup_pipe_readers(struct pipe_inode_info *pipe)
+{
+        smp_mb();
+        if (waitqueue_active(&pipe->wait))
+                wake_up_interruptible(&pipe->wait);
+        kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
+}
 /**
 * splice_to_pipe - fill passed data into a pipe
 * @pipe:       pipe to fill
@@ -247,12 +255,8 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
        pipe_unlock(pipe);
-        if (do_wakeup) {
+        if (do_wakeup)
-                smp_mb();
+                wakeup_pipe_readers(pipe);
-                if (waitqueue_active(&pipe->wait))
-                        wake_up_interruptible(&pipe->wait);
-                kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
-        }
        while (page_nr < spd_pages)
                spd->spd_release(spd, page_nr++);
@@ -1892,12 +1896,9 @@ retry:
        /*
         * If we put data in the output pipe, wakeup any potential readers.
         */
-        if (ret > 0) {
+        if (ret > 0)
-                smp_mb();
+                wakeup_pipe_readers(opipe);
-                if (waitqueue_active(&opipe->wait))
-                        wake_up_interruptible(&opipe->wait);
-                kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN);
-        }
        if (input_wakeup)
                wakeup_pipe_writers(ipipe);
@@ -1976,12 +1977,8 @@ static int link_pipe(struct pipe_inode_info *ipipe,
        /*
         * If we put data in the output pipe, wakeup any potential readers.
         */
-        if (ret > 0) {
+        if (ret > 0)
-                smp_mb();
+                wakeup_pipe_readers(opipe);
-                if (waitqueue_active(&opipe->wait))
-                        wake_up_interruptible(&opipe->wait);
-                kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN);
-        }
        return ret;
 }
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 8c4fc1425b3e..f67acbdda5e8 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -22,16 +22,24 @@
 #include <linux/anon_inodes.h>
 #include <linux/timerfd.h>
 #include <linux/syscalls.h>
+#include <linux/rcupdate.h>
 struct timerfd_ctx {
        struct hrtimer tmr;
        ktime_t tintv;
+        ktime_t moffs;
        wait_queue_head_t wqh;
        u64 ticks;
        int expired;
        int clockid;
+        struct rcu_head rcu;
+        struct list_head clist;
+        bool might_cancel;
 };
+static LIST_HEAD(cancel_list);
+static DEFINE_SPINLOCK(cancel_lock);
 /*
 * This gets called when the timer event triggers. We set the "expired"
 * flag, but we do not re-arm the timer (in case it's necessary,
@@ -51,6 +59,63 @@ static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr)
        return HRTIMER_NORESTART;
 }
+/*
+ * Called when the clock was set to cancel the timers in the cancel
+ * list.
+ */
+void timerfd_clock_was_set(void)
+{
+        ktime_t moffs = ktime_get_monotonic_offset();
+        struct timerfd_ctx *ctx;
+        unsigned long flags;
+        rcu_read_lock();
+        list_for_each_entry_rcu(ctx, &cancel_list, clist) {
+                if (!ctx->might_cancel)
+                        continue;
+                spin_lock_irqsave(&ctx->wqh.lock, flags);
+                if (ctx->moffs.tv64 != moffs.tv64) {
+                        ctx->moffs.tv64 = KTIME_MAX;
+                        wake_up_locked(&ctx->wqh);
+                }
+                spin_unlock_irqrestore(&ctx->wqh.lock, flags);
+        }
+        rcu_read_unlock();
+}
+static void timerfd_remove_cancel(struct timerfd_ctx *ctx)
+{
+        if (ctx->might_cancel) {
+                ctx->might_cancel = false;
+                spin_lock(&cancel_lock);
+                list_del_rcu(&ctx->clist);
+                spin_unlock(&cancel_lock);
+        }
+}
+static bool timerfd_canceled(struct timerfd_ctx *ctx)
+{
+        if (!ctx->might_cancel || ctx->moffs.tv64 != KTIME_MAX)
+                return false;
+        ctx->moffs = ktime_get_monotonic_offset();
+        return true;
+}
+static void timerfd_setup_cancel(struct timerfd_ctx *ctx, int flags)
+{
+        if (ctx->clockid == CLOCK_REALTIME && (flags & TFD_TIMER_ABSTIME) &&
+            (flags & TFD_TIMER_CANCEL_ON_SET)) {
+                if (!ctx->might_cancel) {
+                        ctx->might_cancel = true;
+                        spin_lock(&cancel_lock);
+                        list_add_rcu(&ctx->clist, &cancel_list);
+                        spin_unlock(&cancel_lock);
+                }
+        } else if (ctx->might_cancel) {
+                timerfd_remove_cancel(ctx);
+        }
+}
 static ktime_t timerfd_get_remaining(struct timerfd_ctx *ctx)
 {
        ktime_t remaining;
@@ -59,11 +124,12 @@ static ktime_t timerfd_get_remaining(struct timerfd_ctx *ctx)
        return remaining.tv64 < 0 ? ktime_set(0, 0): remaining;
 }
-static void timerfd_setup(struct timerfd_ctx *ctx, int flags,
+static int timerfd_setup(struct timerfd_ctx *ctx, int flags,
-                          const struct itimerspec *ktmr)
+                         const struct itimerspec *ktmr)
 {
        enum hrtimer_mode htmode;
        ktime_t texp;
+        int clockid = ctx->clockid;
        htmode = (flags & TFD_TIMER_ABSTIME) ?
                HRTIMER_MODE_ABS: HRTIMER_MODE_REL;
@@ -72,19 +138,24 @@ static void timerfd_setup(struct timerfd_ctx *ctx, int flags,
        ctx->expired = 0;
        ctx->ticks = 0;
        ctx->tintv = timespec_to_ktime(ktmr->it_interval);
-        hrtimer_init(&ctx->tmr, ctx->clockid, htmode);
+        hrtimer_init(&ctx->tmr, clockid, htmode);
        hrtimer_set_expires(&ctx->tmr, texp);
        ctx->tmr.function = timerfd_tmrproc;
-        if (texp.tv64 != 0)
+        if (texp.tv64 != 0) {
                hrtimer_start(&ctx->tmr, texp, htmode);
+                if (timerfd_canceled(ctx))
+                        return -ECANCELED;
+        }
+        return 0;
 }
 static int timerfd_release(struct inode *inode, struct file *file)
 {
        struct timerfd_ctx *ctx = file->private_data;
+        timerfd_remove_cancel(ctx);
        hrtimer_cancel(&ctx->tmr);
-        kfree(ctx);
+        kfree_rcu(ctx, rcu);
        return 0;
 }
@@ -118,8 +189,21 @@ static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count,
                res = -EAGAIN;
        else
                res = wait_event_interruptible_locked_irq(ctx->wqh, ctx->ticks);
+        /*
+         * If clock has changed, we do not care about the
+         * ticks and we do not rearm the timer. Userspace must
+         * reevaluate anyway.
+         */
+        if (timerfd_canceled(ctx)) {
+                ctx->ticks = 0;
+                ctx->expired = 0;
+                res = -ECANCELED;
+        }
        if (ctx->ticks) {
                ticks = ctx->ticks;
                if (ctx->expired && ctx->tintv.tv64) {
                        /*
                         * If tintv.tv64 != 0, this is a periodic timer that
@@ -183,6 +267,7 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
        init_waitqueue_head(&ctx->wqh);
        ctx->clockid = clockid;
        hrtimer_init(&ctx->tmr, clockid, HRTIMER_MODE_ABS);
+        ctx->moffs = ktime_get_monotonic_offset();
        ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx,
                               O_RDWR | (flags & TFD_SHARED_FCNTL_FLAGS));
@@ -199,6 +284,7 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
        struct file *file;
        struct timerfd_ctx *ctx;
        struct itimerspec ktmr, kotmr;
+        int ret;
        if (copy_from_user(&ktmr, utmr, sizeof(ktmr)))
                return -EFAULT;
@@ -213,6 +299,8 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
                return PTR_ERR(file);
        ctx = file->private_data;
+        timerfd_setup_cancel(ctx, flags);
        /*
         * We need to stop the existing timer before reprogramming
         * it to the new values.
@@ -240,14 +328,14 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
        /*
         * Re-program the timer to the new value ...
         */
-        timerfd_setup(ctx, flags, &ktmr);
+        ret = timerfd_setup(ctx, flags, &ktmr);
        spin_unlock_irq(&ctx->wqh.lock);
        fput(file);
        if (otmr && copy_to_user(otmr, &kotmr, sizeof(kotmr)))
                return -EFAULT;
-        return 0;
+        return ret;
 }
 SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct itimerspec __user *, otmr)
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index 8b3a7da531eb..315de66e52b2 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -106,7 +106,7 @@ static long long get_liability(struct ubifs_info *c)
        long long liab;
        spin_lock(&c->space_lock);
-        liab = c->budg_idx_growth + c->budg_data_growth + c->budg_dd_growth;
+        liab = c->bi.idx_growth + c->bi.data_growth + c->bi.dd_growth;
        spin_unlock(&c->space_lock);
        return liab;
 }
@@ -180,7 +180,7 @@ int ubifs_calc_min_idx_lebs(struct ubifs_info *c)
        int idx_lebs;
        long long idx_size;
-        idx_size = c->old_idx_sz + c->budg_idx_growth + c->budg_uncommitted_idx;
+        idx_size = c->bi.old_idx_sz + c->bi.idx_growth + c->bi.uncommitted_idx;
        /* And make sure we have thrice the index size of space reserved */
        idx_size += idx_size << 1;
        /*
@@ -292,13 +292,13 @@ static int can_use_rp(struct ubifs_info *c)
 * budgeted index space to the size of the current index, multiplies this by 3,
 * and makes sure this does not exceed the amount of free LEBs.
 *
- * Notes about @c->min_idx_lebs and @c->lst.idx_lebs variables:
+ * Notes about @c->bi.min_idx_lebs and @c->lst.idx_lebs variables:
 * o @c->lst.idx_lebs is the number of LEBs the index currently uses. It might
 *    be large, because UBIFS does not do any index consolidation as long as
 *    there is free space. IOW, the index may take a lot of LEBs, but the LEBs
 *    will contain a lot of dirt.
- * o @c->min_idx_lebs is the number of LEBS the index presumably takes. IOW,
+ * o @c->bi.min_idx_lebs is the number of LEBS the index presumably takes. IOW,
- *    the index may be consolidated to take up to @c->min_idx_lebs LEBs.
+ *    the index may be consolidated to take up to @c->bi.min_idx_lebs LEBs.
 *
 * This function returns zero in case of success, and %-ENOSPC in case of
 * failure.
@@ -343,13 +343,13 @@ static int do_budget_space(struct ubifs_info *c)
               c->lst.taken_empty_lebs;
        if (unlikely(rsvd_idx_lebs > lebs)) {
                dbg_budg("out of indexing space: min_idx_lebs %d (old %d), "
-                         "rsvd_idx_lebs %d", min_idx_lebs, c->min_idx_lebs,
+                         "rsvd_idx_lebs %d", min_idx_lebs, c->bi.min_idx_lebs,
                         rsvd_idx_lebs);
                return -ENOSPC;
        }
        available = ubifs_calc_available(c, min_idx_lebs);
-        outstanding = c->budg_data_growth + c->budg_dd_growth;
+        outstanding = c->bi.data_growth + c->bi.dd_growth;
        if (unlikely(available < outstanding)) {
                dbg_budg("out of data space: available %lld, outstanding %lld",
@@ -360,7 +360,7 @@ static int do_budget_space(struct ubifs_info *c)
        if (available - outstanding <= c->rp_size && !can_use_rp(c))
                return -ENOSPC;
-        c->min_idx_lebs = min_idx_lebs;
+        c->bi.min_idx_lebs = min_idx_lebs;
        return 0;
 }
@@ -393,11 +393,11 @@ static int calc_data_growth(const struct ubifs_info *c,
 {
        int data_growth;
-        data_growth = req->new_ino  ? c->inode_budget : 0;
+        data_growth = req->new_ino  ? c->bi.inode_budget : 0;
        if (req->new_page)
-                data_growth += c->page_budget;
+                data_growth += c->bi.page_budget;
        if (req->new_dent)
-                data_growth += c->dent_budget;
+                data_growth += c->bi.dent_budget;
        data_growth += req->new_ino_d;
        return data_growth;
 }
@@ -413,12 +413,12 @@ static int calc_dd_growth(const struct ubifs_info *c,
 {
        int dd_growth;
-        dd_growth = req->dirtied_page ? c->page_budget : 0;
+        dd_growth = req->dirtied_page ? c->bi.page_budget : 0;
        if (req->dirtied_ino)
-                dd_growth += c->inode_budget << (req->dirtied_ino - 1);
+                dd_growth += c->bi.inode_budget << (req->dirtied_ino - 1);
        if (req->mod_dent)
-                dd_growth += c->dent_budget;
+                dd_growth += c->bi.dent_budget;
        dd_growth += req->dirtied_ino_d;
        return dd_growth;
 }
@@ -460,19 +460,19 @@ int ubifs_budget_space(struct ubifs_info *c, struct ubifs_budget_req *req)
 again:
        spin_lock(&c->space_lock);
-        ubifs_assert(c->budg_idx_growth >= 0);
+        ubifs_assert(c->bi.idx_growth >= 0);
-        ubifs_assert(c->budg_data_growth >= 0);
+        ubifs_assert(c->bi.data_growth >= 0);
-        ubifs_assert(c->budg_dd_growth >= 0);
+        ubifs_assert(c->bi.dd_growth >= 0);
-        if (unlikely(c->nospace) && (c->nospace_rp || !can_use_rp(c))) {
+        if (unlikely(c->bi.nospace) && (c->bi.nospace_rp || !can_use_rp(c))) {
                dbg_budg("no space");
                spin_unlock(&c->space_lock);
                return -ENOSPC;
        }
-        c->budg_idx_growth += idx_growth;
+        c->bi.idx_growth += idx_growth;
-        c->budg_data_growth += data_growth;
+        c->bi.data_growth += data_growth;
-        c->budg_dd_growth += dd_growth;
+        c->bi.dd_growth += dd_growth;
        err = do_budget_space(c);
        if (likely(!err)) {
@@ -484,9 +484,9 @@ again:
        }
        /* Restore the old values */
-        c->budg_idx_growth -= idx_growth;
+        c->bi.idx_growth -= idx_growth;
-        c->budg_data_growth -= data_growth;
+        c->bi.data_growth -= data_growth;
-        c->budg_dd_growth -= dd_growth;
+        c->bi.dd_growth -= dd_growth;
        spin_unlock(&c->space_lock);
        if (req->fast) {
@@ -506,9 +506,9 @@ again:
                        goto again;
                }
                dbg_budg("FS is full, -ENOSPC");
-                c->nospace = 1;
+                c->bi.nospace = 1;
                if (can_use_rp(c) || c->rp_size == 0)
-                        c->nospace_rp = 1;
+                        c->bi.nospace_rp = 1;
                smp_wmb();
        } else
                ubifs_err("cannot budget space, error %d", err);
@@ -523,8 +523,8 @@ again:
 * This function releases the space budgeted by 'ubifs_budget_space()'. Note,
 * since the index changes (which were budgeted for in @req->idx_growth) will
 * only be written to the media on commit, this function moves the index budget
- * from @c->budg_idx_growth to @c->budg_uncommitted_idx. The latter will be
+ * from @c->bi.idx_growth to @c->bi.uncommitted_idx. The latter will be zeroed
- * zeroed by the commit operation.
+ * by the commit operation.
 */
 void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req)
 {
@@ -553,23 +553,23 @@ void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req)
        if (!req->data_growth && !req->dd_growth)
                return;
-        c->nospace = c->nospace_rp = 0;
+        c->bi.nospace = c->bi.nospace_rp = 0;
        smp_wmb();
        spin_lock(&c->space_lock);
-        c->budg_idx_growth -= req->idx_growth;
+        c->bi.idx_growth -= req->idx_growth;
-        c->budg_uncommitted_idx += req->idx_growth;
+        c->bi.uncommitted_idx += req->idx_growth;
-        c->budg_data_growth -= req->data_growth;
+        c->bi.data_growth -= req->data_growth;
-        c->budg_dd_growth -= req->dd_growth;
+        c->bi.dd_growth -= req->dd_growth;
-        c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+        c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c);
-        ubifs_assert(c->budg_idx_growth >= 0);
+        ubifs_assert(c->bi.idx_growth >= 0);
-        ubifs_assert(c->budg_data_growth >= 0);
+        ubifs_assert(c->bi.data_growth >= 0);
-        ubifs_assert(c->budg_dd_growth >= 0);
+        ubifs_assert(c->bi.dd_growth >= 0);
-        ubifs_assert(c->min_idx_lebs < c->main_lebs);
+        ubifs_assert(c->bi.min_idx_lebs < c->main_lebs);
-        ubifs_assert(!(c->budg_idx_growth & 7));
+        ubifs_assert(!(c->bi.idx_growth & 7));
-        ubifs_assert(!(c->budg_data_growth & 7));
+        ubifs_assert(!(c->bi.data_growth & 7));
-        ubifs_assert(!(c->budg_dd_growth & 7));
+        ubifs_assert(!(c->bi.dd_growth & 7));
        spin_unlock(&c->space_lock);
 }
@@ -586,13 +586,13 @@ void ubifs_convert_page_budget(struct ubifs_info *c)
 {
        spin_lock(&c->space_lock);
        /* Release the index growth reservation */
-        c->budg_idx_growth -= c->max_idx_node_sz << UBIFS_BLOCKS_PER_PAGE_SHIFT;
+        c->bi.idx_growth -= c->max_idx_node_sz << UBIFS_BLOCKS_PER_PAGE_SHIFT;
        /* Release the data growth reservation */
-        c->budg_data_growth -= c->page_budget;
+        c->bi.data_growth -= c->bi.page_budget;
        /* Increase the dirty data growth reservation instead */
-        c->budg_dd_growth += c->page_budget;
+        c->bi.dd_growth += c->bi.page_budget;
        /* And re-calculate the indexing space reservation */
-        c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+        c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c);
        spin_unlock(&c->space_lock);
 }
@@ -612,7 +612,7 @@ void ubifs_release_dirty_inode_budget(struct ubifs_info *c,
        memset(&req, 0, sizeof(struct ubifs_budget_req));
        /* The "no space" flags will be cleared because dd_growth is > 0 */
-        req.dd_growth = c->inode_budget + ALIGN(ui->data_len, 8);
+        req.dd_growth = c->bi.inode_budget + ALIGN(ui->data_len, 8);
        ubifs_release_budget(c, &req);
 }
@@ -682,9 +682,9 @@ long long ubifs_get_free_space_nolock(struct ubifs_info *c)
        int rsvd_idx_lebs, lebs;
        long long available, outstanding, free;
-        ubifs_assert(c->min_idx_lebs == ubifs_calc_min_idx_lebs(c));
+        ubifs_assert(c->bi.min_idx_lebs == ubifs_calc_min_idx_lebs(c));
-        outstanding = c->budg_data_growth + c->budg_dd_growth;
+        outstanding = c->bi.data_growth + c->bi.dd_growth;
-        available = ubifs_calc_available(c, c->min_idx_lebs);
+        available = ubifs_calc_available(c, c->bi.min_idx_lebs);
        /*
         * When reporting free space to user-space, UBIFS guarantees that it is
@@ -697,8 +697,8 @@ long long ubifs_get_free_space_nolock(struct ubifs_info *c)
         * Note, the calculations below are similar to what we have in
         * 'do_budget_space()', so refer there for comments.
         */
-        if (c->min_idx_lebs > c->lst.idx_lebs)
+        if (c->bi.min_idx_lebs > c->lst.idx_lebs)
-                rsvd_idx_lebs = c->min_idx_lebs - c->lst.idx_lebs;
+                rsvd_idx_lebs = c->bi.min_idx_lebs - c->lst.idx_lebs;
        else
                rsvd_idx_lebs = 0;
        lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt -
diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
index 1bd01ded7123..87cd0ead8633 100644
--- a/fs/ubifs/commit.c
+++ b/fs/ubifs/commit.c
@@ -182,7 +182,7 @@ static int do_commit(struct ubifs_info *c)
        c->mst_node->root_len    = cpu_to_le32(zroot.len);
        c->mst_node->ihead_lnum  = cpu_to_le32(c->ihead_lnum);
        c->mst_node->ihead_offs  = cpu_to_le32(c->ihead_offs);
-        c->mst_node->index_size  = cpu_to_le64(c->old_idx_sz);
+        c->mst_node->index_size  = cpu_to_le64(c->bi.old_idx_sz);
        c->mst_node->lpt_lnum    = cpu_to_le32(c->lpt_lnum);
        c->mst_node->lpt_offs    = cpu_to_le32(c->lpt_offs);
        c->mst_node->nhead_lnum  = cpu_to_le32(c->nhead_lnum);
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 004d3745dc45..0bb2bcef0de9 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -34,7 +34,6 @@
 #include <linux/moduleparam.h>
 #include <linux/debugfs.h>
 #include <linux/math64.h>
-#include <linux/slab.h>
 #ifdef CONFIG_UBIFS_FS_DEBUG
@@ -43,15 +42,12 @@ DEFINE_SPINLOCK(dbg_lock);
 static char dbg_key_buf0[128];
 static char dbg_key_buf1[128];
-unsigned int ubifs_msg_flags;
 unsigned int ubifs_chk_flags;
 unsigned int ubifs_tst_flags;
-module_param_named(debug_msgs, ubifs_msg_flags, uint, S_IRUGO | S_IWUSR);
 module_param_named(debug_chks, ubifs_chk_flags, uint, S_IRUGO | S_IWUSR);
 module_param_named(debug_tsts, ubifs_tst_flags, uint, S_IRUGO | S_IWUSR);
-MODULE_PARM_DESC(debug_msgs, "Debug message type flags");
 MODULE_PARM_DESC(debug_chks, "Debug check flags");
 MODULE_PARM_DESC(debug_tsts, "Debug special test flags");
@@ -317,6 +313,8 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
                printk(KERN_DEBUG "\tflags          %#x\n", sup_flags);
                printk(KERN_DEBUG "\t  big_lpt      %u\n",
                       !!(sup_flags & UBIFS_FLG_BIGLPT));
+                printk(KERN_DEBUG "\t  space_fixup  %u\n",
+                       !!(sup_flags & UBIFS_FLG_SPACE_FIXUP));
                printk(KERN_DEBUG "\tmin_io_size    %u\n",
                       le32_to_cpu(sup->min_io_size));
                printk(KERN_DEBUG "\tleb_size       %u\n",
@@ -602,7 +600,7 @@ void dbg_dump_lstats(const struct ubifs_lp_stats *lst)
        spin_unlock(&dbg_lock);
 }
-void dbg_dump_budg(struct ubifs_info *c)
+void dbg_dump_budg(struct ubifs_info *c, const struct ubifs_budg_info *bi)
 {
        int i;
        struct rb_node *rb;
@@ -610,26 +608,42 @@ void dbg_dump_budg(struct ubifs_info *c)
        struct ubifs_gced_idx_leb *idx_gc;
        long long available, outstanding, free;
-        ubifs_assert(spin_is_locked(&c->space_lock));
+        spin_lock(&c->space_lock);
        spin_lock(&dbg_lock);
-        printk(KERN_DEBUG "(pid %d) Budgeting info: budg_data_growth %lld, "
+        printk(KERN_DEBUG "(pid %d) Budgeting info: data budget sum %lld, "
-               "budg_dd_growth %lld, budg_idx_growth %lld\n", current->pid,
+               "total budget sum %lld\n", current->pid,
-               c->budg_data_growth, c->budg_dd_growth, c->budg_idx_growth);
+               bi->data_growth + bi->dd_growth,
-        printk(KERN_DEBUG "\tdata budget sum %lld, total budget sum %lld, "
+               bi->data_growth + bi->dd_growth + bi->idx_growth);
-               "freeable_cnt %d\n", c->budg_data_growth + c->budg_dd_growth,
+        printk(KERN_DEBUG "\tbudg_data_growth %lld, budg_dd_growth %lld, "
-               c->budg_data_growth + c->budg_dd_growth + c->budg_idx_growth,
+               "budg_idx_growth %lld\n", bi->data_growth, bi->dd_growth,
-               c->freeable_cnt);
+               bi->idx_growth);
-        printk(KERN_DEBUG "\tmin_idx_lebs %d, old_idx_sz %lld, "
+        printk(KERN_DEBUG "\tmin_idx_lebs %d, old_idx_sz %llu, "
-               "calc_idx_sz %lld, idx_gc_cnt %d\n", c->min_idx_lebs,
+               "uncommitted_idx %lld\n", bi->min_idx_lebs, bi->old_idx_sz,
-               c->old_idx_sz, c->calc_idx_sz, c->idx_gc_cnt);
+               bi->uncommitted_idx);
+        printk(KERN_DEBUG "\tpage_budget %d, inode_budget %d, dent_budget %d\n",
+               bi->page_budget, bi->inode_budget, bi->dent_budget);
+        printk(KERN_DEBUG "\tnospace %u, nospace_rp %u\n",
+               bi->nospace, bi->nospace_rp);
+        printk(KERN_DEBUG "\tdark_wm %d, dead_wm %d, max_idx_node_sz %d\n",
+               c->dark_wm, c->dead_wm, c->max_idx_node_sz);
+        if (bi != &c->bi)
+                /*
+                 * If we are dumping saved budgeting data, do not print
+                 * additional information which is about the current state, not
+                 * the old one which corresponded to the saved budgeting data.
+                 */
+                goto out_unlock;
+        printk(KERN_DEBUG "\tfreeable_cnt %d, calc_idx_sz %lld, idx_gc_cnt %d\n",
+               c->freeable_cnt, c->calc_idx_sz, c->idx_gc_cnt);
        printk(KERN_DEBUG "\tdirty_pg_cnt %ld, dirty_zn_cnt %ld, "
               "clean_zn_cnt %ld\n", atomic_long_read(&c->dirty_pg_cnt),
               atomic_long_read(&c->dirty_zn_cnt),
               atomic_long_read(&c->clean_zn_cnt));
-        printk(KERN_DEBUG "\tdark_wm %d, dead_wm %d, max_idx_node_sz %d\n",
-               c->dark_wm, c->dead_wm, c->max_idx_node_sz);
        printk(KERN_DEBUG "\tgc_lnum %d, ihead_lnum %d\n",
               c->gc_lnum, c->ihead_lnum);
        /* If we are in R/O mode, journal heads do not exist */
        if (c->jheads)
                for (i = 0; i < c->jhead_cnt; i++)
@@ -648,13 +662,15 @@ void dbg_dump_budg(struct ubifs_info *c)
        printk(KERN_DEBUG "\tcommit state %d\n", c->cmt_state);
        /* Print budgeting predictions */
-        available = ubifs_calc_available(c, c->min_idx_lebs);
+        available = ubifs_calc_available(c, c->bi.min_idx_lebs);
-        outstanding = c->budg_data_growth + c->budg_dd_growth;
+        outstanding = c->bi.data_growth + c->bi.dd_growth;
        free = ubifs_get_free_space_nolock(c);
        printk(KERN_DEBUG "Budgeting predictions:\n");
        printk(KERN_DEBUG "\tavailable: %lld, outstanding %lld, free %lld\n",
               available, outstanding, free);
+out_unlock:
        spin_unlock(&dbg_lock);
+        spin_unlock(&c->space_lock);
 }
 void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp)
@@ -729,7 +745,13 @@ void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp)
                if (bud->lnum == lp->lnum) {
                        int head = 0;
                        for (i = 0; i < c->jhead_cnt; i++) {
-                                if (lp->lnum == c->jheads[i].wbuf.lnum) {
+                                /*
+                                 * Note, if we are in R/O mode or in the middle
+                                 * of mounting/re-mounting, the write-buffers do
+                                 * not exist.
+                                 */
+                                if (c->jheads &&
+                                    lp->lnum == c->jheads[i].wbuf.lnum) {
                                        printk(KERN_CONT ", jhead %s",
                                               dbg_jhead(i));
                                        head = 1;
@@ -976,6 +998,8 @@ void dbg_save_space_info(struct ubifs_info *c)
        spin_lock(&c->space_lock);
        memcpy(&d->saved_lst, &c->lst, sizeof(struct ubifs_lp_stats));
+        memcpy(&d->saved_bi, &c->bi, sizeof(struct ubifs_budg_info));
+        d->saved_idx_gc_cnt = c->idx_gc_cnt;
        /*
         * We use a dirty hack here and zero out @c->freeable_cnt, because it
@@ -1042,14 +1066,14 @@ int dbg_check_space_info(struct ubifs_info *c)
 out:
        ubifs_msg("saved lprops statistics dump");
        dbg_dump_lstats(&d->saved_lst);
-        ubifs_get_lp_stats(c, &lst);
+        ubifs_msg("saved budgeting info dump");
+        dbg_dump_budg(c, &d->saved_bi);
+        ubifs_msg("saved idx_gc_cnt %d", d->saved_idx_gc_cnt);
        ubifs_msg("current lprops statistics dump");
+        ubifs_get_lp_stats(c, &lst);
        dbg_dump_lstats(&lst);
+        ubifs_msg("current budgeting info dump");
-        spin_lock(&c->space_lock);
+        dbg_dump_budg(c, &c->bi);
-        dbg_dump_budg(c);
-        spin_unlock(&c->space_lock);
        dump_stack();
        return -EINVAL;
 }
@@ -1793,6 +1817,8 @@ static struct fsck_inode *add_inode(struct ubifs_info *c,
        struct rb_node **p, *parent = NULL;
        struct fsck_inode *fscki;
        ino_t inum = key_inum_flash(c, &ino->key);
+        struct inode *inode;
+        struct ubifs_inode *ui;
        p = &fsckd->inodes.rb_node;
        while (*p) {
@@ -1816,19 +1842,46 @@ static struct fsck_inode *add_inode(struct ubifs_info *c,
        if (!fscki)
                return ERR_PTR(-ENOMEM);
+        inode = ilookup(c->vfs_sb, inum);
        fscki->inum = inum;
-        fscki->nlink = le32_to_cpu(ino->nlink);
+        /*
-        fscki->size = le64_to_cpu(ino->size);
+         * If the inode is present in the VFS inode cache, use it instead of
-        fscki->xattr_cnt = le32_to_cpu(ino->xattr_cnt);
+         * the on-flash inode which might be out-of-date. E.g., the size might
-        fscki->xattr_sz = le32_to_cpu(ino->xattr_size);
+         * be out-of-date. If we do not do this, the following may happen, for
-        fscki->xattr_nms = le32_to_cpu(ino->xattr_names);
+         * example:
-        fscki->mode = le32_to_cpu(ino->mode);
+         *   1. A power cut happens
+         *   2. We mount the file-system R/O, the replay process fixes up the
+         *      inode size in the VFS cache, but on on-flash.
+         *   3. 'check_leaf()' fails because it hits a data node beyond inode
+         *      size.
+         */
+        if (!inode) {
+                fscki->nlink = le32_to_cpu(ino->nlink);
+                fscki->size = le64_to_cpu(ino->size);
+                fscki->xattr_cnt = le32_to_cpu(ino->xattr_cnt);
+                fscki->xattr_sz = le32_to_cpu(ino->xattr_size);
+                fscki->xattr_nms = le32_to_cpu(ino->xattr_names);
+                fscki->mode = le32_to_cpu(ino->mode);
+        } else {
+                ui = ubifs_inode(inode);
+                fscki->nlink = inode->i_nlink;
+                fscki->size = inode->i_size;
+                fscki->xattr_cnt = ui->xattr_cnt;
+                fscki->xattr_sz = ui->xattr_size;
+                fscki->xattr_nms = ui->xattr_names;
+                fscki->mode = inode->i_mode;
+                iput(inode);
+        }
        if (S_ISDIR(fscki->mode)) {
                fscki->calc_sz = UBIFS_INO_NODE_SZ;
                fscki->calc_cnt = 2;
        }
        rb_link_node(&fscki->rb, parent, p);
        rb_insert_color(&fscki->rb, &fsckd->inodes);
        return fscki;
 }
@@ -2421,7 +2474,8 @@ int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head)
                hashb = key_block(c, &sb->key);
                if (hasha > hashb) {
-                        ubifs_err("larger hash %u goes before %u", hasha, hashb);
+                        ubifs_err("larger hash %u goes before %u",
+                                  hasha, hashb);
                        goto error_dump;
                }
        }
@@ -2437,14 +2491,12 @@ error_dump:
        return 0;
 }
-static int invocation_cnt;
 int dbg_force_in_the_gaps(void)
 {
-        if (!dbg_force_in_the_gaps_enabled)
+        if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
                return 0;
-        /* Force in-the-gaps every 8th commit */
-        return !((invocation_cnt++) & 0x7);
+        return !(random32() & 7);
 }
 /* Failure mode for recovery testing */
@@ -2632,7 +2684,7 @@ int dbg_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset,
                 int len, int check)
 {
        if (in_failure_mode(desc))
-                return -EIO;
+                return -EROFS;
        return ubi_leb_read(desc, lnum, buf, offset, len, check);
 }
@@ -2642,7 +2694,7 @@ int dbg_leb_write(struct ubi_volume_desc *desc, int lnum, const void *buf,
        int err, failing;
        if (in_failure_mode(desc))
-                return -EIO;
+                return -EROFS;
        failing = do_fail(desc, lnum, 1);
        if (failing)
                cut_data(buf, len);
@@ -2650,7 +2702,7 @@ int dbg_leb_write(struct ubi_volume_desc *desc, int lnum, const void *buf,
        if (err)
                return err;
        if (failing)
-                return -EIO;
+                return -EROFS;
        return 0;
 }
@@ -2660,12 +2712,12 @@ int dbg_leb_change(struct ubi_volume_desc *desc, int lnum, const void *buf,
        int err;
        if (do_fail(desc, lnum, 1))
-                return -EIO;
+                return -EROFS;
        err = ubi_leb_change(desc, lnum, buf, len, dtype);
        if (err)
                return err;
        if (do_fail(desc, lnum, 1))
-                return -EIO;
+                return -EROFS;
        return 0;
 }
@@ -2674,12 +2726,12 @@ int dbg_leb_erase(struct ubi_volume_desc *desc, int lnum)
        int err;
        if (do_fail(desc, lnum, 0))
-                return -EIO;
+                return -EROFS;
        err = ubi_leb_erase(desc, lnum);
        if (err)
                return err;
        if (do_fail(desc, lnum, 0))
-                return -EIO;
+                return -EROFS;
        return 0;
 }
@@ -2688,19 +2740,19 @@ int dbg_leb_unmap(struct ubi_volume_desc *desc, int lnum)
        int err;
        if (do_fail(desc, lnum, 0))
-                return -EIO;
+                return -EROFS;
        err = ubi_leb_unmap(desc, lnum);
        if (err)
                return err;
        if (do_fail(desc, lnum, 0))
-                return -EIO;
+                return -EROFS;
        return 0;
 }
 int dbg_is_mapped(struct ubi_volume_desc *desc, int lnum)
 {
        if (in_failure_mode(desc))
-                return -EIO;
+                return -EROFS;
        return ubi_is_mapped(desc, lnum);
 }
@@ -2709,12 +2761,12 @@ int dbg_leb_map(struct ubi_volume_desc *desc, int lnum, int dtype)
        int err;
        if (do_fail(desc, lnum, 0))
-                return -EIO;
+                return -EROFS;
        err = ubi_leb_map(desc, lnum, dtype);
        if (err)
                return err;
        if (do_fail(desc, lnum, 0))
-                return -EIO;
+                return -EROFS;
        return 0;
 }
@@ -2784,7 +2836,7 @@ void dbg_debugfs_exit(void)
 static int open_debugfs_file(struct inode *inode, struct file *file)
 {
        file->private_data = inode->i_private;
-        return 0;
+        return nonseekable_open(inode, file);
 }
 static ssize_t write_debugfs_file(struct file *file, const char __user *buf,
@@ -2795,18 +2847,15 @@ static ssize_t write_debugfs_file(struct file *file, const char __user *buf,
        if (file->f_path.dentry == d->dfs_dump_lprops)
                dbg_dump_lprops(c);
-        else if (file->f_path.dentry == d->dfs_dump_budg) {
+        else if (file->f_path.dentry == d->dfs_dump_budg)
-                spin_lock(&c->space_lock);
+                dbg_dump_budg(c, &c->bi);
-                dbg_dump_budg(c);
+        else if (file->f_path.dentry == d->dfs_dump_tnc) {
-                spin_unlock(&c->space_lock);
-        } else if (file->f_path.dentry == d->dfs_dump_tnc) {
                mutex_lock(&c->tnc_mutex);
                dbg_dump_tnc(c);
                mutex_unlock(&c->tnc_mutex);
        } else
                return -EINVAL;
-        *ppos += count;
        return count;
 }
@@ -2814,7 +2863,7 @@ static const struct file_operations dfs_fops = {
        .open = open_debugfs_file,
        .write = write_debugfs_file,
        .owner = THIS_MODULE,
-        .llseek = default_llseek,
+        .llseek = no_llseek,
 };
 /**
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index e6493cac193d..a811ac4a26bb 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -31,6 +31,8 @@ typedef int (*dbg_znode_callback)(struct ubifs_info *c,
 #ifdef CONFIG_UBIFS_FS_DEBUG
+#include <linux/random.h>
 /**
 * ubifs_debug_info - per-FS debugging information.
 * @old_zroot: old index root - used by 'dbg_check_old_index()'
@@ -50,13 +52,15 @@ typedef int (*dbg_znode_callback)(struct ubifs_info *c,
 * @new_ihead_offs: used by debugging to check @c->ihead_offs
 *
 * @saved_lst: saved lprops statistics (used by 'dbg_save_space_info()')
- * @saved_free: saved free space (used by 'dbg_save_space_info()')
+ * @saved_bi: saved budgeting information
+ * @saved_free: saved amount of free space
+ * @saved_idx_gc_cnt: saved value of @c->idx_gc_cnt
 *
- * dfs_dir_name: name of debugfs directory containing this file-system's files
+ * @dfs_dir_name: name of debugfs directory containing this file-system's files
- * dfs_dir: direntry object of the file-system debugfs directory
+ * @dfs_dir: direntry object of the file-system debugfs directory
- * dfs_dump_lprops: "dump lprops" debugfs knob
+ * @dfs_dump_lprops: "dump lprops" debugfs knob
- * dfs_dump_budg: "dump budgeting information" debugfs knob
+ * @dfs_dump_budg: "dump budgeting information" debugfs knob
- * dfs_dump_tnc: "dump TNC" debugfs knob
+ * @dfs_dump_tnc: "dump TNC" debugfs knob
 */
 struct ubifs_debug_info {
        struct ubifs_zbranch old_zroot;
@@ -76,7 +80,9 @@ struct ubifs_debug_info {
        int new_ihead_offs;
        struct ubifs_lp_stats saved_lst;
+        struct ubifs_budg_info saved_bi;
        long long saved_free;
+        int saved_idx_gc_cnt;
        char dfs_dir_name[100];
        struct dentry *dfs_dir;
@@ -101,23 +107,7 @@ struct ubifs_debug_info {
        }                                                                      \
 } while (0)
-#define dbg_dump_stack() do {                                                  \
+#define dbg_dump_stack() dump_stack()
-        if (!dbg_failure_mode)                                                 \
-                dump_stack();                                                  \
-} while (0)
-/* Generic debugging messages */
-#define dbg_msg(fmt, ...) do {                                                 \
-        spin_lock(&dbg_lock);                                                  \
-        printk(KERN_DEBUG "UBIFS DBG (pid %d): %s: " fmt "\n", current->pid,   \
-               __func__, ##__VA_ARGS__);                                       \
-        spin_unlock(&dbg_lock);                                                \
-} while (0)
-#define dbg_do_msg(typ, fmt, ...) do {                                         \
-        if (ubifs_msg_flags & typ)                                             \
-                dbg_msg(fmt, ##__VA_ARGS__);                                   \
-} while (0)
 #define dbg_err(fmt, ...) do {                                                 \
        spin_lock(&dbg_lock);                                                  \
@@ -137,77 +127,40 @@ const char *dbg_key_str1(const struct ubifs_info *c,
 #define DBGKEY(key) dbg_key_str0(c, (key))
 #define DBGKEY1(key) dbg_key_str1(c, (key))
-/* General messages */
+#define ubifs_dbg_msg(type, fmt, ...) do {                        \
-#define dbg_gen(fmt, ...)   dbg_do_msg(UBIFS_MSG_GEN, fmt, ##__VA_ARGS__)
+        spin_lock(&dbg_lock);                                     \
+        pr_debug("UBIFS DBG " type ": " fmt "\n", ##__VA_ARGS__); \
+        spin_unlock(&dbg_lock);                                   \
+} while (0)
+/* Just a debugging messages not related to any specific UBIFS subsystem */
+#define dbg_msg(fmt, ...)   ubifs_dbg_msg("msg", fmt, ##__VA_ARGS__)
+/* General messages */
+#define dbg_gen(fmt, ...)   ubifs_dbg_msg("gen", fmt, ##__VA_ARGS__)
 /* Additional journal messages */
-#define dbg_jnl(fmt, ...)   dbg_do_msg(UBIFS_MSG_JNL, fmt, ##__VA_ARGS__)
+#define dbg_jnl(fmt, ...)   ubifs_dbg_msg("jnl", fmt, ##__VA_ARGS__)
 /* Additional TNC messages */
-#define dbg_tnc(fmt, ...)   dbg_do_msg(UBIFS_MSG_TNC, fmt, ##__VA_ARGS__)
+#define dbg_tnc(fmt, ...)   ubifs_dbg_msg("tnc", fmt, ##__VA_ARGS__)
 /* Additional lprops messages */
-#define dbg_lp(fmt, ...)    dbg_do_msg(UBIFS_MSG_LP, fmt, ##__VA_ARGS__)
+#define dbg_lp(fmt, ...)    ubifs_dbg_msg("lp", fmt, ##__VA_ARGS__)
 /* Additional LEB find messages */
-#define dbg_find(fmt, ...)  dbg_do_msg(UBIFS_MSG_FIND, fmt, ##__VA_ARGS__)
+#define dbg_find(fmt, ...)  ubifs_dbg_msg("find", fmt, ##__VA_ARGS__)
 /* Additional mount messages */
-#define dbg_mnt(fmt, ...)   dbg_do_msg(UBIFS_MSG_MNT, fmt, ##__VA_ARGS__)
+#define dbg_mnt(fmt, ...)   ubifs_dbg_msg("mnt", fmt, ##__VA_ARGS__)
 /* Additional I/O messages */
-#define dbg_io(fmt, ...)    dbg_do_msg(UBIFS_MSG_IO, fmt, ##__VA_ARGS__)
+#define dbg_io(fmt, ...)    ubifs_dbg_msg("io", fmt, ##__VA_ARGS__)
 /* Additional commit messages */
-#define dbg_cmt(fmt, ...)   dbg_do_msg(UBIFS_MSG_CMT, fmt, ##__VA_ARGS__)
+#define dbg_cmt(fmt, ...)   ubifs_dbg_msg("cmt", fmt, ##__VA_ARGS__)
 /* Additional budgeting messages */
-#define dbg_budg(fmt, ...)  dbg_do_msg(UBIFS_MSG_BUDG, fmt, ##__VA_ARGS__)
+#define dbg_budg(fmt, ...)  ubifs_dbg_msg("budg", fmt, ##__VA_ARGS__)
 /* Additional log messages */
-#define dbg_log(fmt, ...)   dbg_do_msg(UBIFS_MSG_LOG, fmt, ##__VA_ARGS__)
+#define dbg_log(fmt, ...)   ubifs_dbg_msg("log", fmt, ##__VA_ARGS__)
 /* Additional gc messages */
-#define dbg_gc(fmt, ...)    dbg_do_msg(UBIFS_MSG_GC, fmt, ##__VA_ARGS__)
+#define dbg_gc(fmt, ...)    ubifs_dbg_msg("gc", fmt, ##__VA_ARGS__)
 /* Additional scan messages */
-#define dbg_scan(fmt, ...)  dbg_do_msg(UBIFS_MSG_SCAN, fmt, ##__VA_ARGS__)
+#define dbg_scan(fmt, ...)  ubifs_dbg_msg("scan", fmt, ##__VA_ARGS__)
 /* Additional recovery messages */
-#define dbg_rcvry(fmt, ...) dbg_do_msg(UBIFS_MSG_RCVRY, fmt, ##__VA_ARGS__)
+#define dbg_rcvry(fmt, ...) ubifs_dbg_msg("rcvry", fmt, ##__VA_ARGS__)
-/*
- * Debugging message type flags.
- *
- * UBIFS_MSG_GEN: general messages
- * UBIFS_MSG_JNL: journal messages
- * UBIFS_MSG_MNT: mount messages
- * UBIFS_MSG_CMT: commit messages
- * UBIFS_MSG_FIND: LEB find messages
- * UBIFS_MSG_BUDG: budgeting messages
- * UBIFS_MSG_GC: garbage collection messages
- * UBIFS_MSG_TNC: TNC messages
- * UBIFS_MSG_LP: lprops messages
- * UBIFS_MSG_IO: I/O messages
- * UBIFS_MSG_LOG: log messages
- * UBIFS_MSG_SCAN: scan messages
- * UBIFS_MSG_RCVRY: recovery messages
- */
-enum {
-        UBIFS_MSG_GEN   = 0x1,
-        UBIFS_MSG_JNL   = 0x2,
-        UBIFS_MSG_MNT   = 0x4,
-        UBIFS_MSG_CMT   = 0x8,
-        UBIFS_MSG_FIND  = 0x10,
-        UBIFS_MSG_BUDG  = 0x20,
-        UBIFS_MSG_GC    = 0x40,
-        UBIFS_MSG_TNC   = 0x80,
-        UBIFS_MSG_LP    = 0x100,
-        UBIFS_MSG_IO    = 0x200,
-        UBIFS_MSG_LOG   = 0x400,
-        UBIFS_MSG_SCAN  = 0x800,
-        UBIFS_MSG_RCVRY = 0x1000,
-};
 /*
 * Debugging check flags.
@@ -233,11 +186,9 @@ enum {
 /*
 * Special testing flags.
 *
- * UBIFS_TST_FORCE_IN_THE_GAPS: force the use of in-the-gaps method
 * UBIFS_TST_RCVRY: failure mode for recovery testing
 */
 enum {
-        UBIFS_TST_FORCE_IN_THE_GAPS = 0x2,
        UBIFS_TST_RCVRY             = 0x4,
 };
@@ -262,7 +213,7 @@ void dbg_dump_lpt_node(const struct ubifs_info *c, void *node, int lnum,
                       int offs);
 void dbg_dump_budget_req(const struct ubifs_budget_req *req);
 void dbg_dump_lstats(const struct ubifs_lp_stats *lst);
-void dbg_dump_budg(struct ubifs_info *c);
+void dbg_dump_budg(struct ubifs_info *c, const struct ubifs_budg_info *bi);
 void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp);
 void dbg_dump_lprops(struct ubifs_info *c);
 void dbg_dump_lpt_info(struct ubifs_info *c);
@@ -304,18 +255,16 @@ int dbg_check_data_nodes_order(struct ubifs_info *c, struct list_head *head);
 int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head);
 /* Force the use of in-the-gaps method for testing */
+static inline int dbg_force_in_the_gaps_enabled(void)
-#define dbg_force_in_the_gaps_enabled \
+{
-        (ubifs_tst_flags & UBIFS_TST_FORCE_IN_THE_GAPS)
+        return ubifs_chk_flags & UBIFS_CHK_GEN;
+}
 int dbg_force_in_the_gaps(void);
 /* Failure mode for recovery testing */
 #define dbg_failure_mode (ubifs_tst_flags & UBIFS_TST_RCVRY)
 #ifndef UBIFS_DBG_PRESERVE_UBI
 #define ubi_leb_read   dbg_leb_read
 #define ubi_leb_write  dbg_leb_write
 #define ubi_leb_change dbg_leb_change
@@ -323,7 +272,6 @@ int dbg_force_in_the_gaps(void);
 #define ubi_leb_unmap  dbg_leb_unmap
 #define ubi_is_mapped  dbg_is_mapped
 #define ubi_leb_map    dbg_leb_map
 #endif
 int dbg_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset,
@@ -370,33 +318,33 @@ void dbg_debugfs_exit_fs(struct ubifs_info *c);
                       __func__, __LINE__, current->pid);                      \
 } while (0)
-#define dbg_err(fmt, ...)   do {                                               \
+#define dbg_err(fmt, ...)   do {                   \
-        if (0)                                                                 \
+        if (0)                                     \
-                ubifs_err(fmt, ##__VA_ARGS__);                                 \
+                ubifs_err(fmt, ##__VA_ARGS__);     \
 } while (0)
-#define dbg_msg(fmt, ...) do {                                                 \
+#define ubifs_dbg_msg(fmt, ...) do {               \
-        if (0)                                                                 \
+        if (0)                                     \
-                printk(KERN_DEBUG "UBIFS DBG (pid %d): %s: " fmt "\n",         \
+                pr_debug(fmt "\n", ##__VA_ARGS__); \
-                       current->pid, __func__, ##__VA_ARGS__);                 \
 } while (0)
 #define dbg_dump_stack()
 #define ubifs_assert_cmt_locked(c)
-#define dbg_gen(fmt, ...)   dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_msg(fmt, ...)   ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_jnl(fmt, ...)   dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_gen(fmt, ...)   ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_tnc(fmt, ...)   dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_jnl(fmt, ...)   ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_lp(fmt, ...)    dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_tnc(fmt, ...)   ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_find(fmt, ...)  dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_lp(fmt, ...)    ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_mnt(fmt, ...)   dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_find(fmt, ...)  ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_io(fmt, ...)    dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_mnt(fmt, ...)   ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_cmt(fmt, ...)   dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_io(fmt, ...)    ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_budg(fmt, ...)  dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_cmt(fmt, ...)   ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_log(fmt, ...)   dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_budg(fmt, ...)  ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_gc(fmt, ...)    dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_log(fmt, ...)   ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_scan(fmt, ...)  dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_gc(fmt, ...)    ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_rcvry(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_scan(fmt, ...)  ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_rcvry(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
 #define DBGKEY(key)  ((char *)(key))
 #define DBGKEY1(key) ((char *)(key))
@@ -420,7 +368,9 @@ static inline void
 dbg_dump_budget_req(const struct ubifs_budget_req *req)           { return; }
 static inline void
 dbg_dump_lstats(const struct ubifs_lp_stats *lst)                 { return; }
-static inline void dbg_dump_budg(struct ubifs_info *c)            { return; }
+static inline void
+dbg_dump_budg(struct ubifs_info *c,
+              const struct ubifs_budg_info *bi)                   { return; }
 static inline void dbg_dump_lprop(const struct ubifs_info *c,
                                  const struct ubifs_lprops *lp)  { return; }
 static inline void dbg_dump_lprops(struct ubifs_info *c)          { return; }
@@ -482,8 +432,8 @@ dbg_check_nondata_nodes_order(struct ubifs_info *c,
                              struct list_head *head)             { return 0; }
 static inline int dbg_force_in_the_gaps(void)                     { return 0; }
-#define dbg_force_in_the_gaps_enabled 0
+#define dbg_force_in_the_gaps_enabled() 0
-#define dbg_failure_mode              0
+#define dbg_failure_mode                0
 static inline int dbg_debugfs_init(void)                          { return 0; }
 static inline void dbg_debugfs_exit(void)                         { return; }
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 7217d67a80a6..ef5abd38f0bf 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -603,7 +603,7 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry)
                ubifs_release_budget(c, &req);
        else {
                /* We've deleted something - clean the "no space" flags */
-                c->nospace = c->nospace_rp = 0;
+                c->bi.nospace = c->bi.nospace_rp = 0;
                smp_wmb();
        }
        return 0;
@@ -693,7 +693,7 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry)
                ubifs_release_budget(c, &req);
        else {
                /* We've deleted something - clean the "no space" flags */
-                c->nospace = c->nospace_rp = 0;
+                c->bi.nospace = c->bi.nospace_rp = 0;
                smp_wmb();
        }
        return 0;
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index b286db79c686..5e7fccfc4b29 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -212,7 +212,7 @@ static void release_new_page_budget(struct ubifs_info *c)
 */
 static void release_existing_page_budget(struct ubifs_info *c)
 {
-        struct ubifs_budget_req req = { .dd_growth = c->page_budget};
+        struct ubifs_budget_req req = { .dd_growth = c->bi.page_budget};
        ubifs_release_budget(c, &req);
 }
@@ -971,11 +971,11 @@ static int do_writepage(struct page *page, int len)
 * the page locked, and it locks @ui_mutex. However, write-back does take inode
 * @i_mutex, which means other VFS operations may be run on this inode at the
 * same time. And the problematic one is truncation to smaller size, from where
- * we have to call 'truncate_setsize()', which first changes @inode->i_size, then
+ * we have to call 'truncate_setsize()', which first changes @inode->i_size,
- * drops the truncated pages. And while dropping the pages, it takes the page
+ * then drops the truncated pages. And while dropping the pages, it takes the
- * lock. This means that 'do_truncation()' cannot call 'truncate_setsize()' with
+ * page lock. This means that 'do_truncation()' cannot call 'truncate_setsize()'
- * @ui_mutex locked, because it would deadlock with 'ubifs_writepage()'. This
+ * with @ui_mutex locked, because it would deadlock with 'ubifs_writepage()'.
- * means that @inode->i_size is changed while @ui_mutex is unlocked.
+ * This means that @inode->i_size is changed while @ui_mutex is unlocked.
 *
 * XXX(truncate): with the new truncate sequence this is not true anymore,
 * and the calls to truncate_setsize can be move around freely.  They should
@@ -1189,7 +1189,7 @@ out_budg:
        if (budgeted)
                ubifs_release_budget(c, &req);
        else {
-                c->nospace = c->nospace_rp = 0;
+                c->bi.nospace = c->bi.nospace_rp = 0;
                smp_wmb();
        }
        return err;
@@ -1312,7 +1312,11 @@ int ubifs_fsync(struct file *file, int datasync)
        dbg_gen("syncing inode %lu", inode->i_ino);
-        if (inode->i_sb->s_flags & MS_RDONLY)
+        if (c->ro_mount)
+                /*
+                 * For some really strange reasons VFS does not filter out
+                 * 'fsync()' for R/O mounted file-systems as per 2.6.39.
+                 */
                return 0;
        /*
@@ -1432,10 +1436,11 @@ static int ubifs_releasepage(struct page *page, gfp_t unused_gfp_flags)
 }
 /*
- * mmap()d file has taken write protection fault and is being made
+ * mmap()d file has taken write protection fault and is being made writable.
- * writable. UBIFS must ensure page is budgeted for.
+ * UBIFS must ensure page is budgeted for.
 */
-static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma,
+                                 struct vm_fault *vmf)
 {
        struct page *page = vmf->page;
        struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
@@ -1536,7 +1541,6 @@ static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
        int err;
-        /* 'generic_file_mmap()' takes care of NOMMU case */
        err = generic_file_mmap(file, vma);
        if (err)
                return err;
diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c
index 1d54383d1269..2559d174e004 100644
--- a/fs/ubifs/find.c
+++ b/fs/ubifs/find.c
@@ -252,8 +252,8 @@ int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
                 * But if the index takes fewer LEBs than it is reserved for it,
                 * this function must avoid picking those reserved LEBs.
                 */
-                if (c->min_idx_lebs >= c->lst.idx_lebs) {
+                if (c->bi.min_idx_lebs >= c->lst.idx_lebs) {
-                        rsvd_idx_lebs = c->min_idx_lebs -  c->lst.idx_lebs;
+                        rsvd_idx_lebs = c->bi.min_idx_lebs -  c->lst.idx_lebs;
                        exclude_index = 1;
                }
                spin_unlock(&c->space_lock);
@@ -276,7 +276,7 @@ int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
                        pick_free = 0;
        } else {
                spin_lock(&c->space_lock);
-                exclude_index = (c->min_idx_lebs >= c->lst.idx_lebs);
+                exclude_index = (c->bi.min_idx_lebs >= c->lst.idx_lebs);
                spin_unlock(&c->space_lock);
        }
@@ -501,8 +501,8 @@ int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *offs,
        /* Check if there are enough empty LEBs for commit */
        spin_lock(&c->space_lock);
-        if (c->min_idx_lebs > c->lst.idx_lebs)
+        if (c->bi.min_idx_lebs > c->lst.idx_lebs)
-                rsvd_idx_lebs = c->min_idx_lebs -  c->lst.idx_lebs;
+                rsvd_idx_lebs = c->bi.min_idx_lebs -  c->lst.idx_lebs;
        else
                rsvd_idx_lebs = 0;
        lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt -
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index 151f10882820..ded29f6224c2 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -100,6 +100,10 @@ static int switch_gc_head(struct ubifs_info *c)
        if (err)
                return err;
+        err = ubifs_wbuf_sync_nolock(wbuf);
+        if (err)
+                return err;
        err = ubifs_add_bud_to_log(c, GCHD, gc_lnum, 0);
        if (err)
                return err;
@@ -118,7 +122,7 @@ static int switch_gc_head(struct ubifs_info *c)
 * This function compares data nodes @a and @b. Returns %1 if @a has greater
 * inode or block number, and %-1 otherwise.
 */
-int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
+static int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
 {
        ino_t inuma, inumb;
        struct ubifs_info *c = priv;
@@ -161,7 +165,8 @@ int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
 * first and sorted by length in descending order. Directory entry nodes go
 * after inode nodes and are sorted in ascending hash valuer order.
 */
-int nondata_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
+static int nondata_nodes_cmp(void *priv, struct list_head *a,
+                             struct list_head *b)
 {
        ino_t inuma, inumb;
        struct ubifs_info *c = priv;
@@ -473,6 +478,37 @@ int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp)
        ubifs_assert(c->gc_lnum != lnum);
        ubifs_assert(wbuf->lnum != lnum);
+        if (lp->free + lp->dirty == c->leb_size) {
+                /* Special case - a free LEB  */
+                dbg_gc("LEB %d is free, return it", lp->lnum);
+                ubifs_assert(!(lp->flags & LPROPS_INDEX));
+                if (lp->free != c->leb_size) {
+                        /*
+                         * Write buffers must be sync'd before unmapping
+                         * freeable LEBs, because one of them may contain data
+                         * which obsoletes something in 'lp->pnum'.
+                         */
+                        err = gc_sync_wbufs(c);
+                        if (err)
+                                return err;
+                        err = ubifs_change_one_lp(c, lp->lnum, c->leb_size,
+                                                  0, 0, 0, 0);
+                        if (err)
+                                return err;
+                }
+                err = ubifs_leb_unmap(c, lp->lnum);
+                if (err)
+                        return err;
+                if (c->gc_lnum == -1) {
+                        c->gc_lnum = lnum;
+                        return LEB_RETAINED;
+                }
+                return LEB_FREED;
+        }
        /*
         * We scan the entire LEB even though we only really need to scan up to
         * (c->leb_size - lp->free).
@@ -682,37 +718,6 @@ int ubifs_garbage_collect(struct ubifs_info *c, int anyway)
                       "(min. space %d)", lp.lnum, lp.free, lp.dirty,
                       lp.free + lp.dirty, min_space);
-                if (lp.free + lp.dirty == c->leb_size) {
-                        /* An empty LEB was returned */
-                        dbg_gc("LEB %d is free, return it", lp.lnum);
-                        /*
-                         * ubifs_find_dirty_leb() doesn't return freeable index
-                         * LEBs.
-                         */
-                        ubifs_assert(!(lp.flags & LPROPS_INDEX));
-                        if (lp.free != c->leb_size) {
-                                /*
-                                 * Write buffers must be sync'd before
-                                 * unmapping freeable LEBs, because one of them
-                                 * may contain data which obsoletes something
-                                 * in 'lp.pnum'.
-                                 */
-                                ret = gc_sync_wbufs(c);
-                                if (ret)
-                                        goto out;
-                                ret = ubifs_change_one_lp(c, lp.lnum,
-                                                          c->leb_size, 0, 0, 0,
-                                                          0);
-                                if (ret)
-                                        goto out;
-                        }
-                        ret = ubifs_leb_unmap(c, lp.lnum);
-                        if (ret)
-                                goto out;
-                        ret = lp.lnum;
-                        break;
-                }
                space_before = c->leb_size - wbuf->offs - wbuf->used;
                if (wbuf->lnum == -1)
                        space_before = 0;
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index dfd168b7807e..166951e0dcd3 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -393,7 +393,7 @@ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf)
        ubifs_assert(wbuf->size % c->min_io_size == 0);
        ubifs_assert(!c->ro_media && !c->ro_mount);
        if (c->leb_size - wbuf->offs >= c->max_write_size)
-                ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size ));
+                ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size));
        if (c->ro_error)
                return -EROFS;
@@ -452,8 +452,8 @@ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf)
 * @dtype: data type
 *
 * This function targets the write-buffer to logical eraseblock @lnum:@offs.
- * The write-buffer is synchronized if it is not empty. Returns zero in case of
+ * The write-buffer has to be empty. Returns zero in case of success and a
- * success and a negative error code in case of failure.
+ * negative error code in case of failure.
 */
 int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs,
                           int dtype)
@@ -465,13 +465,7 @@ int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs,
        ubifs_assert(offs >= 0 && offs <= c->leb_size);
        ubifs_assert(offs % c->min_io_size == 0 && !(offs & 7));
        ubifs_assert(lnum != wbuf->lnum);
+        ubifs_assert(wbuf->used == 0);
-        if (wbuf->used > 0) {
-                int err = ubifs_wbuf_sync_nolock(wbuf);
-                if (err)
-                        return err;
-        }
        spin_lock(&wbuf->lock);
        wbuf->lnum = lnum;
@@ -573,7 +567,7 @@ out_timers:
 int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
 {
        struct ubifs_info *c = wbuf->c;
-        int err, written, n, aligned_len = ALIGN(len, 8), offs;
+        int err, written, n, aligned_len = ALIGN(len, 8);
        dbg_io("%d bytes (%s) to jhead %s wbuf at LEB %d:%d", len,
               dbg_ntype(((struct ubifs_ch *)buf)->node_type),
@@ -588,7 +582,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
        ubifs_assert(mutex_is_locked(&wbuf->io_mutex));
        ubifs_assert(!c->ro_media && !c->ro_mount);
        if (c->leb_size - wbuf->offs >= c->max_write_size)
-                ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size ));
+                ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size));
        if (c->leb_size - wbuf->offs - wbuf->used < aligned_len) {
                err = -ENOSPC;
@@ -636,7 +630,6 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
                goto exit;
        }
-        offs = wbuf->offs;
        written = 0;
        if (wbuf->used) {
@@ -653,7 +646,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
                if (err)
                        goto out;
-                offs += wbuf->size;
+                wbuf->offs += wbuf->size;
                len -= wbuf->avail;
                aligned_len -= wbuf->avail;
                written += wbuf->avail;
@@ -672,7 +665,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
                if (err)
                        goto out;
-                offs += wbuf->size;
+                wbuf->offs += wbuf->size;
                len -= wbuf->size;
                aligned_len -= wbuf->size;
                written += wbuf->size;
@@ -687,12 +680,13 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
        n = aligned_len >> c->max_write_shift;
        if (n) {
                n <<= c->max_write_shift;
-                dbg_io("write %d bytes to LEB %d:%d", n, wbuf->lnum, offs);
+                dbg_io("write %d bytes to LEB %d:%d", n, wbuf->lnum,
-                err = ubi_leb_write(c->ubi, wbuf->lnum, buf + written, offs, n,
+                       wbuf->offs);
-                                    wbuf->dtype);
+                err = ubi_leb_write(c->ubi, wbuf->lnum, buf + written,
+                                    wbuf->offs, n, wbuf->dtype);
                if (err)
                        goto out;
-                offs += n;
+                wbuf->offs += n;
                aligned_len -= n;
                len -= n;
                written += n;
@@ -707,7 +701,6 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
                 */
                memcpy(wbuf->buf, buf + written, len);
-        wbuf->offs = offs;
        if (c->leb_size - wbuf->offs >= c->max_write_size)
                wbuf->size = c->max_write_size;
        else
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index aed25e864227..34b1679e6e3a 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -141,14 +141,8 @@ again:
         * LEB with some empty space.
         */
        lnum = ubifs_find_free_space(c, len, &offs, squeeze);
-        if (lnum >= 0) {
+        if (lnum >= 0)
-                /* Found an LEB, add it to the journal head */
-                err = ubifs_add_bud_to_log(c, jhead, lnum, offs);
-                if (err)
-                        goto out_return;
-                /* A new bud was successfully allocated and added to the log */
                goto out;
-        }
        err = lnum;
        if (err != -ENOSPC)
@@ -203,12 +197,23 @@ again:
                return 0;
        }
-        err = ubifs_add_bud_to_log(c, jhead, lnum, 0);
-        if (err)
-                goto out_return;
        offs = 0;
 out:
+        /*
+         * Make sure we synchronize the write-buffer before we add the new bud
+         * to the log. Otherwise we may have a power cut after the log
+         * reference node for the last bud (@lnum) is written but before the
+         * write-buffer data are written to the next-to-last bud
+         * (@wbuf->lnum). And the effect would be that the recovery would see
+         * that there is corruption in the next-to-last bud.
+         */
+        err = ubifs_wbuf_sync_nolock(wbuf);
+        if (err)
+                goto out_return;
+        err = ubifs_add_bud_to_log(c, jhead, lnum, offs);
+        if (err)
+                goto out_return;
        err = ubifs_wbuf_seek_nolock(wbuf, lnum, offs, wbuf->dtype);
        if (err)
                goto out_unlock;
@@ -380,10 +385,8 @@ out:
        if (err == -ENOSPC) {
                /* This are some budgeting problems, print useful information */
                down_write(&c->commit_sem);
-                spin_lock(&c->space_lock);
                dbg_dump_stack();
-                dbg_dump_budg(c);
+                dbg_dump_budg(c, &c->bi);
-                spin_unlock(&c->space_lock);
                dbg_dump_lprops(c);
                cmt_retries = dbg_check_lprops(c);
                up_write(&c->commit_sem);
diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c
index 40fa780ebea7..affea9494ae2 100644
--- a/fs/ubifs/log.c
+++ b/fs/ubifs/log.c
@@ -100,20 +100,6 @@ struct ubifs_wbuf *ubifs_get_wbuf(struct ubifs_info *c, int lnum)
 }
 /**
- * next_log_lnum - switch to the next log LEB.
- * @c: UBIFS file-system description object
- * @lnum: current log LEB
- */
-static inline int next_log_lnum(const struct ubifs_info *c, int lnum)
-{
-        lnum += 1;
-        if (lnum > c->log_last)
-                lnum = UBIFS_LOG_LNUM;
-        return lnum;
-}
-/**
 * empty_log_bytes - calculate amount of empty space in the log.
 * @c: UBIFS file-system description object
 */
@@ -257,7 +243,7 @@ int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs)
        ref->jhead = cpu_to_le32(jhead);
        if (c->lhead_offs > c->leb_size - c->ref_node_alsz) {
-                c->lhead_lnum = next_log_lnum(c, c->lhead_lnum);
+                c->lhead_lnum = ubifs_next_log_lnum(c, c->lhead_lnum);
                c->lhead_offs = 0;
        }
@@ -425,7 +411,7 @@ int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum)
        /* Switch to the next log LEB */
        if (c->lhead_offs) {
-                c->lhead_lnum = next_log_lnum(c, c->lhead_lnum);
+                c->lhead_lnum = ubifs_next_log_lnum(c, c->lhead_lnum);
                c->lhead_offs = 0;
        }
@@ -446,7 +432,7 @@ int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum)
        c->lhead_offs += len;
        if (c->lhead_offs == c->leb_size) {
-                c->lhead_lnum = next_log_lnum(c, c->lhead_lnum);
+                c->lhead_lnum = ubifs_next_log_lnum(c, c->lhead_lnum);
                c->lhead_offs = 0;
        }
@@ -533,7 +519,7 @@ int ubifs_log_post_commit(struct ubifs_info *c, int old_ltail_lnum)
        }
        mutex_lock(&c->log_mutex);
        for (lnum = old_ltail_lnum; lnum != c->ltail_lnum;
-             lnum = next_log_lnum(c, lnum)) {
+             lnum = ubifs_next_log_lnum(c, lnum)) {
                dbg_log("unmap log LEB %d", lnum);
                err = ubifs_leb_unmap(c, lnum);
                if (err)
@@ -642,7 +628,7 @@ static int add_node(struct ubifs_info *c, void *buf, int *lnum, int *offs,
                err = ubifs_leb_change(c, *lnum, buf, sz, UBI_SHORTTERM);
                if (err)
                        return err;
-                *lnum = next_log_lnum(c, *lnum);
+                *lnum = ubifs_next_log_lnum(c, *lnum);
                *offs = 0;
        }
        memcpy(buf + *offs, node, len);
@@ -712,7 +698,7 @@ int ubifs_consolidate_log(struct ubifs_info *c)
                ubifs_scan_destroy(sleb);
                if (lnum == c->lhead_lnum)
                        break;
-                lnum = next_log_lnum(c, lnum);
+                lnum = ubifs_next_log_lnum(c, lnum);
        }
        if (offs) {
                int sz = ALIGN(offs, c->min_io_size);
@@ -732,7 +718,7 @@ int ubifs_consolidate_log(struct ubifs_info *c)
        /* Unmap remaining LEBs */
        lnum = write_lnum;
        do {
-                lnum = next_log_lnum(c, lnum);
+                lnum = ubifs_next_log_lnum(c, lnum);
                err = ubifs_leb_unmap(c, lnum);
                if (err)
                        return err;
diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c
index 0ee0847f2421..667884f4a615 100644
--- a/fs/ubifs/lprops.c
+++ b/fs/ubifs/lprops.c
@@ -1007,21 +1007,11 @@ out:
 }
 /**
- * struct scan_check_data - data provided to scan callback function.
- * @lst: LEB properties statistics
- * @err: error code
- */
-struct scan_check_data {
-        struct ubifs_lp_stats lst;
-        int err;
-};
-/**
 * scan_check_cb - scan callback.
 * @c: the UBIFS file-system description object
 * @lp: LEB properties to scan
 * @in_tree: whether the LEB properties are in main memory
- * @data: information passed to and from the caller of the scan
+ * @lst: lprops statistics to update
 *
 * This function returns a code that indicates whether the scan should continue
 * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree
@@ -1030,11 +1020,10 @@ struct scan_check_data {
 */
 static int scan_check_cb(struct ubifs_info *c,
                         const struct ubifs_lprops *lp, int in_tree,
-                         struct scan_check_data *data)
+                         struct ubifs_lp_stats *lst)
 {
        struct ubifs_scan_leb *sleb;
        struct ubifs_scan_node *snod;
-        struct ubifs_lp_stats *lst = &data->lst;
        int cat, lnum = lp->lnum, is_idx = 0, used = 0, free, dirty, ret;
        void *buf = NULL;
@@ -1044,7 +1033,7 @@ static int scan_check_cb(struct ubifs_info *c,
                if (cat != (lp->flags & LPROPS_CAT_MASK)) {
                        ubifs_err("bad LEB category %d expected %d",
                                  (lp->flags & LPROPS_CAT_MASK), cat);
-                        goto out;
+                        return -EINVAL;
                }
        }
@@ -1078,7 +1067,7 @@ static int scan_check_cb(struct ubifs_info *c,
                        }
                        if (!found) {
                                ubifs_err("bad LPT list (category %d)", cat);
-                                goto out;
+                                return -EINVAL;
                        }
                }
        }
@@ -1090,45 +1079,40 @@ static int scan_check_cb(struct ubifs_info *c,
                if ((lp->hpos != -1 && heap->arr[lp->hpos]->lnum != lnum) ||
                    lp != heap->arr[lp->hpos]) {
                        ubifs_err("bad LPT heap (category %d)", cat);
-                        goto out;
+                        return -EINVAL;
                }
        }
        buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL);
-        if (!buf) {
+        if (!buf)
-                ubifs_err("cannot allocate memory to scan LEB %d", lnum);
+                return -ENOMEM;
-                goto out;
+        /*
+         * After an unclean unmount, empty and freeable LEBs
+         * may contain garbage - do not scan them.
+         */
+        if (lp->free == c->leb_size) {
+                lst->empty_lebs += 1;
+                lst->total_free += c->leb_size;
+                lst->total_dark += ubifs_calc_dark(c, c->leb_size);
+                return LPT_SCAN_CONTINUE;
+        }
+        if (lp->free + lp->dirty == c->leb_size &&
+            !(lp->flags & LPROPS_INDEX)) {
+                lst->total_free  += lp->free;
+                lst->total_dirty += lp->dirty;
+                lst->total_dark  +=  ubifs_calc_dark(c, c->leb_size);
+                return LPT_SCAN_CONTINUE;
        }
        sleb = ubifs_scan(c, lnum, 0, buf, 0);
        if (IS_ERR(sleb)) {
-                /*
+                ret = PTR_ERR(sleb);
-                 * After an unclean unmount, empty and freeable LEBs
+                if (ret == -EUCLEAN) {
-                 * may contain garbage.
+                        dbg_dump_lprops(c);
-                 */
+                        dbg_dump_budg(c, &c->bi);
-                if (lp->free == c->leb_size) {
-                        ubifs_err("scan errors were in empty LEB "
-                                  "- continuing checking");
-                        lst->empty_lebs += 1;
-                        lst->total_free += c->leb_size;
-                        lst->total_dark += ubifs_calc_dark(c, c->leb_size);
-                        ret = LPT_SCAN_CONTINUE;
-                        goto exit;
-                }
-                if (lp->free + lp->dirty == c->leb_size &&
-                    !(lp->flags & LPROPS_INDEX)) {
-                        ubifs_err("scan errors were in freeable LEB "
-                                  "- continuing checking");
-                        lst->total_free  += lp->free;
-                        lst->total_dirty += lp->dirty;
-                        lst->total_dark  +=  ubifs_calc_dark(c, c->leb_size);
-                        ret = LPT_SCAN_CONTINUE;
-                        goto exit;
                }
-                data->err = PTR_ERR(sleb);
+                goto out;
-                ret = LPT_SCAN_STOP;
-                goto exit;
        }
        is_idx = -1;
@@ -1246,10 +1230,8 @@ static int scan_check_cb(struct ubifs_info *c,
        }
        ubifs_scan_destroy(sleb);
-        ret = LPT_SCAN_CONTINUE;
-exit:
        vfree(buf);
-        return ret;
+        return LPT_SCAN_CONTINUE;
 out_print:
        ubifs_err("bad accounting of LEB %d: free %d, dirty %d flags %#x, "
@@ -1258,10 +1240,10 @@ out_print:
        dbg_dump_leb(c, lnum);
 out_destroy:
        ubifs_scan_destroy(sleb);
+        ret = -EINVAL;
 out:
        vfree(buf);
-        data->err = -EINVAL;
+        return ret;
-        return LPT_SCAN_STOP;
 }
 /**
@@ -1278,8 +1260,7 @@ out:
 int dbg_check_lprops(struct ubifs_info *c)
 {
        int i, err;
-        struct scan_check_data data;
+        struct ubifs_lp_stats lst;
-        struct ubifs_lp_stats *lst = &data.lst;
        if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS))
                return 0;
@@ -1294,29 +1275,23 @@ int dbg_check_lprops(struct ubifs_info *c)
                        return err;
        }
-        memset(lst, 0, sizeof(struct ubifs_lp_stats));
+        memset(&lst, 0, sizeof(struct ubifs_lp_stats));
-        data.err = 0;
        err = ubifs_lpt_scan_nolock(c, c->main_first, c->leb_cnt - 1,
                                    (ubifs_lpt_scan_callback)scan_check_cb,
-                                    &data);
+                                    &lst);
        if (err && err != -ENOSPC)
                goto out;
-        if (data.err) {
-                err = data.err;
-                goto out;
-        }
-        if (lst->empty_lebs != c->lst.empty_lebs ||
+        if (lst.empty_lebs != c->lst.empty_lebs ||
-            lst->idx_lebs != c->lst.idx_lebs ||
+            lst.idx_lebs != c->lst.idx_lebs ||
-            lst->total_free != c->lst.total_free ||
+            lst.total_free != c->lst.total_free ||
-            lst->total_dirty != c->lst.total_dirty ||
+            lst.total_dirty != c->lst.total_dirty ||
-            lst->total_used != c->lst.total_used) {
+            lst.total_used != c->lst.total_used) {
                ubifs_err("bad overall accounting");
                ubifs_err("calculated: empty_lebs %d, idx_lebs %d, "
                          "total_free %lld, total_dirty %lld, total_used %lld",
-                          lst->empty_lebs, lst->idx_lebs, lst->total_free,
+                          lst.empty_lebs, lst.idx_lebs, lst.total_free,
-                          lst->total_dirty, lst->total_used);
+                          lst.total_dirty, lst.total_used);
                ubifs_err("read from lprops: empty_lebs %d, idx_lebs %d, "
                          "total_free %lld, total_dirty %lld, total_used %lld",
                          c->lst.empty_lebs, c->lst.idx_lebs, c->lst.total_free,
@@ -1325,11 +1300,11 @@ int dbg_check_lprops(struct ubifs_info *c)
                goto out;
        }
-        if (lst->total_dead != c->lst.total_dead ||
+        if (lst.total_dead != c->lst.total_dead ||
-            lst->total_dark != c->lst.total_dark) {
+            lst.total_dark != c->lst.total_dark) {
                ubifs_err("bad dead/dark space accounting");
                ubifs_err("calculated: total_dead %lld, total_dark %lld",
-                          lst->total_dead, lst->total_dark);
+                          lst.total_dead, lst.total_dark);
                ubifs_err("read from lprops: total_dead %lld, total_dark %lld",
                          c->lst.total_dead, c->lst.total_dark);
                err = -EINVAL;
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index 0c9c69bd983a..dfcb5748a7dc 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -29,6 +29,12 @@
 #include <linux/slab.h>
 #include "ubifs.h"
+#ifdef CONFIG_UBIFS_FS_DEBUG
+static int dbg_populate_lsave(struct ubifs_info *c);
+#else
+#define dbg_populate_lsave(c) 0
+#endif
 /**
 * first_dirty_cnode - find first dirty cnode.
 * @c: UBIFS file-system description object
@@ -586,7 +592,7 @@ static struct ubifs_pnode *next_pnode_to_dirty(struct ubifs_info *c,
                        if (nnode->nbranch[iip].lnum)
                                break;
                }
-       } while (iip >= UBIFS_LPT_FANOUT);
+        } while (iip >= UBIFS_LPT_FANOUT);
        /* Go right */
        nnode = ubifs_get_nnode(c, nnode, iip);
@@ -815,6 +821,10 @@ static void populate_lsave(struct ubifs_info *c)
                c->lpt_drty_flgs |= LSAVE_DIRTY;
                ubifs_add_lpt_dirt(c, c->lsave_lnum, c->lsave_sz);
        }
+        if (dbg_populate_lsave(c))
+                return;
        list_for_each_entry(lprops, &c->empty_list, list) {
                c->lsave[cnt++] = lprops->lnum;
                if (cnt >= c->lsave_cnt)
@@ -1994,4 +2004,47 @@ void dbg_dump_lpt_lebs(const struct ubifs_info *c)
               current->pid);
 }
+/**
+ * dbg_populate_lsave - debugging version of 'populate_lsave()'
+ * @c: UBIFS file-system description object
+ *
+ * This is a debugging version for 'populate_lsave()' which populates lsave
+ * with random LEBs instead of useful LEBs, which is good for test coverage.
+ * Returns zero if lsave has not been populated (this debugging feature is
+ * disabled) an non-zero if lsave has been populated.
+ */
+static int dbg_populate_lsave(struct ubifs_info *c)
+{
+        struct ubifs_lprops *lprops;
+        struct ubifs_lpt_heap *heap;
+        int i;
+        if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
+                return 0;
+        if (random32() & 3)
+                return 0;
+        for (i = 0; i < c->lsave_cnt; i++)
+                c->lsave[i] = c->main_first;
+        list_for_each_entry(lprops, &c->empty_list, list)
+                c->lsave[random32() % c->lsave_cnt] = lprops->lnum;
+        list_for_each_entry(lprops, &c->freeable_list, list)
+                c->lsave[random32() % c->lsave_cnt] = lprops->lnum;
+        list_for_each_entry(lprops, &c->frdi_idx_list, list)
+                c->lsave[random32() % c->lsave_cnt] = lprops->lnum;
+        heap = &c->lpt_heap[LPROPS_DIRTY_IDX - 1];
+        for (i = 0; i < heap->cnt; i++)
+                c->lsave[random32() % c->lsave_cnt] = heap->arr[i]->lnum;
+        heap = &c->lpt_heap[LPROPS_DIRTY - 1];
+        for (i = 0; i < heap->cnt; i++)
+                c->lsave[random32() % c->lsave_cnt] = heap->arr[i]->lnum;
+        heap = &c->lpt_heap[LPROPS_FREE - 1];
+        for (i = 0; i < heap->cnt; i++)
+                c->lsave[random32() % c->lsave_cnt] = heap->arr[i]->lnum;
+        return 1;
+}
 #endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c
index 21f47afdacff..278c2382e8c2 100644
--- a/fs/ubifs/master.c
+++ b/fs/ubifs/master.c
@@ -148,7 +148,7 @@ static int validate_master(const struct ubifs_info *c)
        }
        main_sz = (long long)c->main_lebs * c->leb_size;
-        if (c->old_idx_sz & 7 || c->old_idx_sz >= main_sz) {
+        if (c->bi.old_idx_sz & 7 || c->bi.old_idx_sz >= main_sz) {
                err = 9;
                goto out;
        }
@@ -218,7 +218,7 @@ static int validate_master(const struct ubifs_info *c)
        }
        if (c->lst.total_dead + c->lst.total_dark +
-            c->lst.total_used + c->old_idx_sz > main_sz) {
+            c->lst.total_used + c->bi.old_idx_sz > main_sz) {
                err = 21;
                goto out;
        }
@@ -286,7 +286,7 @@ int ubifs_read_master(struct ubifs_info *c)
        c->gc_lnum         = le32_to_cpu(c->mst_node->gc_lnum);
        c->ihead_lnum      = le32_to_cpu(c->mst_node->ihead_lnum);
        c->ihead_offs      = le32_to_cpu(c->mst_node->ihead_offs);
-        c->old_idx_sz      = le64_to_cpu(c->mst_node->index_size);
+        c->bi.old_idx_sz   = le64_to_cpu(c->mst_node->index_size);
        c->lpt_lnum        = le32_to_cpu(c->mst_node->lpt_lnum);
        c->lpt_offs        = le32_to_cpu(c->mst_node->lpt_offs);
        c->nhead_lnum      = le32_to_cpu(c->mst_node->nhead_lnum);
@@ -305,7 +305,7 @@ int ubifs_read_master(struct ubifs_info *c)
        c->lst.total_dead  = le64_to_cpu(c->mst_node->total_dead);
        c->lst.total_dark  = le64_to_cpu(c->mst_node->total_dark);
-        c->calc_idx_sz = c->old_idx_sz;
+        c->calc_idx_sz = c->bi.old_idx_sz;
        if (c->mst_node->flags & cpu_to_le32(UBIFS_MST_NO_ORPHS))
                c->no_orphs = 1;
diff --git a/fs/ubifs/misc.h b/fs/ubifs/misc.h
index c3de04dc952a..0b5296a9a4c5 100644
--- a/fs/ubifs/misc.h
+++ b/fs/ubifs/misc.h
@@ -340,4 +340,21 @@ static inline void ubifs_release_lprops(struct ubifs_info *c)
        mutex_unlock(&c->lp_mutex);
 }
+/**
+ * ubifs_next_log_lnum - switch to the next log LEB.
+ * @c: UBIFS file-system description object
+ * @lnum: current log LEB
+ *
+ * This helper function returns the log LEB number which goes next after LEB
+ * 'lnum'.
+ */
+static inline int ubifs_next_log_lnum(const struct ubifs_info *c, int lnum)
+{
+        lnum += 1;
+        if (lnum > c->log_last)
+                lnum = UBIFS_LOG_LNUM;
+        return lnum;
+}
 #endif /* __UBIFS_MISC_H__ */
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index 09df318e368f..bd644bf587a8 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -673,7 +673,8 @@ static int kill_orphans(struct ubifs_info *c)
                sleb = ubifs_scan(c, lnum, 0, c->sbuf, 1);
                if (IS_ERR(sleb)) {
                        if (PTR_ERR(sleb) == -EUCLEAN)
-                                sleb = ubifs_recover_leb(c, lnum, 0, c->sbuf, 0);
+                                sleb = ubifs_recover_leb(c, lnum, 0,
+                                                         c->sbuf, 0);
                        if (IS_ERR(sleb)) {
                                err = PTR_ERR(sleb);
                                break;
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 3dbad6fbd1eb..731d9e2e7b50 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -564,13 +564,16 @@ static int fix_unclean_leb(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
 }
 /**
- * drop_incomplete_group - drop nodes from an incomplete group.
+ * drop_last_node - drop the last node or group of nodes.
 * @sleb: scanned LEB information
 * @offs: offset of dropped nodes is returned here
+ * @grouped: non-zero if whole group of nodes have to be dropped
 *
- * This function returns %1 if nodes are dropped and %0 otherwise.
+ * This is a helper function for 'ubifs_recover_leb()' which drops the last
+ * node of the scanned LEB or the last group of nodes if @grouped is not zero.
+ * This function returns %1 if a node was dropped and %0 otherwise.
 */
-static int drop_incomplete_group(struct ubifs_scan_leb *sleb, int *offs)
+static int drop_last_node(struct ubifs_scan_leb *sleb, int *offs, int grouped)
 {
        int dropped = 0;
@@ -589,6 +592,8 @@ static int drop_incomplete_group(struct ubifs_scan_leb *sleb, int *offs)
                kfree(snod);
                sleb->nodes_cnt -= 1;
                dropped = 1;
+                if (!grouped)
+                        break;
        }
        return dropped;
 }
@@ -609,8 +614,7 @@ static int drop_incomplete_group(struct ubifs_scan_leb *sleb, int *offs)
 struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
                                         int offs, void *sbuf, int grouped)
 {
-        int err, len = c->leb_size - offs, need_clean = 0, quiet = 1;
+        int ret = 0, err, len = c->leb_size - offs, start = offs, min_io_unit;
-        int empty_chkd = 0, start = offs;
        struct ubifs_scan_leb *sleb;
        void *buf = sbuf + offs;
@@ -620,12 +624,8 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
        if (IS_ERR(sleb))
                return sleb;
-        if (sleb->ecc)
+        ubifs_assert(len >= 8);
-                need_clean = 1;
        while (len >= 8) {
-                int ret;
                dbg_scan("look at LEB %d:%d (%d bytes left)",
                         lnum, offs, len);
@@ -635,8 +635,7 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
                 * Scan quietly until there is an error from which we cannot
                 * recover
                 */
-                ret = ubifs_scan_a_node(c, buf, len, lnum, offs, quiet);
+                ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 0);
                if (ret == SCANNED_A_NODE) {
                        /* A valid node, and not a padding node */
                        struct ubifs_ch *ch = buf;
@@ -649,70 +648,32 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
                        offs += node_len;
                        buf += node_len;
                        len -= node_len;
-                        continue;
+                } else if (ret > 0) {
-                }
-                if (ret > 0) {
                        /* Padding bytes or a valid padding node */
                        offs += ret;
                        buf += ret;
                        len -= ret;
-                        continue;
+                } else if (ret == SCANNED_EMPTY_SPACE ||
-                }
+                           ret == SCANNED_GARBAGE     ||
+                           ret == SCANNED_A_BAD_PAD_NODE ||
-                if (ret == SCANNED_EMPTY_SPACE) {
+                           ret == SCANNED_A_CORRUPT_NODE) {
-                        if (!is_empty(buf, len)) {
+                        dbg_rcvry("found corruption - %d", ret);
-                                if (!is_last_write(c, buf, offs))
-                                        break;
-                                clean_buf(c, &buf, lnum, &offs, &len);
-                                need_clean = 1;
-                        }
-                        empty_chkd = 1;
                        break;
-                }
+                } else {
+                        dbg_err("unexpected return value %d", ret);
-                if (ret == SCANNED_GARBAGE || ret == SCANNED_A_BAD_PAD_NODE)
-                        if (is_last_write(c, buf, offs)) {
-                                clean_buf(c, &buf, lnum, &offs, &len);
-                                need_clean = 1;
-                                empty_chkd = 1;
-                                break;
-                        }
-                if (ret == SCANNED_A_CORRUPT_NODE)
-                        if (no_more_nodes(c, buf, len, lnum, offs)) {
-                                clean_buf(c, &buf, lnum, &offs, &len);
-                                need_clean = 1;
-                                empty_chkd = 1;
-                                break;
-                        }
-                if (quiet) {
-                        /* Redo the last scan but noisily */
-                        quiet = 0;
-                        continue;
-                }
-                switch (ret) {
-                case SCANNED_GARBAGE:
-                        dbg_err("garbage");
-                        goto corrupted;
-                case SCANNED_A_CORRUPT_NODE:
-                case SCANNED_A_BAD_PAD_NODE:
-                        dbg_err("bad node");
-                        goto corrupted;
-                default:
-                        dbg_err("unknown");
                        err = -EINVAL;
                        goto error;
                }
        }
-        if (!empty_chkd && !is_empty(buf, len)) {
+        if (ret == SCANNED_GARBAGE || ret == SCANNED_A_BAD_PAD_NODE) {
-                if (is_last_write(c, buf, offs)) {
+                if (!is_last_write(c, buf, offs))
-                        clean_buf(c, &buf, lnum, &offs, &len);
+                        goto corrupted_rescan;
-                        need_clean = 1;
+        } else if (ret == SCANNED_A_CORRUPT_NODE) {
-                } else {
+                if (!no_more_nodes(c, buf, len, lnum, offs))
+                        goto corrupted_rescan;
+        } else if (!is_empty(buf, len)) {
+                if (!is_last_write(c, buf, offs)) {
                        int corruption = first_non_ff(buf, len);
                        /*
@@ -728,29 +689,82 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
                }
        }
-        /* Drop nodes from incomplete group */
+        min_io_unit = round_down(offs, c->min_io_size);
-        if (grouped && drop_incomplete_group(sleb, &offs)) {
+        if (grouped)
-                buf = sbuf + offs;
+                /*
-                len = c->leb_size - offs;
+                 * If nodes are grouped, always drop the incomplete group at
-                clean_buf(c, &buf, lnum, &offs, &len);
+                 * the end.
-                need_clean = 1;
+                 */
-        }
+                drop_last_node(sleb, &offs, 1);
-        if (offs % c->min_io_size) {
+        /*
-                clean_buf(c, &buf, lnum, &offs, &len);
+         * While we are in the middle of the same min. I/O unit keep dropping
-                need_clean = 1;
+         * nodes. So basically, what we want is to make sure that the last min.
-        }
+         * I/O unit where we saw the corruption is dropped completely with all
+         * the uncorrupted node which may possibly sit there.
+         *
+         * In other words, let's name the min. I/O unit where the corruption
+         * starts B, and the previous min. I/O unit A. The below code tries to
+         * deal with a situation when half of B contains valid nodes or the end
+         * of a valid node, and the second half of B contains corrupted data or
+         * garbage. This means that UBIFS had been writing to B just before the
+         * power cut happened. I do not know how realistic is this scenario
+         * that half of the min. I/O unit had been written successfully and the
+         * other half not, but this is possible in our 'failure mode emulation'
+         * infrastructure at least.
+         *
+         * So what is the problem, why we need to drop those nodes? Whey can't
+         * we just clean-up the second half of B by putting a padding node
+         * there? We can, and this works fine with one exception which was
+         * reproduced with power cut emulation testing and happens extremely
+         * rarely. The description follows, but it is worth noting that that is
+         * only about the GC head, so we could do this trick only if the bud
+         * belongs to the GC head, but it does not seem to be worth an
+         * additional "if" statement.
+         *
+         * So, imagine the file-system is full, we run GC which is moving valid
+         * nodes from LEB X to LEB Y (obviously, LEB Y is the current GC head
+         * LEB). The @c->gc_lnum is -1, which means that GC will retain LEB X
+         * and will try to continue. Imagine that LEB X is currently the
+         * dirtiest LEB, and the amount of used space in LEB Y is exactly the
+         * same as amount of free space in LEB X.
+         *
+         * And a power cut happens when nodes are moved from LEB X to LEB Y. We
+         * are here trying to recover LEB Y which is the GC head LEB. We find
+         * the min. I/O unit B as described above. Then we clean-up LEB Y by
+         * padding min. I/O unit. And later 'ubifs_rcvry_gc_commit()' function
+         * fails, because it cannot find a dirty LEB which could be GC'd into
+         * LEB Y! Even LEB X does not match because the amount of valid nodes
+         * there does not fit the free space in LEB Y any more! And this is
+         * because of the padding node which we added to LEB Y. The
+         * user-visible effect of this which I once observed and analysed is
+         * that we cannot mount the file-system with -ENOSPC error.
+         *
+         * So obviously, to make sure that situation does not happen we should
+         * free min. I/O unit B in LEB Y completely and the last used min. I/O
+         * unit in LEB Y should be A. This is basically what the below code
+         * tries to do.
+         */
+        while (min_io_unit == round_down(offs, c->min_io_size) &&
+               min_io_unit != offs &&
+               drop_last_node(sleb, &offs, grouped));
+        buf = sbuf + offs;
+        len = c->leb_size - offs;
+        clean_buf(c, &buf, lnum, &offs, &len);
        ubifs_end_scan(c, sleb, lnum, offs);
-        if (need_clean) {
+        err = fix_unclean_leb(c, sleb, start);
-                err = fix_unclean_leb(c, sleb, start);
+        if (err)
-                if (err)
+                goto error;
-                        goto error;
-        }
        return sleb;
+corrupted_rescan:
+        /* Re-scan the corrupted data with verbose messages */
+        dbg_err("corruptio %d", ret);
+        ubifs_scan_a_node(c, buf, len, lnum, offs, 1);
 corrupted:
        ubifs_scanned_corruption(c, lnum, offs, buf);
        err = -EUCLEAN;
@@ -1070,6 +1084,53 @@ int ubifs_clean_lebs(const struct ubifs_info *c, void *sbuf)
 }
 /**
+ * grab_empty_leb - grab an empty LEB to use as GC LEB and run commit.
+ * @c: UBIFS file-system description object
+ *
+ * This is a helper function for 'ubifs_rcvry_gc_commit()' which grabs an empty
+ * LEB to be used as GC LEB (@c->gc_lnum), and then runs the commit. Returns
+ * zero in case of success and a negative error code in case of failure.
+ */
+static int grab_empty_leb(struct ubifs_info *c)
+{
+        int lnum, err;
+        /*
+         * Note, it is very important to first search for an empty LEB and then
+         * run the commit, not vice-versa. The reason is that there might be
+         * only one empty LEB at the moment, the one which has been the
+         * @c->gc_lnum just before the power cut happened. During the regular
+         * UBIFS operation (not now) @c->gc_lnum is marked as "taken", so no
+         * one but GC can grab it. But at this moment this single empty LEB is
+         * not marked as taken, so if we run commit - what happens? Right, the
+         * commit will grab it and write the index there. Remember that the
+         * index always expands as long as there is free space, and it only
+         * starts consolidating when we run out of space.
+         *
+         * IOW, if we run commit now, we might not be able to find a free LEB
+         * after this.
+         */
+        lnum = ubifs_find_free_leb_for_idx(c);
+        if (lnum < 0) {
+                dbg_err("could not find an empty LEB");
+                dbg_dump_lprops(c);
+                dbg_dump_budg(c, &c->bi);
+                return lnum;
+        }
+        /* Reset the index flag */
+        err = ubifs_change_one_lp(c, lnum, LPROPS_NC, LPROPS_NC, 0,
+                                  LPROPS_INDEX, 0);
+        if (err)
+                return err;
+        c->gc_lnum = lnum;
+        dbg_rcvry("found empty LEB %d, run commit", lnum);
+        return ubifs_run_commit(c);
+}
+/**
 * ubifs_rcvry_gc_commit - recover the GC LEB number and run the commit.
 * @c: UBIFS file-system description object
 *
@@ -1091,71 +1152,26 @@ int ubifs_rcvry_gc_commit(struct ubifs_info *c)
 {
        struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
        struct ubifs_lprops lp;
-        int lnum, err;
+        int err;
+        dbg_rcvry("GC head LEB %d, offs %d", wbuf->lnum, wbuf->offs);
        c->gc_lnum = -1;
-        if (wbuf->lnum == -1) {
+        if (wbuf->lnum == -1 || wbuf->offs == c->leb_size)
-                dbg_rcvry("no GC head LEB");
+                return grab_empty_leb(c);
-                goto find_free;
-        }
-        /*
-         * See whether the used space in the dirtiest LEB fits in the GC head
-         * LEB.
-         */
-        if (wbuf->offs == c->leb_size) {
-                dbg_rcvry("no room in GC head LEB");
-                goto find_free;
-        }
        err = ubifs_find_dirty_leb(c, &lp, wbuf->offs, 2);
        if (err) {
-                /*
+                if (err != -ENOSPC)
-                 * There are no dirty or empty LEBs subject to here being
-                 * enough for the index. Try to use
-                 * 'ubifs_find_free_leb_for_idx()', which will return any empty
-                 * LEBs (ignoring index requirements). If the index then
-                 * doesn't have enough LEBs the recovery commit will fail -
-                 * which is the  same result anyway i.e. recovery fails. So
-                 * there is no problem ignoring index  requirements and just
-                 * grabbing a free LEB since we have already established there
-                 * is not a dirty LEB we could have used instead.
-                 */
-                if (err == -ENOSPC) {
-                        dbg_rcvry("could not find a dirty LEB");
-                        goto find_free;
-                }
-                return err;
-        }
-        ubifs_assert(!(lp.flags & LPROPS_INDEX));
-        lnum = lp.lnum;
-        if (lp.free + lp.dirty == c->leb_size) {
-                /* An empty LEB was returned */
-                if (lp.free != c->leb_size) {
-                        err = ubifs_change_one_lp(c, lnum, c->leb_size,
-                                                  0, 0, 0, 0);
-                        if (err)
-                                return err;
-                }
-                err = ubifs_leb_unmap(c, lnum);
-                if (err)
                        return err;
-                c->gc_lnum = lnum;
-                dbg_rcvry("allocated LEB %d for GC", lnum);
+                dbg_rcvry("could not find a dirty LEB");
-                /* Run the commit */
+                return grab_empty_leb(c);
-                dbg_rcvry("committing");
-                return ubifs_run_commit(c);
-        }
-        /*
-         * There was no empty LEB so the used space in the dirtiest LEB must fit
-         * in the GC head LEB.
-         */
-        if (lp.free + lp.dirty < wbuf->offs) {
-                dbg_rcvry("LEB %d doesn't fit in GC head LEB %d:%d",
-                          lnum, wbuf->lnum, wbuf->offs);
-                err = ubifs_return_leb(c, lnum);
-                if (err)
-                        return err;
-                goto find_free;
        }
+        ubifs_assert(!(lp.flags & LPROPS_INDEX));
+        ubifs_assert(lp.free + lp.dirty >= wbuf->offs);
        /*
         * We run the commit before garbage collection otherwise subsequent
         * mounts will see the GC and orphan deletion in a different order.
@@ -1164,11 +1180,8 @@ int ubifs_rcvry_gc_commit(struct ubifs_info *c)
        err = ubifs_run_commit(c);
        if (err)
                return err;
-        /*
-         * The data in the dirtiest LEB fits in the GC head LEB, so do the GC
+        dbg_rcvry("GC'ing LEB %d", lp.lnum);
-         * - use locking to keep 'ubifs_assert()' happy.
-         */
-        dbg_rcvry("GC'ing LEB %d", lnum);
        mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
        err = ubifs_garbage_collect_leb(c, &lp);
        if (err >= 0) {
@@ -1184,37 +1197,17 @@ int ubifs_rcvry_gc_commit(struct ubifs_info *c)
                        err = -EINVAL;
                return err;
        }
-        if (err != LEB_RETAINED) {
-                dbg_err("GC returned %d", err);
+        ubifs_assert(err == LEB_RETAINED);
+        if (err != LEB_RETAINED)
                return -EINVAL;
-        }
        err = ubifs_leb_unmap(c, c->gc_lnum);
        if (err)
                return err;
-        dbg_rcvry("allocated LEB %d for GC", lnum);
-        return 0;
-find_free:
+        dbg_rcvry("allocated LEB %d for GC", lp.lnum);
-        /*
+        return 0;
-         * There is no GC head LEB or the free space in the GC head LEB is too
-         * small, or there are not dirty LEBs. Allocate gc_lnum by calling
-         * 'ubifs_find_free_leb_for_idx()' so GC is not run.
-         */
-        lnum = ubifs_find_free_leb_for_idx(c);
-        if (lnum < 0) {
-                dbg_err("could not find an empty LEB");
-                return lnum;
-        }
-        /* And reset the index flag */
-        err = ubifs_change_one_lp(c, lnum, LPROPS_NC, LPROPS_NC, 0,
-                                  LPROPS_INDEX, 0);
-        if (err)
-                return err;
-        c->gc_lnum = lnum;
-        dbg_rcvry("allocated LEB %d for GC", lnum);
-        /* Run the commit */
-        dbg_rcvry("committing");
-        return ubifs_run_commit(c);
 }
 /**
@@ -1456,7 +1449,7 @@ static int fix_size_in_place(struct ubifs_info *c, struct size_entry *e)
        err = ubi_leb_change(c->ubi, lnum, c->sbuf, len, UBI_UNKNOWN);
        if (err)
                goto out;
-        dbg_rcvry("inode %lu at %d:%d size %lld -> %lld ",
+        dbg_rcvry("inode %lu at %d:%d size %lld -> %lld",
                  (unsigned long)e->inum, lnum, offs, i_size, e->d_size);
        return 0;
@@ -1505,20 +1498,27 @@ int ubifs_recover_size(struct ubifs_info *c)
                                e->i_size = le64_to_cpu(ino->size);
                        }
                }
                if (e->exists && e->i_size < e->d_size) {
-                        if (!e->inode && c->ro_mount) {
+                        if (c->ro_mount) {
                                /* Fix the inode size and pin it in memory */
                                struct inode *inode;
+                                struct ubifs_inode *ui;
+                                ubifs_assert(!e->inode);
                                inode = ubifs_iget(c->vfs_sb, e->inum);
                                if (IS_ERR(inode))
                                        return PTR_ERR(inode);
+                                ui = ubifs_inode(inode);
                                if (inode->i_size < e->d_size) {
                                        dbg_rcvry("ino %lu size %lld -> %lld",
                                                  (unsigned long)e->inum,
-                                                  e->d_size, inode->i_size);
+                                                  inode->i_size, e->d_size);
                                        inode->i_size = e->d_size;
-                                        ubifs_inode(inode)->ui_size = e->d_size;
+                                        ui->ui_size = e->d_size;
+                                        ui->synced_i_size = e->d_size;
                                        e->inode = inode;
                                        this = rb_next(this);
                                        continue;
@@ -1533,9 +1533,11 @@ int ubifs_recover_size(struct ubifs_info *c)
                                        iput(e->inode);
                        }
                }
                this = rb_next(this);
                rb_erase(&e->rb, &c->size_tree);
                kfree(e);
        }
        return 0;
 }
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index d3d6d365bfc1..6617280d1679 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -33,44 +33,32 @@
 */
 #include "ubifs.h"
+#include <linux/list_sort.h>
-/*
- * Replay flags.
- *
- * REPLAY_DELETION: node was deleted
- * REPLAY_REF: node is a reference node
- */
-enum {
-        REPLAY_DELETION = 1,
-        REPLAY_REF = 2,
-};
 /**
- * struct replay_entry - replay tree entry.
+ * struct replay_entry - replay list entry.
 * @lnum: logical eraseblock number of the node
 * @offs: node offset
 * @len: node length
+ * @deletion: non-zero if this entry corresponds to a node deletion
 * @sqnum: node sequence number
- * @flags: replay flags
+ * @list: links the replay list
- * @rb: links the replay tree
 * @key: node key
 * @nm: directory entry name
 * @old_size: truncation old size
 * @new_size: truncation new size
- * @free: amount of free space in a bud
- * @dirty: amount of dirty space in a bud from padding and deletion nodes
- * @jhead: journal head number of the bud
 *
- * UBIFS journal replay must compare node sequence numbers, which means it must
+ * The replay process first scans all buds and builds the replay list, then
- * build a tree of node information to insert into the TNC.
+ * sorts the replay list in nodes sequence number order, and then inserts all
+ * the replay entries to the TNC.
 */
 struct replay_entry {
        int lnum;
        int offs;
        int len;
+        unsigned int deletion:1;
        unsigned long long sqnum;
-        int flags;
+        struct list_head list;
-        struct rb_node rb;
        union ubifs_key key;
        union {
                struct qstr nm;
@@ -78,11 +66,6 @@ struct replay_entry {
                        loff_t old_size;
                        loff_t new_size;
                };
-                struct {
-                        int free;
-                        int dirty;
-                        int jhead;
-                };
        };
 };
@@ -90,57 +73,64 @@ struct replay_entry {
 * struct bud_entry - entry in the list of buds to replay.
 * @list: next bud in the list
 * @bud: bud description object
- * @free: free bytes in the bud
 * @sqnum: reference node sequence number
+ * @free: free bytes in the bud
+ * @dirty: dirty bytes in the bud
 */
 struct bud_entry {
        struct list_head list;
        struct ubifs_bud *bud;
-        int free;
        unsigned long long sqnum;
+        int free;
+        int dirty;
 };
 /**
 * set_bud_lprops - set free and dirty space used by a bud.
 * @c: UBIFS file-system description object
- * @r: replay entry of bud
+ * @b: bud entry which describes the bud
+ *
+ * This function makes sure the LEB properties of bud @b are set correctly
+ * after the replay. Returns zero in case of success and a negative error code
+ * in case of failure.
 */
-static int set_bud_lprops(struct ubifs_info *c, struct replay_entry *r)
+static int set_bud_lprops(struct ubifs_info *c, struct bud_entry *b)
 {
        const struct ubifs_lprops *lp;
        int err = 0, dirty;
        ubifs_get_lprops(c);
-        lp = ubifs_lpt_lookup_dirty(c, r->lnum);
+        lp = ubifs_lpt_lookup_dirty(c, b->bud->lnum);
        if (IS_ERR(lp)) {
                err = PTR_ERR(lp);
                goto out;
        }
        dirty = lp->dirty;
-        if (r->offs == 0 && (lp->free != c->leb_size || lp->dirty != 0)) {
+        if (b->bud->start == 0 && (lp->free != c->leb_size || lp->dirty != 0)) {
                /*
                 * The LEB was added to the journal with a starting offset of
                 * zero which means the LEB must have been empty. The LEB
-                 * property values should be lp->free == c->leb_size and
+                 * property values should be @lp->free == @c->leb_size and
-                 * lp->dirty == 0, but that is not the case. The reason is that
+                 * @lp->dirty == 0, but that is not the case. The reason is that
-                 * the LEB was garbage collected. The garbage collector resets
+                 * the LEB had been garbage collected before it became the bud,
-                 * the free and dirty space without recording it anywhere except
+                 * and there was not commit inbetween. The garbage collector
-                 * lprops, so if there is not a commit then lprops does not have
+                 * resets the free and dirty space without recording it
-                 * that information next time the file system is mounted.
+                 * anywhere except lprops, so if there was no commit then
+                 * lprops does not have that information.
                 *
                 * We do not need to adjust free space because the scan has told
                 * us the exact value which is recorded in the replay entry as
-                 * r->free.
+                 * @b->free.
                 *
                 * However we do need to subtract from the dirty space the
                 * amount of space that the garbage collector reclaimed, which
                 * is the whole LEB minus the amount of space that was free.
                 */
-                dbg_mnt("bud LEB %d was GC'd (%d free, %d dirty)", r->lnum,
+                dbg_mnt("bud LEB %d was GC'd (%d free, %d dirty)", b->bud->lnum,
                        lp->free, lp->dirty);
-                dbg_gc("bud LEB %d was GC'd (%d free, %d dirty)", r->lnum,
+                dbg_gc("bud LEB %d was GC'd (%d free, %d dirty)", b->bud->lnum,
                        lp->free, lp->dirty);
                dirty -= c->leb_size - lp->free;
                /*
@@ -152,10 +142,10 @@ static int set_bud_lprops(struct ubifs_info *c, struct replay_entry *r)
                 */
                if (dirty != 0)
                        dbg_msg("LEB %d lp: %d free %d dirty "
-                                "replay: %d free %d dirty", r->lnum, lp->free,
+                                "replay: %d free %d dirty", b->bud->lnum,
-                                lp->dirty, r->free, r->dirty);
+                                lp->free, lp->dirty, b->free, b->dirty);
        }
-        lp = ubifs_change_lp(c, lp, r->free, dirty + r->dirty,
+        lp = ubifs_change_lp(c, lp, b->free, dirty + b->dirty,
                             lp->flags | LPROPS_TAKEN, 0);
        if (IS_ERR(lp)) {
                err = PTR_ERR(lp);
@@ -163,8 +153,9 @@ static int set_bud_lprops(struct ubifs_info *c, struct replay_entry *r)
        }
        /* Make sure the journal head points to the latest bud */
-        err = ubifs_wbuf_seek_nolock(&c->jheads[r->jhead].wbuf, r->lnum,
+        err = ubifs_wbuf_seek_nolock(&c->jheads[b->bud->jhead].wbuf,
-                                     c->leb_size - r->free, UBI_SHORTTERM);
+                                     b->bud->lnum, c->leb_size - b->free,
+                                     UBI_SHORTTERM);
 out:
        ubifs_release_lprops(c);
@@ -172,6 +163,27 @@ out:
 }
 /**
+ * set_buds_lprops - set free and dirty space for all replayed buds.
+ * @c: UBIFS file-system description object
+ *
+ * This function sets LEB properties for all replayed buds. Returns zero in
+ * case of success and a negative error code in case of failure.
+ */
+static int set_buds_lprops(struct ubifs_info *c)
+{
+        struct bud_entry *b;
+        int err;
+        list_for_each_entry(b, &c->replay_buds, list) {
+                err = set_bud_lprops(c, b);
+                if (err)
+                        return err;
+        }
+        return 0;
+}
+/**
 * trun_remove_range - apply a replay entry for a truncation to the TNC.
 * @c: UBIFS file-system description object
 * @r: replay entry of truncation
@@ -207,24 +219,22 @@ static int trun_remove_range(struct ubifs_info *c, struct replay_entry *r)
 */
 static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r)
 {
-        int err, deletion = ((r->flags & REPLAY_DELETION) != 0);
+        int err;
-        dbg_mnt("LEB %d:%d len %d flgs %d sqnum %llu %s", r->lnum,
+        dbg_mnt("LEB %d:%d len %d deletion %d sqnum %llu %s", r->lnum,
-                r->offs, r->len, r->flags, r->sqnum, DBGKEY(&r->key));
+                r->offs, r->len, r->deletion, r->sqnum, DBGKEY(&r->key));
        /* Set c->replay_sqnum to help deal with dangling branches. */
        c->replay_sqnum = r->sqnum;
-        if (r->flags & REPLAY_REF)
+        if (is_hash_key(c, &r->key)) {
-                err = set_bud_lprops(c, r);
+                if (r->deletion)
-        else if (is_hash_key(c, &r->key)) {
-                if (deletion)
                        err = ubifs_tnc_remove_nm(c, &r->key, &r->nm);
                else
                        err = ubifs_tnc_add_nm(c, &r->key, r->lnum, r->offs,
                                               r->len, &r->nm);
        } else {
-                if (deletion)
+                if (r->deletion)
                        switch (key_type(c, &r->key)) {
                        case UBIFS_INO_KEY:
                        {
@@ -247,7 +257,7 @@ static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r)
                        return err;
                if (c->need_recovery)
-                        err = ubifs_recover_size_accum(c, &r->key, deletion,
+                        err = ubifs_recover_size_accum(c, &r->key, r->deletion,
                                                       r->new_size);
        }
@@ -255,68 +265,77 @@ static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r)
 }
 /**
- * destroy_replay_tree - destroy the replay.
+ * replay_entries_cmp - compare 2 replay entries.
- * @c: UBIFS file-system description object
+ * @priv: UBIFS file-system description object
+ * @a: first replay entry
+ * @a: second replay entry
 *
- * Destroy the replay tree.
+ * This is a comparios function for 'list_sort()' which compares 2 replay
+ * entries @a and @b by comparing their sequence numer.  Returns %1 if @a has
+ * greater sequence number and %-1 otherwise.
 */
-static void destroy_replay_tree(struct ubifs_info *c)
+static int replay_entries_cmp(void *priv, struct list_head *a,
+                              struct list_head *b)
 {
-        struct rb_node *this = c->replay_tree.rb_node;
+        struct replay_entry *ra, *rb;
-        struct replay_entry *r;
+        cond_resched();
-        while (this) {
+        if (a == b)
-                if (this->rb_left) {
+                return 0;
-                        this = this->rb_left;
-                        continue;
+        ra = list_entry(a, struct replay_entry, list);
-                } else if (this->rb_right) {
+        rb = list_entry(b, struct replay_entry, list);
-                        this = this->rb_right;
+        ubifs_assert(ra->sqnum != rb->sqnum);
-                        continue;
+        if (ra->sqnum > rb->sqnum)
-                }
+                return 1;
-                r = rb_entry(this, struct replay_entry, rb);
+        return -1;
-                this = rb_parent(this);
-                if (this) {
-                        if (this->rb_left == &r->rb)
-                                this->rb_left = NULL;
-                        else
-                                this->rb_right = NULL;
-                }
-                if (is_hash_key(c, &r->key))
-                        kfree(r->nm.name);
-                kfree(r);
-        }
-        c->replay_tree = RB_ROOT;
 }
 /**
- * apply_replay_tree - apply the replay tree to the TNC.
+ * apply_replay_list - apply the replay list to the TNC.
 * @c: UBIFS file-system description object
 *
- * Apply the replay tree.
+ * Apply all entries in the replay list to the TNC. Returns zero in case of
- * Returns zero in case of success and a negative error code in case of
+ * success and a negative error code in case of failure.
- * failure.
 */
-static int apply_replay_tree(struct ubifs_info *c)
+static int apply_replay_list(struct ubifs_info *c)
 {
-        struct rb_node *this = rb_first(&c->replay_tree);
+        struct replay_entry *r;
+        int err;
-        while (this) {
+        list_sort(c, &c->replay_list, &replay_entries_cmp);
-                struct replay_entry *r;
-                int err;
+        list_for_each_entry(r, &c->replay_list, list) {
                cond_resched();
-                r = rb_entry(this, struct replay_entry, rb);
                err = apply_replay_entry(c, r);
                if (err)
                        return err;
-                this = rb_next(this);
        }
        return 0;
 }
 /**
- * insert_node - insert a node to the replay tree.
+ * destroy_replay_list - destroy the replay.
+ * @c: UBIFS file-system description object
+ *
+ * Destroy the replay list.
+ */
+static void destroy_replay_list(struct ubifs_info *c)
+{
+        struct replay_entry *r, *tmp;
+        list_for_each_entry_safe(r, tmp, &c->replay_list, list) {
+                if (is_hash_key(c, &r->key))
+                        kfree(r->nm.name);
+                list_del(&r->list);
+                kfree(r);
+        }
+}
+/**
+ * insert_node - insert a node to the replay list
 * @c: UBIFS file-system description object
 * @lnum: node logical eraseblock number
 * @offs: node offset
@@ -328,39 +347,25 @@ static int apply_replay_tree(struct ubifs_info *c)
 * @old_size: truncation old size
 * @new_size: truncation new size
 *
- * This function inserts a scanned non-direntry node to the replay tree. The
+ * This function inserts a scanned non-direntry node to the replay list. The
- * replay tree is an RB-tree containing @struct replay_entry elements which are
+ * replay list contains @struct replay_entry elements, and we sort this list in
- * indexed by the sequence number. The replay tree is applied at the very end
+ * sequence number order before applying it. The replay list is applied at the
- * of the replay process. Since the tree is sorted in sequence number order,
+ * very end of the replay process. Since the list is sorted in sequence number
- * the older modifications are applied first. This function returns zero in
+ * order, the older modifications are applied first. This function returns zero
- * case of success and a negative error code in case of failure.
+ * in case of success and a negative error code in case of failure.
 */
 static int insert_node(struct ubifs_info *c, int lnum, int offs, int len,
                       union ubifs_key *key, unsigned long long sqnum,
                       int deletion, int *used, loff_t old_size,
                       loff_t new_size)
 {
-        struct rb_node **p = &c->replay_tree.rb_node, *parent = NULL;
        struct replay_entry *r;
+        dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key));
        if (key_inum(c, key) >= c->highest_inum)
                c->highest_inum = key_inum(c, key);
-        dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key));
-        while (*p) {
-                parent = *p;
-                r = rb_entry(parent, struct replay_entry, rb);
-                if (sqnum < r->sqnum) {
-                        p = &(*p)->rb_left;
-                        continue;
-                } else if (sqnum > r->sqnum) {
-                        p = &(*p)->rb_right;
-                        continue;
-                }
-                ubifs_err("duplicate sqnum in replay");
-                return -EINVAL;
-        }
        r = kzalloc(sizeof(struct replay_entry), GFP_KERNEL);
        if (!r)
                return -ENOMEM;
@@ -370,19 +375,18 @@ static int insert_node(struct ubifs_info *c, int lnum, int offs, int len,
        r->lnum = lnum;
        r->offs = offs;
        r->len = len;
+        r->deletion = !!deletion;
        r->sqnum = sqnum;
-        r->flags = (deletion ? REPLAY_DELETION : 0);
+        key_copy(c, key, &r->key);
        r->old_size = old_size;
        r->new_size = new_size;
-        key_copy(c, key, &r->key);
-        rb_link_node(&r->rb, parent, p);
+        list_add_tail(&r->list, &c->replay_list);
-        rb_insert_color(&r->rb, &c->replay_tree);
        return 0;
 }
 /**
- * insert_dent - insert a directory entry node into the replay tree.
+ * insert_dent - insert a directory entry node into the replay list.
 * @c: UBIFS file-system description object
 * @lnum: node logical eraseblock number
 * @offs: node offset
@@ -394,43 +398,25 @@ static int insert_node(struct ubifs_info *c, int lnum, int offs, int len,
 * @deletion: non-zero if this is a deletion
 * @used: number of bytes in use in a LEB
 *
- * This function inserts a scanned directory entry node to the replay tree.
+ * This function inserts a scanned directory entry node or an extended
- * Returns zero in case of success and a negative error code in case of
+ * attribute entry to the replay list. Returns zero in case of success and a
- * failure.
+ * negative error code in case of failure.
- *
- * This function is also used for extended attribute entries because they are
- * implemented as directory entry nodes.
 */
 static int insert_dent(struct ubifs_info *c, int lnum, int offs, int len,
                       union ubifs_key *key, const char *name, int nlen,
                       unsigned long long sqnum, int deletion, int *used)
 {
-        struct rb_node **p = &c->replay_tree.rb_node, *parent = NULL;
        struct replay_entry *r;
        char *nbuf;
+        dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key));
        if (key_inum(c, key) >= c->highest_inum)
                c->highest_inum = key_inum(c, key);
-        dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key));
-        while (*p) {
-                parent = *p;
-                r = rb_entry(parent, struct replay_entry, rb);
-                if (sqnum < r->sqnum) {
-                        p = &(*p)->rb_left;
-                        continue;
-                }
-                if (sqnum > r->sqnum) {
-                        p = &(*p)->rb_right;
-                        continue;
-                }
-                ubifs_err("duplicate sqnum in replay");
-                return -EINVAL;
-        }
        r = kzalloc(sizeof(struct replay_entry), GFP_KERNEL);
        if (!r)
                return -ENOMEM;
        nbuf = kmalloc(nlen + 1, GFP_KERNEL);
        if (!nbuf) {
                kfree(r);
@@ -442,17 +428,15 @@ static int insert_dent(struct ubifs_info *c, int lnum, int offs, int len,
        r->lnum = lnum;
        r->offs = offs;
        r->len = len;
+        r->deletion = !!deletion;
        r->sqnum = sqnum;
+        key_copy(c, key, &r->key);
        r->nm.len = nlen;
        memcpy(nbuf, name, nlen);
        nbuf[nlen] = '\0';
        r->nm.name = nbuf;
-        r->flags = (deletion ? REPLAY_DELETION : 0);
-        key_copy(c, key, &r->key);
-        ubifs_assert(!*p);
+        list_add_tail(&r->list, &c->replay_list);
-        rb_link_node(&r->rb, parent, p);
-        rb_insert_color(&r->rb, &c->replay_tree);
        return 0;
 }
@@ -489,29 +473,92 @@ int ubifs_validate_entry(struct ubifs_info *c,
 }
 /**
+ * is_last_bud - check if the bud is the last in the journal head.
+ * @c: UBIFS file-system description object
+ * @bud: bud description object
+ *
+ * This function checks if bud @bud is the last bud in its journal head. This
+ * information is then used by 'replay_bud()' to decide whether the bud can
+ * have corruptions or not. Indeed, only last buds can be corrupted by power
+ * cuts. Returns %1 if this is the last bud, and %0 if not.
+ */
+static int is_last_bud(struct ubifs_info *c, struct ubifs_bud *bud)
+{
+        struct ubifs_jhead *jh = &c->jheads[bud->jhead];
+        struct ubifs_bud *next;
+        uint32_t data;
+        int err;
+        if (list_is_last(&bud->list, &jh->buds_list))
+                return 1;
+        /*
+         * The following is a quirk to make sure we work correctly with UBIFS
+         * images used with older UBIFS.
+         *
+         * Normally, the last bud will be the last in the journal head's list
+         * of bud. However, there is one exception if the UBIFS image belongs
+         * to older UBIFS. This is fairly unlikely: one would need to use old
+         * UBIFS, then have a power cut exactly at the right point, and then
+         * try to mount this image with new UBIFS.
+         *
+         * The exception is: it is possible to have 2 buds A and B, A goes
+         * before B, and B is the last, bud B is contains no data, and bud A is
+         * corrupted at the end. The reason is that in older versions when the
+         * journal code switched the next bud (from A to B), it first added a
+         * log reference node for the new bud (B), and only after this it
+         * synchronized the write-buffer of current bud (A). But later this was
+         * changed and UBIFS started to always synchronize the write-buffer of
+         * the bud (A) before writing the log reference for the new bud (B).
+         *
+         * But because older UBIFS always synchronized A's write-buffer before
+         * writing to B, we can recognize this exceptional situation but
+         * checking the contents of bud B - if it is empty, then A can be
+         * treated as the last and we can recover it.
+         *
+         * TODO: remove this piece of code in a couple of years (today it is
+         * 16.05.2011).
+         */
+        next = list_entry(bud->list.next, struct ubifs_bud, list);
+        if (!list_is_last(&next->list, &jh->buds_list))
+                return 0;
+        err = ubi_read(c->ubi, next->lnum, (char *)&data,
+                       next->start, 4);
+        if (err)
+                return 0;
+        return data == 0xFFFFFFFF;
+}
+/**
 * replay_bud - replay a bud logical eraseblock.
 * @c: UBIFS file-system description object
- * @lnum: bud logical eraseblock number to replay
+ * @b: bud entry which describes the bud
- * @offs: bud start offset
- * @jhead: journal head to which this bud belongs
- * @free: amount of free space in the bud is returned here
- * @dirty: amount of dirty space from padding and deletion nodes is returned
- * here
 *
- * This function returns zero in case of success and a negative error code in
+ * This function replays bud @bud, recovers it if needed, and adds all nodes
- * case of failure.
+ * from this bud to the replay list. Returns zero in case of success and a
+ * negative error code in case of failure.
 */
-static int replay_bud(struct ubifs_info *c, int lnum, int offs, int jhead,
+static int replay_bud(struct ubifs_info *c, struct bud_entry *b)
-                      int *free, int *dirty)
 {
-        int err = 0, used = 0;
+        int is_last = is_last_bud(c, b->bud);
+        int err = 0, used = 0, lnum = b->bud->lnum, offs = b->bud->start;
        struct ubifs_scan_leb *sleb;
        struct ubifs_scan_node *snod;
-        struct ubifs_bud *bud;
-        dbg_mnt("replay bud LEB %d, head %d", lnum, jhead);
+        dbg_mnt("replay bud LEB %d, head %d, offs %d, is_last %d",
-        if (c->need_recovery)
+                lnum, b->bud->jhead, offs, is_last);
-                sleb = ubifs_recover_leb(c, lnum, offs, c->sbuf, jhead != GCHD);
+        if (c->need_recovery && is_last)
+                /*
+                 * Recover only last LEBs in the journal heads, because power
+                 * cuts may cause corruptions only in these LEBs, because only
+                 * these LEBs could possibly be written to at the power cut
+                 * time.
+                 */
+                sleb = ubifs_recover_leb(c, lnum, offs, c->sbuf,
+                                         b->bud->jhead != GCHD);
        else
                sleb = ubifs_scan(c, lnum, offs, c->sbuf, 0);
        if (IS_ERR(sleb))
@@ -627,15 +674,13 @@ static int replay_bud(struct ubifs_info *c, int lnum, int offs, int jhead,
                        goto out;
        }
-        bud = ubifs_search_bud(c, lnum);
+        ubifs_assert(ubifs_search_bud(c, lnum));
-        if (!bud)
-                BUG();
        ubifs_assert(sleb->endpt - offs >= used);
        ubifs_assert(sleb->endpt % c->min_io_size == 0);
-        *dirty = sleb->endpt - offs - used;
+        b->dirty = sleb->endpt - offs - used;
-        *free = c->leb_size - sleb->endpt;
+        b->free = c->leb_size - sleb->endpt;
+        dbg_mnt("bud LEB %d replied: dirty %d, free %d", lnum, b->dirty, b->free);
 out:
        ubifs_scan_destroy(sleb);
@@ -649,58 +694,6 @@ out_dump:
 }
 /**
- * insert_ref_node - insert a reference node to the replay tree.
- * @c: UBIFS file-system description object
- * @lnum: node logical eraseblock number
- * @offs: node offset
- * @sqnum: sequence number
- * @free: amount of free space in bud
- * @dirty: amount of dirty space from padding and deletion nodes
- * @jhead: journal head number for the bud
- *
- * This function inserts a reference node to the replay tree and returns zero
- * in case of success or a negative error code in case of failure.
- */
-static int insert_ref_node(struct ubifs_info *c, int lnum, int offs,
-                           unsigned long long sqnum, int free, int dirty,
-                           int jhead)
-{
-        struct rb_node **p = &c->replay_tree.rb_node, *parent = NULL;
-        struct replay_entry *r;
-        dbg_mnt("add ref LEB %d:%d", lnum, offs);
-        while (*p) {
-                parent = *p;
-                r = rb_entry(parent, struct replay_entry, rb);
-                if (sqnum < r->sqnum) {
-                        p = &(*p)->rb_left;
-                        continue;
-                } else if (sqnum > r->sqnum) {
-                        p = &(*p)->rb_right;
-                        continue;
-                }
-                ubifs_err("duplicate sqnum in replay tree");
-                return -EINVAL;
-        }
-        r = kzalloc(sizeof(struct replay_entry), GFP_KERNEL);
-        if (!r)
-                return -ENOMEM;
-        r->lnum = lnum;
-        r->offs = offs;
-        r->sqnum = sqnum;
-        r->flags = REPLAY_REF;
-        r->free = free;
-        r->dirty = dirty;
-        r->jhead = jhead;
-        rb_link_node(&r->rb, parent, p);
-        rb_insert_color(&r->rb, &c->replay_tree);
-        return 0;
-}
-/**
 * replay_buds - replay all buds.
 * @c: UBIFS file-system description object
 *
@@ -710,17 +703,16 @@ static int insert_ref_node(struct ubifs_info *c, int lnum, int offs,
 static int replay_buds(struct ubifs_info *c)
 {
        struct bud_entry *b;
-        int err, uninitialized_var(free), uninitialized_var(dirty);
+        int err;
+        unsigned long long prev_sqnum = 0;
        list_for_each_entry(b, &c->replay_buds, list) {
-                err = replay_bud(c, b->bud->lnum, b->bud->start, b->bud->jhead,
+                err = replay_bud(c, b);
-                                 &free, &dirty);
-                if (err)
-                        return err;
-                err = insert_ref_node(c, b->bud->lnum, b->bud->start, b->sqnum,
-                                      free, dirty, b->bud->jhead);
                if (err)
                        return err;
+                ubifs_assert(b->sqnum > prev_sqnum);
+                prev_sqnum = b->sqnum;
        }
        return 0;
@@ -1060,25 +1052,29 @@ int ubifs_replay_journal(struct ubifs_info *c)
        if (err)
                goto out;
-        err = apply_replay_tree(c);
+        err = apply_replay_list(c);
+        if (err)
+                goto out;
+        err = set_buds_lprops(c);
        if (err)
                goto out;
        /*
-         * UBIFS budgeting calculations use @c->budg_uncommitted_idx variable
+         * UBIFS budgeting calculations use @c->bi.uncommitted_idx variable
-         * to roughly estimate index growth. Things like @c->min_idx_lebs
+         * to roughly estimate index growth. Things like @c->bi.min_idx_lebs
         * depend on it. This means we have to initialize it to make sure
         * budgeting works properly.
         */
-        c->budg_uncommitted_idx = atomic_long_read(&c->dirty_zn_cnt);
+        c->bi.uncommitted_idx = atomic_long_read(&c->dirty_zn_cnt);
-        c->budg_uncommitted_idx *= c->max_idx_node_sz;
+        c->bi.uncommitted_idx *= c->max_idx_node_sz;
        ubifs_assert(c->bud_bytes <= c->max_bud_bytes || c->need_recovery);
        dbg_mnt("finished, log head LEB %d:%d, max_sqnum %llu, "
                "highest_inum %lu", c->lhead_lnum, c->lhead_offs, c->max_sqnum,
                (unsigned long)c->highest_inum);
 out:
-        destroy_replay_tree(c);
+        destroy_replay_list(c);
        destroy_bud_list(c);
        c->replaying = 0;
        return err;
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index bf31b4729e51..c606f010e8df 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -475,7 +475,8 @@ failed:
 * @c: UBIFS file-system description object
 *
 * This function returns a pointer to the superblock node or a negative error
- * code.
+ * code. Note, the user of this function is responsible of kfree()'ing the
+ * returned superblock buffer.
 */
 struct ubifs_sb_node *ubifs_read_sb_node(struct ubifs_info *c)
 {
@@ -616,6 +617,7 @@ int ubifs_read_superblock(struct ubifs_info *c)
        c->vfs_sb->s_time_gran = le32_to_cpu(sup->time_gran);
        memcpy(&c->uuid, &sup->uuid, 16);
        c->big_lpt = !!(sup_flags & UBIFS_FLG_BIGLPT);
+        c->space_fixup = !!(sup_flags & UBIFS_FLG_SPACE_FIXUP);
        /* Automatically increase file system size to the maximum size */
        c->old_leb_cnt = c->leb_cnt;
@@ -650,3 +652,152 @@ out:
        kfree(sup);
        return err;
 }
+/**
+ * fixup_leb - fixup/unmap an LEB containing free space.
+ * @c: UBIFS file-system description object
+ * @lnum: the LEB number to fix up
+ * @len: number of used bytes in LEB (starting at offset 0)
+ *
+ * This function reads the contents of the given LEB number @lnum, then fixes
+ * it up, so that empty min. I/O units in the end of LEB are actually erased on
+ * flash (rather than being just all-0xff real data). If the LEB is completely
+ * empty, it is simply unmapped.
+ */
+static int fixup_leb(struct ubifs_info *c, int lnum, int len)
+{
+        int err;
+        ubifs_assert(len >= 0);
+        ubifs_assert(len % c->min_io_size == 0);
+        ubifs_assert(len < c->leb_size);
+        if (len == 0) {
+                dbg_mnt("unmap empty LEB %d", lnum);
+                return ubi_leb_unmap(c->ubi, lnum);
+        }
+        dbg_mnt("fixup LEB %d, data len %d", lnum, len);
+        err = ubi_read(c->ubi, lnum, c->sbuf, 0, len);
+        if (err)
+                return err;
+        return ubi_leb_change(c->ubi, lnum, c->sbuf, len, UBI_UNKNOWN);
+}
+/**
+ * fixup_free_space - find & remap all LEBs containing free space.
+ * @c: UBIFS file-system description object
+ *
+ * This function walks through all LEBs in the filesystem and fiexes up those
+ * containing free/empty space.
+ */
+static int fixup_free_space(struct ubifs_info *c)
+{
+        int lnum, err = 0;
+        struct ubifs_lprops *lprops;
+        ubifs_get_lprops(c);
+        /* Fixup LEBs in the master area */
+        for (lnum = UBIFS_MST_LNUM; lnum < UBIFS_LOG_LNUM; lnum++) {
+                err = fixup_leb(c, lnum, c->mst_offs + c->mst_node_alsz);
+                if (err)
+                        goto out;
+        }
+        /* Unmap unused log LEBs */
+        lnum = ubifs_next_log_lnum(c, c->lhead_lnum);
+        while (lnum != c->ltail_lnum) {
+                err = fixup_leb(c, lnum, 0);
+                if (err)
+                        goto out;
+                lnum = ubifs_next_log_lnum(c, lnum);
+        }
+        /* Fixup the current log head */
+        err = fixup_leb(c, c->lhead_lnum, c->lhead_offs);
+        if (err)
+                goto out;
+        /* Fixup LEBs in the LPT area */
+        for (lnum = c->lpt_first; lnum <= c->lpt_last; lnum++) {
+                int free = c->ltab[lnum - c->lpt_first].free;
+                if (free > 0) {
+                        err = fixup_leb(c, lnum, c->leb_size - free);
+                        if (err)
+                                goto out;
+                }
+        }
+        /* Unmap LEBs in the orphans area */
+        for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) {
+                err = fixup_leb(c, lnum, 0);
+                if (err)
+                        goto out;
+        }
+        /* Fixup LEBs in the main area */
+        for (lnum = c->main_first; lnum < c->leb_cnt; lnum++) {
+                lprops = ubifs_lpt_lookup(c, lnum);
+                if (IS_ERR(lprops)) {
+                        err = PTR_ERR(lprops);
+                        goto out;
+                }
+                if (lprops->free > 0) {
+                        err = fixup_leb(c, lnum, c->leb_size - lprops->free);
+                        if (err)
+                                goto out;
+                }
+        }
+out:
+        ubifs_release_lprops(c);
+        return err;
+}
+/**
+ * ubifs_fixup_free_space - find & fix all LEBs with free space.
+ * @c: UBIFS file-system description object
+ *
+ * This function fixes up LEBs containing free space on first mount, if the
+ * appropriate flag was set when the FS was created. Each LEB with one or more
+ * empty min. I/O unit (i.e. free-space-count > 0) is re-written, to make sure
+ * the free space is actually erased. E.g., this is necessary for some NAND
+ * chips, since the free space may have been programmed like real "0xff" data
+ * (generating a non-0xff ECC), causing future writes to the not-really-erased
+ * NAND pages to behave badly. After the space is fixed up, the superblock flag
+ * is cleared, so that this is skipped for all future mounts.
+ */
+int ubifs_fixup_free_space(struct ubifs_info *c)
+{
+        int err;
+        struct ubifs_sb_node *sup;
+        ubifs_assert(c->space_fixup);
+        ubifs_assert(!c->ro_mount);
+        ubifs_msg("start fixing up free space");
+        err = fixup_free_space(c);
+        if (err)
+                return err;
+        sup = ubifs_read_sb_node(c);
+        if (IS_ERR(sup))
+                return PTR_ERR(sup);
+        /* Free-space fixup is no longer required */
+        c->space_fixup = 0;
+        sup->flags &= cpu_to_le32(~UBIFS_FLG_SPACE_FIXUP);
+        err = ubifs_write_sb_node(c, sup);
+        kfree(sup);
+        if (err)
+                return err;
+        ubifs_msg("free space fixup complete");
+        return err;
+}
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 04ad07f4fcc3..6db0bdaa9f74 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -375,7 +375,7 @@ out:
                ubifs_release_dirty_inode_budget(c, ui);
        else {
                /* We've deleted something - clean the "no space" flags */
-                c->nospace = c->nospace_rp = 0;
+                c->bi.nospace = c->bi.nospace_rp = 0;
                smp_wmb();
        }
 done:
@@ -694,11 +694,11 @@ static int init_constants_sb(struct ubifs_info *c)
         * be compressed and direntries are of the maximum size.
         *
         * Note, data, which may be stored in inodes is budgeted separately, so
-         * it is not included into 'c->inode_budget'.
+         * it is not included into 'c->bi.inode_budget'.
         */
-        c->page_budget = UBIFS_MAX_DATA_NODE_SZ * UBIFS_BLOCKS_PER_PAGE;
+        c->bi.page_budget = UBIFS_MAX_DATA_NODE_SZ * UBIFS_BLOCKS_PER_PAGE;
-        c->inode_budget = UBIFS_INO_NODE_SZ;
+        c->bi.inode_budget = UBIFS_INO_NODE_SZ;
-        c->dent_budget = UBIFS_MAX_DENT_NODE_SZ;
+        c->bi.dent_budget = UBIFS_MAX_DENT_NODE_SZ;
        /*
         * When the amount of flash space used by buds becomes
@@ -742,7 +742,7 @@ static void init_constants_master(struct ubifs_info *c)
 {
        long long tmp64;
-        c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+        c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c);
        c->report_rp_size = ubifs_reported_space(c, c->rp_size);
        /*
@@ -1144,8 +1144,8 @@ static int check_free_space(struct ubifs_info *c)
 {
        ubifs_assert(c->dark_wm > 0);
        if (c->lst.total_free + c->lst.total_dirty < c->dark_wm) {
-                ubifs_err("insufficient free space to mount in read/write mode");
+                ubifs_err("insufficient free space to mount in R/W mode");
-                dbg_dump_budg(c);
+                dbg_dump_budg(c, &c->bi);
                dbg_dump_lprops(c);
                return -ENOSPC;
        }
@@ -1304,7 +1304,7 @@ static int mount_ubifs(struct ubifs_info *c)
        if (err)
                goto out_lpt;
-        err = dbg_check_idx_size(c, c->old_idx_sz);
+        err = dbg_check_idx_size(c, c->bi.old_idx_sz);
        if (err)
                goto out_lpt;
@@ -1313,7 +1313,7 @@ static int mount_ubifs(struct ubifs_info *c)
                goto out_journal;
        /* Calculate 'min_idx_lebs' after journal replay */
-        c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+        c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c);
        err = ubifs_mount_orphans(c, c->need_recovery, c->ro_mount);
        if (err)
@@ -1396,6 +1396,12 @@ static int mount_ubifs(struct ubifs_info *c)
        } else
                ubifs_assert(c->lst.taken_empty_lebs > 0);
+        if (!c->ro_mount && c->space_fixup) {
+                err = ubifs_fixup_free_space(c);
+                if (err)
+                        goto out_infos;
+        }
        err = dbg_check_filesystem(c);
        if (err)
                goto out_infos;
@@ -1442,7 +1448,8 @@ static int mount_ubifs(struct ubifs_info *c)
                c->main_lebs, c->main_first, c->leb_cnt - 1);
        dbg_msg("index LEBs:          %d", c->lst.idx_lebs);
        dbg_msg("total index bytes:   %lld (%lld KiB, %lld MiB)",
-                c->old_idx_sz, c->old_idx_sz >> 10, c->old_idx_sz >> 20);
+                c->bi.old_idx_sz, c->bi.old_idx_sz >> 10,
+                c->bi.old_idx_sz >> 20);
        dbg_msg("key hash type:       %d", c->key_hash_type);
        dbg_msg("tree fanout:         %d", c->fanout);
        dbg_msg("reserved GC LEB:     %d", c->gc_lnum);
@@ -1456,7 +1463,7 @@ static int mount_ubifs(struct ubifs_info *c)
        dbg_msg("node sizes:          ref %zu, cmt. start %zu, orph %zu",
                UBIFS_REF_NODE_SZ, UBIFS_CS_NODE_SZ, UBIFS_ORPH_NODE_SZ);
        dbg_msg("max. node sizes:     data %zu, inode %zu dentry %zu, idx %d",
-                UBIFS_MAX_DATA_NODE_SZ, UBIFS_MAX_INO_NODE_SZ,
+                UBIFS_MAX_DATA_NODE_SZ, UBIFS_MAX_INO_NODE_SZ,
                UBIFS_MAX_DENT_NODE_SZ, ubifs_idx_node_sz(c, c->fanout));
        dbg_msg("dead watermark:      %d", c->dead_wm);
        dbg_msg("dark watermark:      %d", c->dark_wm);
@@ -1584,6 +1591,7 @@ static int ubifs_remount_rw(struct ubifs_info *c)
                }
                sup->leb_cnt = cpu_to_le32(c->leb_cnt);
                err = ubifs_write_sb_node(c, sup);
+                kfree(sup);
                if (err)
                        goto out;
        }
@@ -1684,6 +1692,13 @@ static int ubifs_remount_rw(struct ubifs_info *c)
                 */
                err = dbg_check_space_info(c);
        }
+        if (c->space_fixup) {
+                err = ubifs_fixup_free_space(c);
+                if (err)
+                        goto out;
+        }
        mutex_unlock(&c->umount_mutex);
        return err;
@@ -1766,10 +1781,9 @@ static void ubifs_put_super(struct super_block *sb)
         * to write them back because of I/O errors.
         */
        if (!c->ro_error) {
-                ubifs_assert(atomic_long_read(&c->dirty_pg_cnt) == 0);
+                ubifs_assert(c->bi.idx_growth == 0);
-                ubifs_assert(c->budg_idx_growth == 0);
+                ubifs_assert(c->bi.dd_growth == 0);
-                ubifs_assert(c->budg_dd_growth == 0);
+                ubifs_assert(c->bi.data_growth == 0);
-                ubifs_assert(c->budg_data_growth == 0);
        }
        /*
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index de485979ca39..8119b1fd8d94 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -2557,11 +2557,11 @@ int ubifs_tnc_remove_nm(struct ubifs_info *c, const union ubifs_key *key,
                if (err) {
                        /* Ensure the znode is dirtied */
                        if (znode->cnext || !ubifs_zn_dirty(znode)) {
-                                    znode = dirty_cow_bottom_up(c, znode);
+                                znode = dirty_cow_bottom_up(c, znode);
-                                    if (IS_ERR(znode)) {
+                                if (IS_ERR(znode)) {
-                                            err = PTR_ERR(znode);
+                                        err = PTR_ERR(znode);
-                                            goto out_unlock;
+                                        goto out_unlock;
-                                    }
+                                }
                        }
                        err = tnc_delete(c, znode, n);
                }
diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c
index 53288e5d604e..41920f357bbf 100644
--- a/fs/ubifs/tnc_commit.c
+++ b/fs/ubifs/tnc_commit.c
@@ -377,15 +377,13 @@ static int layout_in_gaps(struct ubifs_info *c, int cnt)
                                c->gap_lebs = NULL;
                                return err;
                        }
-                        if (!dbg_force_in_the_gaps_enabled) {
+                        if (dbg_force_in_the_gaps_enabled()) {
                                /*
                                 * Do not print scary warnings if the debugging
                                 * option which forces in-the-gaps is enabled.
                                 */
-                                ubifs_err("out of space");
+                                ubifs_warn("out of space");
-                                spin_lock(&c->space_lock);
+                                dbg_dump_budg(c, &c->bi);
-                                dbg_dump_budg(c);
-                                spin_unlock(&c->space_lock);
                                dbg_dump_lprops(c);
                        }
                        /* Try to commit anyway */
@@ -796,16 +794,16 @@ int ubifs_tnc_start_commit(struct ubifs_info *c, struct ubifs_zbranch *zroot)
        spin_lock(&c->space_lock);
        /*
         * Although we have not finished committing yet, update size of the
-         * committed index ('c->old_idx_sz') and zero out the index growth
+         * committed index ('c->bi.old_idx_sz') and zero out the index growth
         * budget. It is OK to do this now, because we've reserved all the
         * space which is needed to commit the index, and it is save for the
         * budgeting subsystem to assume the index is already committed,
         * even though it is not.
         */
-        ubifs_assert(c->min_idx_lebs == ubifs_calc_min_idx_lebs(c));
+        ubifs_assert(c->bi.min_idx_lebs == ubifs_calc_min_idx_lebs(c));
-        c->old_idx_sz = c->calc_idx_sz;
+        c->bi.old_idx_sz = c->calc_idx_sz;
-        c->budg_uncommitted_idx = 0;
+        c->bi.uncommitted_idx = 0;
-        c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+        c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c);
        spin_unlock(&c->space_lock);
        mutex_unlock(&c->tnc_mutex);
diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h
index 191ca7863fe7..e24380cf46ed 100644
--- a/fs/ubifs/ubifs-media.h
+++ b/fs/ubifs/ubifs-media.h
@@ -408,9 +408,11 @@ enum {
 * Superblock flags.
 *
 * UBIFS_FLG_BIGLPT: if "big" LPT model is used if set
+ * UBIFS_FLG_SPACE_FIXUP: first-mount "fixup" of free space within LEBs needed
 */
 enum {
        UBIFS_FLG_BIGLPT = 0x02,
+        UBIFS_FLG_SPACE_FIXUP = 0x04,
 };
 /**
@@ -434,7 +436,7 @@ struct ubifs_ch {
        __u8 node_type;
        __u8 group_type;
        __u8 padding[2];
-} __attribute__ ((packed));
+} __packed;
 /**
 * union ubifs_dev_desc - device node descriptor.
@@ -448,7 +450,7 @@ struct ubifs_ch {
 union ubifs_dev_desc {
        __le32 new;
        __le64 huge;
-} __attribute__ ((packed));
+} __packed;
 /**
 * struct ubifs_ino_node - inode node.
@@ -509,7 +511,7 @@ struct ubifs_ino_node {
        __le16 compr_type;
        __u8 padding2[26]; /* Watch 'zero_ino_node_unused()' if changing! */
        __u8 data[];
-} __attribute__ ((packed));
+} __packed;
 /**
 * struct ubifs_dent_node - directory entry node.
@@ -534,7 +536,7 @@ struct ubifs_dent_node {
        __le16 nlen;
        __u8 padding2[4]; /* Watch 'zero_dent_node_unused()' if changing! */
        __u8 name[];
-} __attribute__ ((packed));
+} __packed;
 /**
 * struct ubifs_data_node - data node.
@@ -555,7 +557,7 @@ struct ubifs_data_node {
        __le16 compr_type;
        __u8 padding[2]; /* Watch 'zero_data_node_unused()' if changing! */
        __u8 data[];
-} __attribute__ ((packed));
+} __packed;
 /**
 * struct ubifs_trun_node - truncation node.
@@ -575,7 +577,7 @@ struct ubifs_trun_node {
        __u8 padding[12]; /* Watch 'zero_trun_node_unused()' if changing! */
        __le64 old_size;
        __le64 new_size;
-} __attribute__ ((packed));
+} __packed;
 /**
 * struct ubifs_pad_node - padding node.
@@ -586,7 +588,7 @@ struct ubifs_trun_node {
 struct ubifs_pad_node {
        struct ubifs_ch ch;
        __le32 pad_len;
-} __attribute__ ((packed));
+} __packed;
 /**
 * struct ubifs_sb_node - superblock node.
@@ -644,7 +646,7 @@ struct ubifs_sb_node {
        __u8 uuid[16];
        __le32 ro_compat_version;
        __u8 padding2[3968];
-} __attribute__ ((packed));
+} __packed;
 /**
 * struct ubifs_mst_node - master node.
@@ -711,7 +713,7 @@ struct ubifs_mst_node {
        __le32 idx_lebs;
        __le32 leb_cnt;
        __u8 padding[344];
-} __attribute__ ((packed));
+} __packed;
 /**
 * struct ubifs_ref_node - logical eraseblock reference node.
@@ -727,7 +729,7 @@ struct ubifs_ref_node {
        __le32 offs;
        __le32 jhead;
        __u8 padding[28];
-} __attribute__ ((packed));
+} __packed;
 /**
 * struct ubifs_branch - key/reference/length branch
@@ -741,7 +743,7 @@ struct ubifs_branch {
        __le32 offs;
        __le32 len;
        __u8 key[];
-} __attribute__ ((packed));
+} __packed;
 /**
 * struct ubifs_idx_node - indexing node.
@@ -755,7 +757,7 @@ struct ubifs_idx_node {
        __le16 child_cnt;
        __le16 level;
        __u8 branches[];
-} __attribute__ ((packed));
+} __packed;
 /**
 * struct ubifs_cs_node - commit start node.
@@ -765,7 +767,7 @@ struct ubifs_idx_node {
 struct ubifs_cs_node {
        struct ubifs_ch ch;
        __le64 cmt_no;
-} __attribute__ ((packed));
+} __packed;
 /**
 * struct ubifs_orph_node - orphan node.
@@ -777,6 +779,6 @@ struct ubifs_orph_node {
        struct ubifs_ch ch;
        __le64 cmt_no;
        __le64 inos[];
-} __attribute__ ((packed));
+} __packed;
 #endif /* __UBIFS_MEDIA_H__ */
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 8c40ad3c6721..93d1412a06f0 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -389,9 +389,9 @@ struct ubifs_gced_idx_leb {
 * The @ui_size is a "shadow" variable for @inode->i_size and UBIFS uses
 * @ui_size instead of @inode->i_size. The reason for this is that UBIFS cannot
 * make sure @inode->i_size is always changed under @ui_mutex, because it
- * cannot call 'truncate_setsize()' with @ui_mutex locked, because it would deadlock
+ * cannot call 'truncate_setsize()' with @ui_mutex locked, because it would
- * with 'ubifs_writepage()' (see file.c). All the other inode fields are
+ * deadlock with 'ubifs_writepage()' (see file.c). All the other inode fields
- * changed under @ui_mutex, so they do not need "shadow" fields. Note, one
+ * are changed under @ui_mutex, so they do not need "shadow" fields. Note, one
 * could consider to rework locking and base it on "shadow" fields.
 */
 struct ubifs_inode {
@@ -937,6 +937,40 @@ struct ubifs_mount_opts {
        unsigned int compr_type:2;
 };
+/**
+ * struct ubifs_budg_info - UBIFS budgeting information.
+ * @idx_growth: amount of bytes budgeted for index growth
+ * @data_growth: amount of bytes budgeted for cached data
+ * @dd_growth: amount of bytes budgeted for cached data that will make
+ *             other data dirty
+ * @uncommitted_idx: amount of bytes were budgeted for growth of the index, but
+ *                   which still have to be taken into account because the index
+ *                   has not been committed so far
+ * @old_idx_sz: size of index on flash
+ * @min_idx_lebs: minimum number of LEBs required for the index
+ * @nospace: non-zero if the file-system does not have flash space (used as
+ *           optimization)
+ * @nospace_rp: the same as @nospace, but additionally means that even reserved
+ *              pool is full
+ * @page_budget: budget for a page (constant, nenver changed after mount)
+ * @inode_budget: budget for an inode (constant, nenver changed after mount)
+ * @dent_budget: budget for a directory entry (constant, nenver changed after
+ *               mount)
+ */
+struct ubifs_budg_info {
+        long long idx_growth;
+        long long data_growth;
+        long long dd_growth;
+        long long uncommitted_idx;
+        unsigned long long old_idx_sz;
+        int min_idx_lebs;
+        unsigned int nospace:1;
+        unsigned int nospace_rp:1;
+        int page_budget;
+        int inode_budget;
+        int dent_budget;
+};
 struct ubifs_debug_info;
 /**
@@ -980,6 +1014,7 @@ struct ubifs_debug_info;
 * @cmt_wq: wait queue to sleep on if the log is full and a commit is running
 *
 * @big_lpt: flag that LPT is too big to write whole during commit
+ * @space_fixup: flag indicating that free space in LEBs needs to be cleaned up
 * @no_chk_data_crc: do not check CRCs when reading data nodes (except during
 *                   recovery)
 * @bulk_read: enable bulk-reads
@@ -1057,32 +1092,14 @@ struct ubifs_debug_info;
 * @dirty_zn_cnt: number of dirty znodes
 * @clean_zn_cnt: number of clean znodes
 *
- * @budg_idx_growth: amount of bytes budgeted for index growth
+ * @space_lock: protects @bi and @lst
- * @budg_data_growth: amount of bytes budgeted for cached data
+ * @lst: lprops statistics
- * @budg_dd_growth: amount of bytes budgeted for cached data that will make
+ * @bi: budgeting information
- *                  other data dirty
- * @budg_uncommitted_idx: amount of bytes were budgeted for growth of the index,
- *                        but which still have to be taken into account because
- *                        the index has not been committed so far
- * @space_lock: protects @budg_idx_growth, @budg_data_growth, @budg_dd_growth,
- *              @budg_uncommited_idx, @min_idx_lebs, @old_idx_sz, @lst,
- *              @nospace, and @nospace_rp;
- * @min_idx_lebs: minimum number of LEBs required for the index
- * @old_idx_sz: size of index on flash
 * @calc_idx_sz: temporary variable which is used to calculate new index size
 *               (contains accurate new index size at end of TNC commit start)
- * @lst: lprops statistics
- * @nospace: non-zero if the file-system does not have flash space (used as
- *           optimization)
- * @nospace_rp: the same as @nospace, but additionally means that even reserved
- *              pool is full
- *
- * @page_budget: budget for a page
- * @inode_budget: budget for an inode
- * @dent_budget: budget for a directory entry
 *
 * @ref_node_alsz: size of the LEB reference node aligned to the min. flash
- * I/O unit
+ *                 I/O unit
 * @mst_node_alsz: master node aligned size
 * @min_idx_node_sz: minimum indexing node aligned on 8-bytes boundary
 * @max_idx_node_sz: maximum indexing node aligned on 8-bytes boundary
@@ -1189,7 +1206,6 @@ struct ubifs_debug_info;
 * @replaying: %1 during journal replay
 * @mounting: %1 while mounting
 * @remounting_rw: %1 while re-mounting from R/O mode to R/W mode
- * @replay_tree: temporary tree used during journal replay
 * @replay_list: temporary list used during journal replay
 * @replay_buds: list of buds to replay
 * @cs_sqnum: sequence number of first node in the log (commit start node)
@@ -1238,6 +1254,7 @@ struct ubifs_info {
        wait_queue_head_t cmt_wq;
        unsigned int big_lpt:1;
+        unsigned int space_fixup:1;
        unsigned int no_chk_data_crc:1;
        unsigned int bulk_read:1;
        unsigned int default_compr:2;
@@ -1308,21 +1325,10 @@ struct ubifs_info {
        atomic_long_t dirty_zn_cnt;
        atomic_long_t clean_zn_cnt;
-        long long budg_idx_growth;
-        long long budg_data_growth;
-        long long budg_dd_growth;
-        long long budg_uncommitted_idx;
        spinlock_t space_lock;
-        int min_idx_lebs;
-        unsigned long long old_idx_sz;
-        unsigned long long calc_idx_sz;
        struct ubifs_lp_stats lst;
-        unsigned int nospace:1;
+        struct ubifs_budg_info bi;
-        unsigned int nospace_rp:1;
+        unsigned long long calc_idx_sz;
-        int page_budget;
-        int inode_budget;
-        int dent_budget;
        int ref_node_alsz;
        int mst_node_alsz;
@@ -1430,7 +1436,6 @@ struct ubifs_info {
        unsigned int replaying:1;
        unsigned int mounting:1;
        unsigned int remounting_rw:1;
-        struct rb_root replay_tree;
        struct list_head replay_list;
        struct list_head replay_buds;
        unsigned long long cs_sqnum;
@@ -1628,6 +1633,7 @@ int ubifs_write_master(struct ubifs_info *c);
 int ubifs_read_superblock(struct ubifs_info *c);
 struct ubifs_sb_node *ubifs_read_sb_node(struct ubifs_info *c);
 int ubifs_write_sb_node(struct ubifs_info *c, struct ubifs_sb_node *sup);
+int ubifs_fixup_free_space(struct ubifs_info *c);
 /* replay.c */
 int ubifs_validate_entry(struct ubifs_info *c,
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index 3299f469e712..16f19f55e63f 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -80,8 +80,8 @@ enum {
        SECURITY_XATTR,
 };
-static const struct inode_operations none_inode_operations;
+static const struct inode_operations empty_iops;
-static const struct file_operations none_file_operations;
+static const struct file_operations empty_fops;
 /**
 * create_xattr - create an extended attribute.
@@ -131,8 +131,8 @@ static int create_xattr(struct ubifs_info *c, struct inode *host,
        /* Re-define all operations to be "nothing" */
        inode->i_mapping->a_ops = &empty_aops;
-        inode->i_op = &none_inode_operations;
+        inode->i_op = &empty_iops;
-        inode->i_fop = &none_file_operations;
+        inode->i_fop = &empty_fops;
        inode->i_flags |= S_SYNC | S_NOATIME | S_NOCMTIME | S_NOQUOTA;
        ui = ubifs_inode(inode);
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 9ef9ed2cfe2e..5e68099db2a5 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -33,7 +33,6 @@
 #include <linux/migrate.h>
 #include <linux/backing-dev.h>
 #include <linux/freezer.h>
-#include <linux/list_sort.h>
 #include "xfs_sb.h"
 #include "xfs_inum.h"
@@ -709,6 +708,27 @@ xfs_buf_get_empty(
        return bp;
 }
+/*
+ * Return a buffer allocated as an empty buffer and associated to external
+ * memory via xfs_buf_associate_memory() back to it's empty state.
+ */
+void
+xfs_buf_set_empty(
+        struct xfs_buf          *bp,
+        size_t                  len)
+{
+        if (bp->b_pages)
+                _xfs_buf_free_pages(bp);
+        bp->b_pages = NULL;
+        bp->b_page_count = 0;
+        bp->b_addr = NULL;
+        bp->b_file_offset = 0;
+        bp->b_buffer_length = bp->b_count_desired = len;
+        bp->b_bn = XFS_BUF_DADDR_NULL;
+        bp->b_flags &= ~XBF_MAPPED;
+}
 static inline struct page *
 mem_to_page(
        void                    *addr)
@@ -1402,12 +1422,12 @@ restart:
 int
 xfs_buftarg_shrink(
        struct shrinker         *shrink,
-        int                     nr_to_scan,
+        struct shrink_control   *sc)
-        gfp_t                   mask)
 {
        struct xfs_buftarg      *btp = container_of(shrink,
                                        struct xfs_buftarg, bt_shrinker);
        struct xfs_buf          *bp;
+        int nr_to_scan = sc->nr_to_scan;
        LIST_HEAD(dispose);
        if (!nr_to_scan)
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index a9a1c4512645..50a7d5fb3b73 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -178,6 +178,7 @@ extern xfs_buf_t *xfs_buf_read(xfs_buftarg_t *, xfs_off_t, size_t,
                                xfs_buf_flags_t);
 extern xfs_buf_t *xfs_buf_get_empty(size_t, xfs_buftarg_t *);
+extern void xfs_buf_set_empty(struct xfs_buf *bp, size_t len);
 extern xfs_buf_t *xfs_buf_get_uncached(struct xfs_buftarg *, size_t, int);
 extern int xfs_buf_associate_memory(xfs_buf_t *, void *, size_t);
 extern void xfs_buf_hold(xfs_buf_t *);
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index b3486dfa5520..54e623bfbb85 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -586,7 +586,8 @@ xfs_file_compat_ioctl(
        case XFS_IOC_RESVSP_32:
        case XFS_IOC_UNRESVSP_32:
        case XFS_IOC_RESVSP64_32:
-        case XFS_IOC_UNRESVSP64_32: {
+        case XFS_IOC_UNRESVSP64_32:
+        case XFS_IOC_ZERO_RANGE_32: {
                struct xfs_flock64      bf;
                if (xfs_compat_flock64_copyin(&bf, arg))
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.h b/fs/xfs/linux-2.6/xfs_ioctl32.h
index 08b605792a99..80f4060e8970 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.h
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.h
@@ -184,6 +184,7 @@ typedef struct compat_xfs_flock64 {
 #define XFS_IOC_UNRESVSP_32     _IOW('X', 41, struct compat_xfs_flock64)
 #define XFS_IOC_RESVSP64_32     _IOW('X', 42, struct compat_xfs_flock64)
 #define XFS_IOC_UNRESVSP64_32   _IOW('X', 43, struct compat_xfs_flock64)
+#define XFS_IOC_ZERO_RANGE_32   _IOW('X', 57, struct compat_xfs_flock64)
 typedef struct compat_xfs_fsop_geom_v1 {
        __u32           blocksize;      /* filesystem (data) block size */
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 244be9cbfe78..8633521b3b2e 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -70,6 +70,7 @@
 #include <linux/ctype.h>
 #include <linux/writeback.h>
 #include <linux/capability.h>
+#include <linux/list_sort.h>
 #include <asm/page.h>
 #include <asm/div64.h>
diff --git a/fs/xfs/linux-2.6/xfs_message.c b/fs/xfs/linux-2.6/xfs_message.c
index 9f76cceb678d..bd672def95ac 100644
--- a/fs/xfs/linux-2.6/xfs_message.c
+++ b/fs/xfs/linux-2.6/xfs_message.c
@@ -41,23 +41,6 @@ __xfs_printk(
        printk("%sXFS: %pV\n", level, vaf);
 }
-void xfs_printk(
-        const char              *level,
-        const struct xfs_mount  *mp,
-        const char              *fmt, ...)
-{
-        struct va_format        vaf;
-        va_list                 args;
-        va_start(args, fmt);
-        vaf.fmt = fmt;
-        vaf.va = &args;
-        __xfs_printk(level, mp, &vaf);
-        va_end(args);
-}
 #define define_xfs_printk_level(func, kern_level)               \
 void func(const struct xfs_mount *mp, const char *fmt, ...)     \
 {                                                               \
@@ -95,8 +78,7 @@ xfs_alert_tag(
        int                     do_panic = 0;
        if (xfs_panic_mask && (xfs_panic_mask & panic_tag)) {
-                xfs_printk(KERN_ALERT, mp,
+                xfs_alert(mp, "Transforming an alert into a BUG.");
-                        "XFS: Transforming an alert into a BUG.");
                do_panic = 1;
        }
diff --git a/fs/xfs/linux-2.6/xfs_message.h b/fs/xfs/linux-2.6/xfs_message.h
index f1b3fc1b6c4e..7fb7ea007672 100644
--- a/fs/xfs/linux-2.6/xfs_message.h
+++ b/fs/xfs/linux-2.6/xfs_message.h
@@ -3,9 +3,6 @@
 struct xfs_mount;
-extern void xfs_printk(const char *level, const struct xfs_mount *mp,
-                      const char *fmt, ...)
-        __attribute__ ((format (printf, 3, 4)));
 extern void xfs_emerg(const struct xfs_mount *mp, const char *fmt, ...)
        __attribute__ ((format (printf, 2, 3)));
 extern void xfs_alert(const struct xfs_mount *mp, const char *fmt, ...)
@@ -28,7 +25,9 @@ extern void xfs_info(const struct xfs_mount *mp, const char *fmt, ...)
 extern void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...)
        __attribute__ ((format (printf, 2, 3)));
 #else
-static inline void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...)
+static inline void
+__attribute__ ((format (printf, 2, 3)))
+xfs_debug(const struct xfs_mount *mp, const char *fmt, ...)
 {
 }
 #endif
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index b38e58d02299..b0aa59e51fd0 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1787,10 +1787,6 @@ init_xfs_fs(void)
        if (error)
                goto out_cleanup_procfs;
-        error = xfs_init_workqueues();
-        if (error)
-                goto out_sysctl_unregister;
        vfs_initquota();
        error = register_filesystem(&xfs_fs_type);
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 3e898a48122d..8ecad5ff9f9b 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -267,6 +267,16 @@ xfs_sync_inode_attr(
        error = xfs_iflush(ip, flags);
+        /*
+         * We don't want to try again on non-blocking flushes that can't run
+         * again immediately. If an inode really must be written, then that's
+         * what the SYNC_WAIT flag is for.
+         */
+        if (error == EAGAIN) {
+                ASSERT(!(flags & SYNC_WAIT));
+                error = 0;
+        }
 out_unlock:
        xfs_iunlock(ip, XFS_ILOCK_SHARED);
        return error;
@@ -1022,13 +1032,14 @@ xfs_reclaim_inodes(
 static int
 xfs_reclaim_inode_shrink(
        struct shrinker *shrink,
-        int             nr_to_scan,
+        struct shrink_control *sc)
-        gfp_t           gfp_mask)
 {
        struct xfs_mount *mp;
        struct xfs_perag *pag;
        xfs_agnumber_t  ag;
        int             reclaimable;
+        int nr_to_scan = sc->nr_to_scan;
+        gfp_t gfp_mask = sc->gfp_mask;
        mp = container_of(shrink, struct xfs_mount, m_inode_shrink);
        if (nr_to_scan) {
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index 2d0bcb479075..d48b7a579ae1 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -1151,44 +1151,7 @@ TRACE_EVENT(xfs_bunmap,
 );
-#define XFS_BUSY_SYNC \
+DECLARE_EVENT_CLASS(xfs_busy_class,
-        { 0,    "async" }, \
-        { 1,    "sync" }
-TRACE_EVENT(xfs_alloc_busy,
-        TP_PROTO(struct xfs_trans *trans, xfs_agnumber_t agno,
-                 xfs_agblock_t agbno, xfs_extlen_t len, int sync),
-        TP_ARGS(trans, agno, agbno, len, sync),
-        TP_STRUCT__entry(
-                __field(dev_t, dev)
-                __field(struct xfs_trans *, tp)
-                __field(int, tid)
-                __field(xfs_agnumber_t, agno)
-                __field(xfs_agblock_t, agbno)
-                __field(xfs_extlen_t, len)
-                __field(int, sync)
-        ),
-        TP_fast_assign(
-                __entry->dev = trans->t_mountp->m_super->s_dev;
-                __entry->tp = trans;
-                __entry->tid = trans->t_ticket->t_tid;
-                __entry->agno = agno;
-                __entry->agbno = agbno;
-                __entry->len = len;
-                __entry->sync = sync;
-        ),
-        TP_printk("dev %d:%d trans 0x%p tid 0x%x agno %u agbno %u len %u %s",
-                  MAJOR(__entry->dev), MINOR(__entry->dev),
-                  __entry->tp,
-                  __entry->tid,
-                  __entry->agno,
-                  __entry->agbno,
-                  __entry->len,
-                  __print_symbolic(__entry->sync, XFS_BUSY_SYNC))
-);
-TRACE_EVENT(xfs_alloc_unbusy,
        TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
                 xfs_agblock_t agbno, xfs_extlen_t len),
        TP_ARGS(mp, agno, agbno, len),
@@ -1210,35 +1173,45 @@ TRACE_EVENT(xfs_alloc_unbusy,
                  __entry->agbno,
                  __entry->len)
 );
+#define DEFINE_BUSY_EVENT(name) \
+DEFINE_EVENT(xfs_busy_class, name, \
+        TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+                 xfs_agblock_t agbno, xfs_extlen_t len), \
+        TP_ARGS(mp, agno, agbno, len))
+DEFINE_BUSY_EVENT(xfs_alloc_busy);
+DEFINE_BUSY_EVENT(xfs_alloc_busy_enomem);
+DEFINE_BUSY_EVENT(xfs_alloc_busy_force);
+DEFINE_BUSY_EVENT(xfs_alloc_busy_reuse);
+DEFINE_BUSY_EVENT(xfs_alloc_busy_clear);
-#define XFS_BUSY_STATES \
+TRACE_EVENT(xfs_alloc_busy_trim,
-        { 0,    "missing" }, \
-        { 1,    "found" }
-TRACE_EVENT(xfs_alloc_busysearch,
        TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
-                 xfs_agblock_t agbno, xfs_extlen_t len, int found),
+                 xfs_agblock_t agbno, xfs_extlen_t len,
-        TP_ARGS(mp, agno, agbno, len, found),
+                 xfs_agblock_t tbno, xfs_extlen_t tlen),
+        TP_ARGS(mp, agno, agbno, len, tbno, tlen),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_agnumber_t, agno)
                __field(xfs_agblock_t, agbno)
                __field(xfs_extlen_t, len)
-                __field(int, found)
+                __field(xfs_agblock_t, tbno)
+                __field(xfs_extlen_t, tlen)
        ),
        TP_fast_assign(
                __entry->dev = mp->m_super->s_dev;
                __entry->agno = agno;
                __entry->agbno = agbno;
                __entry->len = len;
-                __entry->found = found;
+                __entry->tbno = tbno;
+                __entry->tlen = tlen;
        ),
-        TP_printk("dev %d:%d agno %u agbno %u len %u %s",
+        TP_printk("dev %d:%d agno %u agbno %u len %u tbno %u tlen %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->agno,
                  __entry->agbno,
                  __entry->len,
-                  __print_symbolic(__entry->found, XFS_BUSY_STATES))
+                  __entry->tbno,
+                  __entry->tlen)
 );
 TRACE_EVENT(xfs_trans_commit_lsn,
@@ -1418,7 +1391,7 @@ DECLARE_EVENT_CLASS(xfs_alloc_class,
                  __entry->wasfromfl,
                  __entry->isfl,
                  __entry->userdata,
-                  __entry->firstblock)
+                  (unsigned long long)__entry->firstblock)
 )
 #define DEFINE_ALLOC_EVENT(name) \
@@ -1433,11 +1406,14 @@ DEFINE_ALLOC_EVENT(xfs_alloc_near_first);
 DEFINE_ALLOC_EVENT(xfs_alloc_near_greater);
 DEFINE_ALLOC_EVENT(xfs_alloc_near_lesser);
 DEFINE_ALLOC_EVENT(xfs_alloc_near_error);
+DEFINE_ALLOC_EVENT(xfs_alloc_near_noentry);
+DEFINE_ALLOC_EVENT(xfs_alloc_near_busy);
 DEFINE_ALLOC_EVENT(xfs_alloc_size_neither);
 DEFINE_ALLOC_EVENT(xfs_alloc_size_noentry);
 DEFINE_ALLOC_EVENT(xfs_alloc_size_nominleft);
 DEFINE_ALLOC_EVENT(xfs_alloc_size_done);
 DEFINE_ALLOC_EVENT(xfs_alloc_size_error);
+DEFINE_ALLOC_EVENT(xfs_alloc_size_busy);
 DEFINE_ALLOC_EVENT(xfs_alloc_small_freelist);
 DEFINE_ALLOC_EVENT(xfs_alloc_small_notenough);
 DEFINE_ALLOC_EVENT(xfs_alloc_small_done);
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 69228aa8605a..b94dace4e785 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -60,7 +60,7 @@ STATIC void	xfs_qm_list_destroy(xfs_dqlist_t *);
 STATIC int      xfs_qm_init_quotainos(xfs_mount_t *);
 STATIC int      xfs_qm_init_quotainfo(xfs_mount_t *);
-STATIC int      xfs_qm_shake(struct shrinker *, int, gfp_t);
+STATIC int      xfs_qm_shake(struct shrinker *, struct shrink_control *);
 static struct shrinker xfs_qm_shaker = {
        .shrink = xfs_qm_shake,
@@ -2009,10 +2009,10 @@ xfs_qm_shake_freelist(
 STATIC int
 xfs_qm_shake(
        struct shrinker *shrink,
-        int             nr_to_scan,
+        struct shrink_control *sc)
-        gfp_t           gfp_mask)
 {
        int     ndqused, nfree, n;
+        gfp_t gfp_mask = sc->gfp_mask;
        if (!kmem_shake_allow(gfp_mask))
                return 0;
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 58632cc17f2d..da0a561ffba2 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -187,7 +187,6 @@ struct xfs_busy_extent {
        xfs_agnumber_t  agno;
        xfs_agblock_t   bno;
        xfs_extlen_t    length;
-        xlog_tid_t      tid;            /* transaction that created this */
 };
 /*
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 27d64d752eab..acdced86413c 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -41,19 +41,13 @@
 #define XFSA_FIXUP_BNO_OK       1
 #define XFSA_FIXUP_CNT_OK       2
-/*
- * Prototypes for per-ag allocation routines
- */
 STATIC int xfs_alloc_ag_vextent_exact(xfs_alloc_arg_t *);
 STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *);
 STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *);
 STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *,
-        xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *);
+                xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *);
+STATIC void xfs_alloc_busy_trim(struct xfs_alloc_arg *,
-/*
+                xfs_agblock_t, xfs_extlen_t, xfs_agblock_t *, xfs_extlen_t *);
- * Internal functions.
- */
 /*
 * Lookup the record equal to [bno, len] in the btree given by cur.
@@ -154,19 +148,21 @@ xfs_alloc_compute_aligned(
        xfs_extlen_t    *reslen)        /* result length */
 {
        xfs_agblock_t   bno;
-        xfs_extlen_t    diff;
        xfs_extlen_t    len;
-        if (args->alignment > 1 && foundlen >= args->minlen) {
+        /* Trim busy sections out of found extent */
-                bno = roundup(foundbno, args->alignment);
+        xfs_alloc_busy_trim(args, foundbno, foundlen, &bno, &len);
-                diff = bno - foundbno;
-                len = diff >= foundlen ? 0 : foundlen - diff;
+        if (args->alignment > 1 && len >= args->minlen) {
+                xfs_agblock_t   aligned_bno = roundup(bno, args->alignment);
+                xfs_extlen_t    diff = aligned_bno - bno;
+                *resbno = aligned_bno;
+                *reslen = diff >= len ? 0 : len - diff;
        } else {
-                bno = foundbno;
+                *resbno = bno;
-                len = foundlen;
+                *reslen = len;
        }
-        *resbno = bno;
-        *reslen = len;
 }
 /*
@@ -280,7 +276,6 @@ xfs_alloc_fix_minleft(
                return 1;
        agf = XFS_BUF_TO_AGF(args->agbp);
        diff = be32_to_cpu(agf->agf_freeblks)
-                + be32_to_cpu(agf->agf_flcount)
                - args->len - args->minleft;
        if (diff >= 0)
                return 1;
@@ -541,16 +536,8 @@ xfs_alloc_ag_vextent(
                if (error)
                        return error;
-                /*
+                ASSERT(!xfs_alloc_busy_search(args->mp, args->agno,
-                 * Search the busylist for these blocks and mark the
+                                              args->agbno, args->len));
-                 * transaction as synchronous if blocks are found. This
-                 * avoids the need to block due to a synchronous log
-                 * force to ensure correct ordering as the synchronous
-                 * transaction will guarantee that for us.
-                 */
-                if (xfs_alloc_busy_search(args->mp, args->agno,
-                                        args->agbno, args->len))
-                        xfs_trans_set_sync(args->tp);
        }
        if (!args->isfl) {
@@ -577,14 +564,14 @@ xfs_alloc_ag_vextent_exact(
 {
        xfs_btree_cur_t *bno_cur;/* by block-number btree cursor */
        xfs_btree_cur_t *cnt_cur;/* by count btree cursor */
-        xfs_agblock_t   end;    /* end of allocated extent */
        int             error;
        xfs_agblock_t   fbno;   /* start block of found extent */
-        xfs_agblock_t   fend;   /* end block of found extent */
        xfs_extlen_t    flen;   /* length of found extent */
+        xfs_agblock_t   tbno;   /* start block of trimmed extent */
+        xfs_extlen_t    tlen;   /* length of trimmed extent */
+        xfs_agblock_t   tend;   /* end block of trimmed extent */
+        xfs_agblock_t   end;    /* end of allocated extent */
        int             i;      /* success/failure of operation */
-        xfs_agblock_t   maxend; /* end of maximal extent */
-        xfs_agblock_t   minend; /* end of minimal extent */
        xfs_extlen_t    rlen;   /* length of returned extent */
        ASSERT(args->alignment == 1);
@@ -614,14 +601,22 @@ xfs_alloc_ag_vextent_exact(
                goto error0;
        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
        ASSERT(fbno <= args->agbno);
-        minend = args->agbno + args->minlen;
-        maxend = args->agbno + args->maxlen;
-        fend = fbno + flen;
        /*
-         * Give up if the freespace isn't long enough for the minimum request.
+         * Check for overlapping busy extents.
+         */
+        xfs_alloc_busy_trim(args, fbno, flen, &tbno, &tlen);
+        /*
+         * Give up if the start of the extent is busy, or the freespace isn't
+         * long enough for the minimum request.
         */
-        if (fend < minend)
+        if (tbno > args->agbno)
+                goto not_found;
+        if (tlen < args->minlen)
+                goto not_found;
+        tend = tbno + tlen;
+        if (tend < args->agbno + args->minlen)
                goto not_found;
        /*
@@ -630,14 +625,14 @@ xfs_alloc_ag_vextent_exact(
         *
         * Fix the length according to mod and prod if given.
         */
-        end = XFS_AGBLOCK_MIN(fend, maxend);
+        end = XFS_AGBLOCK_MIN(tend, args->agbno + args->maxlen);
        args->len = end - args->agbno;
        xfs_alloc_fix_len(args);
        if (!xfs_alloc_fix_minleft(args))
                goto not_found;
        rlen = args->len;
-        ASSERT(args->agbno + rlen <= fend);
+        ASSERT(args->agbno + rlen <= tend);
        end = args->agbno + rlen;
        /*
@@ -686,11 +681,11 @@ xfs_alloc_find_best_extent(
        struct xfs_btree_cur    **scur, /* searching cursor */
        xfs_agblock_t           gdiff,  /* difference for search comparison */
        xfs_agblock_t           *sbno,  /* extent found by search */
-        xfs_extlen_t            *slen,
+        xfs_extlen_t            *slen,  /* extent length */
-        xfs_extlen_t            *slena, /* aligned length */
+        xfs_agblock_t           *sbnoa, /* aligned extent found by search */
+        xfs_extlen_t            *slena, /* aligned extent length */
        int                     dir)    /* 0 = search right, 1 = search left */
 {
-        xfs_agblock_t           bno;
        xfs_agblock_t           new;
        xfs_agblock_t           sdiff;
        int                     error;
@@ -708,16 +703,16 @@ xfs_alloc_find_best_extent(
                if (error)
                        goto error0;
                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                xfs_alloc_compute_aligned(args, *sbno, *slen, &bno, slena);
+                xfs_alloc_compute_aligned(args, *sbno, *slen, sbnoa, slena);
                /*
                 * The good extent is closer than this one.
                 */
                if (!dir) {
-                        if (bno >= args->agbno + gdiff)
+                        if (*sbnoa >= args->agbno + gdiff)
                                goto out_use_good;
                } else {
-                        if (bno <= args->agbno - gdiff)
+                        if (*sbnoa <= args->agbno - gdiff)
                                goto out_use_good;
                }
@@ -729,8 +724,8 @@ xfs_alloc_find_best_extent(
                        xfs_alloc_fix_len(args);
                        sdiff = xfs_alloc_compute_diff(args->agbno, args->len,
-                                                       args->alignment, *sbno,
+                                                       args->alignment, *sbnoa,
-                                                       *slen, &new);
+                                                       *slena, &new);
                        /*
                         * Choose closer size and invalidate other cursor.
@@ -780,7 +775,7 @@ xfs_alloc_ag_vextent_near(
        xfs_agblock_t   gtbnoa;         /* aligned ... */
        xfs_extlen_t    gtdiff;         /* difference to right side entry */
        xfs_extlen_t    gtlen;          /* length of right side entry */
-        xfs_extlen_t    gtlena = 0;     /* aligned ... */
+        xfs_extlen_t    gtlena;         /* aligned ... */
        xfs_agblock_t   gtnew;          /* useful start bno of right side */
        int             error;          /* error code */
        int             i;              /* result code, temporary */
@@ -789,9 +784,10 @@ xfs_alloc_ag_vextent_near(
        xfs_agblock_t   ltbnoa;         /* aligned ... */
        xfs_extlen_t    ltdiff;         /* difference to left side entry */
        xfs_extlen_t    ltlen;          /* length of left side entry */
-        xfs_extlen_t    ltlena = 0;     /* aligned ... */
+        xfs_extlen_t    ltlena;         /* aligned ... */
        xfs_agblock_t   ltnew;          /* useful start bno of left side */
        xfs_extlen_t    rlen;           /* length of returned extent */
+        int             forced = 0;
 #if defined(DEBUG) && defined(__KERNEL__)
        /*
         * Randomly don't execute the first algorithm.
@@ -800,13 +796,20 @@ xfs_alloc_ag_vextent_near(
        dofirst = random32() & 1;
 #endif
+restart:
+        bno_cur_lt = NULL;
+        bno_cur_gt = NULL;
+        ltlen = 0;
+        gtlena = 0;
+        ltlena = 0;
        /*
         * Get a cursor for the by-size btree.
         */
        cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
                args->agno, XFS_BTNUM_CNT);
-        ltlen = 0;
-        bno_cur_lt = bno_cur_gt = NULL;
        /*
         * See if there are any free extents as big as maxlen.
         */
@@ -822,11 +825,13 @@ xfs_alloc_ag_vextent_near(
                        goto error0;
                if (i == 0 || ltlen == 0) {
                        xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+                        trace_xfs_alloc_near_noentry(args);
                        return 0;
                }
                ASSERT(i == 1);
        }
        args->wasfromfl = 0;
        /*
         * First algorithm.
         * If the requested extent is large wrt the freespaces available
@@ -890,7 +895,7 @@ xfs_alloc_ag_vextent_near(
                        if (args->len < blen)
                                continue;
                        ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
-                                args->alignment, ltbno, ltlen, &ltnew);
+                                args->alignment, ltbnoa, ltlena, &ltnew);
                        if (ltnew != NULLAGBLOCK &&
                            (args->len > blen || ltdiff < bdiff)) {
                                bdiff = ltdiff;
@@ -1042,11 +1047,12 @@ xfs_alloc_ag_vextent_near(
                        args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
                        xfs_alloc_fix_len(args);
                        ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
-                                args->alignment, ltbno, ltlen, &ltnew);
+                                args->alignment, ltbnoa, ltlena, &ltnew);
                        error = xfs_alloc_find_best_extent(args,
                                                &bno_cur_lt, &bno_cur_gt,
-                                                ltdiff, &gtbno, &gtlen, &gtlena,
+                                                ltdiff, &gtbno, &gtlen,
+                                                &gtbnoa, &gtlena,
                                                0 /* search right */);
                } else {
                        ASSERT(gtlena >= args->minlen);
@@ -1057,11 +1063,12 @@ xfs_alloc_ag_vextent_near(
                        args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen);
                        xfs_alloc_fix_len(args);
                        gtdiff = xfs_alloc_compute_diff(args->agbno, args->len,
-                                args->alignment, gtbno, gtlen, &gtnew);
+                                args->alignment, gtbnoa, gtlena, &gtnew);
                        error = xfs_alloc_find_best_extent(args,
                                                &bno_cur_gt, &bno_cur_lt,
-                                                gtdiff, &ltbno, &ltlen, &ltlena,
+                                                gtdiff, &ltbno, &ltlen,
+                                                &ltbnoa, &ltlena,
                                                1 /* search left */);
                }
@@ -1073,6 +1080,12 @@ xfs_alloc_ag_vextent_near(
         * If we couldn't get anything, give up.
         */
        if (bno_cur_lt == NULL && bno_cur_gt == NULL) {
+                if (!forced++) {
+                        trace_xfs_alloc_near_busy(args);
+                        xfs_log_force(args->mp, XFS_LOG_SYNC);
+                        goto restart;
+                }
                trace_xfs_alloc_size_neither(args);
                args->agbno = NULLAGBLOCK;
                return 0;
@@ -1107,12 +1120,13 @@ xfs_alloc_ag_vextent_near(
                return 0;
        }
        rlen = args->len;
-        (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment, ltbno,
+        (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment,
-                ltlen, &ltnew);
+                                     ltbnoa, ltlena, &ltnew);
        ASSERT(ltnew >= ltbno);
-        ASSERT(ltnew + rlen <= ltbno + ltlen);
+        ASSERT(ltnew + rlen <= ltbnoa + ltlena);
        ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
        args->agbno = ltnew;
        if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen,
                        ltnew, rlen, XFSA_FIXUP_BNO_OK)))
                goto error0;
@@ -1155,26 +1169,35 @@ xfs_alloc_ag_vextent_size(
        int             i;              /* temp status variable */
        xfs_agblock_t   rbno;           /* returned block number */
        xfs_extlen_t    rlen;           /* length of returned extent */
+        int             forced = 0;
+restart:
        /*
         * Allocate and initialize a cursor for the by-size btree.
         */
        cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
                args->agno, XFS_BTNUM_CNT);
        bno_cur = NULL;
        /*
         * Look for an entry >= maxlen+alignment-1 blocks.
         */
        if ((error = xfs_alloc_lookup_ge(cnt_cur, 0,
                        args->maxlen + args->alignment - 1, &i)))
                goto error0;
        /*
-         * If none, then pick up the last entry in the tree unless the
+         * If none or we have busy extents that we cannot allocate from, then
-         * tree is empty.
+         * we have to settle for a smaller extent. In the case that there are
+         * no large extents, this will return the last entry in the tree unless
+         * the tree is empty. In the case that there are only busy large
+         * extents, this will return the largest small extent unless there
+         * are no smaller extents available.
         */
-        if (!i) {
+        if (!i || forced > 1) {
-                if ((error = xfs_alloc_ag_vextent_small(args, cnt_cur, &fbno,
+                error = xfs_alloc_ag_vextent_small(args, cnt_cur,
-                                &flen, &i)))
+                                                   &fbno, &flen, &i);
+                if (error)
                        goto error0;
                if (i == 0 || flen == 0) {
                        xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
@@ -1182,22 +1205,56 @@ xfs_alloc_ag_vextent_size(
                        return 0;
                }
                ASSERT(i == 1);
+                xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen);
+        } else {
+                /*
+                 * Search for a non-busy extent that is large enough.
+                 * If we are at low space, don't check, or if we fall of
+                 * the end of the btree, turn off the busy check and
+                 * restart.
+                 */
+                for (;;) {
+                        error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i);
+                        if (error)
+                                goto error0;
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                        xfs_alloc_compute_aligned(args, fbno, flen,
+                                                  &rbno, &rlen);
+                        if (rlen >= args->maxlen)
+                                break;
+                        error = xfs_btree_increment(cnt_cur, 0, &i);
+                        if (error)
+                                goto error0;
+                        if (i == 0) {
+                                /*
+                                 * Our only valid extents must have been busy.
+                                 * Make it unbusy by forcing the log out and
+                                 * retrying. If we've been here before, forcing
+                                 * the log isn't making the extents available,
+                                 * which means they have probably been freed in
+                                 * this transaction.  In that case, we have to
+                                 * give up on them and we'll attempt a minlen
+                                 * allocation the next time around.
+                                 */
+                                xfs_btree_del_cursor(cnt_cur,
+                                                     XFS_BTREE_NOERROR);
+                                trace_xfs_alloc_size_busy(args);
+                                if (!forced++)
+                                        xfs_log_force(args->mp, XFS_LOG_SYNC);
+                                goto restart;
+                        }
+                }
        }
-        /*
-         * There's a freespace as big as maxlen+alignment-1, get it.
-         */
-        else {
-                if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i)))
-                        goto error0;
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-        }
        /*
         * In the first case above, we got the last entry in the
         * by-size btree.  Now we check to see if the space hits maxlen
         * once aligned; if not, we search left for something better.
         * This can't happen in the second case above.
         */
-        xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen);
        rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
        XFS_WANT_CORRUPTED_GOTO(rlen == 0 ||
                        (rlen <= flen && rbno + rlen <= fbno + flen), error0);
@@ -1251,13 +1308,19 @@ xfs_alloc_ag_vextent_size(
         * Fix up the length.
         */
        args->len = rlen;
-        xfs_alloc_fix_len(args);
+        if (rlen < args->minlen) {
-        if (rlen < args->minlen || !xfs_alloc_fix_minleft(args)) {
+                if (!forced++) {
-                xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+                        xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
-                trace_xfs_alloc_size_nominleft(args);
+                        trace_xfs_alloc_size_busy(args);
-                args->agbno = NULLAGBLOCK;
+                        xfs_log_force(args->mp, XFS_LOG_SYNC);
-                return 0;
+                        goto restart;
+                }
+                goto out_nominleft;
        }
+        xfs_alloc_fix_len(args);
+        if (!xfs_alloc_fix_minleft(args))
+                goto out_nominleft;
        rlen = args->len;
        XFS_WANT_CORRUPTED_GOTO(rlen <= flen, error0);
        /*
@@ -1287,6 +1350,12 @@ error0:
        if (bno_cur)
                xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
        return error;
+out_nominleft:
+        xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+        trace_xfs_alloc_size_nominleft(args);
+        args->agbno = NULLAGBLOCK;
+        return 0;
 }
 /*
@@ -1326,6 +1395,9 @@ xfs_alloc_ag_vextent_small(
                if (error)
                        goto error0;
                if (fbno != NULLAGBLOCK) {
+                        xfs_alloc_busy_reuse(args->mp, args->agno, fbno, 1,
+                                             args->userdata);
                        if (args->userdata) {
                                xfs_buf_t       *bp;
@@ -1617,18 +1689,6 @@ xfs_free_ag_extent(
        trace_xfs_free_extent(mp, agno, bno, len, isfl, haveleft, haveright);
-        /*
-         * Since blocks move to the free list without the coordination
-         * used in xfs_bmap_finish, we can't allow block to be available
-         * for reallocation and non-transaction writing (user data)
-         * until we know that the transaction that moved it to the free
-         * list is permanently on disk.  We track the blocks by declaring
-         * these blocks as "busy"; the busy list is maintained on a per-ag
-         * basis and each transaction records which entries should be removed
-         * when the iclog commits to disk.  If a busy block is allocated,
-         * the iclog is pushed up to the LSN that freed the block.
-         */
-        xfs_alloc_busy_insert(tp, agno, bno, len);
        return 0;
 error0:
@@ -1923,21 +1983,6 @@ xfs_alloc_get_freelist(
        xfs_alloc_log_agf(tp, agbp, logflags);
        *bnop = bno;
-        /*
-         * As blocks are freed, they are added to the per-ag busy list and
-         * remain there until the freeing transaction is committed to disk.
-         * Now that we have allocated blocks, this list must be searched to see
-         * if a block is being reused.  If one is, then the freeing transaction
-         * must be pushed to disk before this transaction.
-         *
-         * We do this by setting the current transaction to a sync transaction
-         * which guarantees that the freeing transaction is on disk before this
-         * transaction. This is done instead of a synchronous log force here so
-         * that we don't sit and wait with the AGF locked in the transaction
-         * during the log force.
-         */
-        if (xfs_alloc_busy_search(mp, be32_to_cpu(agf->agf_seqno), bno, 1))
-                xfs_trans_set_sync(tp);
        return 0;
 }
@@ -2423,105 +2468,13 @@ xfs_free_extent(
        }
        error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0);
+        if (!error)
+                xfs_alloc_busy_insert(tp, args.agno, args.agbno, len);
 error0:
        xfs_perag_put(args.pag);
        return error;
 }
-/*
- * AG Busy list management
- * The busy list contains block ranges that have been freed but whose
- * transactions have not yet hit disk.  If any block listed in a busy
- * list is reused, the transaction that freed it must be forced to disk
- * before continuing to use the block.
- *
- * xfs_alloc_busy_insert - add to the per-ag busy list
- * xfs_alloc_busy_clear - remove an item from the per-ag busy list
- * xfs_alloc_busy_search - search for a busy extent
- */
-/*
- * Insert a new extent into the busy tree.
- *
- * The busy extent tree is indexed by the start block of the busy extent.
- * there can be multiple overlapping ranges in the busy extent tree but only
- * ever one entry at a given start block. The reason for this is that
- * multi-block extents can be freed, then smaller chunks of that extent
- * allocated and freed again before the first transaction commit is on disk.
- * If the exact same start block is freed a second time, we have to wait for
- * that busy extent to pass out of the tree before the new extent is inserted.
- * There are two main cases we have to handle here.
- *
- * The first case is a transaction that triggers a "free - allocate - free"
- * cycle. This can occur during btree manipulations as a btree block is freed
- * to the freelist, then allocated from the free list, then freed again. In
- * this case, the second extxpnet free is what triggers the duplicate and as
- * such the transaction IDs should match. Because the extent was allocated in
- * this transaction, the transaction must be marked as synchronous. This is
- * true for all cases where the free/alloc/free occurs in the one transaction,
- * hence the addition of the ASSERT(tp->t_flags & XFS_TRANS_SYNC) to this case.
- * This serves to catch violations of the second case quite effectively.
- *
- * The second case is where the free/alloc/free occur in different
- * transactions. In this case, the thread freeing the extent the second time
- * can't mark the extent busy immediately because it is already tracked in a
- * transaction that may be committing.  When the log commit for the existing
- * busy extent completes, the busy extent will be removed from the tree. If we
- * allow the second busy insert to continue using that busy extent structure,
- * it can be freed before this transaction is safely in the log.  Hence our
- * only option in this case is to force the log to remove the existing busy
- * extent from the list before we insert the new one with the current
- * transaction ID.
- *
- * The problem we are trying to avoid in the free-alloc-free in separate
- * transactions is most easily described with a timeline:
- *
- *      Thread 1        Thread 2        Thread 3        xfslogd
- *      xact alloc
- *      free X
- *      mark busy
- *      commit xact
- *      free xact
- *                      xact alloc
- *                      alloc X
- *                      busy search
- *                      mark xact sync
- *                      commit xact
- *                      free xact
- *                      force log
- *                      checkpoint starts
- *                      ....
- *                                      xact alloc
- *                                      free X
- *                                      mark busy
- *                                      finds match
- *                                      *** KABOOM! ***
- *                                      ....
- *                                                      log IO completes
- *                                                      unbusy X
- *                      checkpoint completes
- *
- * By issuing a log force in thread 3 @ "KABOOM", the thread will block until
- * the checkpoint completes, and the busy extent it matched will have been
- * removed from the tree when it is woken. Hence it can then continue safely.
- *
- * However, to ensure this matching process is robust, we need to use the
- * transaction ID for identifying transaction, as delayed logging results in
- * the busy extent and transaction lifecycles being different. i.e. the busy
- * extent is active for a lot longer than the transaction.  Hence the
- * transaction structure can be freed and reallocated, then mark the same
- * extent busy again in the new transaction. In this case the new transaction
- * will have a different tid but can have the same address, and hence we need
- * to check against the tid.
- *
- * Future: for delayed logging, we could avoid the log force if the extent was
- * first freed in the current checkpoint sequence. This, however, requires the
- * ability to pin the current checkpoint in memory until this transaction
- * commits to ensure that both the original free and the current one combine
- * logically into the one checkpoint. If the checkpoint sequences are
- * different, however, we still need to wait on a log force.
- */
 void
 xfs_alloc_busy_insert(
        struct xfs_trans        *tp,
@@ -2533,9 +2486,7 @@ xfs_alloc_busy_insert(
        struct xfs_busy_extent  *busyp;
        struct xfs_perag        *pag;
        struct rb_node          **rbp;
-        struct rb_node          *parent;
+        struct rb_node          *parent = NULL;
-        int                     match;
        new = kmem_zalloc(sizeof(struct xfs_busy_extent), KM_MAYFAIL);
        if (!new) {
@@ -2544,7 +2495,7 @@ xfs_alloc_busy_insert(
                 * block, make this a synchronous transaction to insure that
                 * the block is not reused before this transaction commits.
                 */
-                trace_xfs_alloc_busy(tp, agno, bno, len, 1);
+                trace_xfs_alloc_busy_enomem(tp->t_mountp, agno, bno, len);
                xfs_trans_set_sync(tp);
                return;
        }
@@ -2552,66 +2503,28 @@ xfs_alloc_busy_insert(
        new->agno = agno;
        new->bno = bno;
        new->length = len;
-        new->tid = xfs_log_get_trans_ident(tp);
        INIT_LIST_HEAD(&new->list);
        /* trace before insert to be able to see failed inserts */
-        trace_xfs_alloc_busy(tp, agno, bno, len, 0);
+        trace_xfs_alloc_busy(tp->t_mountp, agno, bno, len);
        pag = xfs_perag_get(tp->t_mountp, new->agno);
-restart:
        spin_lock(&pag->pagb_lock);
        rbp = &pag->pagb_tree.rb_node;
-        parent = NULL;
+        while (*rbp) {
-        busyp = NULL;
-        match = 0;
-        while (*rbp && match >= 0) {
                parent = *rbp;
                busyp = rb_entry(parent, struct xfs_busy_extent, rb_node);
                if (new->bno < busyp->bno) {
-                        /* may overlap, but exact start block is lower */
                        rbp = &(*rbp)->rb_left;
-                        if (new->bno + new->length > busyp->bno)
+                        ASSERT(new->bno + new->length <= busyp->bno);
-                                match = busyp->tid == new->tid ? 1 : -1;
                } else if (new->bno > busyp->bno) {
-                        /* may overlap, but exact start block is higher */
                        rbp = &(*rbp)->rb_right;
-                        if (bno < busyp->bno + busyp->length)
+                        ASSERT(bno >= busyp->bno + busyp->length);
-                                match = busyp->tid == new->tid ? 1 : -1;
                } else {
-                        match = busyp->tid == new->tid ? 1 : -1;
+                        ASSERT(0);
-                        break;
                }
        }
-        if (match < 0) {
-                /* overlap marked busy in different transaction */
-                spin_unlock(&pag->pagb_lock);
-                xfs_log_force(tp->t_mountp, XFS_LOG_SYNC);
-                goto restart;
-        }
-        if (match > 0) {
-                /*
-                 * overlap marked busy in same transaction. Update if exact
-                 * start block match, otherwise combine the busy extents into
-                 * a single range.
-                 */
-                if (busyp->bno == new->bno) {
-                        busyp->length = max(busyp->length, new->length);
-                        spin_unlock(&pag->pagb_lock);
-                        ASSERT(tp->t_flags & XFS_TRANS_SYNC);
-                        xfs_perag_put(pag);
-                        kmem_free(new);
-                        return;
-                }
-                rb_erase(&busyp->rb_node, &pag->pagb_tree);
-                new->length = max(busyp->bno + busyp->length,
-                                        new->bno + new->length) -
-                                min(busyp->bno, new->bno);
-                new->bno = min(busyp->bno, new->bno);
-        } else
-                busyp = NULL;
        rb_link_node(&new->rb_node, parent, rbp);
        rb_insert_color(&new->rb_node, &pag->pagb_tree);
@@ -2619,7 +2532,6 @@ restart:
        list_add(&new->list, &tp->t_busy);
        spin_unlock(&pag->pagb_lock);
        xfs_perag_put(pag);
-        kmem_free(busyp);
 }
 /*
@@ -2668,31 +2580,443 @@ xfs_alloc_busy_search(
                }
        }
        spin_unlock(&pag->pagb_lock);
-        trace_xfs_alloc_busysearch(mp, agno, bno, len, !!match);
        xfs_perag_put(pag);
        return match;
 }
+/*
+ * The found free extent [fbno, fend] overlaps part or all of the given busy
+ * extent.  If the overlap covers the beginning, the end, or all of the busy
+ * extent, the overlapping portion can be made unbusy and used for the
+ * allocation.  We can't split a busy extent because we can't modify a
+ * transaction/CIL context busy list, but we can update an entries block
+ * number or length.
+ *
+ * Returns true if the extent can safely be reused, or false if the search
+ * needs to be restarted.
+ */
+STATIC bool
+xfs_alloc_busy_update_extent(
+        struct xfs_mount        *mp,
+        struct xfs_perag        *pag,
+        struct xfs_busy_extent  *busyp,
+        xfs_agblock_t           fbno,
+        xfs_extlen_t            flen,
+        bool                    userdata)
+{
+        xfs_agblock_t           fend = fbno + flen;
+        xfs_agblock_t           bbno = busyp->bno;
+        xfs_agblock_t           bend = bbno + busyp->length;
+        /*
+         * If there is a busy extent overlapping a user allocation, we have
+         * no choice but to force the log and retry the search.
+         *
+         * Fortunately this does not happen during normal operation, but
+         * only if the filesystem is very low on space and has to dip into
+         * the AGFL for normal allocations.
+         */
+        if (userdata)
+                goto out_force_log;
+        if (bbno < fbno && bend > fend) {
+                /*
+                 * Case 1:
+                 *    bbno           bend
+                 *    +BBBBBBBBBBBBBBBBB+
+                 *        +---------+
+                 *        fbno   fend
+                 */
+                /*
+                 * We would have to split the busy extent to be able to track
+                 * it correct, which we cannot do because we would have to
+                 * modify the list of busy extents attached to the transaction
+                 * or CIL context, which is immutable.
+                 *
+                 * Force out the log to clear the busy extent and retry the
+                 * search.
+                 */
+                goto out_force_log;
+        } else if (bbno >= fbno && bend <= fend) {
+                /*
+                 * Case 2:
+                 *    bbno           bend
+                 *    +BBBBBBBBBBBBBBBBB+
+                 *    +-----------------+
+                 *    fbno           fend
+                 *
+                 * Case 3:
+                 *    bbno           bend
+                 *    +BBBBBBBBBBBBBBBBB+
+                 *    +--------------------------+
+                 *    fbno                    fend
+                 *
+                 * Case 4:
+                 *             bbno           bend
+                 *             +BBBBBBBBBBBBBBBBB+
+                 *    +--------------------------+
+                 *    fbno                    fend
+                 *
+                 * Case 5:
+                 *             bbno           bend
+                 *             +BBBBBBBBBBBBBBBBB+
+                 *    +-----------------------------------+
+                 *    fbno                             fend
+                 *
+                 */
+                /*
+                 * The busy extent is fully covered by the extent we are
+                 * allocating, and can simply be removed from the rbtree.
+                 * However we cannot remove it from the immutable list
+                 * tracking busy extents in the transaction or CIL context,
+                 * so set the length to zero to mark it invalid.
+                 *
+                 * We also need to restart the busy extent search from the
+                 * tree root, because erasing the node can rearrange the
+                 * tree topology.
+                 */
+                rb_erase(&busyp->rb_node, &pag->pagb_tree);
+                busyp->length = 0;
+                return false;
+        } else if (fend < bend) {
+                /*
+                 * Case 6:
+                 *              bbno           bend
+                 *             +BBBBBBBBBBBBBBBBB+
+                 *             +---------+
+                 *             fbno   fend
+                 *
+                 * Case 7:
+                 *             bbno           bend
+                 *             +BBBBBBBBBBBBBBBBB+
+                 *    +------------------+
+                 *    fbno            fend
+                 *
+                 */
+                busyp->bno = fend;
+        } else if (bbno < fbno) {
+                /*
+                 * Case 8:
+                 *    bbno           bend
+                 *    +BBBBBBBBBBBBBBBBB+
+                 *        +-------------+
+                 *        fbno       fend
+                 *
+                 * Case 9:
+                 *    bbno           bend
+                 *    +BBBBBBBBBBBBBBBBB+
+                 *        +----------------------+
+                 *        fbno                fend
+                 */
+                busyp->length = fbno - busyp->bno;
+        } else {
+                ASSERT(0);
+        }
+        trace_xfs_alloc_busy_reuse(mp, pag->pag_agno, fbno, flen);
+        return true;
+out_force_log:
+        spin_unlock(&pag->pagb_lock);
+        xfs_log_force(mp, XFS_LOG_SYNC);
+        trace_xfs_alloc_busy_force(mp, pag->pag_agno, fbno, flen);
+        spin_lock(&pag->pagb_lock);
+        return false;
+}
+/*
+ * For a given extent [fbno, flen], make sure we can reuse it safely.
+ */
 void
-xfs_alloc_busy_clear(
+xfs_alloc_busy_reuse(
        struct xfs_mount        *mp,
-        struct xfs_busy_extent  *busyp)
+        xfs_agnumber_t          agno,
+        xfs_agblock_t           fbno,
+        xfs_extlen_t            flen,
+        bool                    userdata)
 {
        struct xfs_perag        *pag;
+        struct rb_node          *rbp;
-        trace_xfs_alloc_unbusy(mp, busyp->agno, busyp->bno,
+        ASSERT(flen > 0);
-                                                busyp->length);
-        ASSERT(xfs_alloc_busy_search(mp, busyp->agno, busyp->bno,
+        pag = xfs_perag_get(mp, agno);
-                                                busyp->length) == 1);
+        spin_lock(&pag->pagb_lock);
+restart:
+        rbp = pag->pagb_tree.rb_node;
+        while (rbp) {
+                struct xfs_busy_extent *busyp =
+                        rb_entry(rbp, struct xfs_busy_extent, rb_node);
+                xfs_agblock_t   bbno = busyp->bno;
+                xfs_agblock_t   bend = bbno + busyp->length;
-        list_del_init(&busyp->list);
+                if (fbno + flen <= bbno) {
+                        rbp = rbp->rb_left;
+                        continue;
+                } else if (fbno >= bend) {
+                        rbp = rbp->rb_right;
+                        continue;
+                }
-        pag = xfs_perag_get(mp, busyp->agno);
+                if (!xfs_alloc_busy_update_extent(mp, pag, busyp, fbno, flen,
-        spin_lock(&pag->pagb_lock);
+                                                  userdata))
-        rb_erase(&busyp->rb_node, &pag->pagb_tree);
+                        goto restart;
+        }
        spin_unlock(&pag->pagb_lock);
        xfs_perag_put(pag);
+}
+/*
+ * For a given extent [fbno, flen], search the busy extent list to find a
+ * subset of the extent that is not busy.  If *rlen is smaller than
+ * args->minlen no suitable extent could be found, and the higher level
+ * code needs to force out the log and retry the allocation.
+ */
+STATIC void
+xfs_alloc_busy_trim(
+        struct xfs_alloc_arg    *args,
+        xfs_agblock_t           bno,
+        xfs_extlen_t            len,
+        xfs_agblock_t           *rbno,
+        xfs_extlen_t            *rlen)
+{
+        xfs_agblock_t           fbno;
+        xfs_extlen_t            flen;
+        struct rb_node          *rbp;
+        ASSERT(len > 0);
+        spin_lock(&args->pag->pagb_lock);
+restart:
+        fbno = bno;
+        flen = len;
+        rbp = args->pag->pagb_tree.rb_node;
+        while (rbp && flen >= args->minlen) {
+                struct xfs_busy_extent *busyp =
+                        rb_entry(rbp, struct xfs_busy_extent, rb_node);
+                xfs_agblock_t   fend = fbno + flen;
+                xfs_agblock_t   bbno = busyp->bno;
+                xfs_agblock_t   bend = bbno + busyp->length;
+                if (fend <= bbno) {
+                        rbp = rbp->rb_left;
+                        continue;
+                } else if (fbno >= bend) {
+                        rbp = rbp->rb_right;
+                        continue;
+                }
+                /*
+                 * If this is a metadata allocation, try to reuse the busy
+                 * extent instead of trimming the allocation.
+                 */
+                if (!args->userdata) {
+                        if (!xfs_alloc_busy_update_extent(args->mp, args->pag,
+                                                          busyp, fbno, flen,
+                                                          false))
+                                goto restart;
+                        continue;
+                }
+                if (bbno <= fbno) {
+                        /* start overlap */
+                        /*
+                         * Case 1:
+                         *    bbno           bend
+                         *    +BBBBBBBBBBBBBBBBB+
+                         *        +---------+
+                         *        fbno   fend
+                         *
+                         * Case 2:
+                         *    bbno           bend
+                         *    +BBBBBBBBBBBBBBBBB+
+                         *    +-------------+
+                         *    fbno       fend
+                         *
+                         * Case 3:
+                         *    bbno           bend
+                         *    +BBBBBBBBBBBBBBBBB+
+                         *        +-------------+
+                         *        fbno       fend
+                         *
+                         * Case 4:
+                         *    bbno           bend
+                         *    +BBBBBBBBBBBBBBBBB+
+                         *    +-----------------+
+                         *    fbno           fend
+                         *
+                         * No unbusy region in extent, return failure.
+                         */
+                        if (fend <= bend)
+                                goto fail;
+                        /*
+                         * Case 5:
+                         *    bbno           bend
+                         *    +BBBBBBBBBBBBBBBBB+
+                         *        +----------------------+
+                         *        fbno                fend
+                         *
+                         * Case 6:
+                         *    bbno           bend
+                         *    +BBBBBBBBBBBBBBBBB+
+                         *    +--------------------------+
+                         *    fbno                    fend
+                         *
+                         * Needs to be trimmed to:
+                         *                       +-------+
+                         *                       fbno fend
+                         */
+                        fbno = bend;
+                } else if (bend >= fend) {
+                        /* end overlap */
+                        /*
+                         * Case 7:
+                         *             bbno           bend
+                         *             +BBBBBBBBBBBBBBBBB+
+                         *    +------------------+
+                         *    fbno            fend
+                         *
+                         * Case 8:
+                         *             bbno           bend
+                         *             +BBBBBBBBBBBBBBBBB+
+                         *    +--------------------------+
+                         *    fbno                    fend
+                         *
+                         * Needs to be trimmed to:
+                         *    +-------+
+                         *    fbno fend
+                         */
+                        fend = bbno;
+                } else {
+                        /* middle overlap */
+                        /*
+                         * Case 9:
+                         *             bbno           bend
+                         *             +BBBBBBBBBBBBBBBBB+
+                         *    +-----------------------------------+
+                         *    fbno                             fend
+                         *
+                         * Can be trimmed to:
+                         *    +-------+        OR         +-------+
+                         *    fbno fend                   fbno fend
+                         *
+                         * Backward allocation leads to significant
+                         * fragmentation of directories, which degrades
+                         * directory performance, therefore we always want to
+                         * choose the option that produces forward allocation
+                         * patterns.
+                         * Preferring the lower bno extent will make the next
+                         * request use "fend" as the start of the next
+                         * allocation;  if the segment is no longer busy at
+                         * that point, we'll get a contiguous allocation, but
+                         * even if it is still busy, we will get a forward
+                         * allocation.
+                         * We try to avoid choosing the segment at "bend",
+                         * because that can lead to the next allocation
+                         * taking the segment at "fbno", which would be a
+                         * backward allocation.  We only use the segment at
+                         * "fbno" if it is much larger than the current
+                         * requested size, because in that case there's a
+                         * good chance subsequent allocations will be
+                         * contiguous.
+                         */
+                        if (bbno - fbno >= args->maxlen) {
+                                /* left candidate fits perfect */
+                                fend = bbno;
+                        } else if (fend - bend >= args->maxlen * 4) {
+                                /* right candidate has enough free space */
+                                fbno = bend;
+                        } else if (bbno - fbno >= args->minlen) {
+                                /* left candidate fits minimum requirement */
+                                fend = bbno;
+                        } else {
+                                goto fail;
+                        }
+                }
+                flen = fend - fbno;
+        }
+        spin_unlock(&args->pag->pagb_lock);
+        if (fbno != bno || flen != len) {
+                trace_xfs_alloc_busy_trim(args->mp, args->agno, bno, len,
+                                          fbno, flen);
+        }
+        *rbno = fbno;
+        *rlen = flen;
+        return;
+fail:
+        /*
+         * Return a zero extent length as failure indications.  All callers
+         * re-check if the trimmed extent satisfies the minlen requirement.
+         */
+        spin_unlock(&args->pag->pagb_lock);
+        trace_xfs_alloc_busy_trim(args->mp, args->agno, bno, len, fbno, 0);
+        *rbno = fbno;
+        *rlen = 0;
+}
+static void
+xfs_alloc_busy_clear_one(
+        struct xfs_mount        *mp,
+        struct xfs_perag        *pag,
+        struct xfs_busy_extent  *busyp)
+{
+        if (busyp->length) {
+                trace_xfs_alloc_busy_clear(mp, busyp->agno, busyp->bno,
+                                                busyp->length);
+                rb_erase(&busyp->rb_node, &pag->pagb_tree);
+        }
+        list_del_init(&busyp->list);
        kmem_free(busyp);
 }
+void
+xfs_alloc_busy_clear(
+        struct xfs_mount        *mp,
+        struct list_head        *list)
+{
+        struct xfs_busy_extent  *busyp, *n;
+        struct xfs_perag        *pag = NULL;
+        xfs_agnumber_t          agno = NULLAGNUMBER;
+        list_for_each_entry_safe(busyp, n, list, list) {
+                if (busyp->agno != agno) {
+                        if (pag) {
+                                spin_unlock(&pag->pagb_lock);
+                                xfs_perag_put(pag);
+                        }
+                        pag = xfs_perag_get(mp, busyp->agno);
+                        spin_lock(&pag->pagb_lock);
+                        agno = busyp->agno;
+                }
+                xfs_alloc_busy_clear_one(mp, pag, busyp);
+        }
+        if (pag) {
+                spin_unlock(&pag->pagb_lock);
+                xfs_perag_put(pag);
+        }
+}
+/*
+ * Callback for list_sort to sort busy extents by the AG they reside in.
+ */
+int
+xfs_busy_extent_ag_cmp(
+        void                    *priv,
+        struct list_head        *a,
+        struct list_head        *b)
+{
+        return container_of(a, struct xfs_busy_extent, list)->agno -
+                container_of(b, struct xfs_busy_extent, list)->agno;
+}
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index d0b3bc72005b..240ad288f2f9 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -140,11 +140,24 @@ xfs_alloc_busy_insert(struct xfs_trans *tp, xfs_agnumber_t agno,
        xfs_agblock_t bno, xfs_extlen_t len);
 void
-xfs_alloc_busy_clear(struct xfs_mount *mp, struct xfs_busy_extent *busyp);
+xfs_alloc_busy_clear(struct xfs_mount *mp, struct list_head *list);
 int
 xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
        xfs_agblock_t bno, xfs_extlen_t len);
+void
+xfs_alloc_busy_reuse(struct xfs_mount *mp, xfs_agnumber_t agno,
+        xfs_agblock_t fbno, xfs_extlen_t flen, bool userdata);
+int
+xfs_busy_extent_ag_cmp(void *priv, struct list_head *a, struct list_head *b);
+static inline void xfs_alloc_busy_sort(struct list_head *list)
+{
+        list_sort(NULL, list, xfs_busy_extent_ag_cmp);
+}
 #endif  /* __KERNEL__ */
 /*
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index 3916925e2584..8b469d53599f 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -95,6 +95,8 @@ xfs_allocbt_alloc_block(
                return 0;
        }
+        xfs_alloc_busy_reuse(cur->bc_mp, cur->bc_private.a.agno, bno, 1, false);
        xfs_trans_agbtree_delta(cur->bc_tp, 1);
        new->s = cpu_to_be32(bno);
@@ -118,17 +120,6 @@ xfs_allocbt_free_block(
        if (error)
                return error;
-        /*
-         * Since blocks move to the free list without the coordination used in
-         * xfs_bmap_finish, we can't allow block to be available for
-         * reallocation and non-transaction writing (user data) until we know
-         * that the transaction that moved it to the free list is permanently
-         * on disk. We track the blocks by declaring these blocks as "busy";
-         * the busy list is maintained on a per-ag basis and each transaction
-         * records which entries should be removed when the iclog commits to
-         * disk. If a busy block is allocated, the iclog is pushed up to the
-         * LSN that freed the block.
-         */
        xfs_alloc_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1);
        xfs_trans_agbtree_delta(cur->bc_tp, -1);
        return 0;
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index be628677c288..9a84a85c03b1 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -202,7 +202,7 @@ xfs_swap_extents(
        xfs_inode_t     *tip,   /* tmp inode */
        xfs_swapext_t   *sxp)
 {
-        xfs_mount_t     *mp;
+        xfs_mount_t     *mp = ip->i_mount;
        xfs_trans_t     *tp;
        xfs_bstat_t     *sbp = &sxp->sx_stat;
        xfs_ifork_t     *tempifp, *ifp, *tifp;
@@ -212,16 +212,12 @@ xfs_swap_extents(
        int             taforkblks = 0;
        __uint64_t      tmp;
-        mp = ip->i_mount;
        tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL);
        if (!tempifp) {
                error = XFS_ERROR(ENOMEM);
                goto out;
        }
-        sbp = &sxp->sx_stat;
        /*
         * we have to do two separate lock calls here to keep lockdep
         * happy. If we try to get all the locks in one call, lock will
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index d11ce613d692..c8e3349c287c 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1354,7 +1354,7 @@ xfs_itruncate_start(
                return 0;
        }
        last_byte = xfs_file_last_byte(ip);
-        trace_xfs_itruncate_start(ip, flags, new_size, toss_start, last_byte);
+        trace_xfs_itruncate_start(ip, new_size, flags, toss_start, last_byte);
        if (last_byte > toss_start) {
                if (flags & XFS_ITRUNC_DEFINITE) {
                        xfs_tosspages(ip, toss_start,
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 576fdfe81d60..09983a3344a5 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -970,7 +970,6 @@ xfs_iflush_abort(
 {
        xfs_inode_log_item_t    *iip = ip->i_itemp;
-        iip = ip->i_itemp;
        if (iip) {
                struct xfs_ail  *ailp = iip->ili_item.li_ailp;
                if (iip->ili_item.li_flags & XFS_LI_IN_AIL) {
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index b612ce4520ae..211930246f20 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1449,6 +1449,13 @@ xlog_dealloc_log(xlog_t *log)
        xlog_cil_destroy(log);
+        /*
+         * always need to ensure that the extra buffer does not point to memory
+         * owned by another log buffer before we free it.
+         */
+        xfs_buf_set_empty(log->l_xbuf, log->l_iclog_size);
+        xfs_buf_free(log->l_xbuf);
        iclog = log->l_iclog;
        for (i=0; i<log->l_iclog_bufs; i++) {
                xfs_buf_free(iclog->ic_bp);
@@ -1458,7 +1465,6 @@ xlog_dealloc_log(xlog_t *log)
        }
        spinlock_destroy(&log->l_icloglock);
-        xfs_buf_free(log->l_xbuf);
        log->l_mp->m_log = NULL;
        kmem_free(log);
 }       /* xlog_dealloc_log */
@@ -3248,13 +3254,6 @@ xfs_log_ticket_get(
        return ticket;
 }
-xlog_tid_t
-xfs_log_get_trans_ident(
-        struct xfs_trans        *tp)
-{
-        return tp->t_ticket->t_tid;
-}
 /*
 * Allocate and initialise a new log ticket.
 */
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 3bd3291ef8d2..78c9039994af 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -189,8 +189,6 @@ void	  xlog_iodone(struct xfs_buf *);
 struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket);
 void      xfs_log_ticket_put(struct xlog_ticket *ticket);
-xlog_tid_t xfs_log_get_trans_ident(struct xfs_trans *tp);
 void    xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
                                struct xfs_log_vec *log_vector,
                                xfs_lsn_t *commit_lsn, int flags);
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 9ca59be08977..7d56e88a3f0e 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -361,13 +361,12 @@ xlog_cil_committed(
        int     abort)
 {
        struct xfs_cil_ctx      *ctx = args;
-        struct xfs_busy_extent  *busyp, *n;
        xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain,
                                        ctx->start_lsn, abort);
-        list_for_each_entry_safe(busyp, n, &ctx->busy_extents, list)
+        xfs_alloc_busy_sort(&ctx->busy_extents);
-                xfs_alloc_busy_clear(ctx->cil->xc_log->l_mp, busyp);
+        xfs_alloc_busy_clear(ctx->cil->xc_log->l_mp, &ctx->busy_extents);
        spin_lock(&ctx->cil->xc_cil_lock);
        list_del(&ctx->committing);
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 5864850e9e34..2d3b6a498d63 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -146,6 +146,8 @@ static inline uint xlog_get_client_id(__be32 i)
                                           shutdown */
 #define XLOG_TAIL_WARN          0x10    /* log tail verify warning issued */
+typedef __uint32_t xlog_tid_t;
 #ifdef __KERNEL__
 /*
 * Below are states for covering allocation transactions.
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 5cc464a17c93..04142caedb2b 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -205,6 +205,35 @@ xlog_bread(
 }
 /*
+ * Read at an offset into the buffer. Returns with the buffer in it's original
+ * state regardless of the result of the read.
+ */
+STATIC int
+xlog_bread_offset(
+        xlog_t          *log,
+        xfs_daddr_t     blk_no,         /* block to read from */
+        int             nbblks,         /* blocks to read */
+        xfs_buf_t       *bp,
+        xfs_caddr_t     offset)
+{
+        xfs_caddr_t     orig_offset = XFS_BUF_PTR(bp);
+        int             orig_len = bp->b_buffer_length;
+        int             error, error2;
+        error = XFS_BUF_SET_PTR(bp, offset, BBTOB(nbblks));
+        if (error)
+                return error;
+        error = xlog_bread_noalign(log, blk_no, nbblks, bp);
+        /* must reset buffer pointer even on error */
+        error2 = XFS_BUF_SET_PTR(bp, orig_offset, orig_len);
+        if (error)
+                return error;
+        return error2;
+}
+/*
 * Write out the buffer at the given block for the given number of blocks.
 * The buffer is kept locked across the write and is returned locked.
 * This can only be used for synchronous log writes.
@@ -1229,20 +1258,12 @@ xlog_write_log_records(
                 */
                ealign = round_down(end_block, sectbb);
                if (j == 0 && (start_block + endcount > ealign)) {
-                        offset = XFS_BUF_PTR(bp);
+                        offset = XFS_BUF_PTR(bp) + BBTOB(ealign - start_block);
-                        balign = BBTOB(ealign - start_block);
+                        error = xlog_bread_offset(log, ealign, sectbb,
-                        error = XFS_BUF_SET_PTR(bp, offset + balign,
+                                                        bp, offset);
-                                                BBTOB(sectbb));
                        if (error)
                                break;
-                        error = xlog_bread_noalign(log, ealign, sectbb, bp);
-                        if (error)
-                                break;
-                        error = XFS_BUF_SET_PTR(bp, offset, bufblks);
-                        if (error)
-                                break;
                }
                offset = xlog_align(log, start_block, endcount, bp);
@@ -3448,19 +3469,9 @@ xlog_do_recovery_pass(
                                 *   - order is important.
                                 */
                                wrapped_hblks = hblks - split_hblks;
-                                error = XFS_BUF_SET_PTR(hbp,
+                                error = xlog_bread_offset(log, 0,
-                                                offset + BBTOB(split_hblks),
+                                                wrapped_hblks, hbp,
-                                                BBTOB(hblks - split_hblks));
+                                                offset + BBTOB(split_hblks));
-                                if (error)
-                                        goto bread_err2;
-                                error = xlog_bread_noalign(log, 0,
-                                                           wrapped_hblks, hbp);
-                                if (error)
-                                        goto bread_err2;
-                                error = XFS_BUF_SET_PTR(hbp, offset,
-                                                        BBTOB(hblks));
                                if (error)
                                        goto bread_err2;
                        }
@@ -3511,19 +3522,9 @@ xlog_do_recovery_pass(
                                 *   _first_, then the log start (LR header end)
                                 *   - order is important.
                                 */
-                                error = XFS_BUF_SET_PTR(dbp,
+                                error = xlog_bread_offset(log, 0,
-                                                offset + BBTOB(split_bblks),
+                                                bblks - split_bblks, hbp,
-                                                BBTOB(bblks - split_bblks));
+                                                offset + BBTOB(split_bblks));
-                                if (error)
-                                        goto bread_err2;
-                                error = xlog_bread_noalign(log, wrapped_hblks,
-                                                bblks - split_bblks,
-                                                dbp);
-                                if (error)
-                                        goto bread_err2;
-                                error = XFS_BUF_SET_PTR(dbp, offset, h_size);
                                if (error)
                                        goto bread_err2;
                        }
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index bb3f9a7b24ed..b49b82363d20 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1900,7 +1900,7 @@ xfs_mod_incore_sb_batch(
        uint                    nmsb,
        int                     rsvd)
 {
-        xfs_mod_sb_t            *msbp = &msb[0];
+        xfs_mod_sb_t            *msbp;
        int                     error = 0;
        /*
@@ -1910,7 +1910,7 @@ xfs_mod_incore_sb_batch(
         * changes will be atomic.
         */
        spin_lock(&mp->m_sb_lock);
-        for (msbp = &msbp[0]; msbp < (msb + nmsb); msbp++) {
+        for (msbp = msb; msbp < (msb + nmsb); msbp++) {
                ASSERT(msbp->msb_field < XFS_SBS_ICOUNT ||
                       msbp->msb_field > XFS_SBS_FDBLOCKS);
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 76922793f64f..d1f24858ccc4 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -608,10 +608,8 @@ STATIC void
 xfs_trans_free(
        struct xfs_trans        *tp)
 {
-        struct xfs_busy_extent  *busyp, *n;
+        xfs_alloc_busy_sort(&tp->t_busy);
+        xfs_alloc_busy_clear(tp->t_mountp, &tp->t_busy);
-        list_for_each_entry_safe(busyp, n, &tp->t_busy, list)
-                xfs_alloc_busy_clear(tp->t_mountp, busyp);
        atomic_dec(&tp->t_mountp->m_active_trans);
        xfs_trans_free_dqinfo(tp);
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
index 26d1867d8156..65584b55607d 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/xfs_types.h
@@ -73,8 +73,6 @@ typedef	__int32_t	xfs_tid_t;	/* transaction identifier */
 typedef __uint32_t      xfs_dablk_t;    /* dir/attr block number (in file) */
 typedef __uint32_t      xfs_dahash_t;   /* dir/attr hash value */
-typedef __uint32_t      xlog_tid_t;     /* transaction ID type */
 /*
 * These types are 64 bits on disk but are either 32 or 64 bits in memory.
 * Disk based types: