47 files changed, 3158 insertions, 1047 deletions
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 397d3057d33..1bffbe0ed77 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -820,6 +820,8 @@ static int load_flat_shared_library(int id, struct lib_info *libs)
        int res;
        char buf[16];
+        memset(&bprm, 0, sizeof(bprm));
        /* Create the file name */
        sprintf(buf, "/lib/lib%d.so", id);
@@ -835,6 +837,12 @@ static int load_flat_shared_library(int id, struct lib_info *libs)
        if (!bprm.cred)
                goto out;
+        /* We don't really care about recalculating credentials at this point
+         * as we're past the point of no return and are dealing with shared
+         * libraries.
+         */
+        bprm.cred_prepared = 1;
        res = prepare_binprm(&bprm);
        if (!IS_ERR_VALUE(res))
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 0d329ff8ed4..9b026ea8baa 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -100,6 +100,7 @@ struct dlm_cluster {
        unsigned int cl_log_debug;
        unsigned int cl_protocol;
        unsigned int cl_timewarn_cs;
+        unsigned int cl_waitwarn_us;
 };
 enum {
@@ -114,6 +115,7 @@ enum {
        CLUSTER_ATTR_LOG_DEBUG,
        CLUSTER_ATTR_PROTOCOL,
        CLUSTER_ATTR_TIMEWARN_CS,
+        CLUSTER_ATTR_WAITWARN_US,
 };
 struct cluster_attribute {
@@ -166,6 +168,7 @@ CLUSTER_ATTR(scan_secs, 1);
 CLUSTER_ATTR(log_debug, 0);
 CLUSTER_ATTR(protocol, 0);
 CLUSTER_ATTR(timewarn_cs, 1);
+CLUSTER_ATTR(waitwarn_us, 0);
 static struct configfs_attribute *cluster_attrs[] = {
        [CLUSTER_ATTR_TCP_PORT] = &cluster_attr_tcp_port.attr,
@@ -179,6 +182,7 @@ static struct configfs_attribute *cluster_attrs[] = {
        [CLUSTER_ATTR_LOG_DEBUG] = &cluster_attr_log_debug.attr,
        [CLUSTER_ATTR_PROTOCOL] = &cluster_attr_protocol.attr,
        [CLUSTER_ATTR_TIMEWARN_CS] = &cluster_attr_timewarn_cs.attr,
+        [CLUSTER_ATTR_WAITWARN_US] = &cluster_attr_waitwarn_us.attr,
        NULL,
 };
@@ -439,6 +443,7 @@ static struct config_group *make_cluster(struct config_group *g,
        cl->cl_log_debug = dlm_config.ci_log_debug;
        cl->cl_protocol = dlm_config.ci_protocol;
        cl->cl_timewarn_cs = dlm_config.ci_timewarn_cs;
+        cl->cl_waitwarn_us = dlm_config.ci_waitwarn_us;
        space_list = &sps->ss_group;
        comm_list = &cms->cs_group;
@@ -986,6 +991,7 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num)
 #define DEFAULT_LOG_DEBUG          0
 #define DEFAULT_PROTOCOL           0
 #define DEFAULT_TIMEWARN_CS      500 /* 5 sec = 500 centiseconds */
+#define DEFAULT_WAITWARN_US        0
 struct dlm_config_info dlm_config = {
        .ci_tcp_port = DEFAULT_TCP_PORT,
@@ -998,6 +1004,7 @@ struct dlm_config_info dlm_config = {
        .ci_scan_secs = DEFAULT_SCAN_SECS,
        .ci_log_debug = DEFAULT_LOG_DEBUG,
        .ci_protocol = DEFAULT_PROTOCOL,
-        .ci_timewarn_cs = DEFAULT_TIMEWARN_CS
+        .ci_timewarn_cs = DEFAULT_TIMEWARN_CS,
+        .ci_waitwarn_us = DEFAULT_WAITWARN_US
 };
diff --git a/fs/dlm/config.h b/fs/dlm/config.h
index 4f1d6fce58c..dd0ce24d5a8 100644
--- a/fs/dlm/config.h
+++ b/fs/dlm/config.h
@@ -28,6 +28,7 @@ struct dlm_config_info {
        int ci_log_debug;
        int ci_protocol;
        int ci_timewarn_cs;
+        int ci_waitwarn_us;
 };
 extern struct dlm_config_info dlm_config;
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index b9420491301..0262451eb9c 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -209,6 +209,7 @@ struct dlm_args {
 #define DLM_IFL_WATCH_TIMEWARN  0x00400000
 #define DLM_IFL_TIMEOUT_CANCEL  0x00800000
 #define DLM_IFL_DEADLOCK_CANCEL 0x01000000
+#define DLM_IFL_STUB_MS         0x02000000 /* magic number for m_flags */
 #define DLM_IFL_USER            0x00000001
 #define DLM_IFL_ORPHAN          0x00000002
@@ -245,6 +246,7 @@ struct dlm_lkb {
        int8_t                  lkb_wait_type;  /* type of reply waiting for */
        int8_t                  lkb_wait_count;
+        int                     lkb_wait_nodeid; /* for debugging */
        struct list_head        lkb_idtbl_list; /* lockspace lkbtbl */
        struct list_head        lkb_statequeue; /* rsb g/c/w list */
@@ -254,6 +256,7 @@ struct dlm_lkb {
        struct list_head        lkb_ownqueue;   /* list of locks for a process */
        struct list_head        lkb_time_list;
        ktime_t                 lkb_timestamp;
+        ktime_t                 lkb_wait_time;
        unsigned long           lkb_timeout_cs;
        struct dlm_callback     lkb_callbacks[DLM_CALLBACKS_SIZE];
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 56d6bfcc1e4..f71d0b5abd9 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -799,10 +799,84 @@ static int msg_reply_type(int mstype)
        return -1;
 }
+static int nodeid_warned(int nodeid, int num_nodes, int *warned)
+{
+        int i;
+        for (i = 0; i < num_nodes; i++) {
+                if (!warned[i]) {
+                        warned[i] = nodeid;
+                        return 0;
+                }
+                if (warned[i] == nodeid)
+                        return 1;
+        }
+        return 0;
+}
+void dlm_scan_waiters(struct dlm_ls *ls)
+{
+        struct dlm_lkb *lkb;
+        ktime_t zero = ktime_set(0, 0);
+        s64 us;
+        s64 debug_maxus = 0;
+        u32 debug_scanned = 0;
+        u32 debug_expired = 0;
+        int num_nodes = 0;
+        int *warned = NULL;
+        if (!dlm_config.ci_waitwarn_us)
+                return;
+        mutex_lock(&ls->ls_waiters_mutex);
+        list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
+                if (ktime_equal(lkb->lkb_wait_time, zero))
+                        continue;
+                debug_scanned++;
+                us = ktime_to_us(ktime_sub(ktime_get(), lkb->lkb_wait_time));
+                if (us < dlm_config.ci_waitwarn_us)
+                        continue;
+                lkb->lkb_wait_time = zero;
+                debug_expired++;
+                if (us > debug_maxus)
+                        debug_maxus = us;
+                if (!num_nodes) {
+                        num_nodes = ls->ls_num_nodes;
+                        warned = kmalloc(GFP_KERNEL, num_nodes * sizeof(int));
+                        if (warned)
+                                memset(warned, 0, num_nodes * sizeof(int));
+                }
+                if (!warned)
+                        continue;
+                if (nodeid_warned(lkb->lkb_wait_nodeid, num_nodes, warned))
+                        continue;
+                log_error(ls, "waitwarn %x %lld %d us check connection to "
+                          "node %d", lkb->lkb_id, (long long)us,
+                          dlm_config.ci_waitwarn_us, lkb->lkb_wait_nodeid);
+        }
+        mutex_unlock(&ls->ls_waiters_mutex);
+        if (warned)
+                kfree(warned);
+        if (debug_expired)
+                log_debug(ls, "scan_waiters %u warn %u over %d us max %lld us",
+                          debug_scanned, debug_expired,
+                          dlm_config.ci_waitwarn_us, (long long)debug_maxus);
+}
 /* add/remove lkb from global waiters list of lkb's waiting for
   a reply from a remote node */
-static int add_to_waiters(struct dlm_lkb *lkb, int mstype)
+static int add_to_waiters(struct dlm_lkb *lkb, int mstype, int to_nodeid)
 {
        struct dlm_ls *ls = lkb->lkb_resource->res_ls;
        int error = 0;
@@ -842,6 +916,8 @@ static int add_to_waiters(struct dlm_lkb *lkb, int mstype)
        lkb->lkb_wait_count++;
        lkb->lkb_wait_type = mstype;
+        lkb->lkb_wait_time = ktime_get();
+        lkb->lkb_wait_nodeid = to_nodeid; /* for debugging */
        hold_lkb(lkb);
        list_add(&lkb->lkb_wait_reply, &ls->ls_waiters);
 out:
@@ -961,10 +1037,10 @@ static int remove_from_waiters_ms(struct dlm_lkb *lkb, struct dlm_message *ms)
        struct dlm_ls *ls = lkb->lkb_resource->res_ls;
        int error;
-        if (ms != &ls->ls_stub_ms)
+        if (ms->m_flags != DLM_IFL_STUB_MS)
                mutex_lock(&ls->ls_waiters_mutex);
        error = _remove_from_waiters(lkb, ms->m_type, ms);
-        if (ms != &ls->ls_stub_ms)
+        if (ms->m_flags != DLM_IFL_STUB_MS)
                mutex_unlock(&ls->ls_waiters_mutex);
        return error;
 }
@@ -1157,6 +1233,16 @@ void dlm_adjust_timeouts(struct dlm_ls *ls)
        list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list)
                lkb->lkb_timestamp = ktime_add_us(lkb->lkb_timestamp, adj_us);
        mutex_unlock(&ls->ls_timeout_mutex);
+        if (!dlm_config.ci_waitwarn_us)
+                return;
+        mutex_lock(&ls->ls_waiters_mutex);
+        list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
+                if (ktime_to_us(lkb->lkb_wait_time))
+                        lkb->lkb_wait_time = ktime_get();
+        }
+        mutex_unlock(&ls->ls_waiters_mutex);
 }
 /* lkb is master or local copy */
@@ -1376,14 +1462,8 @@ static void grant_lock_pending(struct dlm_rsb *r, struct dlm_lkb *lkb)
   ALTPR/ALTCW: our rqmode may have been changed to PR or CW to become
   compatible with other granted locks */
-static void munge_demoted(struct dlm_lkb *lkb, struct dlm_message *ms)
+static void munge_demoted(struct dlm_lkb *lkb)
 {
-        if (ms->m_type != DLM_MSG_CONVERT_REPLY) {
-                log_print("munge_demoted %x invalid reply type %d",
-                          lkb->lkb_id, ms->m_type);
-                return;
-        }
        if (lkb->lkb_rqmode == DLM_LOCK_IV || lkb->lkb_grmode == DLM_LOCK_IV) {
                log_print("munge_demoted %x invalid modes gr %d rq %d",
                          lkb->lkb_id, lkb->lkb_grmode, lkb->lkb_rqmode);
@@ -2844,12 +2924,12 @@ static int send_common(struct dlm_rsb *r, struct dlm_lkb *lkb, int mstype)
        struct dlm_mhandle *mh;
        int to_nodeid, error;
-        error = add_to_waiters(lkb, mstype);
+        to_nodeid = r->res_nodeid;
+        error = add_to_waiters(lkb, mstype, to_nodeid);
        if (error)
                return error;
-        to_nodeid = r->res_nodeid;
        error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh);
        if (error)
                goto fail;
@@ -2880,9 +2960,9 @@ static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
        /* down conversions go without a reply from the master */
        if (!error && down_conversion(lkb)) {
                remove_from_waiters(lkb, DLM_MSG_CONVERT_REPLY);
+                r->res_ls->ls_stub_ms.m_flags = DLM_IFL_STUB_MS;
                r->res_ls->ls_stub_ms.m_type = DLM_MSG_CONVERT_REPLY;
                r->res_ls->ls_stub_ms.m_result = 0;
-                r->res_ls->ls_stub_ms.m_flags = lkb->lkb_flags;
                __receive_convert_reply(r, lkb, &r->res_ls->ls_stub_ms);
        }
@@ -2951,12 +3031,12 @@ static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb)
        struct dlm_mhandle *mh;
        int to_nodeid, error;
-        error = add_to_waiters(lkb, DLM_MSG_LOOKUP);
+        to_nodeid = dlm_dir_nodeid(r);
+        error = add_to_waiters(lkb, DLM_MSG_LOOKUP, to_nodeid);
        if (error)
                return error;
-        to_nodeid = dlm_dir_nodeid(r);
        error = create_message(r, NULL, to_nodeid, DLM_MSG_LOOKUP, &ms, &mh);
        if (error)
                goto fail;
@@ -3070,6 +3150,9 @@ static void receive_flags(struct dlm_lkb *lkb, struct dlm_message *ms)
 static void receive_flags_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
 {
+        if (ms->m_flags == DLM_IFL_STUB_MS)
+                return;
        lkb->lkb_sbflags = ms->m_sbflags;
        lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) |
                         (ms->m_flags & 0x0000FFFF);
@@ -3612,7 +3695,7 @@ static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
                /* convert was queued on remote master */
                receive_flags_reply(lkb, ms);
                if (is_demoted(lkb))
-                        munge_demoted(lkb, ms);
+                        munge_demoted(lkb);
                del_lkb(r, lkb);
                add_lkb(r, lkb, DLM_LKSTS_CONVERT);
                add_timeout(lkb);
@@ -3622,7 +3705,7 @@ static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
                /* convert was granted on remote master */
                receive_flags_reply(lkb, ms);
                if (is_demoted(lkb))
-                        munge_demoted(lkb, ms);
+                        munge_demoted(lkb);
                grant_lock_pc(r, lkb, ms);
                queue_cast(r, lkb, 0);
                break;
@@ -3996,15 +4079,17 @@ void dlm_receive_buffer(union dlm_packet *p, int nodeid)
        dlm_put_lockspace(ls);
 }
-static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb)
+static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb,
+                                   struct dlm_message *ms_stub)
 {
        if (middle_conversion(lkb)) {
                hold_lkb(lkb);
-                ls->ls_stub_ms.m_type = DLM_MSG_CONVERT_REPLY;
+                memset(ms_stub, 0, sizeof(struct dlm_message));
-                ls->ls_stub_ms.m_result = -EINPROGRESS;
+                ms_stub->m_flags = DLM_IFL_STUB_MS;
-                ls->ls_stub_ms.m_flags = lkb->lkb_flags;
+                ms_stub->m_type = DLM_MSG_CONVERT_REPLY;
-                ls->ls_stub_ms.m_header.h_nodeid = lkb->lkb_nodeid;
+                ms_stub->m_result = -EINPROGRESS;
-                _receive_convert_reply(lkb, &ls->ls_stub_ms);
+                ms_stub->m_header.h_nodeid = lkb->lkb_nodeid;
+                _receive_convert_reply(lkb, ms_stub);
                /* Same special case as in receive_rcom_lock_args() */
                lkb->lkb_grmode = DLM_LOCK_IV;
@@ -4045,13 +4130,27 @@ static int waiter_needs_recovery(struct dlm_ls *ls, struct dlm_lkb *lkb)
 void dlm_recover_waiters_pre(struct dlm_ls *ls)
 {
        struct dlm_lkb *lkb, *safe;
+        struct dlm_message *ms_stub;
        int wait_type, stub_unlock_result, stub_cancel_result;
+        ms_stub = kmalloc(GFP_KERNEL, sizeof(struct dlm_message));
+        if (!ms_stub) {
+                log_error(ls, "dlm_recover_waiters_pre no mem");
+                return;
+        }
        mutex_lock(&ls->ls_waiters_mutex);
        list_for_each_entry_safe(lkb, safe, &ls->ls_waiters, lkb_wait_reply) {
-                log_debug(ls, "pre recover waiter lkid %x type %d flags %x",
-                          lkb->lkb_id, lkb->lkb_wait_type, lkb->lkb_flags);
+                /* exclude debug messages about unlocks because there can be so
+                   many and they aren't very interesting */
+                if (lkb->lkb_wait_type != DLM_MSG_UNLOCK) {
+                        log_debug(ls, "recover_waiter %x nodeid %d "
+                                  "msg %d to %d", lkb->lkb_id, lkb->lkb_nodeid,
+                                  lkb->lkb_wait_type, lkb->lkb_wait_nodeid);
+                }
                /* all outstanding lookups, regardless of destination  will be
                   resent after recovery is done */
@@ -4097,26 +4196,28 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
                        break;
                case DLM_MSG_CONVERT:
-                        recover_convert_waiter(ls, lkb);
+                        recover_convert_waiter(ls, lkb, ms_stub);
                        break;
                case DLM_MSG_UNLOCK:
                        hold_lkb(lkb);
-                        ls->ls_stub_ms.m_type = DLM_MSG_UNLOCK_REPLY;
+                        memset(ms_stub, 0, sizeof(struct dlm_message));
-                        ls->ls_stub_ms.m_result = stub_unlock_result;
+                        ms_stub->m_flags = DLM_IFL_STUB_MS;
-                        ls->ls_stub_ms.m_flags = lkb->lkb_flags;
+                        ms_stub->m_type = DLM_MSG_UNLOCK_REPLY;
-                        ls->ls_stub_ms.m_header.h_nodeid = lkb->lkb_nodeid;
+                        ms_stub->m_result = stub_unlock_result;
-                        _receive_unlock_reply(lkb, &ls->ls_stub_ms);
+                        ms_stub->m_header.h_nodeid = lkb->lkb_nodeid;
+                        _receive_unlock_reply(lkb, ms_stub);
                        dlm_put_lkb(lkb);
                        break;
                case DLM_MSG_CANCEL:
                        hold_lkb(lkb);
-                        ls->ls_stub_ms.m_type = DLM_MSG_CANCEL_REPLY;
+                        memset(ms_stub, 0, sizeof(struct dlm_message));
-                        ls->ls_stub_ms.m_result = stub_cancel_result;
+                        ms_stub->m_flags = DLM_IFL_STUB_MS;
-                        ls->ls_stub_ms.m_flags = lkb->lkb_flags;
+                        ms_stub->m_type = DLM_MSG_CANCEL_REPLY;
-                        ls->ls_stub_ms.m_header.h_nodeid = lkb->lkb_nodeid;
+                        ms_stub->m_result = stub_cancel_result;
-                        _receive_cancel_reply(lkb, &ls->ls_stub_ms);
+                        ms_stub->m_header.h_nodeid = lkb->lkb_nodeid;
+                        _receive_cancel_reply(lkb, ms_stub);
                        dlm_put_lkb(lkb);
                        break;
@@ -4127,6 +4228,7 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
                schedule();
        }
        mutex_unlock(&ls->ls_waiters_mutex);
+        kfree(ms_stub);
 }
 static struct dlm_lkb *find_resend_waiter(struct dlm_ls *ls)
@@ -4191,8 +4293,8 @@ int dlm_recover_waiters_post(struct dlm_ls *ls)
                ou = is_overlap_unlock(lkb);
                err = 0;
-                log_debug(ls, "recover_waiters_post %x type %d flags %x %s",
+                log_debug(ls, "recover_waiter %x nodeid %d msg %d r_nodeid %d",
-                          lkb->lkb_id, mstype, lkb->lkb_flags, r->res_name);
+                          lkb->lkb_id, lkb->lkb_nodeid, mstype, r->res_nodeid);
                /* At this point we assume that we won't get a reply to any
                   previous op or overlap op on this lock.  First, do a big
diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h
index 88e93c80cc2..265017a7c3e 100644
--- a/fs/dlm/lock.h
+++ b/fs/dlm/lock.h
@@ -24,6 +24,7 @@ int dlm_put_lkb(struct dlm_lkb *lkb);
 void dlm_scan_rsbs(struct dlm_ls *ls);
 int dlm_lock_recovery_try(struct dlm_ls *ls);
 void dlm_unlock_recovery(struct dlm_ls *ls);
+void dlm_scan_waiters(struct dlm_ls *ls);
 void dlm_scan_timeout(struct dlm_ls *ls);
 void dlm_adjust_timeouts(struct dlm_ls *ls);
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index f994a7dfda8..14cbf409975 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -243,7 +243,6 @@ static struct dlm_ls *find_ls_to_scan(void)
 static int dlm_scand(void *data)
 {
        struct dlm_ls *ls;
-        int timeout_jiffies = dlm_config.ci_scan_secs * HZ;
        while (!kthread_should_stop()) {
                ls = find_ls_to_scan();
@@ -252,13 +251,14 @@ static int dlm_scand(void *data)
                                ls->ls_scan_time = jiffies;
                                dlm_scan_rsbs(ls);
                                dlm_scan_timeout(ls);
+                                dlm_scan_waiters(ls);
                                dlm_unlock_recovery(ls);
                        } else {
                                ls->ls_scan_time += HZ;
                        }
-                } else {
+                        continue;
-                        schedule_timeout_interruptible(timeout_jiffies);
                }
+                schedule_timeout_interruptible(dlm_config.ci_scan_secs * HZ);
        }
        return 0;
 }
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index 30d8b85febb..e2b87800436 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -71,6 +71,36 @@ static void send_op(struct plock_op *op)
        wake_up(&send_wq);
 }
+/* If a process was killed while waiting for the only plock on a file,
+   locks_remove_posix will not see any lock on the file so it won't
+   send an unlock-close to us to pass on to userspace to clean up the
+   abandoned waiter.  So, we have to insert the unlock-close when the
+   lock call is interrupted. */
+static void do_unlock_close(struct dlm_ls *ls, u64 number,
+                            struct file *file, struct file_lock *fl)
+{
+        struct plock_op *op;
+        op = kzalloc(sizeof(*op), GFP_NOFS);
+        if (!op)
+                return;
+        op->info.optype         = DLM_PLOCK_OP_UNLOCK;
+        op->info.pid            = fl->fl_pid;
+        op->info.fsid           = ls->ls_global_id;
+        op->info.number         = number;
+        op->info.start          = 0;
+        op->info.end            = OFFSET_MAX;
+        if (fl->fl_lmops && fl->fl_lmops->fl_grant)
+                op->info.owner  = (__u64) fl->fl_pid;
+        else
+                op->info.owner  = (__u64)(long) fl->fl_owner;
+        op->info.flags |= DLM_PLOCK_FL_CLOSE;
+        send_op(op);
+}
 int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
                   int cmd, struct file_lock *fl)
 {
@@ -114,9 +144,19 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
        send_op(op);
-        if (xop->callback == NULL)
+        if (xop->callback == NULL) {
-                wait_event(recv_wq, (op->done != 0));
+                rv = wait_event_killable(recv_wq, (op->done != 0));
-        else {
+                if (rv == -ERESTARTSYS) {
+                        log_debug(ls, "dlm_posix_lock: wait killed %llx",
+                                  (unsigned long long)number);
+                        spin_lock(&ops_lock);
+                        list_del(&op->list);
+                        spin_unlock(&ops_lock);
+                        kfree(xop);
+                        do_unlock_close(ls, number, file, fl);
+                        goto out;
+                }
+        } else {
                rv = FILE_LOCK_DEFERRED;
                goto out;
        }
@@ -233,6 +273,13 @@ int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
        else
                op->info.owner  = (__u64)(long) fl->fl_owner;
+        if (fl->fl_flags & FL_CLOSE) {
+                op->info.flags |= DLM_PLOCK_FL_CLOSE;
+                send_op(op);
+                rv = 0;
+                goto out;
+        }
        send_op(op);
        wait_event(recv_wq, (op->done != 0));
@@ -334,7 +381,10 @@ static ssize_t dev_read(struct file *file, char __user *u, size_t count,
        spin_lock(&ops_lock);
        if (!list_empty(&send_list)) {
                op = list_entry(send_list.next, struct plock_op, list);
-                list_move(&op->list, &recv_list);
+                if (op->info.flags & DLM_PLOCK_FL_CLOSE)
+                        list_del(&op->list);
+                else
+                        list_move(&op->list, &recv_list);
                memcpy(&info, &op->info, sizeof(info));
        }
        spin_unlock(&ops_lock);
@@ -342,6 +392,13 @@ static ssize_t dev_read(struct file *file, char __user *u, size_t count,
        if (!op)
                return -EAGAIN;
+        /* there is no need to get a reply from userspace for unlocks
+           that were generated by the vfs cleaning up for a close
+           (the process did not make an unlock call). */
+        if (op->info.flags & DLM_PLOCK_FL_CLOSE)
+                kfree(op);
        if (copy_to_user(u, &info, sizeof(info)))
                return -EFAULT;
        return sizeof(info);
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index d5ab3fe7c19..e96bf3e9be8 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -611,7 +611,6 @@ static ssize_t device_write(struct file *file, const char __user *buf,
 out_sig:
        sigprocmask(SIG_SETMASK, &tmpsig, NULL);
-        recalc_sigpending();
 out_free:
        kfree(kbuf);
        return error;
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 0a78dae7e2c..1dd62ed35b8 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -898,7 +898,8 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
                brelse(bh);
                if (!sb_set_blocksize(sb, blocksize)) {
-                        ext2_msg(sb, KERN_ERR, "error: blocksize is too small");
+                        ext2_msg(sb, KERN_ERR,
+                                "error: bad blocksize %d", blocksize);
                        goto failed_sbi;
                }
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 32f3b869585..34b6d9bfc48 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1416,10 +1416,19 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
        frame->at = entries;
        frame->bh = bh;
        bh = bh2;
+        /*
+         * Mark buffers dirty here so that if do_split() fails we write a
+         * consistent set of buffers to disk.
+         */
+        ext3_journal_dirty_metadata(handle, frame->bh);
+        ext3_journal_dirty_metadata(handle, bh);
        de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
-        dx_release (frames);
+        if (!de) {
-        if (!(de))
+                ext3_mark_inode_dirty(handle, dir);
+                dx_release(frames);
                return retval;
+        }
+        dx_release(frames);
        return add_dirent_to_buf(handle, dentry, inode, de, bh);
 }
@@ -2189,6 +2198,7 @@ static int ext3_symlink (struct inode * dir,
        handle_t *handle;
        struct inode * inode;
        int l, err, retries = 0;
+        int credits;
        l = strlen(symname)+1;
        if (l > dir->i_sb->s_blocksize)
@@ -2196,10 +2206,26 @@ static int ext3_symlink (struct inode * dir,
        dquot_initialize(dir);
+        if (l > EXT3_N_BLOCKS * 4) {
+                /*
+                 * For non-fast symlinks, we just allocate inode and put it on
+                 * orphan list in the first transaction => we need bitmap,
+                 * group descriptor, sb, inode block, quota blocks.
+                 */
+                credits = 4 + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
+        } else {
+                /*
+                 * Fast symlink. We have to add entry to directory
+                 * (EXT3_DATA_TRANS_BLOCKS + EXT3_INDEX_EXTRA_TRANS_BLOCKS),
+                 * allocate new inode (bitmap, group descriptor, inode block,
+                 * quota blocks, sb is already counted in previous macros).
+                 */
+                credits = EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
+                          EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+                          EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
+        }
 retry:
-        handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
+        handle = ext3_journal_start(dir, credits);
-                                        EXT3_INDEX_EXTRA_TRANS_BLOCKS + 5 +
-                                        EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
        if (IS_ERR(handle))
                return PTR_ERR(handle);
@@ -2211,21 +2237,45 @@ retry:
        if (IS_ERR(inode))
                goto out_stop;
-        if (l > sizeof (EXT3_I(inode)->i_data)) {
+        if (l > EXT3_N_BLOCKS * 4) {
                inode->i_op = &ext3_symlink_inode_operations;
                ext3_set_aops(inode);
                /*
-                 * page_symlink() calls into ext3_prepare/commit_write.
+                 * We cannot call page_symlink() with transaction started
-                 * We have a transaction open.  All is sweetness.  It also sets
+                 * because it calls into ext3_write_begin() which acquires page
-                 * i_size in generic_commit_write().
+                 * lock which ranks below transaction start (and it can also
+                 * wait for journal commit if we are running out of space). So
+                 * we have to stop transaction now and restart it when symlink
+                 * contents is written. 
+                 *
+                 * To keep fs consistent in case of crash, we have to put inode
+                 * to orphan list in the mean time.
                 */
+                drop_nlink(inode);
+                err = ext3_orphan_add(handle, inode);
+                ext3_journal_stop(handle);
+                if (err)
+                        goto err_drop_inode;
                err = __page_symlink(inode, symname, l, 1);
+                if (err)
+                        goto err_drop_inode;
+                /*
+                 * Now inode is being linked into dir (EXT3_DATA_TRANS_BLOCKS
+                 * + EXT3_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified
+                 */
+                handle = ext3_journal_start(dir,
+                                EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
+                                EXT3_INDEX_EXTRA_TRANS_BLOCKS + 1);
+                if (IS_ERR(handle)) {
+                        err = PTR_ERR(handle);
+                        goto err_drop_inode;
+                }
+                inc_nlink(inode);
+                err = ext3_orphan_del(handle, inode);
                if (err) {
+                        ext3_journal_stop(handle);
                        drop_nlink(inode);
-                        unlock_new_inode(inode);
+                        goto err_drop_inode;
-                        ext3_mark_inode_dirty(handle, inode);
-                        iput (inode);
-                        goto out_stop;
                }
        } else {
                inode->i_op = &ext3_fast_symlink_inode_operations;
@@ -2239,6 +2289,10 @@ out_stop:
        if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
                goto retry;
        return err;
+err_drop_inode:
+        unlock_new_inode(inode);
+        iput(inode);
+        return err;
 }
 static int ext3_link (struct dentry * old_dentry,
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 69b18045946..72ffa974b0b 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -302,12 +302,6 @@ void journal_commit_transaction(journal_t *journal)
         * all outstanding updates to complete.
         */
-#ifdef COMMIT_STATS
-        spin_lock(&journal->j_list_lock);
-        summarise_journal_usage(journal);
-        spin_unlock(&journal->j_list_lock);
-#endif
        /* Do we need to erase the effects of a prior journal_flush? */
        if (journal->j_flags & JFS_FLUSHED) {
                jbd_debug(3, "super block updated\n");
@@ -722,8 +716,13 @@ wait_for_iobuf:
                   required. */
                JBUFFER_TRACE(jh, "file as BJ_Forget");
                journal_file_buffer(jh, commit_transaction, BJ_Forget);
-                /* Wake up any transactions which were waiting for this
+                /*
-                   IO to complete */
+                 * Wake up any transactions which were waiting for this
+                 * IO to complete. The barrier must be here so that changes
+                 * by journal_file_buffer() take effect before wake_up_bit()
+                 * does the waitqueue check.
+                 */
+                smp_mb();
                wake_up_bit(&bh->b_state, BH_Unshadow);
                JBUFFER_TRACE(jh, "brelse shadowed buffer");
                __brelse(bh);
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index b3713afaaa9..e2d4285fbe9 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -437,9 +437,12 @@ int __log_space_left(journal_t *journal)
 int __log_start_commit(journal_t *journal, tid_t target)
 {
        /*
-         * Are we already doing a recent enough commit?
+         * The only transaction we can possibly wait upon is the
+         * currently running transaction (if it exists).  Otherwise,
+         * the target tid must be an old one.
         */
-        if (!tid_geq(journal->j_commit_request, target)) {
+        if (journal->j_running_transaction &&
+            journal->j_running_transaction->t_tid == target) {
                /*
                 * We want a new commit: OK, mark the request and wakeup the
                 * commit thread.  We do _not_ do the commit ourselves.
@@ -451,7 +454,14 @@ int __log_start_commit(journal_t *journal, tid_t target)
                          journal->j_commit_sequence);
                wake_up(&journal->j_wait_commit);
                return 1;
-        }
+        } else if (!tid_geq(journal->j_commit_request, target))
+                /* This should never happen, but if it does, preserve
+                   the evidence before kjournald goes into a loop and
+                   increments j_commit_sequence beyond all recognition. */
+                WARN_ONCE(1, "jbd: bad log_start_commit: %u %u %u %u\n",
+                    journal->j_commit_request, journal->j_commit_sequence,
+                    target, journal->j_running_transaction ?
+                    journal->j_running_transaction->t_tid : 0);
        return 0;
 }
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 60d2319651b..f7ee81a065d 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -266,7 +266,8 @@ static handle_t *new_handle(int nblocks)
 * This function is visible to journal users (like ext3fs), so is not
 * called with the journal already locked.
 *
- * Return a pointer to a newly allocated handle, or NULL on failure
+ * Return a pointer to a newly allocated handle, or an ERR_PTR() value
+ * on failure.
 */
 handle_t *journal_start(journal_t *journal, int nblocks)
 {
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 6e28000a4b2..29148a81c78 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -338,12 +338,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
         * all outstanding updates to complete.
         */
-#ifdef COMMIT_STATS
-        spin_lock(&journal->j_list_lock);
-        summarise_journal_usage(journal);
-        spin_unlock(&journal->j_list_lock);
-#endif
        /* Do we need to erase the effects of a prior jbd2_journal_flush? */
        if (journal->j_flags & JBD2_FLUSHED) {
                jbd_debug(3, "super block updated\n");
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index d8a0313e99e..f17e58b3298 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -30,6 +30,7 @@ ocfs2-objs := \
        namei.o                 \
        refcounttree.o          \
        reservations.o          \
+        move_extents.o          \
        resize.o                \
        slot_map.o              \
        suballoc.o              \
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 312a28f433a..bc91072b721 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -22,6 +22,11 @@
 #include "ioctl.h"
 #include "resize.h"
 #include "refcounttree.h"
+#include "sysfile.h"
+#include "dir.h"
+#include "buffer_head_io.h"
+#include "suballoc.h"
+#include "move_extents.h"
 #include <linux/ext2_fs.h>
@@ -35,31 +40,27 @@
 * be -EFAULT.  The error will be returned from the ioctl(2) call.  It's
 * just a best-effort to tell userspace that this request caused the error.
 */
-static inline void __o2info_set_request_error(struct ocfs2_info_request *kreq,
+static inline void o2info_set_request_error(struct ocfs2_info_request *kreq,
                                        struct ocfs2_info_request __user *req)
 {
        kreq->ir_flags |= OCFS2_INFO_FL_ERROR;
        (void)put_user(kreq->ir_flags, (__u32 __user *)&(req->ir_flags));
 }
-#define o2info_set_request_error(a, b) \
+static inline void o2info_set_request_filled(struct ocfs2_info_request *req)
-                __o2info_set_request_error((struct ocfs2_info_request *)&(a), b)
-static inline void __o2info_set_request_filled(struct ocfs2_info_request *req)
 {
        req->ir_flags |= OCFS2_INFO_FL_FILLED;
 }
-#define o2info_set_request_filled(a) \
+static inline void o2info_clear_request_filled(struct ocfs2_info_request *req)
-                __o2info_set_request_filled((struct ocfs2_info_request *)&(a))
-static inline void __o2info_clear_request_filled(struct ocfs2_info_request *req)
 {
        req->ir_flags &= ~OCFS2_INFO_FL_FILLED;
 }
-#define o2info_clear_request_filled(a) \
+static inline int o2info_coherent(struct ocfs2_info_request *req)
-                __o2info_clear_request_filled((struct ocfs2_info_request *)&(a))
+{
+        return (!(req->ir_flags & OCFS2_INFO_FL_NON_COHERENT));
+}
 static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags)
 {
@@ -153,7 +154,7 @@ int ocfs2_info_handle_blocksize(struct inode *inode,
        oib.ib_blocksize = inode->i_sb->s_blocksize;
-        o2info_set_request_filled(oib);
+        o2info_set_request_filled(&oib.ib_req);
        if (o2info_to_user(oib, req))
                goto bail;
@@ -161,7 +162,7 @@ int ocfs2_info_handle_blocksize(struct inode *inode,
        status = 0;
 bail:
        if (status)
-                o2info_set_request_error(oib, req);
+                o2info_set_request_error(&oib.ib_req, req);
        return status;
 }
@@ -178,7 +179,7 @@ int ocfs2_info_handle_clustersize(struct inode *inode,
        oic.ic_clustersize = osb->s_clustersize;
-        o2info_set_request_filled(oic);
+        o2info_set_request_filled(&oic.ic_req);
        if (o2info_to_user(oic, req))
                goto bail;
@@ -186,7 +187,7 @@ int ocfs2_info_handle_clustersize(struct inode *inode,
        status = 0;
 bail:
        if (status)
-                o2info_set_request_error(oic, req);
+                o2info_set_request_error(&oic.ic_req, req);
        return status;
 }
@@ -203,7 +204,7 @@ int ocfs2_info_handle_maxslots(struct inode *inode,
        oim.im_max_slots = osb->max_slots;
-        o2info_set_request_filled(oim);
+        o2info_set_request_filled(&oim.im_req);
        if (o2info_to_user(oim, req))
                goto bail;
@@ -211,7 +212,7 @@ int ocfs2_info_handle_maxslots(struct inode *inode,
        status = 0;
 bail:
        if (status)
-                o2info_set_request_error(oim, req);
+                o2info_set_request_error(&oim.im_req, req);
        return status;
 }
@@ -228,7 +229,7 @@ int ocfs2_info_handle_label(struct inode *inode,
        memcpy(oil.il_label, osb->vol_label, OCFS2_MAX_VOL_LABEL_LEN);
-        o2info_set_request_filled(oil);
+        o2info_set_request_filled(&oil.il_req);
        if (o2info_to_user(oil, req))
                goto bail;
@@ -236,7 +237,7 @@ int ocfs2_info_handle_label(struct inode *inode,
        status = 0;
 bail:
        if (status)
-                o2info_set_request_error(oil, req);
+                o2info_set_request_error(&oil.il_req, req);
        return status;
 }
@@ -253,7 +254,7 @@ int ocfs2_info_handle_uuid(struct inode *inode,
        memcpy(oiu.iu_uuid_str, osb->uuid_str, OCFS2_TEXT_UUID_LEN + 1);
-        o2info_set_request_filled(oiu);
+        o2info_set_request_filled(&oiu.iu_req);
        if (o2info_to_user(oiu, req))
                goto bail;
@@ -261,7 +262,7 @@ int ocfs2_info_handle_uuid(struct inode *inode,
        status = 0;
 bail:
        if (status)
-                o2info_set_request_error(oiu, req);
+                o2info_set_request_error(&oiu.iu_req, req);
        return status;
 }
@@ -280,7 +281,7 @@ int ocfs2_info_handle_fs_features(struct inode *inode,
        oif.if_incompat_features = osb->s_feature_incompat;
        oif.if_ro_compat_features = osb->s_feature_ro_compat;
-        o2info_set_request_filled(oif);
+        o2info_set_request_filled(&oif.if_req);
        if (o2info_to_user(oif, req))
                goto bail;
@@ -288,7 +289,7 @@ int ocfs2_info_handle_fs_features(struct inode *inode,
        status = 0;
 bail:
        if (status)
-                o2info_set_request_error(oif, req);
+                o2info_set_request_error(&oif.if_req, req);
        return status;
 }
@@ -305,7 +306,7 @@ int ocfs2_info_handle_journal_size(struct inode *inode,
        oij.ij_journal_size = osb->journal->j_inode->i_size;
-        o2info_set_request_filled(oij);
+        o2info_set_request_filled(&oij.ij_req);
        if (o2info_to_user(oij, req))
                goto bail;
@@ -313,7 +314,408 @@ int ocfs2_info_handle_journal_size(struct inode *inode,
        status = 0;
 bail:
        if (status)
-                o2info_set_request_error(oij, req);
+                o2info_set_request_error(&oij.ij_req, req);
+        return status;
+}
+int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb,
+                                struct inode *inode_alloc, u64 blkno,
+                                struct ocfs2_info_freeinode *fi, u32 slot)
+{
+        int status = 0, unlock = 0;
+        struct buffer_head *bh = NULL;
+        struct ocfs2_dinode *dinode_alloc = NULL;
+        if (inode_alloc)
+                mutex_lock(&inode_alloc->i_mutex);
+        if (o2info_coherent(&fi->ifi_req)) {
+                status = ocfs2_inode_lock(inode_alloc, &bh, 0);
+                if (status < 0) {
+                        mlog_errno(status);
+                        goto bail;
+                }
+                unlock = 1;
+        } else {
+                status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh);
+                if (status < 0) {
+                        mlog_errno(status);
+                        goto bail;
+                }
+        }
+        dinode_alloc = (struct ocfs2_dinode *)bh->b_data;
+        fi->ifi_stat[slot].lfi_total =
+                le32_to_cpu(dinode_alloc->id1.bitmap1.i_total);
+        fi->ifi_stat[slot].lfi_free =
+                le32_to_cpu(dinode_alloc->id1.bitmap1.i_total) -
+                le32_to_cpu(dinode_alloc->id1.bitmap1.i_used);
+bail:
+        if (unlock)
+                ocfs2_inode_unlock(inode_alloc, 0);
+        if (inode_alloc)
+                mutex_unlock(&inode_alloc->i_mutex);
+        brelse(bh);
+        return status;
+}
+int ocfs2_info_handle_freeinode(struct inode *inode,
+                                struct ocfs2_info_request __user *req)
+{
+        u32 i;
+        u64 blkno = -1;
+        char namebuf[40];
+        int status = -EFAULT, type = INODE_ALLOC_SYSTEM_INODE;
+        struct ocfs2_info_freeinode *oifi = NULL;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct inode *inode_alloc = NULL;
+        oifi = kzalloc(sizeof(struct ocfs2_info_freeinode), GFP_KERNEL);
+        if (!oifi) {
+                status = -ENOMEM;
+                mlog_errno(status);
+                goto bail;
+        }
+        if (o2info_from_user(*oifi, req))
+                goto bail;
+        oifi->ifi_slotnum = osb->max_slots;
+        for (i = 0; i < oifi->ifi_slotnum; i++) {
+                if (o2info_coherent(&oifi->ifi_req)) {
+                        inode_alloc = ocfs2_get_system_file_inode(osb, type, i);
+                        if (!inode_alloc) {
+                                mlog(ML_ERROR, "unable to get alloc inode in "
+                                    "slot %u\n", i);
+                                status = -EIO;
+                                goto bail;
+                        }
+                } else {
+                        ocfs2_sprintf_system_inode_name(namebuf,
+                                                        sizeof(namebuf),
+                                                        type, i);
+                        status = ocfs2_lookup_ino_from_name(osb->sys_root_inode,
+                                                            namebuf,
+                                                            strlen(namebuf),
+                                                            &blkno);
+                        if (status < 0) {
+                                status = -ENOENT;
+                                goto bail;
+                        }
+                }
+                status = ocfs2_info_scan_inode_alloc(osb, inode_alloc, blkno, oifi, i);
+                if (status < 0)
+                        goto bail;
+                iput(inode_alloc);
+                inode_alloc = NULL;
+        }
+        o2info_set_request_filled(&oifi->ifi_req);
+        if (o2info_to_user(*oifi, req))
+                goto bail;
+        status = 0;
+bail:
+        if (status)
+                o2info_set_request_error(&oifi->ifi_req, req);
+        kfree(oifi);
+        return status;
+}
+static void o2ffg_update_histogram(struct ocfs2_info_free_chunk_list *hist,
+                                   unsigned int chunksize)
+{
+        int index;
+        index = __ilog2_u32(chunksize);
+        if (index >= OCFS2_INFO_MAX_HIST)
+                index = OCFS2_INFO_MAX_HIST - 1;
+        hist->fc_chunks[index]++;
+        hist->fc_clusters[index] += chunksize;
+}
+static void o2ffg_update_stats(struct ocfs2_info_freefrag_stats *stats,
+                               unsigned int chunksize)
+{
+        if (chunksize > stats->ffs_max)
+                stats->ffs_max = chunksize;
+        if (chunksize < stats->ffs_min)
+                stats->ffs_min = chunksize;
+        stats->ffs_avg += chunksize;
+        stats->ffs_free_chunks_real++;
+}
+void ocfs2_info_update_ffg(struct ocfs2_info_freefrag *ffg,
+                           unsigned int chunksize)
+{
+        o2ffg_update_histogram(&(ffg->iff_ffs.ffs_fc_hist), chunksize);
+        o2ffg_update_stats(&(ffg->iff_ffs), chunksize);
+}
+int ocfs2_info_freefrag_scan_chain(struct ocfs2_super *osb,
+                                   struct inode *gb_inode,
+                                   struct ocfs2_dinode *gb_dinode,
+                                   struct ocfs2_chain_rec *rec,
+                                   struct ocfs2_info_freefrag *ffg,
+                                   u32 chunks_in_group)
+{
+        int status = 0, used;
+        u64 blkno;
+        struct buffer_head *bh = NULL;
+        struct ocfs2_group_desc *bg = NULL;
+        unsigned int max_bits, num_clusters;
+        unsigned int offset = 0, cluster, chunk;
+        unsigned int chunk_free, last_chunksize = 0;
+        if (!le32_to_cpu(rec->c_free))
+                goto bail;
+        do {
+                if (!bg)
+                        blkno = le64_to_cpu(rec->c_blkno);
+                else
+                        blkno = le64_to_cpu(bg->bg_next_group);
+                if (bh) {
+                        brelse(bh);
+                        bh = NULL;
+                }
+                if (o2info_coherent(&ffg->iff_req))
+                        status = ocfs2_read_group_descriptor(gb_inode,
+                                                             gb_dinode,
+                                                             blkno, &bh);
+                else
+                        status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh);
+                if (status < 0) {
+                        mlog(ML_ERROR, "Can't read the group descriptor # "
+                             "%llu from device.", (unsigned long long)blkno);
+                        status = -EIO;
+                        goto bail;
+                }
+                bg = (struct ocfs2_group_desc *)bh->b_data;
+                if (!le16_to_cpu(bg->bg_free_bits_count))
+                        continue;
+                max_bits = le16_to_cpu(bg->bg_bits);
+                offset = 0;
+                for (chunk = 0; chunk < chunks_in_group; chunk++) {
+                        /*
+                         * last chunk may be not an entire one.
+                         */
+                        if ((offset + ffg->iff_chunksize) > max_bits)
+                                num_clusters = max_bits - offset;
+                        else
+                                num_clusters = ffg->iff_chunksize;
+                        chunk_free = 0;
+                        for (cluster = 0; cluster < num_clusters; cluster++) {
+                                used = ocfs2_test_bit(offset,
+                                                (unsigned long *)bg->bg_bitmap);
+                                /*
+                                 * - chunk_free counts free clusters in #N chunk.
+                                 * - last_chunksize records the size(in) clusters
+                                 *   for the last real free chunk being counted.
+                                 */
+                                if (!used) {
+                                        last_chunksize++;
+                                        chunk_free++;
+                                }
+                                if (used && last_chunksize) {
+                                        ocfs2_info_update_ffg(ffg,
+                                                              last_chunksize);
+                                        last_chunksize = 0;
+                                }
+                                offset++;
+                        }
+                        if (chunk_free == ffg->iff_chunksize)
+                                ffg->iff_ffs.ffs_free_chunks++;
+                }
+                /*
+                 * need to update the info for last free chunk.
+                 */
+                if (last_chunksize)
+                        ocfs2_info_update_ffg(ffg, last_chunksize);
+        } while (le64_to_cpu(bg->bg_next_group));
+bail:
+        brelse(bh);
+        return status;
+}
+int ocfs2_info_freefrag_scan_bitmap(struct ocfs2_super *osb,
+                                    struct inode *gb_inode, u64 blkno,
+                                    struct ocfs2_info_freefrag *ffg)
+{
+        u32 chunks_in_group;
+        int status = 0, unlock = 0, i;
+        struct buffer_head *bh = NULL;
+        struct ocfs2_chain_list *cl = NULL;
+        struct ocfs2_chain_rec *rec = NULL;
+        struct ocfs2_dinode *gb_dinode = NULL;
+        if (gb_inode)
+                mutex_lock(&gb_inode->i_mutex);
+        if (o2info_coherent(&ffg->iff_req)) {
+                status = ocfs2_inode_lock(gb_inode, &bh, 0);
+                if (status < 0) {
+                        mlog_errno(status);
+                        goto bail;
+                }
+                unlock = 1;
+        } else {
+                status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh);
+                if (status < 0) {
+                        mlog_errno(status);
+                        goto bail;
+                }
+        }
+        gb_dinode = (struct ocfs2_dinode *)bh->b_data;
+        cl = &(gb_dinode->id2.i_chain);
+        /*
+         * Chunksize(in) clusters from userspace should be
+         * less than clusters in a group.
+         */
+        if (ffg->iff_chunksize > le16_to_cpu(cl->cl_cpg)) {
+                status = -EINVAL;
+                goto bail;
+        }
+        memset(&ffg->iff_ffs, 0, sizeof(struct ocfs2_info_freefrag_stats));
+        ffg->iff_ffs.ffs_min = ~0U;
+        ffg->iff_ffs.ffs_clusters =
+                        le32_to_cpu(gb_dinode->id1.bitmap1.i_total);
+        ffg->iff_ffs.ffs_free_clusters = ffg->iff_ffs.ffs_clusters -
+                        le32_to_cpu(gb_dinode->id1.bitmap1.i_used);
+        chunks_in_group = le16_to_cpu(cl->cl_cpg) / ffg->iff_chunksize + 1;
+        for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i++) {
+                rec = &(cl->cl_recs[i]);
+                status = ocfs2_info_freefrag_scan_chain(osb, gb_inode,
+                                                        gb_dinode,
+                                                        rec, ffg,
+                                                        chunks_in_group);
+                if (status)
+                        goto bail;
+        }
+        if (ffg->iff_ffs.ffs_free_chunks_real)
+                ffg->iff_ffs.ffs_avg = (ffg->iff_ffs.ffs_avg /
+                                        ffg->iff_ffs.ffs_free_chunks_real);
+bail:
+        if (unlock)
+                ocfs2_inode_unlock(gb_inode, 0);
+        if (gb_inode)
+                mutex_unlock(&gb_inode->i_mutex);
+        if (gb_inode)
+                iput(gb_inode);
+        brelse(bh);
+        return status;
+}
+int ocfs2_info_handle_freefrag(struct inode *inode,
+                               struct ocfs2_info_request __user *req)
+{
+        u64 blkno = -1;
+        char namebuf[40];
+        int status = -EFAULT, type = GLOBAL_BITMAP_SYSTEM_INODE;
+        struct ocfs2_info_freefrag *oiff;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct inode *gb_inode = NULL;
+        oiff = kzalloc(sizeof(struct ocfs2_info_freefrag), GFP_KERNEL);
+        if (!oiff) {
+                status = -ENOMEM;
+                mlog_errno(status);
+                goto bail;
+        }
+        if (o2info_from_user(*oiff, req))
+                goto bail;
+        /*
+         * chunksize from userspace should be power of 2.
+         */
+        if ((oiff->iff_chunksize & (oiff->iff_chunksize - 1)) ||
+            (!oiff->iff_chunksize)) {
+                status = -EINVAL;
+                goto bail;
+        }
+        if (o2info_coherent(&oiff->iff_req)) {
+                gb_inode = ocfs2_get_system_file_inode(osb, type,
+                                                       OCFS2_INVALID_SLOT);
+                if (!gb_inode) {
+                        mlog(ML_ERROR, "unable to get global_bitmap inode\n");
+                        status = -EIO;
+                        goto bail;
+                }
+        } else {
+                ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type,
+                                                OCFS2_INVALID_SLOT);
+                status = ocfs2_lookup_ino_from_name(osb->sys_root_inode,
+                                                    namebuf,
+                                                    strlen(namebuf),
+                                                    &blkno);
+                if (status < 0) {
+                        status = -ENOENT;
+                        goto bail;
+                }
+        }
+        status = ocfs2_info_freefrag_scan_bitmap(osb, gb_inode, blkno, oiff);
+        if (status < 0)
+                goto bail;
+        o2info_set_request_filled(&oiff->iff_req);
+        if (o2info_to_user(*oiff, req))
+                goto bail;
+        status = 0;
+bail:
+        if (status)
+                o2info_set_request_error(&oiff->iff_req, req);
+        kfree(oiff);
        return status;
 }
@@ -327,7 +729,7 @@ int ocfs2_info_handle_unknown(struct inode *inode,
        if (o2info_from_user(oir, req))
                goto bail;
-        o2info_clear_request_filled(oir);
+        o2info_clear_request_filled(&oir);
        if (o2info_to_user(oir, req))
                goto bail;
@@ -335,7 +737,7 @@ int ocfs2_info_handle_unknown(struct inode *inode,
        status = 0;
 bail:
        if (status)
-                o2info_set_request_error(oir, req);
+                o2info_set_request_error(&oir, req);
        return status;
 }
@@ -389,6 +791,14 @@ int ocfs2_info_handle_request(struct inode *inode,
                if (oir.ir_size == sizeof(struct ocfs2_info_journal_size))
                        status = ocfs2_info_handle_journal_size(inode, req);
                break;
+        case OCFS2_INFO_FREEINODE:
+                if (oir.ir_size == sizeof(struct ocfs2_info_freeinode))
+                        status = ocfs2_info_handle_freeinode(inode, req);
+                break;
+        case OCFS2_INFO_FREEFRAG:
+                if (oir.ir_size == sizeof(struct ocfs2_info_freefrag))
+                        status = ocfs2_info_handle_freefrag(inode, req);
+                break;
        default:
                status = ocfs2_info_handle_unknown(inode, req);
                break;
@@ -565,6 +975,8 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                return 0;
        }
+        case OCFS2_IOC_MOVE_EXT:
+                return ocfs2_ioctl_move_extents(filp, (void __user *)arg);
        default:
                return -ENOTTY;
        }
@@ -608,6 +1020,8 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
                        return -EFAULT;
                return ocfs2_info_handle(inode, &info, 1);
+        case OCFS2_IOC_MOVE_EXT:
+                break;
        default:
                return -ENOIOCTLCMD;
        }
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
new file mode 100644
index 00000000000..4c5488468c1
--- /dev/null
+++ b/fs/ocfs2/move_extents.c
@@ -0,0 +1,1153 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * move_extents.c
+ *
+ * Copyright (C) 2011 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/mount.h>
+#include <linux/swap.h>
+#include <cluster/masklog.h>
+#include "ocfs2.h"
+#include "ocfs2_ioctl.h"
+#include "alloc.h"
+#include "aops.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "inode.h"
+#include "journal.h"
+#include "suballoc.h"
+#include "uptodate.h"
+#include "super.h"
+#include "dir.h"
+#include "buffer_head_io.h"
+#include "sysfile.h"
+#include "suballoc.h"
+#include "refcounttree.h"
+#include "move_extents.h"
+struct ocfs2_move_extents_context {
+        struct inode *inode;
+        struct file *file;
+        int auto_defrag;
+        int partial;
+        int credits;
+        u32 new_phys_cpos;
+        u32 clusters_moved;
+        u64 refcount_loc;
+        struct ocfs2_move_extents *range;
+        struct ocfs2_extent_tree et;
+        struct ocfs2_alloc_context *meta_ac;
+        struct ocfs2_alloc_context *data_ac;
+        struct ocfs2_cached_dealloc_ctxt dealloc;
+};
+static int __ocfs2_move_extent(handle_t *handle,
+                               struct ocfs2_move_extents_context *context,
+                               u32 cpos, u32 len, u32 p_cpos, u32 new_p_cpos,
+                               int ext_flags)
+{
+        int ret = 0, index;
+        struct inode *inode = context->inode;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct ocfs2_extent_rec *rec, replace_rec;
+        struct ocfs2_path *path = NULL;
+        struct ocfs2_extent_list *el;
+        u64 ino = ocfs2_metadata_cache_owner(context->et.et_ci);
+        u64 old_blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cpos);
+        ret = ocfs2_duplicate_clusters_by_page(handle, context->file, cpos,
+                                               p_cpos, new_p_cpos, len);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        memset(&replace_rec, 0, sizeof(replace_rec));
+        replace_rec.e_cpos = cpu_to_le32(cpos);
+        replace_rec.e_leaf_clusters = cpu_to_le16(len);
+        replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(inode->i_sb,
+                                                                   new_p_cpos));
+        path = ocfs2_new_path_from_et(&context->et);
+        if (!path) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_find_path(INODE_CACHE(inode), path, cpos);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        el = path_leaf_el(path);
+        index = ocfs2_search_extent_list(el, cpos);
+        if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
+                ocfs2_error(inode->i_sb,
+                            "Inode %llu has an extent at cpos %u which can no "
+                            "longer be found.\n",
+                            (unsigned long long)ino, cpos);
+                ret = -EROFS;
+                goto out;
+        }
+        rec = &el->l_recs[index];
+        BUG_ON(ext_flags != rec->e_flags);
+        /*
+         * after moving/defraging to new location, the extent is not going
+         * to be refcounted anymore.
+         */
+        replace_rec.e_flags = ext_flags & ~OCFS2_EXT_REFCOUNTED;
+        ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
+                                      context->et.et_root_bh,
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_split_extent(handle, &context->et, path, index,
+                                 &replace_rec, context->meta_ac,
+                                 &context->dealloc);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        ocfs2_journal_dirty(handle, context->et.et_root_bh);
+        context->new_phys_cpos = new_p_cpos;
+        /*
+         * need I to append truncate log for old clusters?
+         */
+        if (old_blkno) {
+                if (ext_flags & OCFS2_EXT_REFCOUNTED)
+                        ret = ocfs2_decrease_refcount(inode, handle,
+                                        ocfs2_blocks_to_clusters(osb->sb,
+                                                                 old_blkno),
+                                        len, context->meta_ac,
+                                        &context->dealloc, 1);
+                else
+                        ret = ocfs2_truncate_log_append(osb, handle,
+                                                        old_blkno, len);
+        }
+out:
+        return ret;
+}
+/*
+ * lock allocators, and reserving appropriate number of bits for
+ * meta blocks and data clusters.
+ *
+ * in some cases, we don't need to reserve clusters, just let data_ac
+ * be NULL.
+ */
+static int ocfs2_lock_allocators_move_extents(struct inode *inode,
+                                        struct ocfs2_extent_tree *et,
+                                        u32 clusters_to_move,
+                                        u32 extents_to_split,
+                                        struct ocfs2_alloc_context **meta_ac,
+                                        struct ocfs2_alloc_context **data_ac,
+                                        int extra_blocks,
+                                        int *credits)
+{
+        int ret, num_free_extents;
+        unsigned int max_recs_needed = 2 * extents_to_split + clusters_to_move;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        num_free_extents = ocfs2_num_free_extents(osb, et);
+        if (num_free_extents < 0) {
+                ret = num_free_extents;
+                mlog_errno(ret);
+                goto out;
+        }
+        if (!num_free_extents ||
+            (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed))
+                extra_blocks += ocfs2_extend_meta_needed(et->et_root_el);
+        ret = ocfs2_reserve_new_metadata_blocks(osb, extra_blocks, meta_ac);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        if (data_ac) {
+                ret = ocfs2_reserve_clusters(osb, clusters_to_move, data_ac);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+        *credits += ocfs2_calc_extend_credits(osb->sb, et->et_root_el,
+                                              clusters_to_move + 2);
+        mlog(0, "reserve metadata_blocks: %d, data_clusters: %u, credits: %d\n",
+             extra_blocks, clusters_to_move, *credits);
+out:
+        if (ret) {
+                if (*meta_ac) {
+                        ocfs2_free_alloc_context(*meta_ac);
+                        *meta_ac = NULL;
+                }
+        }
+        return ret;
+}
+/*
+ * Using one journal handle to guarantee the data consistency in case
+ * crash happens anywhere.
+ *
+ *  XXX: defrag can end up with finishing partial extent as requested,
+ * due to not enough contiguous clusters can be found in allocator.
+ */
+static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context,
+                               u32 cpos, u32 phys_cpos, u32 *len, int ext_flags)
+{
+        int ret, credits = 0, extra_blocks = 0, partial = context->partial;
+        handle_t *handle;
+        struct inode *inode = context->inode;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct inode *tl_inode = osb->osb_tl_inode;
+        struct ocfs2_refcount_tree *ref_tree = NULL;
+        u32 new_phys_cpos, new_len;
+        u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
+        if ((ext_flags & OCFS2_EXT_REFCOUNTED) && *len) {
+                BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
+                         OCFS2_HAS_REFCOUNT_FL));
+                BUG_ON(!context->refcount_loc);
+                ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1,
+                                               &ref_tree, NULL);
+                if (ret) {
+                        mlog_errno(ret);
+                        return ret;
+                }
+                ret = ocfs2_prepare_refcount_change_for_del(inode,
+                                                        context->refcount_loc,
+                                                        phys_blkno,
+                                                        *len,
+                                                        &credits,
+                                                        &extra_blocks);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+        ret = ocfs2_lock_allocators_move_extents(inode, &context->et, *len, 1,
+                                                 &context->meta_ac,
+                                                 &context->data_ac,
+                                                 extra_blocks, &credits);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        /*
+         * should be using allocation reservation strategy there?
+         *
+         * if (context->data_ac)
+         *      context->data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
+         */
+        mutex_lock(&tl_inode->i_mutex);
+        if (ocfs2_truncate_log_needs_flush(osb)) {
+                ret = __ocfs2_flush_truncate_log(osb);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out_unlock_mutex;
+                }
+        }
+        handle = ocfs2_start_trans(osb, credits);
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                mlog_errno(ret);
+                goto out_unlock_mutex;
+        }
+        ret = __ocfs2_claim_clusters(handle, context->data_ac, 1, *len,
+                                     &new_phys_cpos, &new_len);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        /*
+         * allowing partial extent moving is kind of 'pros and cons', it makes
+         * whole defragmentation less likely to fail, on the contrary, the bad
+         * thing is it may make the fs even more fragmented after moving, let
+         * userspace make a good decision here.
+         */
+        if (new_len != *len) {
+                mlog(0, "len_claimed: %u, len: %u\n", new_len, *len);
+                if (!partial) {
+                        context->range->me_flags &= ~OCFS2_MOVE_EXT_FL_COMPLETE;
+                        ret = -ENOSPC;
+                        goto out_commit;
+                }
+        }
+        mlog(0, "cpos: %u, phys_cpos: %u, new_phys_cpos: %u\n", cpos,
+             phys_cpos, new_phys_cpos);
+        ret = __ocfs2_move_extent(handle, context, cpos, new_len, phys_cpos,
+                                  new_phys_cpos, ext_flags);
+        if (ret)
+                mlog_errno(ret);
+        if (partial && (new_len != *len))
+                *len = new_len;
+        /*
+         * Here we should write the new page out first if we are
+         * in write-back mode.
+         */
+        ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, *len);
+        if (ret)
+                mlog_errno(ret);
+out_commit:
+        ocfs2_commit_trans(osb, handle);
+out_unlock_mutex:
+        mutex_unlock(&tl_inode->i_mutex);
+        if (context->data_ac) {
+                ocfs2_free_alloc_context(context->data_ac);
+                context->data_ac = NULL;
+        }
+        if (context->meta_ac) {
+                ocfs2_free_alloc_context(context->meta_ac);
+                context->meta_ac = NULL;
+        }
+out:
+        if (ref_tree)
+                ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+        return ret;
+}
+/*
+ * find the victim alloc group, where #blkno fits.
+ */
+static int ocfs2_find_victim_alloc_group(struct inode *inode,
+                                         u64 vict_blkno,
+                                         int type, int slot,
+                                         int *vict_bit,
+                                         struct buffer_head **ret_bh)
+{
+        int ret, i, blocks_per_unit = 1;
+        u64 blkno;
+        char namebuf[40];
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct buffer_head *ac_bh = NULL, *gd_bh = NULL;
+        struct ocfs2_chain_list *cl;
+        struct ocfs2_chain_rec *rec;
+        struct ocfs2_dinode *ac_dinode;
+        struct ocfs2_group_desc *bg;
+        ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, slot);
+        ret = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf,
+                                         strlen(namebuf), &blkno);
+        if (ret) {
+                ret = -ENOENT;
+                goto out;
+        }
+        ret = ocfs2_read_blocks_sync(osb, blkno, 1, &ac_bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        ac_dinode = (struct ocfs2_dinode *)ac_bh->b_data;
+        cl = &(ac_dinode->id2.i_chain);
+        rec = &(cl->cl_recs[0]);
+        if (type == GLOBAL_BITMAP_SYSTEM_INODE)
+                blocks_per_unit <<= (osb->s_clustersize_bits -
+                                                inode->i_sb->s_blocksize_bits);
+        /*
+         * 'vict_blkno' was out of the valid range.
+         */
+        if ((vict_blkno < le64_to_cpu(rec->c_blkno)) ||
+            (vict_blkno >= (le32_to_cpu(ac_dinode->id1.bitmap1.i_total) *
+                                blocks_per_unit))) {
+                ret = -EINVAL;
+                goto out;
+        }
+        for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i++) {
+                rec = &(cl->cl_recs[i]);
+                if (!rec)
+                        continue;
+                bg = NULL;
+                do {
+                        if (!bg)
+                                blkno = le64_to_cpu(rec->c_blkno);
+                        else
+                                blkno = le64_to_cpu(bg->bg_next_group);
+                        if (gd_bh) {
+                                brelse(gd_bh);
+                                gd_bh = NULL;
+                        }
+                        ret = ocfs2_read_blocks_sync(osb, blkno, 1, &gd_bh);
+                        if (ret) {
+                                mlog_errno(ret);
+                                goto out;
+                        }
+                        bg = (struct ocfs2_group_desc *)gd_bh->b_data;
+                        if (vict_blkno < (le64_to_cpu(bg->bg_blkno) +
+                                                le16_to_cpu(bg->bg_bits))) {
+                                *ret_bh = gd_bh;
+                                *vict_bit = (vict_blkno - blkno) /
+                                                        blocks_per_unit;
+                                mlog(0, "find the victim group: #%llu, "
+                                     "total_bits: %u, vict_bit: %u\n",
+                                     blkno, le16_to_cpu(bg->bg_bits),
+                                     *vict_bit);
+                                goto out;
+                        }
+                } while (le64_to_cpu(bg->bg_next_group));
+        }
+        ret = -EINVAL;
+out:
+        brelse(ac_bh);
+        /*
+         * caller has to release the gd_bh properly.
+         */
+        return ret;
+}
+/*
+ * XXX: helper to validate and adjust moving goal.
+ */
+static int ocfs2_validate_and_adjust_move_goal(struct inode *inode,
+                                               struct ocfs2_move_extents *range)
+{
+        int ret, goal_bit = 0;
+        struct buffer_head *gd_bh = NULL;
+        struct ocfs2_group_desc *bg;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        int c_to_b = 1 << (osb->s_clustersize_bits -
+                                        inode->i_sb->s_blocksize_bits);
+        /*
+         * validate goal sits within global_bitmap, and return the victim
+         * group desc
+         */
+        ret = ocfs2_find_victim_alloc_group(inode, range->me_goal,
+                                            GLOBAL_BITMAP_SYSTEM_INODE,
+                                            OCFS2_INVALID_SLOT,
+                                            &goal_bit, &gd_bh);
+        if (ret)
+                goto out;
+        bg = (struct ocfs2_group_desc *)gd_bh->b_data;
+        /*
+         * make goal become cluster aligned.
+         */
+        if (range->me_goal % c_to_b)
+                range->me_goal = range->me_goal / c_to_b * c_to_b;
+        /*
+         * moving goal is not allowd to start with a group desc blok(#0 blk)
+         * let's compromise to the latter cluster.
+         */
+        if (range->me_goal == le64_to_cpu(bg->bg_blkno))
+                range->me_goal += c_to_b;
+        /*
+         * movement is not gonna cross two groups.
+         */
+        if ((le16_to_cpu(bg->bg_bits) - goal_bit) * osb->s_clustersize <
+                                                                range->me_len) {
+                ret = -EINVAL;
+                goto out;
+        }
+        /*
+         * more exact validations/adjustments will be performed later during
+         * moving operation for each extent range.
+         */
+        mlog(0, "extents get ready to be moved to #%llu block\n",
+             range->me_goal);
+out:
+        brelse(gd_bh);
+        return ret;
+}
+static void ocfs2_probe_alloc_group(struct inode *inode, struct buffer_head *bh,
+                                    int *goal_bit, u32 move_len, u32 max_hop,
+                                    u32 *phys_cpos)
+{
+        int i, used, last_free_bits = 0, base_bit = *goal_bit;
+        struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
+        u32 base_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
+                                                 le64_to_cpu(gd->bg_blkno));
+        for (i = base_bit; i < le16_to_cpu(gd->bg_bits); i++) {
+                used = ocfs2_test_bit(i, (unsigned long *)gd->bg_bitmap);
+                if (used) {
+                        /*
+                         * we even tried searching the free chunk by jumping
+                         * a 'max_hop' distance, but still failed.
+                         */
+                        if ((i - base_bit) > max_hop) {
+                                *phys_cpos = 0;
+                                break;
+                        }
+                        if (last_free_bits)
+                                last_free_bits = 0;
+                        continue;
+                } else
+                        last_free_bits++;
+                if (last_free_bits == move_len) {
+                        *goal_bit = i;
+                        *phys_cpos = base_cpos + i;
+                        break;
+                }
+        }
+        mlog(0, "found phys_cpos: %u to fit the wanted moving.\n", *phys_cpos);
+}
+static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
+                                       handle_t *handle,
+                                       struct buffer_head *di_bh,
+                                       u32 num_bits,
+                                       u16 chain)
+{
+        int ret;
+        u32 tmp_used;
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
+        struct ocfs2_chain_list *cl =
+                                (struct ocfs2_chain_list *) &di->id2.i_chain;
+        ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret < 0) {
+                mlog_errno(ret);
+                goto out;
+        }
+        tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
+        di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
+        le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
+        ocfs2_journal_dirty(handle, di_bh);
+out:
+        return ret;
+}
+static inline int ocfs2_block_group_set_bits(handle_t *handle,
+                                             struct inode *alloc_inode,
+                                             struct ocfs2_group_desc *bg,
+                                             struct buffer_head *group_bh,
+                                             unsigned int bit_off,
+                                             unsigned int num_bits)
+{
+        int status;
+        void *bitmap = bg->bg_bitmap;
+        int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
+        /* All callers get the descriptor via
+         * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
+        BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
+        BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
+        mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
+             num_bits);
+        if (ocfs2_is_cluster_bitmap(alloc_inode))
+                journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
+        status = ocfs2_journal_access_gd(handle,
+                                         INODE_CACHE(alloc_inode),
+                                         group_bh,
+                                         journal_type);
+        if (status < 0) {
+                mlog_errno(status);
+                goto bail;
+        }
+        le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
+        if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
+                ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
+                            " count %u but claims %u are freed. num_bits %d",
+                            (unsigned long long)le64_to_cpu(bg->bg_blkno),
+                            le16_to_cpu(bg->bg_bits),
+                            le16_to_cpu(bg->bg_free_bits_count), num_bits);
+                return -EROFS;
+        }
+        while (num_bits--)
+                ocfs2_set_bit(bit_off++, bitmap);
+        ocfs2_journal_dirty(handle, group_bh);
+bail:
+        return status;
+}
+static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
+                             u32 cpos, u32 phys_cpos, u32 *new_phys_cpos,
+                             u32 len, int ext_flags)
+{
+        int ret, credits = 0, extra_blocks = 0, goal_bit = 0;
+        handle_t *handle;
+        struct inode *inode = context->inode;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct inode *tl_inode = osb->osb_tl_inode;
+        struct inode *gb_inode = NULL;
+        struct buffer_head *gb_bh = NULL;
+        struct buffer_head *gd_bh = NULL;
+        struct ocfs2_group_desc *gd;
+        struct ocfs2_refcount_tree *ref_tree = NULL;
+        u32 move_max_hop = ocfs2_blocks_to_clusters(inode->i_sb,
+                                                    context->range->me_threshold);
+        u64 phys_blkno, new_phys_blkno;
+        phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
+        if ((ext_flags & OCFS2_EXT_REFCOUNTED) && len) {
+                BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
+                         OCFS2_HAS_REFCOUNT_FL));
+                BUG_ON(!context->refcount_loc);
+                ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1,
+                                               &ref_tree, NULL);
+                if (ret) {
+                        mlog_errno(ret);
+                        return ret;
+                }
+                ret = ocfs2_prepare_refcount_change_for_del(inode,
+                                                        context->refcount_loc,
+                                                        phys_blkno,
+                                                        len,
+                                                        &credits,
+                                                        &extra_blocks);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+        ret = ocfs2_lock_allocators_move_extents(inode, &context->et, len, 1,
+                                                 &context->meta_ac,
+                                                 NULL, extra_blocks, &credits);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        /*
+         * need to count 2 extra credits for global_bitmap inode and
+         * group descriptor.
+         */
+        credits += OCFS2_INODE_UPDATE_CREDITS + 1;
+        /*
+         * ocfs2_move_extent() didn't reserve any clusters in lock_allocators()
+         * logic, while we still need to lock the global_bitmap.
+         */
+        gb_inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE,
+                                               OCFS2_INVALID_SLOT);
+        if (!gb_inode) {
+                mlog(ML_ERROR, "unable to get global_bitmap inode\n");
+                ret = -EIO;
+                goto out;
+        }
+        mutex_lock(&gb_inode->i_mutex);
+        ret = ocfs2_inode_lock(gb_inode, &gb_bh, 1);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_unlock_gb_mutex;
+        }
+        mutex_lock(&tl_inode->i_mutex);
+        handle = ocfs2_start_trans(osb, credits);
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                mlog_errno(ret);
+                goto out_unlock_tl_inode;
+        }
+        new_phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, *new_phys_cpos);
+        ret = ocfs2_find_victim_alloc_group(inode, new_phys_blkno,
+                                            GLOBAL_BITMAP_SYSTEM_INODE,
+                                            OCFS2_INVALID_SLOT,
+                                            &goal_bit, &gd_bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        /*
+         * probe the victim cluster group to find a proper
+         * region to fit wanted movement, it even will perfrom
+         * a best-effort attempt by compromising to a threshold
+         * around the goal.
+         */
+        ocfs2_probe_alloc_group(inode, gd_bh, &goal_bit, len, move_max_hop,
+                                new_phys_cpos);
+        if (!new_phys_cpos) {
+                ret = -ENOSPC;
+                goto out_commit;
+        }
+        ret = __ocfs2_move_extent(handle, context, cpos, len, phys_cpos,
+                                  *new_phys_cpos, ext_flags);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        gd = (struct ocfs2_group_desc *)gd_bh->b_data;
+        ret = ocfs2_alloc_dinode_update_counts(gb_inode, handle, gb_bh, len,
+                                               le16_to_cpu(gd->bg_chain));
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        ret = ocfs2_block_group_set_bits(handle, gb_inode, gd, gd_bh,
+                                         goal_bit, len);
+        if (ret)
+                mlog_errno(ret);
+        /*
+         * Here we should write the new page out first if we are
+         * in write-back mode.
+         */
+        ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, len);
+        if (ret)
+                mlog_errno(ret);
+out_commit:
+        ocfs2_commit_trans(osb, handle);
+        brelse(gd_bh);
+out_unlock_tl_inode:
+        mutex_unlock(&tl_inode->i_mutex);
+        ocfs2_inode_unlock(gb_inode, 1);
+out_unlock_gb_mutex:
+        mutex_unlock(&gb_inode->i_mutex);
+        brelse(gb_bh);
+        iput(gb_inode);
+out:
+        if (context->meta_ac) {
+                ocfs2_free_alloc_context(context->meta_ac);
+                context->meta_ac = NULL;
+        }
+        if (ref_tree)
+                ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+        return ret;
+}
+/*
+ * Helper to calculate the defraging length in one run according to threshold.
+ */
+static void ocfs2_calc_extent_defrag_len(u32 *alloc_size, u32 *len_defraged,
+                                         u32 threshold, int *skip)
+{
+        if ((*alloc_size + *len_defraged) < threshold) {
+                /*
+                 * proceed defragmentation until we meet the thresh
+                 */
+                *len_defraged += *alloc_size;
+        } else if (*len_defraged == 0) {
+                /*
+                 * XXX: skip a large extent.
+                 */
+                *skip = 1;
+        } else {
+                /*
+                 * split this extent to coalesce with former pieces as
+                 * to reach the threshold.
+                 *
+                 * we're done here with one cycle of defragmentation
+                 * in a size of 'thresh', resetting 'len_defraged'
+                 * forces a new defragmentation.
+                 */
+                *alloc_size = threshold - *len_defraged;
+                *len_defraged = 0;
+        }
+}
+static int __ocfs2_move_extents_range(struct buffer_head *di_bh,
+                                struct ocfs2_move_extents_context *context)
+{
+        int ret = 0, flags, do_defrag, skip = 0;
+        u32 cpos, phys_cpos, move_start, len_to_move, alloc_size;
+        u32 len_defraged = 0, defrag_thresh = 0, new_phys_cpos = 0;
+        struct inode *inode = context->inode;
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+        struct ocfs2_move_extents *range = context->range;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        if ((inode->i_size == 0) || (range->me_len == 0))
+                return 0;
+        if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+                return 0;
+        context->refcount_loc = le64_to_cpu(di->i_refcount_loc);
+        ocfs2_init_dinode_extent_tree(&context->et, INODE_CACHE(inode), di_bh);
+        ocfs2_init_dealloc_ctxt(&context->dealloc);
+        /*
+         * TO-DO XXX:
+         *
+         * - xattr extents.
+         */
+        do_defrag = context->auto_defrag;
+        /*
+         * extents moving happens in unit of clusters, for the sake
+         * of simplicity, we may ignore two clusters where 'byte_start'
+         * and 'byte_start + len' were within.
+         */
+        move_start = ocfs2_clusters_for_bytes(osb->sb, range->me_start);
+        len_to_move = (range->me_start + range->me_len) >>
+                                                osb->s_clustersize_bits;
+        if (len_to_move >= move_start)
+                len_to_move -= move_start;
+        else
+                len_to_move = 0;
+        if (do_defrag) {
+                defrag_thresh = range->me_threshold >> osb->s_clustersize_bits;
+                if (defrag_thresh <= 1)
+                        goto done;
+        } else
+                new_phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
+                                                         range->me_goal);
+        mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u, "
+             "thresh: %u\n",
+             (unsigned long long)OCFS2_I(inode)->ip_blkno,
+             (unsigned long long)range->me_start,
+             (unsigned long long)range->me_len,
+             move_start, len_to_move, defrag_thresh);
+        cpos = move_start;
+        while (len_to_move) {
+                ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &alloc_size,
+                                         &flags);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                if (alloc_size > len_to_move)
+                        alloc_size = len_to_move;
+                /*
+                 * XXX: how to deal with a hole:
+                 *
+                 * - skip the hole of course
+                 * - force a new defragmentation
+                 */
+                if (!phys_cpos) {
+                        if (do_defrag)
+                                len_defraged = 0;
+                        goto next;
+                }
+                if (do_defrag) {
+                        ocfs2_calc_extent_defrag_len(&alloc_size, &len_defraged,
+                                                     defrag_thresh, &skip);
+                        /*
+                         * skip large extents
+                         */
+                        if (skip) {
+                                skip = 0;
+                                goto next;
+                        }
+                        mlog(0, "#Defrag: cpos: %u, phys_cpos: %u, "
+                             "alloc_size: %u, len_defraged: %u\n",
+                             cpos, phys_cpos, alloc_size, len_defraged);
+                        ret = ocfs2_defrag_extent(context, cpos, phys_cpos,
+                                                  &alloc_size, flags);
+                } else {
+                        ret = ocfs2_move_extent(context, cpos, phys_cpos,
+                                                &new_phys_cpos, alloc_size,
+                                                flags);
+                        new_phys_cpos += alloc_size;
+                }
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                context->clusters_moved += alloc_size;
+next:
+                cpos += alloc_size;
+                len_to_move -= alloc_size;
+        }
+done:
+        range->me_flags |= OCFS2_MOVE_EXT_FL_COMPLETE;
+out:
+        range->me_moved_len = ocfs2_clusters_to_bytes(osb->sb,
+                                                      context->clusters_moved);
+        range->me_new_offset = ocfs2_clusters_to_bytes(osb->sb,
+                                                       context->new_phys_cpos);
+        ocfs2_schedule_truncate_log_flush(osb, 1);
+        ocfs2_run_deallocs(osb, &context->dealloc);
+        return ret;
+}
+static int ocfs2_move_extents(struct ocfs2_move_extents_context *context)
+{
+        int status;
+        handle_t *handle;
+        struct inode *inode = context->inode;
+        struct ocfs2_dinode *di;
+        struct buffer_head *di_bh = NULL;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        if (!inode)
+                return -ENOENT;
+        if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
+                return -EROFS;
+        mutex_lock(&inode->i_mutex);
+        /*
+         * This prevents concurrent writes from other nodes
+         */
+        status = ocfs2_rw_lock(inode, 1);
+        if (status) {
+                mlog_errno(status);
+                goto out;
+        }
+        status = ocfs2_inode_lock(inode, &di_bh, 1);
+        if (status) {
+                mlog_errno(status);
+                goto out_rw_unlock;
+        }
+        /*
+         * rememer ip_xattr_sem also needs to be held if necessary
+         */
+        down_write(&OCFS2_I(inode)->ip_alloc_sem);
+        status = __ocfs2_move_extents_range(di_bh, context);
+        up_write(&OCFS2_I(inode)->ip_alloc_sem);
+        if (status) {
+                mlog_errno(status);
+                goto out_inode_unlock;
+        }
+        /*
+         * We update ctime for these changes
+         */
+        handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+        if (IS_ERR(handle)) {
+                status = PTR_ERR(handle);
+                mlog_errno(status);
+                goto out_inode_unlock;
+        }
+        status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
+        if (status) {
+                mlog_errno(status);
+                goto out_commit;
+        }
+        di = (struct ocfs2_dinode *)di_bh->b_data;
+        inode->i_ctime = CURRENT_TIME;
+        di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+        di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+        ocfs2_journal_dirty(handle, di_bh);
+out_commit:
+        ocfs2_commit_trans(osb, handle);
+out_inode_unlock:
+        brelse(di_bh);
+        ocfs2_inode_unlock(inode, 1);
+out_rw_unlock:
+        ocfs2_rw_unlock(inode, 1);
+out:
+        mutex_unlock(&inode->i_mutex);
+        return status;
+}
+int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp)
+{
+        int status;
+        struct inode *inode = filp->f_path.dentry->d_inode;
+        struct ocfs2_move_extents range;
+        struct ocfs2_move_extents_context *context = NULL;
+        status = mnt_want_write(filp->f_path.mnt);
+        if (status)
+                return status;
+        if ((!S_ISREG(inode->i_mode)) || !(filp->f_mode & FMODE_WRITE))
+                goto out;
+        if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
+                status = -EPERM;
+                goto out;
+        }
+        context = kzalloc(sizeof(struct ocfs2_move_extents_context), GFP_NOFS);
+        if (!context) {
+                status = -ENOMEM;
+                mlog_errno(status);
+                goto out;
+        }
+        context->inode = inode;
+        context->file = filp;
+        if (argp) {
+                if (copy_from_user(&range, (struct ocfs2_move_extents *)argp,
+                                   sizeof(range))) {
+                        status = -EFAULT;
+                        goto out;
+                }
+        } else {
+                status = -EINVAL;
+                goto out;
+        }
+        if (range.me_start > i_size_read(inode))
+                goto out;
+        if (range.me_start + range.me_len > i_size_read(inode))
+                        range.me_len = i_size_read(inode) - range.me_start;
+        context->range = &range;
+        if (range.me_flags & OCFS2_MOVE_EXT_FL_AUTO_DEFRAG) {
+                context->auto_defrag = 1;
+                /*
+                 * ok, the default theshold for the defragmentation
+                 * is 1M, since our maximum clustersize was 1M also.
+                 * any thought?
+                 */
+                if (!range.me_threshold)
+                        range.me_threshold = 1024 * 1024;
+                if (range.me_threshold > i_size_read(inode))
+                        range.me_threshold = i_size_read(inode);
+                if (range.me_flags & OCFS2_MOVE_EXT_FL_PART_DEFRAG)
+                        context->partial = 1;
+        } else {
+                /*
+                 * first best-effort attempt to validate and adjust the goal
+                 * (physical address in block), while it can't guarantee later
+                 * operation can succeed all the time since global_bitmap may
+                 * change a bit over time.
+                 */
+                status = ocfs2_validate_and_adjust_move_goal(inode, &range);
+                if (status)
+                        goto out;
+        }
+        status = ocfs2_move_extents(context);
+        if (status)
+                mlog_errno(status);
+out:
+        /*
+         * movement/defragmentation may end up being partially completed,
+         * that's the reason why we need to return userspace the finished
+         * length and new_offset even if failure happens somewhere.
+         */
+        if (argp) {
+                if (copy_to_user((struct ocfs2_move_extents *)argp, &range,
+                                sizeof(range)))
+                        status = -EFAULT;
+        }
+        kfree(context);
+        mnt_drop_write(filp->f_path.mnt);
+        return status;
+}
diff --git a/fs/ocfs2/move_extents.h b/fs/ocfs2/move_extents.h
new file mode 100644
index 00000000000..4e143e81144
--- /dev/null
+++ b/fs/ocfs2/move_extents.h
@@ -0,0 +1,22 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * move_extents.h
+ *
+ * Copyright (C) 2011 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#ifndef OCFS2_MOVE_EXTENTS_H
+#define OCFS2_MOVE_EXTENTS_H
+int ocfs2_ioctl_move_extents(struct file *filp,  void __user *argp);
+#endif /* OCFS2_MOVE_EXTENTS_H */
diff --git a/fs/ocfs2/ocfs2_ioctl.h b/fs/ocfs2/ocfs2_ioctl.h
index b46f39bf743..5b27ff1fa57 100644
--- a/fs/ocfs2/ocfs2_ioctl.h
+++ b/fs/ocfs2/ocfs2_ioctl.h
@@ -142,6 +142,38 @@ struct ocfs2_info_journal_size {
        __u64 ij_journal_size;
 };
+struct ocfs2_info_freeinode {
+        struct ocfs2_info_request ifi_req;
+        struct ocfs2_info_local_freeinode {
+                __u64 lfi_total;
+                __u64 lfi_free;
+        } ifi_stat[OCFS2_MAX_SLOTS];
+        __u32 ifi_slotnum; /* out */
+        __u32 ifi_pad;
+};
+#define OCFS2_INFO_MAX_HIST     (32)
+struct ocfs2_info_freefrag {
+        struct ocfs2_info_request iff_req;
+        struct ocfs2_info_freefrag_stats { /* (out) */
+                struct ocfs2_info_free_chunk_list {
+                        __u32 fc_chunks[OCFS2_INFO_MAX_HIST];
+                        __u32 fc_clusters[OCFS2_INFO_MAX_HIST];
+                } ffs_fc_hist;
+                __u32 ffs_clusters;
+                __u32 ffs_free_clusters;
+                __u32 ffs_free_chunks;
+                __u32 ffs_free_chunks_real;
+                __u32 ffs_min; /* Minimum free chunksize in clusters */
+                __u32 ffs_max;
+                __u32 ffs_avg;
+                __u32 ffs_pad;
+        } iff_ffs;
+        __u32 iff_chunksize; /* chunksize in clusters(in) */
+        __u32 iff_pad;
+};
 /* Codes for ocfs2_info_request */
 enum ocfs2_info_type {
        OCFS2_INFO_CLUSTERSIZE = 1,
@@ -151,6 +183,8 @@ enum ocfs2_info_type {
        OCFS2_INFO_UUID,
        OCFS2_INFO_FS_FEATURES,
        OCFS2_INFO_JOURNAL_SIZE,
+        OCFS2_INFO_FREEINODE,
+        OCFS2_INFO_FREEFRAG,
        OCFS2_INFO_NUM_TYPES
 };
@@ -171,4 +205,38 @@ enum ocfs2_info_type {
 #define OCFS2_IOC_INFO          _IOR('o', 5, struct ocfs2_info)
+struct ocfs2_move_extents {
+/* All values are in bytes */
+        /* in */
+        __u64 me_start;         /* Virtual start in the file to move */
+        __u64 me_len;           /* Length of the extents to be moved */
+        __u64 me_goal;          /* Physical offset of the goal,
+                                   it's in block unit */
+        __u64 me_threshold;     /* Maximum distance from goal or threshold
+                                   for auto defragmentation */
+        __u64 me_flags;         /* Flags for the operation:
+                                 * - auto defragmentation.
+                                 * - refcount,xattr cases.
+                                 */
+        /* out */
+        __u64 me_moved_len;     /* Moved/defraged length */
+        __u64 me_new_offset;    /* Resulting physical location */
+        __u32 me_reserved[2];   /* Reserved for futhure */
+};
+#define OCFS2_MOVE_EXT_FL_AUTO_DEFRAG   (0x00000001)    /* Kernel manages to
+                                                           claim new clusters
+                                                           as the goal place
+                                                           for extents moving */
+#define OCFS2_MOVE_EXT_FL_PART_DEFRAG   (0x00000002)    /* Allow partial extent
+                                                           moving, is to make
+                                                           movement less likely
+                                                           to fail, may make fs
+                                                           even more fragmented */
+#define OCFS2_MOVE_EXT_FL_COMPLETE      (0x00000004)    /* Move or defragmenation
+                                                           completely gets done.
+                                                         */
+#define OCFS2_IOC_MOVE_EXT      _IOW('o', 6, struct ocfs2_move_extents)
 #endif /* OCFS2_IOCTL_H */
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 3c7606cff1a..ebfd3825f12 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -66,7 +66,7 @@ struct ocfs2_cow_context {
                            u32 *num_clusters,
                            unsigned int *extent_flags);
        int (*cow_duplicate_clusters)(handle_t *handle,
-                                      struct ocfs2_cow_context *context,
+                                      struct file *file,
                                      u32 cpos, u32 old_cluster,
                                      u32 new_cluster, u32 new_len);
 };
@@ -2921,20 +2921,21 @@ static int ocfs2_clear_cow_buffer(handle_t *handle, struct buffer_head *bh)
        return 0;
 }
-static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
+int ocfs2_duplicate_clusters_by_page(handle_t *handle,
-                                            struct ocfs2_cow_context *context,
+                                     struct file *file,
-                                            u32 cpos, u32 old_cluster,
+                                     u32 cpos, u32 old_cluster,
-                                            u32 new_cluster, u32 new_len)
+                                     u32 new_cluster, u32 new_len)
 {
        int ret = 0, partial;
-        struct ocfs2_caching_info *ci = context->data_et.et_ci;
+        struct inode *inode = file->f_path.dentry->d_inode;
+        struct ocfs2_caching_info *ci = INODE_CACHE(inode);
        struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
        u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
        struct page *page;
        pgoff_t page_index;
        unsigned int from, to, readahead_pages;
        loff_t offset, end, map_end;
-        struct address_space *mapping = context->inode->i_mapping;
+        struct address_space *mapping = inode->i_mapping;
        trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster,
                                               new_cluster, new_len);
@@ -2948,8 +2949,8 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
         * We only duplicate pages until we reach the page contains i_size - 1.
         * So trim 'end' to i_size.
         */
-        if (end > i_size_read(context->inode))
+        if (end > i_size_read(inode))
-                end = i_size_read(context->inode);
+                end = i_size_read(inode);
        while (offset < end) {
                page_index = offset >> PAGE_CACHE_SHIFT;
@@ -2972,10 +2973,9 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
                if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize)
                        BUG_ON(PageDirty(page));
-                if (PageReadahead(page) && context->file) {
+                if (PageReadahead(page)) {
                        page_cache_async_readahead(mapping,
-                                                   &context->file->f_ra,
+                                                   &file->f_ra, file,
-                                                   context->file,
                                                   page, page_index,
                                                   readahead_pages);
                }
@@ -2999,8 +2999,7 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
                        }
                }
-                ocfs2_map_and_dirty_page(context->inode,
+                ocfs2_map_and_dirty_page(inode, handle, from, to,
-                                         handle, from, to,
                                         page, 0, &new_block);
                mark_page_accessed(page);
 unlock:
@@ -3015,14 +3014,15 @@ unlock:
        return ret;
 }
-static int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
+int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
-                                           struct ocfs2_cow_context *context,
+                                    struct file *file,
-                                           u32 cpos, u32 old_cluster,
+                                    u32 cpos, u32 old_cluster,
-                                           u32 new_cluster, u32 new_len)
+                                    u32 new_cluster, u32 new_len)
 {
        int ret = 0;
-        struct super_block *sb = context->inode->i_sb;
+        struct inode *inode = file->f_path.dentry->d_inode;
-        struct ocfs2_caching_info *ci = context->data_et.et_ci;
+        struct super_block *sb = inode->i_sb;
+        struct ocfs2_caching_info *ci = INODE_CACHE(inode);
        int i, blocks = ocfs2_clusters_to_blocks(sb, new_len);
        u64 old_block = ocfs2_clusters_to_blocks(sb, old_cluster);
        u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
@@ -3145,8 +3145,8 @@ static int ocfs2_replace_clusters(handle_t *handle,
        /*If the old clusters is unwritten, no need to duplicate. */
        if (!(ext_flags & OCFS2_EXT_UNWRITTEN)) {
-                ret = context->cow_duplicate_clusters(handle, context, cpos,
+                ret = context->cow_duplicate_clusters(handle, context->file,
-                                                      old, new, len);
+                                                      cpos, old, new, len);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -3162,22 +3162,22 @@ out:
        return ret;
 }
-static int ocfs2_cow_sync_writeback(struct super_block *sb,
+int ocfs2_cow_sync_writeback(struct super_block *sb,
-                                    struct ocfs2_cow_context *context,
+                             struct inode *inode,
-                                    u32 cpos, u32 num_clusters)
+                             u32 cpos, u32 num_clusters)
 {
        int ret = 0;
        loff_t offset, end, map_end;
        pgoff_t page_index;
        struct page *page;
-        if (ocfs2_should_order_data(context->inode))
+        if (ocfs2_should_order_data(inode))
                return 0;
        offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
        end = offset + (num_clusters << OCFS2_SB(sb)->s_clustersize_bits);
-        ret = filemap_fdatawrite_range(context->inode->i_mapping,
+        ret = filemap_fdatawrite_range(inode->i_mapping,
                                       offset, end - 1);
        if (ret < 0) {
                mlog_errno(ret);
@@ -3190,7 +3190,7 @@ static int ocfs2_cow_sync_writeback(struct super_block *sb,
                if (map_end > end)
                        map_end = end;
-                page = find_or_create_page(context->inode->i_mapping,
+                page = find_or_create_page(inode->i_mapping,
                                           page_index, GFP_NOFS);
                BUG_ON(!page);
@@ -3349,7 +3349,7 @@ static int ocfs2_make_clusters_writable(struct super_block *sb,
         * in write-back mode.
         */
        if (context->get_clusters == ocfs2_di_get_clusters) {
-                ret = ocfs2_cow_sync_writeback(sb, context, cpos,
+                ret = ocfs2_cow_sync_writeback(sb, context->inode, cpos,
                                               orig_num_clusters);
                if (ret)
                        mlog_errno(ret);
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index c8ce46f7d8e..7754608c83a 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -84,6 +84,17 @@ int ocfs2_refcount_cow_xattr(struct inode *inode,
                             struct buffer_head *ref_root_bh,
                             u32 cpos, u32 write_len,
                             struct ocfs2_post_refcount *post);
+int ocfs2_duplicate_clusters_by_page(handle_t *handle,
+                                     struct file *file,
+                                     u32 cpos, u32 old_cluster,
+                                     u32 new_cluster, u32 new_len);
+int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
+                                    struct file *file,
+                                    u32 cpos, u32 old_cluster,
+                                    u32 new_cluster, u32 new_len);
+int ocfs2_cow_sync_writeback(struct super_block *sb,
+                             struct inode *inode,
+                             u32 cpos, u32 num_clusters);
 int ocfs2_add_refcount_flag(struct inode *inode,
                            struct ocfs2_extent_tree *data_et,
                            struct ocfs2_caching_info *ref_ci,
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index 8b3a7da531e..315de66e52b 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -106,7 +106,7 @@ static long long get_liability(struct ubifs_info *c)
        long long liab;
        spin_lock(&c->space_lock);
-        liab = c->budg_idx_growth + c->budg_data_growth + c->budg_dd_growth;
+        liab = c->bi.idx_growth + c->bi.data_growth + c->bi.dd_growth;
        spin_unlock(&c->space_lock);
        return liab;
 }
@@ -180,7 +180,7 @@ int ubifs_calc_min_idx_lebs(struct ubifs_info *c)
        int idx_lebs;
        long long idx_size;
-        idx_size = c->old_idx_sz + c->budg_idx_growth + c->budg_uncommitted_idx;
+        idx_size = c->bi.old_idx_sz + c->bi.idx_growth + c->bi.uncommitted_idx;
        /* And make sure we have thrice the index size of space reserved */
        idx_size += idx_size << 1;
        /*
@@ -292,13 +292,13 @@ static int can_use_rp(struct ubifs_info *c)
 * budgeted index space to the size of the current index, multiplies this by 3,
 * and makes sure this does not exceed the amount of free LEBs.
 *
- * Notes about @c->min_idx_lebs and @c->lst.idx_lebs variables:
+ * Notes about @c->bi.min_idx_lebs and @c->lst.idx_lebs variables:
 * o @c->lst.idx_lebs is the number of LEBs the index currently uses. It might
 *    be large, because UBIFS does not do any index consolidation as long as
 *    there is free space. IOW, the index may take a lot of LEBs, but the LEBs
 *    will contain a lot of dirt.
- * o @c->min_idx_lebs is the number of LEBS the index presumably takes. IOW,
+ * o @c->bi.min_idx_lebs is the number of LEBS the index presumably takes. IOW,
- *    the index may be consolidated to take up to @c->min_idx_lebs LEBs.
+ *    the index may be consolidated to take up to @c->bi.min_idx_lebs LEBs.
 *
 * This function returns zero in case of success, and %-ENOSPC in case of
 * failure.
@@ -343,13 +343,13 @@ static int do_budget_space(struct ubifs_info *c)
               c->lst.taken_empty_lebs;
        if (unlikely(rsvd_idx_lebs > lebs)) {
                dbg_budg("out of indexing space: min_idx_lebs %d (old %d), "
-                         "rsvd_idx_lebs %d", min_idx_lebs, c->min_idx_lebs,
+                         "rsvd_idx_lebs %d", min_idx_lebs, c->bi.min_idx_lebs,
                         rsvd_idx_lebs);
                return -ENOSPC;
        }
        available = ubifs_calc_available(c, min_idx_lebs);
-        outstanding = c->budg_data_growth + c->budg_dd_growth;
+        outstanding = c->bi.data_growth + c->bi.dd_growth;
        if (unlikely(available < outstanding)) {
                dbg_budg("out of data space: available %lld, outstanding %lld",
@@ -360,7 +360,7 @@ static int do_budget_space(struct ubifs_info *c)
        if (available - outstanding <= c->rp_size && !can_use_rp(c))
                return -ENOSPC;
-        c->min_idx_lebs = min_idx_lebs;
+        c->bi.min_idx_lebs = min_idx_lebs;
        return 0;
 }
@@ -393,11 +393,11 @@ static int calc_data_growth(const struct ubifs_info *c,
 {
        int data_growth;
-        data_growth = req->new_ino  ? c->inode_budget : 0;
+        data_growth = req->new_ino  ? c->bi.inode_budget : 0;
        if (req->new_page)
-                data_growth += c->page_budget;
+                data_growth += c->bi.page_budget;
        if (req->new_dent)
-                data_growth += c->dent_budget;
+                data_growth += c->bi.dent_budget;
        data_growth += req->new_ino_d;
        return data_growth;
 }
@@ -413,12 +413,12 @@ static int calc_dd_growth(const struct ubifs_info *c,
 {
        int dd_growth;
-        dd_growth = req->dirtied_page ? c->page_budget : 0;
+        dd_growth = req->dirtied_page ? c->bi.page_budget : 0;
        if (req->dirtied_ino)
-                dd_growth += c->inode_budget << (req->dirtied_ino - 1);
+                dd_growth += c->bi.inode_budget << (req->dirtied_ino - 1);
        if (req->mod_dent)
-                dd_growth += c->dent_budget;
+                dd_growth += c->bi.dent_budget;
        dd_growth += req->dirtied_ino_d;
        return dd_growth;
 }
@@ -460,19 +460,19 @@ int ubifs_budget_space(struct ubifs_info *c, struct ubifs_budget_req *req)
 again:
        spin_lock(&c->space_lock);
-        ubifs_assert(c->budg_idx_growth >= 0);
+        ubifs_assert(c->bi.idx_growth >= 0);
-        ubifs_assert(c->budg_data_growth >= 0);
+        ubifs_assert(c->bi.data_growth >= 0);
-        ubifs_assert(c->budg_dd_growth >= 0);
+        ubifs_assert(c->bi.dd_growth >= 0);
-        if (unlikely(c->nospace) && (c->nospace_rp || !can_use_rp(c))) {
+        if (unlikely(c->bi.nospace) && (c->bi.nospace_rp || !can_use_rp(c))) {
                dbg_budg("no space");
                spin_unlock(&c->space_lock);
                return -ENOSPC;
        }
-        c->budg_idx_growth += idx_growth;
+        c->bi.idx_growth += idx_growth;
-        c->budg_data_growth += data_growth;
+        c->bi.data_growth += data_growth;
-        c->budg_dd_growth += dd_growth;
+        c->bi.dd_growth += dd_growth;
        err = do_budget_space(c);
        if (likely(!err)) {
@@ -484,9 +484,9 @@ again:
        }
        /* Restore the old values */
-        c->budg_idx_growth -= idx_growth;
+        c->bi.idx_growth -= idx_growth;
-        c->budg_data_growth -= data_growth;
+        c->bi.data_growth -= data_growth;
-        c->budg_dd_growth -= dd_growth;
+        c->bi.dd_growth -= dd_growth;
        spin_unlock(&c->space_lock);
        if (req->fast) {
@@ -506,9 +506,9 @@ again:
                        goto again;
                }
                dbg_budg("FS is full, -ENOSPC");
-                c->nospace = 1;
+                c->bi.nospace = 1;
                if (can_use_rp(c) || c->rp_size == 0)
-                        c->nospace_rp = 1;
+                        c->bi.nospace_rp = 1;
                smp_wmb();
        } else
                ubifs_err("cannot budget space, error %d", err);
@@ -523,8 +523,8 @@ again:
 * This function releases the space budgeted by 'ubifs_budget_space()'. Note,
 * since the index changes (which were budgeted for in @req->idx_growth) will
 * only be written to the media on commit, this function moves the index budget
- * from @c->budg_idx_growth to @c->budg_uncommitted_idx. The latter will be
+ * from @c->bi.idx_growth to @c->bi.uncommitted_idx. The latter will be zeroed
- * zeroed by the commit operation.
+ * by the commit operation.
 */
 void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req)
 {
@@ -553,23 +553,23 @@ void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req)
        if (!req->data_growth && !req->dd_growth)
                return;
-        c->nospace = c->nospace_rp = 0;
+        c->bi.nospace = c->bi.nospace_rp = 0;
        smp_wmb();
        spin_lock(&c->space_lock);
-        c->budg_idx_growth -= req->idx_growth;
+        c->bi.idx_growth -= req->idx_growth;
-        c->budg_uncommitted_idx += req->idx_growth;
+        c->bi.uncommitted_idx += req->idx_growth;
-        c->budg_data_growth -= req->data_growth;
+        c->bi.data_growth -= req->data_growth;
-        c->budg_dd_growth -= req->dd_growth;
+        c->bi.dd_growth -= req->dd_growth;
-        c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+        c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c);
-        ubifs_assert(c->budg_idx_growth >= 0);
+        ubifs_assert(c->bi.idx_growth >= 0);
-        ubifs_assert(c->budg_data_growth >= 0);
+        ubifs_assert(c->bi.data_growth >= 0);
-        ubifs_assert(c->budg_dd_growth >= 0);
+        ubifs_assert(c->bi.dd_growth >= 0);
-        ubifs_assert(c->min_idx_lebs < c->main_lebs);
+        ubifs_assert(c->bi.min_idx_lebs < c->main_lebs);
-        ubifs_assert(!(c->budg_idx_growth & 7));
+        ubifs_assert(!(c->bi.idx_growth & 7));
-        ubifs_assert(!(c->budg_data_growth & 7));
+        ubifs_assert(!(c->bi.data_growth & 7));
-        ubifs_assert(!(c->budg_dd_growth & 7));
+        ubifs_assert(!(c->bi.dd_growth & 7));
        spin_unlock(&c->space_lock);
 }
@@ -586,13 +586,13 @@ void ubifs_convert_page_budget(struct ubifs_info *c)
 {
        spin_lock(&c->space_lock);
        /* Release the index growth reservation */
-        c->budg_idx_growth -= c->max_idx_node_sz << UBIFS_BLOCKS_PER_PAGE_SHIFT;
+        c->bi.idx_growth -= c->max_idx_node_sz << UBIFS_BLOCKS_PER_PAGE_SHIFT;
        /* Release the data growth reservation */
-        c->budg_data_growth -= c->page_budget;
+        c->bi.data_growth -= c->bi.page_budget;
        /* Increase the dirty data growth reservation instead */
-        c->budg_dd_growth += c->page_budget;
+        c->bi.dd_growth += c->bi.page_budget;
        /* And re-calculate the indexing space reservation */
-        c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+        c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c);
        spin_unlock(&c->space_lock);
 }
@@ -612,7 +612,7 @@ void ubifs_release_dirty_inode_budget(struct ubifs_info *c,
        memset(&req, 0, sizeof(struct ubifs_budget_req));
        /* The "no space" flags will be cleared because dd_growth is > 0 */
-        req.dd_growth = c->inode_budget + ALIGN(ui->data_len, 8);
+        req.dd_growth = c->bi.inode_budget + ALIGN(ui->data_len, 8);
        ubifs_release_budget(c, &req);
 }
@@ -682,9 +682,9 @@ long long ubifs_get_free_space_nolock(struct ubifs_info *c)
        int rsvd_idx_lebs, lebs;
        long long available, outstanding, free;
-        ubifs_assert(c->min_idx_lebs == ubifs_calc_min_idx_lebs(c));
+        ubifs_assert(c->bi.min_idx_lebs == ubifs_calc_min_idx_lebs(c));
-        outstanding = c->budg_data_growth + c->budg_dd_growth;
+        outstanding = c->bi.data_growth + c->bi.dd_growth;
-        available = ubifs_calc_available(c, c->min_idx_lebs);
+        available = ubifs_calc_available(c, c->bi.min_idx_lebs);
        /*
         * When reporting free space to user-space, UBIFS guarantees that it is
@@ -697,8 +697,8 @@ long long ubifs_get_free_space_nolock(struct ubifs_info *c)
         * Note, the calculations below are similar to what we have in
         * 'do_budget_space()', so refer there for comments.
         */
-        if (c->min_idx_lebs > c->lst.idx_lebs)
+        if (c->bi.min_idx_lebs > c->lst.idx_lebs)
-                rsvd_idx_lebs = c->min_idx_lebs - c->lst.idx_lebs;
+                rsvd_idx_lebs = c->bi.min_idx_lebs - c->lst.idx_lebs;
        else
                rsvd_idx_lebs = 0;
        lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt -
diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
index 1bd01ded712..87cd0ead863 100644
--- a/fs/ubifs/commit.c
+++ b/fs/ubifs/commit.c
@@ -182,7 +182,7 @@ static int do_commit(struct ubifs_info *c)
        c->mst_node->root_len    = cpu_to_le32(zroot.len);
        c->mst_node->ihead_lnum  = cpu_to_le32(c->ihead_lnum);
        c->mst_node->ihead_offs  = cpu_to_le32(c->ihead_offs);
-        c->mst_node->index_size  = cpu_to_le64(c->old_idx_sz);
+        c->mst_node->index_size  = cpu_to_le64(c->bi.old_idx_sz);
        c->mst_node->lpt_lnum    = cpu_to_le32(c->lpt_lnum);
        c->mst_node->lpt_offs    = cpu_to_le32(c->lpt_offs);
        c->mst_node->nhead_lnum  = cpu_to_le32(c->nhead_lnum);
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 004d3745dc4..0bb2bcef0de 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -34,7 +34,6 @@
 #include <linux/moduleparam.h>
 #include <linux/debugfs.h>
 #include <linux/math64.h>
-#include <linux/slab.h>
 #ifdef CONFIG_UBIFS_FS_DEBUG
@@ -43,15 +42,12 @@ DEFINE_SPINLOCK(dbg_lock);
 static char dbg_key_buf0[128];
 static char dbg_key_buf1[128];
-unsigned int ubifs_msg_flags;
 unsigned int ubifs_chk_flags;
 unsigned int ubifs_tst_flags;
-module_param_named(debug_msgs, ubifs_msg_flags, uint, S_IRUGO | S_IWUSR);
 module_param_named(debug_chks, ubifs_chk_flags, uint, S_IRUGO | S_IWUSR);
 module_param_named(debug_tsts, ubifs_tst_flags, uint, S_IRUGO | S_IWUSR);
-MODULE_PARM_DESC(debug_msgs, "Debug message type flags");
 MODULE_PARM_DESC(debug_chks, "Debug check flags");
 MODULE_PARM_DESC(debug_tsts, "Debug special test flags");
@@ -317,6 +313,8 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
                printk(KERN_DEBUG "\tflags          %#x\n", sup_flags);
                printk(KERN_DEBUG "\t  big_lpt      %u\n",
                       !!(sup_flags & UBIFS_FLG_BIGLPT));
+                printk(KERN_DEBUG "\t  space_fixup  %u\n",
+                       !!(sup_flags & UBIFS_FLG_SPACE_FIXUP));
                printk(KERN_DEBUG "\tmin_io_size    %u\n",
                       le32_to_cpu(sup->min_io_size));
                printk(KERN_DEBUG "\tleb_size       %u\n",
@@ -602,7 +600,7 @@ void dbg_dump_lstats(const struct ubifs_lp_stats *lst)
        spin_unlock(&dbg_lock);
 }
-void dbg_dump_budg(struct ubifs_info *c)
+void dbg_dump_budg(struct ubifs_info *c, const struct ubifs_budg_info *bi)
 {
        int i;
        struct rb_node *rb;
@@ -610,26 +608,42 @@ void dbg_dump_budg(struct ubifs_info *c)
        struct ubifs_gced_idx_leb *idx_gc;
        long long available, outstanding, free;
-        ubifs_assert(spin_is_locked(&c->space_lock));
+        spin_lock(&c->space_lock);
        spin_lock(&dbg_lock);
-        printk(KERN_DEBUG "(pid %d) Budgeting info: budg_data_growth %lld, "
+        printk(KERN_DEBUG "(pid %d) Budgeting info: data budget sum %lld, "
-               "budg_dd_growth %lld, budg_idx_growth %lld\n", current->pid,
+               "total budget sum %lld\n", current->pid,
-               c->budg_data_growth, c->budg_dd_growth, c->budg_idx_growth);
+               bi->data_growth + bi->dd_growth,
-        printk(KERN_DEBUG "\tdata budget sum %lld, total budget sum %lld, "
+               bi->data_growth + bi->dd_growth + bi->idx_growth);
-               "freeable_cnt %d\n", c->budg_data_growth + c->budg_dd_growth,
+        printk(KERN_DEBUG "\tbudg_data_growth %lld, budg_dd_growth %lld, "
-               c->budg_data_growth + c->budg_dd_growth + c->budg_idx_growth,
+               "budg_idx_growth %lld\n", bi->data_growth, bi->dd_growth,
-               c->freeable_cnt);
+               bi->idx_growth);
-        printk(KERN_DEBUG "\tmin_idx_lebs %d, old_idx_sz %lld, "
+        printk(KERN_DEBUG "\tmin_idx_lebs %d, old_idx_sz %llu, "
-               "calc_idx_sz %lld, idx_gc_cnt %d\n", c->min_idx_lebs,
+               "uncommitted_idx %lld\n", bi->min_idx_lebs, bi->old_idx_sz,
-               c->old_idx_sz, c->calc_idx_sz, c->idx_gc_cnt);
+               bi->uncommitted_idx);
+        printk(KERN_DEBUG "\tpage_budget %d, inode_budget %d, dent_budget %d\n",
+               bi->page_budget, bi->inode_budget, bi->dent_budget);
+        printk(KERN_DEBUG "\tnospace %u, nospace_rp %u\n",
+               bi->nospace, bi->nospace_rp);
+        printk(KERN_DEBUG "\tdark_wm %d, dead_wm %d, max_idx_node_sz %d\n",
+               c->dark_wm, c->dead_wm, c->max_idx_node_sz);
+        if (bi != &c->bi)
+                /*
+                 * If we are dumping saved budgeting data, do not print
+                 * additional information which is about the current state, not
+                 * the old one which corresponded to the saved budgeting data.
+                 */
+                goto out_unlock;
+        printk(KERN_DEBUG "\tfreeable_cnt %d, calc_idx_sz %lld, idx_gc_cnt %d\n",
+               c->freeable_cnt, c->calc_idx_sz, c->idx_gc_cnt);
        printk(KERN_DEBUG "\tdirty_pg_cnt %ld, dirty_zn_cnt %ld, "
               "clean_zn_cnt %ld\n", atomic_long_read(&c->dirty_pg_cnt),
               atomic_long_read(&c->dirty_zn_cnt),
               atomic_long_read(&c->clean_zn_cnt));
-        printk(KERN_DEBUG "\tdark_wm %d, dead_wm %d, max_idx_node_sz %d\n",
-               c->dark_wm, c->dead_wm, c->max_idx_node_sz);
        printk(KERN_DEBUG "\tgc_lnum %d, ihead_lnum %d\n",
               c->gc_lnum, c->ihead_lnum);
        /* If we are in R/O mode, journal heads do not exist */
        if (c->jheads)
                for (i = 0; i < c->jhead_cnt; i++)
@@ -648,13 +662,15 @@ void dbg_dump_budg(struct ubifs_info *c)
        printk(KERN_DEBUG "\tcommit state %d\n", c->cmt_state);
        /* Print budgeting predictions */
-        available = ubifs_calc_available(c, c->min_idx_lebs);
+        available = ubifs_calc_available(c, c->bi.min_idx_lebs);
-        outstanding = c->budg_data_growth + c->budg_dd_growth;
+        outstanding = c->bi.data_growth + c->bi.dd_growth;
        free = ubifs_get_free_space_nolock(c);
        printk(KERN_DEBUG "Budgeting predictions:\n");
        printk(KERN_DEBUG "\tavailable: %lld, outstanding %lld, free %lld\n",
               available, outstanding, free);
+out_unlock:
        spin_unlock(&dbg_lock);
+        spin_unlock(&c->space_lock);
 }
 void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp)
@@ -729,7 +745,13 @@ void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp)
                if (bud->lnum == lp->lnum) {
                        int head = 0;
                        for (i = 0; i < c->jhead_cnt; i++) {
-                                if (lp->lnum == c->jheads[i].wbuf.lnum) {
+                                /*
+                                 * Note, if we are in R/O mode or in the middle
+                                 * of mounting/re-mounting, the write-buffers do
+                                 * not exist.
+                                 */
+                                if (c->jheads &&
+                                    lp->lnum == c->jheads[i].wbuf.lnum) {
                                        printk(KERN_CONT ", jhead %s",
                                               dbg_jhead(i));
                                        head = 1;
@@ -976,6 +998,8 @@ void dbg_save_space_info(struct ubifs_info *c)
        spin_lock(&c->space_lock);
        memcpy(&d->saved_lst, &c->lst, sizeof(struct ubifs_lp_stats));
+        memcpy(&d->saved_bi, &c->bi, sizeof(struct ubifs_budg_info));
+        d->saved_idx_gc_cnt = c->idx_gc_cnt;
        /*
         * We use a dirty hack here and zero out @c->freeable_cnt, because it
@@ -1042,14 +1066,14 @@ int dbg_check_space_info(struct ubifs_info *c)
 out:
        ubifs_msg("saved lprops statistics dump");
        dbg_dump_lstats(&d->saved_lst);
-        ubifs_get_lp_stats(c, &lst);
+        ubifs_msg("saved budgeting info dump");
+        dbg_dump_budg(c, &d->saved_bi);
+        ubifs_msg("saved idx_gc_cnt %d", d->saved_idx_gc_cnt);
        ubifs_msg("current lprops statistics dump");
+        ubifs_get_lp_stats(c, &lst);
        dbg_dump_lstats(&lst);
+        ubifs_msg("current budgeting info dump");
-        spin_lock(&c->space_lock);
+        dbg_dump_budg(c, &c->bi);
-        dbg_dump_budg(c);
-        spin_unlock(&c->space_lock);
        dump_stack();
        return -EINVAL;
 }
@@ -1793,6 +1817,8 @@ static struct fsck_inode *add_inode(struct ubifs_info *c,
        struct rb_node **p, *parent = NULL;
        struct fsck_inode *fscki;
        ino_t inum = key_inum_flash(c, &ino->key);
+        struct inode *inode;
+        struct ubifs_inode *ui;
        p = &fsckd->inodes.rb_node;
        while (*p) {
@@ -1816,19 +1842,46 @@ static struct fsck_inode *add_inode(struct ubifs_info *c,
        if (!fscki)
                return ERR_PTR(-ENOMEM);
+        inode = ilookup(c->vfs_sb, inum);
        fscki->inum = inum;
-        fscki->nlink = le32_to_cpu(ino->nlink);
+        /*
-        fscki->size = le64_to_cpu(ino->size);
+         * If the inode is present in the VFS inode cache, use it instead of
-        fscki->xattr_cnt = le32_to_cpu(ino->xattr_cnt);
+         * the on-flash inode which might be out-of-date. E.g., the size might
-        fscki->xattr_sz = le32_to_cpu(ino->xattr_size);
+         * be out-of-date. If we do not do this, the following may happen, for
-        fscki->xattr_nms = le32_to_cpu(ino->xattr_names);
+         * example:
-        fscki->mode = le32_to_cpu(ino->mode);
+         *   1. A power cut happens
+         *   2. We mount the file-system R/O, the replay process fixes up the
+         *      inode size in the VFS cache, but on on-flash.
+         *   3. 'check_leaf()' fails because it hits a data node beyond inode
+         *      size.
+         */
+        if (!inode) {
+                fscki->nlink = le32_to_cpu(ino->nlink);
+                fscki->size = le64_to_cpu(ino->size);
+                fscki->xattr_cnt = le32_to_cpu(ino->xattr_cnt);
+                fscki->xattr_sz = le32_to_cpu(ino->xattr_size);
+                fscki->xattr_nms = le32_to_cpu(ino->xattr_names);
+                fscki->mode = le32_to_cpu(ino->mode);
+        } else {
+                ui = ubifs_inode(inode);
+                fscki->nlink = inode->i_nlink;
+                fscki->size = inode->i_size;
+                fscki->xattr_cnt = ui->xattr_cnt;
+                fscki->xattr_sz = ui->xattr_size;
+                fscki->xattr_nms = ui->xattr_names;
+                fscki->mode = inode->i_mode;
+                iput(inode);
+        }
        if (S_ISDIR(fscki->mode)) {
                fscki->calc_sz = UBIFS_INO_NODE_SZ;
                fscki->calc_cnt = 2;
        }
        rb_link_node(&fscki->rb, parent, p);
        rb_insert_color(&fscki->rb, &fsckd->inodes);
        return fscki;
 }
@@ -2421,7 +2474,8 @@ int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head)
                hashb = key_block(c, &sb->key);
                if (hasha > hashb) {
-                        ubifs_err("larger hash %u goes before %u", hasha, hashb);
+                        ubifs_err("larger hash %u goes before %u",
+                                  hasha, hashb);
                        goto error_dump;
                }
        }
@@ -2437,14 +2491,12 @@ error_dump:
        return 0;
 }
-static int invocation_cnt;
 int dbg_force_in_the_gaps(void)
 {
-        if (!dbg_force_in_the_gaps_enabled)
+        if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
                return 0;
-        /* Force in-the-gaps every 8th commit */
-        return !((invocation_cnt++) & 0x7);
+        return !(random32() & 7);
 }
 /* Failure mode for recovery testing */
@@ -2632,7 +2684,7 @@ int dbg_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset,
                 int len, int check)
 {
        if (in_failure_mode(desc))
-                return -EIO;
+                return -EROFS;
        return ubi_leb_read(desc, lnum, buf, offset, len, check);
 }
@@ -2642,7 +2694,7 @@ int dbg_leb_write(struct ubi_volume_desc *desc, int lnum, const void *buf,
        int err, failing;
        if (in_failure_mode(desc))
-                return -EIO;
+                return -EROFS;
        failing = do_fail(desc, lnum, 1);
        if (failing)
                cut_data(buf, len);
@@ -2650,7 +2702,7 @@ int dbg_leb_write(struct ubi_volume_desc *desc, int lnum, const void *buf,
        if (err)
                return err;
        if (failing)
-                return -EIO;
+                return -EROFS;
        return 0;
 }
@@ -2660,12 +2712,12 @@ int dbg_leb_change(struct ubi_volume_desc *desc, int lnum, const void *buf,
        int err;
        if (do_fail(desc, lnum, 1))
-                return -EIO;
+                return -EROFS;
        err = ubi_leb_change(desc, lnum, buf, len, dtype);
        if (err)
                return err;
        if (do_fail(desc, lnum, 1))
-                return -EIO;
+                return -EROFS;
        return 0;
 }
@@ -2674,12 +2726,12 @@ int dbg_leb_erase(struct ubi_volume_desc *desc, int lnum)
        int err;
        if (do_fail(desc, lnum, 0))
-                return -EIO;
+                return -EROFS;
        err = ubi_leb_erase(desc, lnum);
        if (err)
                return err;
        if (do_fail(desc, lnum, 0))
-                return -EIO;
+                return -EROFS;
        return 0;
 }
@@ -2688,19 +2740,19 @@ int dbg_leb_unmap(struct ubi_volume_desc *desc, int lnum)
        int err;
        if (do_fail(desc, lnum, 0))
-                return -EIO;
+                return -EROFS;
        err = ubi_leb_unmap(desc, lnum);
        if (err)
                return err;
        if (do_fail(desc, lnum, 0))
-                return -EIO;
+                return -EROFS;
        return 0;
 }
 int dbg_is_mapped(struct ubi_volume_desc *desc, int lnum)
 {
        if (in_failure_mode(desc))
-                return -EIO;
+                return -EROFS;
        return ubi_is_mapped(desc, lnum);
 }
@@ -2709,12 +2761,12 @@ int dbg_leb_map(struct ubi_volume_desc *desc, int lnum, int dtype)
        int err;
        if (do_fail(desc, lnum, 0))
-                return -EIO;
+                return -EROFS;
        err = ubi_leb_map(desc, lnum, dtype);
        if (err)
                return err;
        if (do_fail(desc, lnum, 0))
-                return -EIO;
+                return -EROFS;
        return 0;
 }
@@ -2784,7 +2836,7 @@ void dbg_debugfs_exit(void)
 static int open_debugfs_file(struct inode *inode, struct file *file)
 {
        file->private_data = inode->i_private;
-        return 0;
+        return nonseekable_open(inode, file);
 }
 static ssize_t write_debugfs_file(struct file *file, const char __user *buf,
@@ -2795,18 +2847,15 @@ static ssize_t write_debugfs_file(struct file *file, const char __user *buf,
        if (file->f_path.dentry == d->dfs_dump_lprops)
                dbg_dump_lprops(c);
-        else if (file->f_path.dentry == d->dfs_dump_budg) {
+        else if (file->f_path.dentry == d->dfs_dump_budg)
-                spin_lock(&c->space_lock);
+                dbg_dump_budg(c, &c->bi);
-                dbg_dump_budg(c);
+        else if (file->f_path.dentry == d->dfs_dump_tnc) {
-                spin_unlock(&c->space_lock);
-        } else if (file->f_path.dentry == d->dfs_dump_tnc) {
                mutex_lock(&c->tnc_mutex);
                dbg_dump_tnc(c);
                mutex_unlock(&c->tnc_mutex);
        } else
                return -EINVAL;
-        *ppos += count;
        return count;
 }
@@ -2814,7 +2863,7 @@ static const struct file_operations dfs_fops = {
        .open = open_debugfs_file,
        .write = write_debugfs_file,
        .owner = THIS_MODULE,
-        .llseek = default_llseek,
+        .llseek = no_llseek,
 };
 /**
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index e6493cac193..a811ac4a26b 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -31,6 +31,8 @@ typedef int (*dbg_znode_callback)(struct ubifs_info *c,
 #ifdef CONFIG_UBIFS_FS_DEBUG
+#include <linux/random.h>
 /**
 * ubifs_debug_info - per-FS debugging information.
 * @old_zroot: old index root - used by 'dbg_check_old_index()'
@@ -50,13 +52,15 @@ typedef int (*dbg_znode_callback)(struct ubifs_info *c,
 * @new_ihead_offs: used by debugging to check @c->ihead_offs
 *
 * @saved_lst: saved lprops statistics (used by 'dbg_save_space_info()')
- * @saved_free: saved free space (used by 'dbg_save_space_info()')
+ * @saved_bi: saved budgeting information
+ * @saved_free: saved amount of free space
+ * @saved_idx_gc_cnt: saved value of @c->idx_gc_cnt
 *
- * dfs_dir_name: name of debugfs directory containing this file-system's files
+ * @dfs_dir_name: name of debugfs directory containing this file-system's files
- * dfs_dir: direntry object of the file-system debugfs directory
+ * @dfs_dir: direntry object of the file-system debugfs directory
- * dfs_dump_lprops: "dump lprops" debugfs knob
+ * @dfs_dump_lprops: "dump lprops" debugfs knob
- * dfs_dump_budg: "dump budgeting information" debugfs knob
+ * @dfs_dump_budg: "dump budgeting information" debugfs knob
- * dfs_dump_tnc: "dump TNC" debugfs knob
+ * @dfs_dump_tnc: "dump TNC" debugfs knob
 */
 struct ubifs_debug_info {
        struct ubifs_zbranch old_zroot;
@@ -76,7 +80,9 @@ struct ubifs_debug_info {
        int new_ihead_offs;
        struct ubifs_lp_stats saved_lst;
+        struct ubifs_budg_info saved_bi;
        long long saved_free;
+        int saved_idx_gc_cnt;
        char dfs_dir_name[100];
        struct dentry *dfs_dir;
@@ -101,23 +107,7 @@ struct ubifs_debug_info {
        }                                                                      \
 } while (0)
-#define dbg_dump_stack() do {                                                  \
+#define dbg_dump_stack() dump_stack()
-        if (!dbg_failure_mode)                                                 \
-                dump_stack();                                                  \
-} while (0)
-/* Generic debugging messages */
-#define dbg_msg(fmt, ...) do {                                                 \
-        spin_lock(&dbg_lock);                                                  \
-        printk(KERN_DEBUG "UBIFS DBG (pid %d): %s: " fmt "\n", current->pid,   \
-               __func__, ##__VA_ARGS__);                                       \
-        spin_unlock(&dbg_lock);                                                \
-} while (0)
-#define dbg_do_msg(typ, fmt, ...) do {                                         \
-        if (ubifs_msg_flags & typ)                                             \
-                dbg_msg(fmt, ##__VA_ARGS__);                                   \
-} while (0)
 #define dbg_err(fmt, ...) do {                                                 \
        spin_lock(&dbg_lock);                                                  \
@@ -137,77 +127,40 @@ const char *dbg_key_str1(const struct ubifs_info *c,
 #define DBGKEY(key) dbg_key_str0(c, (key))
 #define DBGKEY1(key) dbg_key_str1(c, (key))
-/* General messages */
+#define ubifs_dbg_msg(type, fmt, ...) do {                        \
-#define dbg_gen(fmt, ...)   dbg_do_msg(UBIFS_MSG_GEN, fmt, ##__VA_ARGS__)
+        spin_lock(&dbg_lock);                                     \
+        pr_debug("UBIFS DBG " type ": " fmt "\n", ##__VA_ARGS__); \
+        spin_unlock(&dbg_lock);                                   \
+} while (0)
+/* Just a debugging messages not related to any specific UBIFS subsystem */
+#define dbg_msg(fmt, ...)   ubifs_dbg_msg("msg", fmt, ##__VA_ARGS__)
+/* General messages */
+#define dbg_gen(fmt, ...)   ubifs_dbg_msg("gen", fmt, ##__VA_ARGS__)
 /* Additional journal messages */
-#define dbg_jnl(fmt, ...)   dbg_do_msg(UBIFS_MSG_JNL, fmt, ##__VA_ARGS__)
+#define dbg_jnl(fmt, ...)   ubifs_dbg_msg("jnl", fmt, ##__VA_ARGS__)
 /* Additional TNC messages */
-#define dbg_tnc(fmt, ...)   dbg_do_msg(UBIFS_MSG_TNC, fmt, ##__VA_ARGS__)
+#define dbg_tnc(fmt, ...)   ubifs_dbg_msg("tnc", fmt, ##__VA_ARGS__)
 /* Additional lprops messages */
-#define dbg_lp(fmt, ...)    dbg_do_msg(UBIFS_MSG_LP, fmt, ##__VA_ARGS__)
+#define dbg_lp(fmt, ...)    ubifs_dbg_msg("lp", fmt, ##__VA_ARGS__)
 /* Additional LEB find messages */
-#define dbg_find(fmt, ...)  dbg_do_msg(UBIFS_MSG_FIND, fmt, ##__VA_ARGS__)
+#define dbg_find(fmt, ...)  ubifs_dbg_msg("find", fmt, ##__VA_ARGS__)
 /* Additional mount messages */
-#define dbg_mnt(fmt, ...)   dbg_do_msg(UBIFS_MSG_MNT, fmt, ##__VA_ARGS__)
+#define dbg_mnt(fmt, ...)   ubifs_dbg_msg("mnt", fmt, ##__VA_ARGS__)
 /* Additional I/O messages */
-#define dbg_io(fmt, ...)    dbg_do_msg(UBIFS_MSG_IO, fmt, ##__VA_ARGS__)
+#define dbg_io(fmt, ...)    ubifs_dbg_msg("io", fmt, ##__VA_ARGS__)
 /* Additional commit messages */
-#define dbg_cmt(fmt, ...)   dbg_do_msg(UBIFS_MSG_CMT, fmt, ##__VA_ARGS__)
+#define dbg_cmt(fmt, ...)   ubifs_dbg_msg("cmt", fmt, ##__VA_ARGS__)
 /* Additional budgeting messages */
-#define dbg_budg(fmt, ...)  dbg_do_msg(UBIFS_MSG_BUDG, fmt, ##__VA_ARGS__)
+#define dbg_budg(fmt, ...)  ubifs_dbg_msg("budg", fmt, ##__VA_ARGS__)
 /* Additional log messages */
-#define dbg_log(fmt, ...)   dbg_do_msg(UBIFS_MSG_LOG, fmt, ##__VA_ARGS__)
+#define dbg_log(fmt, ...)   ubifs_dbg_msg("log", fmt, ##__VA_ARGS__)
 /* Additional gc messages */
-#define dbg_gc(fmt, ...)    dbg_do_msg(UBIFS_MSG_GC, fmt, ##__VA_ARGS__)
+#define dbg_gc(fmt, ...)    ubifs_dbg_msg("gc", fmt, ##__VA_ARGS__)
 /* Additional scan messages */
-#define dbg_scan(fmt, ...)  dbg_do_msg(UBIFS_MSG_SCAN, fmt, ##__VA_ARGS__)
+#define dbg_scan(fmt, ...)  ubifs_dbg_msg("scan", fmt, ##__VA_ARGS__)
 /* Additional recovery messages */
-#define dbg_rcvry(fmt, ...) dbg_do_msg(UBIFS_MSG_RCVRY, fmt, ##__VA_ARGS__)
+#define dbg_rcvry(fmt, ...) ubifs_dbg_msg("rcvry", fmt, ##__VA_ARGS__)
-/*
- * Debugging message type flags.
- *
- * UBIFS_MSG_GEN: general messages
- * UBIFS_MSG_JNL: journal messages
- * UBIFS_MSG_MNT: mount messages
- * UBIFS_MSG_CMT: commit messages
- * UBIFS_MSG_FIND: LEB find messages
- * UBIFS_MSG_BUDG: budgeting messages
- * UBIFS_MSG_GC: garbage collection messages
- * UBIFS_MSG_TNC: TNC messages
- * UBIFS_MSG_LP: lprops messages
- * UBIFS_MSG_IO: I/O messages
- * UBIFS_MSG_LOG: log messages
- * UBIFS_MSG_SCAN: scan messages
- * UBIFS_MSG_RCVRY: recovery messages
- */
-enum {
-        UBIFS_MSG_GEN   = 0x1,
-        UBIFS_MSG_JNL   = 0x2,
-        UBIFS_MSG_MNT   = 0x4,
-        UBIFS_MSG_CMT   = 0x8,
-        UBIFS_MSG_FIND  = 0x10,
-        UBIFS_MSG_BUDG  = 0x20,
-        UBIFS_MSG_GC    = 0x40,
-        UBIFS_MSG_TNC   = 0x80,
-        UBIFS_MSG_LP    = 0x100,
-        UBIFS_MSG_IO    = 0x200,
-        UBIFS_MSG_LOG   = 0x400,
-        UBIFS_MSG_SCAN  = 0x800,
-        UBIFS_MSG_RCVRY = 0x1000,
-};
 /*
 * Debugging check flags.
@@ -233,11 +186,9 @@ enum {
 /*
 * Special testing flags.
 *
- * UBIFS_TST_FORCE_IN_THE_GAPS: force the use of in-the-gaps method
 * UBIFS_TST_RCVRY: failure mode for recovery testing
 */
 enum {
-        UBIFS_TST_FORCE_IN_THE_GAPS = 0x2,
        UBIFS_TST_RCVRY             = 0x4,
 };
@@ -262,7 +213,7 @@ void dbg_dump_lpt_node(const struct ubifs_info *c, void *node, int lnum,
                       int offs);
 void dbg_dump_budget_req(const struct ubifs_budget_req *req);
 void dbg_dump_lstats(const struct ubifs_lp_stats *lst);
-void dbg_dump_budg(struct ubifs_info *c);
+void dbg_dump_budg(struct ubifs_info *c, const struct ubifs_budg_info *bi);
 void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp);
 void dbg_dump_lprops(struct ubifs_info *c);
 void dbg_dump_lpt_info(struct ubifs_info *c);
@@ -304,18 +255,16 @@ int dbg_check_data_nodes_order(struct ubifs_info *c, struct list_head *head);
 int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head);
 /* Force the use of in-the-gaps method for testing */
+static inline int dbg_force_in_the_gaps_enabled(void)
-#define dbg_force_in_the_gaps_enabled \
+{
-        (ubifs_tst_flags & UBIFS_TST_FORCE_IN_THE_GAPS)
+        return ubifs_chk_flags & UBIFS_CHK_GEN;
+}
 int dbg_force_in_the_gaps(void);
 /* Failure mode for recovery testing */
 #define dbg_failure_mode (ubifs_tst_flags & UBIFS_TST_RCVRY)
 #ifndef UBIFS_DBG_PRESERVE_UBI
 #define ubi_leb_read   dbg_leb_read
 #define ubi_leb_write  dbg_leb_write
 #define ubi_leb_change dbg_leb_change
@@ -323,7 +272,6 @@ int dbg_force_in_the_gaps(void);
 #define ubi_leb_unmap  dbg_leb_unmap
 #define ubi_is_mapped  dbg_is_mapped
 #define ubi_leb_map    dbg_leb_map
 #endif
 int dbg_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset,
@@ -370,33 +318,33 @@ void dbg_debugfs_exit_fs(struct ubifs_info *c);
                       __func__, __LINE__, current->pid);                      \
 } while (0)
-#define dbg_err(fmt, ...)   do {                                               \
+#define dbg_err(fmt, ...)   do {                   \
-        if (0)                                                                 \
+        if (0)                                     \
-                ubifs_err(fmt, ##__VA_ARGS__);                                 \
+                ubifs_err(fmt, ##__VA_ARGS__);     \
 } while (0)
-#define dbg_msg(fmt, ...) do {                                                 \
+#define ubifs_dbg_msg(fmt, ...) do {               \
-        if (0)                                                                 \
+        if (0)                                     \
-                printk(KERN_DEBUG "UBIFS DBG (pid %d): %s: " fmt "\n",         \
+                pr_debug(fmt "\n", ##__VA_ARGS__); \
-                       current->pid, __func__, ##__VA_ARGS__);                 \
 } while (0)
 #define dbg_dump_stack()
 #define ubifs_assert_cmt_locked(c)
-#define dbg_gen(fmt, ...)   dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_msg(fmt, ...)   ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_jnl(fmt, ...)   dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_gen(fmt, ...)   ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_tnc(fmt, ...)   dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_jnl(fmt, ...)   ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_lp(fmt, ...)    dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_tnc(fmt, ...)   ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_find(fmt, ...)  dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_lp(fmt, ...)    ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_mnt(fmt, ...)   dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_find(fmt, ...)  ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_io(fmt, ...)    dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_mnt(fmt, ...)   ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_cmt(fmt, ...)   dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_io(fmt, ...)    ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_budg(fmt, ...)  dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_cmt(fmt, ...)   ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_log(fmt, ...)   dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_budg(fmt, ...)  ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_gc(fmt, ...)    dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_log(fmt, ...)   ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_scan(fmt, ...)  dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_gc(fmt, ...)    ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_rcvry(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_scan(fmt, ...)  ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_rcvry(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
 #define DBGKEY(key)  ((char *)(key))
 #define DBGKEY1(key) ((char *)(key))
@@ -420,7 +368,9 @@ static inline void
 dbg_dump_budget_req(const struct ubifs_budget_req *req)           { return; }
 static inline void
 dbg_dump_lstats(const struct ubifs_lp_stats *lst)                 { return; }
-static inline void dbg_dump_budg(struct ubifs_info *c)            { return; }
+static inline void
+dbg_dump_budg(struct ubifs_info *c,
+              const struct ubifs_budg_info *bi)                   { return; }
 static inline void dbg_dump_lprop(const struct ubifs_info *c,
                                  const struct ubifs_lprops *lp)  { return; }
 static inline void dbg_dump_lprops(struct ubifs_info *c)          { return; }
@@ -482,8 +432,8 @@ dbg_check_nondata_nodes_order(struct ubifs_info *c,
                              struct list_head *head)             { return 0; }
 static inline int dbg_force_in_the_gaps(void)                     { return 0; }
-#define dbg_force_in_the_gaps_enabled 0
+#define dbg_force_in_the_gaps_enabled() 0
-#define dbg_failure_mode              0
+#define dbg_failure_mode                0
 static inline int dbg_debugfs_init(void)                          { return 0; }
 static inline void dbg_debugfs_exit(void)                         { return; }
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 7217d67a80a..ef5abd38f0b 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -603,7 +603,7 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry)
                ubifs_release_budget(c, &req);
        else {
                /* We've deleted something - clean the "no space" flags */
-                c->nospace = c->nospace_rp = 0;
+                c->bi.nospace = c->bi.nospace_rp = 0;
                smp_wmb();
        }
        return 0;
@@ -693,7 +693,7 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry)
                ubifs_release_budget(c, &req);
        else {
                /* We've deleted something - clean the "no space" flags */
-                c->nospace = c->nospace_rp = 0;
+                c->bi.nospace = c->bi.nospace_rp = 0;
                smp_wmb();
        }
        return 0;
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index b286db79c68..5e7fccfc4b2 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -212,7 +212,7 @@ static void release_new_page_budget(struct ubifs_info *c)
 */
 static void release_existing_page_budget(struct ubifs_info *c)
 {
-        struct ubifs_budget_req req = { .dd_growth = c->page_budget};
+        struct ubifs_budget_req req = { .dd_growth = c->bi.page_budget};
        ubifs_release_budget(c, &req);
 }
@@ -971,11 +971,11 @@ static int do_writepage(struct page *page, int len)
 * the page locked, and it locks @ui_mutex. However, write-back does take inode
 * @i_mutex, which means other VFS operations may be run on this inode at the
 * same time. And the problematic one is truncation to smaller size, from where
- * we have to call 'truncate_setsize()', which first changes @inode->i_size, then
+ * we have to call 'truncate_setsize()', which first changes @inode->i_size,
- * drops the truncated pages. And while dropping the pages, it takes the page
+ * then drops the truncated pages. And while dropping the pages, it takes the
- * lock. This means that 'do_truncation()' cannot call 'truncate_setsize()' with
+ * page lock. This means that 'do_truncation()' cannot call 'truncate_setsize()'
- * @ui_mutex locked, because it would deadlock with 'ubifs_writepage()'. This
+ * with @ui_mutex locked, because it would deadlock with 'ubifs_writepage()'.
- * means that @inode->i_size is changed while @ui_mutex is unlocked.
+ * This means that @inode->i_size is changed while @ui_mutex is unlocked.
 *
 * XXX(truncate): with the new truncate sequence this is not true anymore,
 * and the calls to truncate_setsize can be move around freely.  They should
@@ -1189,7 +1189,7 @@ out_budg:
        if (budgeted)
                ubifs_release_budget(c, &req);
        else {
-                c->nospace = c->nospace_rp = 0;
+                c->bi.nospace = c->bi.nospace_rp = 0;
                smp_wmb();
        }
        return err;
@@ -1312,7 +1312,11 @@ int ubifs_fsync(struct file *file, int datasync)
        dbg_gen("syncing inode %lu", inode->i_ino);
-        if (inode->i_sb->s_flags & MS_RDONLY)
+        if (c->ro_mount)
+                /*
+                 * For some really strange reasons VFS does not filter out
+                 * 'fsync()' for R/O mounted file-systems as per 2.6.39.
+                 */
                return 0;
        /*
@@ -1432,10 +1436,11 @@ static int ubifs_releasepage(struct page *page, gfp_t unused_gfp_flags)
 }
 /*
- * mmap()d file has taken write protection fault and is being made
+ * mmap()d file has taken write protection fault and is being made writable.
- * writable. UBIFS must ensure page is budgeted for.
+ * UBIFS must ensure page is budgeted for.
 */
-static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma,
+                                 struct vm_fault *vmf)
 {
        struct page *page = vmf->page;
        struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
@@ -1536,7 +1541,6 @@ static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
        int err;
-        /* 'generic_file_mmap()' takes care of NOMMU case */
        err = generic_file_mmap(file, vma);
        if (err)
                return err;
diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c
index 1d54383d126..2559d174e00 100644
--- a/fs/ubifs/find.c
+++ b/fs/ubifs/find.c
@@ -252,8 +252,8 @@ int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
                 * But if the index takes fewer LEBs than it is reserved for it,
                 * this function must avoid picking those reserved LEBs.
                 */
-                if (c->min_idx_lebs >= c->lst.idx_lebs) {
+                if (c->bi.min_idx_lebs >= c->lst.idx_lebs) {
-                        rsvd_idx_lebs = c->min_idx_lebs -  c->lst.idx_lebs;
+                        rsvd_idx_lebs = c->bi.min_idx_lebs -  c->lst.idx_lebs;
                        exclude_index = 1;
                }
                spin_unlock(&c->space_lock);
@@ -276,7 +276,7 @@ int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
                        pick_free = 0;
        } else {
                spin_lock(&c->space_lock);
-                exclude_index = (c->min_idx_lebs >= c->lst.idx_lebs);
+                exclude_index = (c->bi.min_idx_lebs >= c->lst.idx_lebs);
                spin_unlock(&c->space_lock);
        }
@@ -501,8 +501,8 @@ int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *offs,
        /* Check if there are enough empty LEBs for commit */
        spin_lock(&c->space_lock);
-        if (c->min_idx_lebs > c->lst.idx_lebs)
+        if (c->bi.min_idx_lebs > c->lst.idx_lebs)
-                rsvd_idx_lebs = c->min_idx_lebs -  c->lst.idx_lebs;
+                rsvd_idx_lebs = c->bi.min_idx_lebs -  c->lst.idx_lebs;
        else
                rsvd_idx_lebs = 0;
        lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt -
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index 151f1088282..ded29f6224c 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -100,6 +100,10 @@ static int switch_gc_head(struct ubifs_info *c)
        if (err)
                return err;
+        err = ubifs_wbuf_sync_nolock(wbuf);
+        if (err)
+                return err;
        err = ubifs_add_bud_to_log(c, GCHD, gc_lnum, 0);
        if (err)
                return err;
@@ -118,7 +122,7 @@ static int switch_gc_head(struct ubifs_info *c)
 * This function compares data nodes @a and @b. Returns %1 if @a has greater
 * inode or block number, and %-1 otherwise.
 */
-int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
+static int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
 {
        ino_t inuma, inumb;
        struct ubifs_info *c = priv;
@@ -161,7 +165,8 @@ int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
 * first and sorted by length in descending order. Directory entry nodes go
 * after inode nodes and are sorted in ascending hash valuer order.
 */
-int nondata_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
+static int nondata_nodes_cmp(void *priv, struct list_head *a,
+                             struct list_head *b)
 {
        ino_t inuma, inumb;
        struct ubifs_info *c = priv;
@@ -473,6 +478,37 @@ int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp)
        ubifs_assert(c->gc_lnum != lnum);
        ubifs_assert(wbuf->lnum != lnum);
+        if (lp->free + lp->dirty == c->leb_size) {
+                /* Special case - a free LEB  */
+                dbg_gc("LEB %d is free, return it", lp->lnum);
+                ubifs_assert(!(lp->flags & LPROPS_INDEX));
+                if (lp->free != c->leb_size) {
+                        /*
+                         * Write buffers must be sync'd before unmapping
+                         * freeable LEBs, because one of them may contain data
+                         * which obsoletes something in 'lp->pnum'.
+                         */
+                        err = gc_sync_wbufs(c);
+                        if (err)
+                                return err;
+                        err = ubifs_change_one_lp(c, lp->lnum, c->leb_size,
+                                                  0, 0, 0, 0);
+                        if (err)
+                                return err;
+                }
+                err = ubifs_leb_unmap(c, lp->lnum);
+                if (err)
+                        return err;
+                if (c->gc_lnum == -1) {
+                        c->gc_lnum = lnum;
+                        return LEB_RETAINED;
+                }
+                return LEB_FREED;
+        }
        /*
         * We scan the entire LEB even though we only really need to scan up to
         * (c->leb_size - lp->free).
@@ -682,37 +718,6 @@ int ubifs_garbage_collect(struct ubifs_info *c, int anyway)
                       "(min. space %d)", lp.lnum, lp.free, lp.dirty,
                       lp.free + lp.dirty, min_space);
-                if (lp.free + lp.dirty == c->leb_size) {
-                        /* An empty LEB was returned */
-                        dbg_gc("LEB %d is free, return it", lp.lnum);
-                        /*
-                         * ubifs_find_dirty_leb() doesn't return freeable index
-                         * LEBs.
-                         */
-                        ubifs_assert(!(lp.flags & LPROPS_INDEX));
-                        if (lp.free != c->leb_size) {
-                                /*
-                                 * Write buffers must be sync'd before
-                                 * unmapping freeable LEBs, because one of them
-                                 * may contain data which obsoletes something
-                                 * in 'lp.pnum'.
-                                 */
-                                ret = gc_sync_wbufs(c);
-                                if (ret)
-                                        goto out;
-                                ret = ubifs_change_one_lp(c, lp.lnum,
-                                                          c->leb_size, 0, 0, 0,
-                                                          0);
-                                if (ret)
-                                        goto out;
-                        }
-                        ret = ubifs_leb_unmap(c, lp.lnum);
-                        if (ret)
-                                goto out;
-                        ret = lp.lnum;
-                        break;
-                }
                space_before = c->leb_size - wbuf->offs - wbuf->used;
                if (wbuf->lnum == -1)
                        space_before = 0;
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index dfd168b7807..166951e0dcd 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -393,7 +393,7 @@ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf)
        ubifs_assert(wbuf->size % c->min_io_size == 0);
        ubifs_assert(!c->ro_media && !c->ro_mount);
        if (c->leb_size - wbuf->offs >= c->max_write_size)
-                ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size ));
+                ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size));
        if (c->ro_error)
                return -EROFS;
@@ -452,8 +452,8 @@ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf)
 * @dtype: data type
 *
 * This function targets the write-buffer to logical eraseblock @lnum:@offs.
- * The write-buffer is synchronized if it is not empty. Returns zero in case of
+ * The write-buffer has to be empty. Returns zero in case of success and a
- * success and a negative error code in case of failure.
+ * negative error code in case of failure.
 */
 int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs,
                           int dtype)
@@ -465,13 +465,7 @@ int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs,
        ubifs_assert(offs >= 0 && offs <= c->leb_size);
        ubifs_assert(offs % c->min_io_size == 0 && !(offs & 7));
        ubifs_assert(lnum != wbuf->lnum);
+        ubifs_assert(wbuf->used == 0);
-        if (wbuf->used > 0) {
-                int err = ubifs_wbuf_sync_nolock(wbuf);
-                if (err)
-                        return err;
-        }
        spin_lock(&wbuf->lock);
        wbuf->lnum = lnum;
@@ -573,7 +567,7 @@ out_timers:
 int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
 {
        struct ubifs_info *c = wbuf->c;
-        int err, written, n, aligned_len = ALIGN(len, 8), offs;
+        int err, written, n, aligned_len = ALIGN(len, 8);
        dbg_io("%d bytes (%s) to jhead %s wbuf at LEB %d:%d", len,
               dbg_ntype(((struct ubifs_ch *)buf)->node_type),
@@ -588,7 +582,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
        ubifs_assert(mutex_is_locked(&wbuf->io_mutex));
        ubifs_assert(!c->ro_media && !c->ro_mount);
        if (c->leb_size - wbuf->offs >= c->max_write_size)
-                ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size ));
+                ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size));
        if (c->leb_size - wbuf->offs - wbuf->used < aligned_len) {
                err = -ENOSPC;
@@ -636,7 +630,6 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
                goto exit;
        }
-        offs = wbuf->offs;
        written = 0;
        if (wbuf->used) {
@@ -653,7 +646,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
                if (err)
                        goto out;
-                offs += wbuf->size;
+                wbuf->offs += wbuf->size;
                len -= wbuf->avail;
                aligned_len -= wbuf->avail;
                written += wbuf->avail;
@@ -672,7 +665,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
                if (err)
                        goto out;
-                offs += wbuf->size;
+                wbuf->offs += wbuf->size;
                len -= wbuf->size;
                aligned_len -= wbuf->size;
                written += wbuf->size;
@@ -687,12 +680,13 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
        n = aligned_len >> c->max_write_shift;
        if (n) {
                n <<= c->max_write_shift;
-                dbg_io("write %d bytes to LEB %d:%d", n, wbuf->lnum, offs);
+                dbg_io("write %d bytes to LEB %d:%d", n, wbuf->lnum,
-                err = ubi_leb_write(c->ubi, wbuf->lnum, buf + written, offs, n,
+                       wbuf->offs);
-                                    wbuf->dtype);
+                err = ubi_leb_write(c->ubi, wbuf->lnum, buf + written,
+                                    wbuf->offs, n, wbuf->dtype);
                if (err)
                        goto out;
-                offs += n;
+                wbuf->offs += n;
                aligned_len -= n;
                len -= n;
                written += n;
@@ -707,7 +701,6 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
                 */
                memcpy(wbuf->buf, buf + written, len);
-        wbuf->offs = offs;
        if (c->leb_size - wbuf->offs >= c->max_write_size)
                wbuf->size = c->max_write_size;
        else
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index aed25e86422..34b1679e6e3 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -141,14 +141,8 @@ again:
         * LEB with some empty space.
         */
        lnum = ubifs_find_free_space(c, len, &offs, squeeze);
-        if (lnum >= 0) {
+        if (lnum >= 0)
-                /* Found an LEB, add it to the journal head */
-                err = ubifs_add_bud_to_log(c, jhead, lnum, offs);
-                if (err)
-                        goto out_return;
-                /* A new bud was successfully allocated and added to the log */
                goto out;
-        }
        err = lnum;
        if (err != -ENOSPC)
@@ -203,12 +197,23 @@ again:
                return 0;
        }
-        err = ubifs_add_bud_to_log(c, jhead, lnum, 0);
-        if (err)
-                goto out_return;
        offs = 0;
 out:
+        /*
+         * Make sure we synchronize the write-buffer before we add the new bud
+         * to the log. Otherwise we may have a power cut after the log
+         * reference node for the last bud (@lnum) is written but before the
+         * write-buffer data are written to the next-to-last bud
+         * (@wbuf->lnum). And the effect would be that the recovery would see
+         * that there is corruption in the next-to-last bud.
+         */
+        err = ubifs_wbuf_sync_nolock(wbuf);
+        if (err)
+                goto out_return;
+        err = ubifs_add_bud_to_log(c, jhead, lnum, offs);
+        if (err)
+                goto out_return;
        err = ubifs_wbuf_seek_nolock(wbuf, lnum, offs, wbuf->dtype);
        if (err)
                goto out_unlock;
@@ -380,10 +385,8 @@ out:
        if (err == -ENOSPC) {
                /* This are some budgeting problems, print useful information */
                down_write(&c->commit_sem);
-                spin_lock(&c->space_lock);
                dbg_dump_stack();
-                dbg_dump_budg(c);
+                dbg_dump_budg(c, &c->bi);
-                spin_unlock(&c->space_lock);
                dbg_dump_lprops(c);
                cmt_retries = dbg_check_lprops(c);
                up_write(&c->commit_sem);
diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c
index 40fa780ebea..affea9494ae 100644
--- a/fs/ubifs/log.c
+++ b/fs/ubifs/log.c
@@ -100,20 +100,6 @@ struct ubifs_wbuf *ubifs_get_wbuf(struct ubifs_info *c, int lnum)
 }
 /**
- * next_log_lnum - switch to the next log LEB.
- * @c: UBIFS file-system description object
- * @lnum: current log LEB
- */
-static inline int next_log_lnum(const struct ubifs_info *c, int lnum)
-{
-        lnum += 1;
-        if (lnum > c->log_last)
-                lnum = UBIFS_LOG_LNUM;
-        return lnum;
-}
-/**
 * empty_log_bytes - calculate amount of empty space in the log.
 * @c: UBIFS file-system description object
 */
@@ -257,7 +243,7 @@ int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs)
        ref->jhead = cpu_to_le32(jhead);
        if (c->lhead_offs > c->leb_size - c->ref_node_alsz) {
-                c->lhead_lnum = next_log_lnum(c, c->lhead_lnum);
+                c->lhead_lnum = ubifs_next_log_lnum(c, c->lhead_lnum);
                c->lhead_offs = 0;
        }
@@ -425,7 +411,7 @@ int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum)
        /* Switch to the next log LEB */
        if (c->lhead_offs) {
-                c->lhead_lnum = next_log_lnum(c, c->lhead_lnum);
+                c->lhead_lnum = ubifs_next_log_lnum(c, c->lhead_lnum);
                c->lhead_offs = 0;
        }
@@ -446,7 +432,7 @@ int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum)
        c->lhead_offs += len;
        if (c->lhead_offs == c->leb_size) {
-                c->lhead_lnum = next_log_lnum(c, c->lhead_lnum);
+                c->lhead_lnum = ubifs_next_log_lnum(c, c->lhead_lnum);
                c->lhead_offs = 0;
        }
@@ -533,7 +519,7 @@ int ubifs_log_post_commit(struct ubifs_info *c, int old_ltail_lnum)
        }
        mutex_lock(&c->log_mutex);
        for (lnum = old_ltail_lnum; lnum != c->ltail_lnum;
-             lnum = next_log_lnum(c, lnum)) {
+             lnum = ubifs_next_log_lnum(c, lnum)) {
                dbg_log("unmap log LEB %d", lnum);
                err = ubifs_leb_unmap(c, lnum);
                if (err)
@@ -642,7 +628,7 @@ static int add_node(struct ubifs_info *c, void *buf, int *lnum, int *offs,
                err = ubifs_leb_change(c, *lnum, buf, sz, UBI_SHORTTERM);
                if (err)
                        return err;
-                *lnum = next_log_lnum(c, *lnum);
+                *lnum = ubifs_next_log_lnum(c, *lnum);
                *offs = 0;
        }
        memcpy(buf + *offs, node, len);
@@ -712,7 +698,7 @@ int ubifs_consolidate_log(struct ubifs_info *c)
                ubifs_scan_destroy(sleb);
                if (lnum == c->lhead_lnum)
                        break;
-                lnum = next_log_lnum(c, lnum);
+                lnum = ubifs_next_log_lnum(c, lnum);
        }
        if (offs) {
                int sz = ALIGN(offs, c->min_io_size);
@@ -732,7 +718,7 @@ int ubifs_consolidate_log(struct ubifs_info *c)
        /* Unmap remaining LEBs */
        lnum = write_lnum;
        do {
-                lnum = next_log_lnum(c, lnum);
+                lnum = ubifs_next_log_lnum(c, lnum);
                err = ubifs_leb_unmap(c, lnum);
                if (err)
                        return err;
diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c
index 0ee0847f242..667884f4a61 100644
--- a/fs/ubifs/lprops.c
+++ b/fs/ubifs/lprops.c
@@ -1007,21 +1007,11 @@ out:
 }
 /**
- * struct scan_check_data - data provided to scan callback function.
- * @lst: LEB properties statistics
- * @err: error code
- */
-struct scan_check_data {
-        struct ubifs_lp_stats lst;
-        int err;
-};
-/**
 * scan_check_cb - scan callback.
 * @c: the UBIFS file-system description object
 * @lp: LEB properties to scan
 * @in_tree: whether the LEB properties are in main memory
- * @data: information passed to and from the caller of the scan
+ * @lst: lprops statistics to update
 *
 * This function returns a code that indicates whether the scan should continue
 * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree
@@ -1030,11 +1020,10 @@ struct scan_check_data {
 */
 static int scan_check_cb(struct ubifs_info *c,
                         const struct ubifs_lprops *lp, int in_tree,
-                         struct scan_check_data *data)
+                         struct ubifs_lp_stats *lst)
 {
        struct ubifs_scan_leb *sleb;
        struct ubifs_scan_node *snod;
-        struct ubifs_lp_stats *lst = &data->lst;
        int cat, lnum = lp->lnum, is_idx = 0, used = 0, free, dirty, ret;
        void *buf = NULL;
@@ -1044,7 +1033,7 @@ static int scan_check_cb(struct ubifs_info *c,
                if (cat != (lp->flags & LPROPS_CAT_MASK)) {
                        ubifs_err("bad LEB category %d expected %d",
                                  (lp->flags & LPROPS_CAT_MASK), cat);
-                        goto out;
+                        return -EINVAL;
                }
        }
@@ -1078,7 +1067,7 @@ static int scan_check_cb(struct ubifs_info *c,
                        }
                        if (!found) {
                                ubifs_err("bad LPT list (category %d)", cat);
-                                goto out;
+                                return -EINVAL;
                        }
                }
        }
@@ -1090,45 +1079,40 @@ static int scan_check_cb(struct ubifs_info *c,
                if ((lp->hpos != -1 && heap->arr[lp->hpos]->lnum != lnum) ||
                    lp != heap->arr[lp->hpos]) {
                        ubifs_err("bad LPT heap (category %d)", cat);
-                        goto out;
+                        return -EINVAL;
                }
        }
        buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL);
-        if (!buf) {
+        if (!buf)
-                ubifs_err("cannot allocate memory to scan LEB %d", lnum);
+                return -ENOMEM;
-                goto out;
+        /*
+         * After an unclean unmount, empty and freeable LEBs
+         * may contain garbage - do not scan them.
+         */
+        if (lp->free == c->leb_size) {
+                lst->empty_lebs += 1;
+                lst->total_free += c->leb_size;
+                lst->total_dark += ubifs_calc_dark(c, c->leb_size);
+                return LPT_SCAN_CONTINUE;
+        }
+        if (lp->free + lp->dirty == c->leb_size &&
+            !(lp->flags & LPROPS_INDEX)) {
+                lst->total_free  += lp->free;
+                lst->total_dirty += lp->dirty;
+                lst->total_dark  +=  ubifs_calc_dark(c, c->leb_size);
+                return LPT_SCAN_CONTINUE;
        }
        sleb = ubifs_scan(c, lnum, 0, buf, 0);
        if (IS_ERR(sleb)) {
-                /*
+                ret = PTR_ERR(sleb);
-                 * After an unclean unmount, empty and freeable LEBs
+                if (ret == -EUCLEAN) {
-                 * may contain garbage.
+                        dbg_dump_lprops(c);
-                 */
+                        dbg_dump_budg(c, &c->bi);
-                if (lp->free == c->leb_size) {
-                        ubifs_err("scan errors were in empty LEB "
-                                  "- continuing checking");
-                        lst->empty_lebs += 1;
-                        lst->total_free += c->leb_size;
-                        lst->total_dark += ubifs_calc_dark(c, c->leb_size);
-                        ret = LPT_SCAN_CONTINUE;
-                        goto exit;
-                }
-                if (lp->free + lp->dirty == c->leb_size &&
-                    !(lp->flags & LPROPS_INDEX)) {
-                        ubifs_err("scan errors were in freeable LEB "
-                                  "- continuing checking");
-                        lst->total_free  += lp->free;
-                        lst->total_dirty += lp->dirty;
-                        lst->total_dark  +=  ubifs_calc_dark(c, c->leb_size);
-                        ret = LPT_SCAN_CONTINUE;
-                        goto exit;
                }
-                data->err = PTR_ERR(sleb);
+                goto out;
-                ret = LPT_SCAN_STOP;
-                goto exit;
        }
        is_idx = -1;
@@ -1246,10 +1230,8 @@ static int scan_check_cb(struct ubifs_info *c,
        }
        ubifs_scan_destroy(sleb);
-        ret = LPT_SCAN_CONTINUE;
-exit:
        vfree(buf);
-        return ret;
+        return LPT_SCAN_CONTINUE;
 out_print:
        ubifs_err("bad accounting of LEB %d: free %d, dirty %d flags %#x, "
@@ -1258,10 +1240,10 @@ out_print:
        dbg_dump_leb(c, lnum);
 out_destroy:
        ubifs_scan_destroy(sleb);
+        ret = -EINVAL;
 out:
        vfree(buf);
-        data->err = -EINVAL;
+        return ret;
-        return LPT_SCAN_STOP;
 }
 /**
@@ -1278,8 +1260,7 @@ out:
 int dbg_check_lprops(struct ubifs_info *c)
 {
        int i, err;
-        struct scan_check_data data;
+        struct ubifs_lp_stats lst;
-        struct ubifs_lp_stats *lst = &data.lst;
        if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS))
                return 0;
@@ -1294,29 +1275,23 @@ int dbg_check_lprops(struct ubifs_info *c)
                        return err;
        }
-        memset(lst, 0, sizeof(struct ubifs_lp_stats));
+        memset(&lst, 0, sizeof(struct ubifs_lp_stats));
-        data.err = 0;
        err = ubifs_lpt_scan_nolock(c, c->main_first, c->leb_cnt - 1,
                                    (ubifs_lpt_scan_callback)scan_check_cb,
-                                    &data);
+                                    &lst);
        if (err && err != -ENOSPC)
                goto out;
-        if (data.err) {
-                err = data.err;
-                goto out;
-        }
-        if (lst->empty_lebs != c->lst.empty_lebs ||
+        if (lst.empty_lebs != c->lst.empty_lebs ||
-            lst->idx_lebs != c->lst.idx_lebs ||
+            lst.idx_lebs != c->lst.idx_lebs ||
-            lst->total_free != c->lst.total_free ||
+            lst.total_free != c->lst.total_free ||
-            lst->total_dirty != c->lst.total_dirty ||
+            lst.total_dirty != c->lst.total_dirty ||
-            lst->total_used != c->lst.total_used) {
+            lst.total_used != c->lst.total_used) {
                ubifs_err("bad overall accounting");
                ubifs_err("calculated: empty_lebs %d, idx_lebs %d, "
                          "total_free %lld, total_dirty %lld, total_used %lld",
-                          lst->empty_lebs, lst->idx_lebs, lst->total_free,
+                          lst.empty_lebs, lst.idx_lebs, lst.total_free,
-                          lst->total_dirty, lst->total_used);
+                          lst.total_dirty, lst.total_used);
                ubifs_err("read from lprops: empty_lebs %d, idx_lebs %d, "
                          "total_free %lld, total_dirty %lld, total_used %lld",
                          c->lst.empty_lebs, c->lst.idx_lebs, c->lst.total_free,
@@ -1325,11 +1300,11 @@ int dbg_check_lprops(struct ubifs_info *c)
                goto out;
        }
-        if (lst->total_dead != c->lst.total_dead ||
+        if (lst.total_dead != c->lst.total_dead ||
-            lst->total_dark != c->lst.total_dark) {
+            lst.total_dark != c->lst.total_dark) {
                ubifs_err("bad dead/dark space accounting");
                ubifs_err("calculated: total_dead %lld, total_dark %lld",
-                          lst->total_dead, lst->total_dark);
+                          lst.total_dead, lst.total_dark);
                ubifs_err("read from lprops: total_dead %lld, total_dark %lld",
                          c->lst.total_dead, c->lst.total_dark);
                err = -EINVAL;
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index 0c9c69bd983..dfcb5748a7d 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -29,6 +29,12 @@
 #include <linux/slab.h>
 #include "ubifs.h"
+#ifdef CONFIG_UBIFS_FS_DEBUG
+static int dbg_populate_lsave(struct ubifs_info *c);
+#else
+#define dbg_populate_lsave(c) 0
+#endif
 /**
 * first_dirty_cnode - find first dirty cnode.
 * @c: UBIFS file-system description object
@@ -586,7 +592,7 @@ static struct ubifs_pnode *next_pnode_to_dirty(struct ubifs_info *c,
                        if (nnode->nbranch[iip].lnum)
                                break;
                }
-       } while (iip >= UBIFS_LPT_FANOUT);
+        } while (iip >= UBIFS_LPT_FANOUT);
        /* Go right */
        nnode = ubifs_get_nnode(c, nnode, iip);
@@ -815,6 +821,10 @@ static void populate_lsave(struct ubifs_info *c)
                c->lpt_drty_flgs |= LSAVE_DIRTY;
                ubifs_add_lpt_dirt(c, c->lsave_lnum, c->lsave_sz);
        }
+        if (dbg_populate_lsave(c))
+                return;
        list_for_each_entry(lprops, &c->empty_list, list) {
                c->lsave[cnt++] = lprops->lnum;
                if (cnt >= c->lsave_cnt)
@@ -1994,4 +2004,47 @@ void dbg_dump_lpt_lebs(const struct ubifs_info *c)
               current->pid);
 }
+/**
+ * dbg_populate_lsave - debugging version of 'populate_lsave()'
+ * @c: UBIFS file-system description object
+ *
+ * This is a debugging version for 'populate_lsave()' which populates lsave
+ * with random LEBs instead of useful LEBs, which is good for test coverage.
+ * Returns zero if lsave has not been populated (this debugging feature is
+ * disabled) an non-zero if lsave has been populated.
+ */
+static int dbg_populate_lsave(struct ubifs_info *c)
+{
+        struct ubifs_lprops *lprops;
+        struct ubifs_lpt_heap *heap;
+        int i;
+        if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
+                return 0;
+        if (random32() & 3)
+                return 0;
+        for (i = 0; i < c->lsave_cnt; i++)
+                c->lsave[i] = c->main_first;
+        list_for_each_entry(lprops, &c->empty_list, list)
+                c->lsave[random32() % c->lsave_cnt] = lprops->lnum;
+        list_for_each_entry(lprops, &c->freeable_list, list)
+                c->lsave[random32() % c->lsave_cnt] = lprops->lnum;
+        list_for_each_entry(lprops, &c->frdi_idx_list, list)
+                c->lsave[random32() % c->lsave_cnt] = lprops->lnum;
+        heap = &c->lpt_heap[LPROPS_DIRTY_IDX - 1];
+        for (i = 0; i < heap->cnt; i++)
+                c->lsave[random32() % c->lsave_cnt] = heap->arr[i]->lnum;
+        heap = &c->lpt_heap[LPROPS_DIRTY - 1];
+        for (i = 0; i < heap->cnt; i++)
+                c->lsave[random32() % c->lsave_cnt] = heap->arr[i]->lnum;
+        heap = &c->lpt_heap[LPROPS_FREE - 1];
+        for (i = 0; i < heap->cnt; i++)
+                c->lsave[random32() % c->lsave_cnt] = heap->arr[i]->lnum;
+        return 1;
+}
 #endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c
index 21f47afdacf..278c2382e8c 100644
--- a/fs/ubifs/master.c
+++ b/fs/ubifs/master.c
@@ -148,7 +148,7 @@ static int validate_master(const struct ubifs_info *c)
        }
        main_sz = (long long)c->main_lebs * c->leb_size;
-        if (c->old_idx_sz & 7 || c->old_idx_sz >= main_sz) {
+        if (c->bi.old_idx_sz & 7 || c->bi.old_idx_sz >= main_sz) {
                err = 9;
                goto out;
        }
@@ -218,7 +218,7 @@ static int validate_master(const struct ubifs_info *c)
        }
        if (c->lst.total_dead + c->lst.total_dark +
-            c->lst.total_used + c->old_idx_sz > main_sz) {
+            c->lst.total_used + c->bi.old_idx_sz > main_sz) {
                err = 21;
                goto out;
        }
@@ -286,7 +286,7 @@ int ubifs_read_master(struct ubifs_info *c)
        c->gc_lnum         = le32_to_cpu(c->mst_node->gc_lnum);
        c->ihead_lnum      = le32_to_cpu(c->mst_node->ihead_lnum);
        c->ihead_offs      = le32_to_cpu(c->mst_node->ihead_offs);
-        c->old_idx_sz      = le64_to_cpu(c->mst_node->index_size);
+        c->bi.old_idx_sz   = le64_to_cpu(c->mst_node->index_size);
        c->lpt_lnum        = le32_to_cpu(c->mst_node->lpt_lnum);
        c->lpt_offs        = le32_to_cpu(c->mst_node->lpt_offs);
        c->nhead_lnum      = le32_to_cpu(c->mst_node->nhead_lnum);
@@ -305,7 +305,7 @@ int ubifs_read_master(struct ubifs_info *c)
        c->lst.total_dead  = le64_to_cpu(c->mst_node->total_dead);
        c->lst.total_dark  = le64_to_cpu(c->mst_node->total_dark);
-        c->calc_idx_sz = c->old_idx_sz;
+        c->calc_idx_sz = c->bi.old_idx_sz;
        if (c->mst_node->flags & cpu_to_le32(UBIFS_MST_NO_ORPHS))
                c->no_orphs = 1;
diff --git a/fs/ubifs/misc.h b/fs/ubifs/misc.h
index c3de04dc952..0b5296a9a4c 100644
--- a/fs/ubifs/misc.h
+++ b/fs/ubifs/misc.h
@@ -340,4 +340,21 @@ static inline void ubifs_release_lprops(struct ubifs_info *c)
        mutex_unlock(&c->lp_mutex);
 }
+/**
+ * ubifs_next_log_lnum - switch to the next log LEB.
+ * @c: UBIFS file-system description object
+ * @lnum: current log LEB
+ *
+ * This helper function returns the log LEB number which goes next after LEB
+ * 'lnum'.
+ */
+static inline int ubifs_next_log_lnum(const struct ubifs_info *c, int lnum)
+{
+        lnum += 1;
+        if (lnum > c->log_last)
+                lnum = UBIFS_LOG_LNUM;
+        return lnum;
+}
 #endif /* __UBIFS_MISC_H__ */
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index 09df318e368..bd644bf587a 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -673,7 +673,8 @@ static int kill_orphans(struct ubifs_info *c)
                sleb = ubifs_scan(c, lnum, 0, c->sbuf, 1);
                if (IS_ERR(sleb)) {
                        if (PTR_ERR(sleb) == -EUCLEAN)
-                                sleb = ubifs_recover_leb(c, lnum, 0, c->sbuf, 0);
+                                sleb = ubifs_recover_leb(c, lnum, 0,
+                                                         c->sbuf, 0);
                        if (IS_ERR(sleb)) {
                                err = PTR_ERR(sleb);
                                break;
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 3dbad6fbd1e..731d9e2e7b5 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -564,13 +564,16 @@ static int fix_unclean_leb(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
 }
 /**
- * drop_incomplete_group - drop nodes from an incomplete group.
+ * drop_last_node - drop the last node or group of nodes.
 * @sleb: scanned LEB information
 * @offs: offset of dropped nodes is returned here
+ * @grouped: non-zero if whole group of nodes have to be dropped
 *
- * This function returns %1 if nodes are dropped and %0 otherwise.
+ * This is a helper function for 'ubifs_recover_leb()' which drops the last
+ * node of the scanned LEB or the last group of nodes if @grouped is not zero.
+ * This function returns %1 if a node was dropped and %0 otherwise.
 */
-static int drop_incomplete_group(struct ubifs_scan_leb *sleb, int *offs)
+static int drop_last_node(struct ubifs_scan_leb *sleb, int *offs, int grouped)
 {
        int dropped = 0;
@@ -589,6 +592,8 @@ static int drop_incomplete_group(struct ubifs_scan_leb *sleb, int *offs)
                kfree(snod);
                sleb->nodes_cnt -= 1;
                dropped = 1;
+                if (!grouped)
+                        break;
        }
        return dropped;
 }
@@ -609,8 +614,7 @@ static int drop_incomplete_group(struct ubifs_scan_leb *sleb, int *offs)
 struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
                                         int offs, void *sbuf, int grouped)
 {
-        int err, len = c->leb_size - offs, need_clean = 0, quiet = 1;
+        int ret = 0, err, len = c->leb_size - offs, start = offs, min_io_unit;
-        int empty_chkd = 0, start = offs;
        struct ubifs_scan_leb *sleb;
        void *buf = sbuf + offs;
@@ -620,12 +624,8 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
        if (IS_ERR(sleb))
                return sleb;
-        if (sleb->ecc)
+        ubifs_assert(len >= 8);
-                need_clean = 1;
        while (len >= 8) {
-                int ret;
                dbg_scan("look at LEB %d:%d (%d bytes left)",
                         lnum, offs, len);
@@ -635,8 +635,7 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
                 * Scan quietly until there is an error from which we cannot
                 * recover
                 */
-                ret = ubifs_scan_a_node(c, buf, len, lnum, offs, quiet);
+                ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 0);
                if (ret == SCANNED_A_NODE) {
                        /* A valid node, and not a padding node */
                        struct ubifs_ch *ch = buf;
@@ -649,70 +648,32 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
                        offs += node_len;
                        buf += node_len;
                        len -= node_len;
-                        continue;
+                } else if (ret > 0) {
-                }
-                if (ret > 0) {
                        /* Padding bytes or a valid padding node */
                        offs += ret;
                        buf += ret;
                        len -= ret;
-                        continue;
+                } else if (ret == SCANNED_EMPTY_SPACE ||
-                }
+                           ret == SCANNED_GARBAGE     ||
+                           ret == SCANNED_A_BAD_PAD_NODE ||
-                if (ret == SCANNED_EMPTY_SPACE) {
+                           ret == SCANNED_A_CORRUPT_NODE) {
-                        if (!is_empty(buf, len)) {
+                        dbg_rcvry("found corruption - %d", ret);
-                                if (!is_last_write(c, buf, offs))
-                                        break;
-                                clean_buf(c, &buf, lnum, &offs, &len);
-                                need_clean = 1;
-                        }
-                        empty_chkd = 1;
                        break;
-                }
+                } else {
+                        dbg_err("unexpected return value %d", ret);
-                if (ret == SCANNED_GARBAGE || ret == SCANNED_A_BAD_PAD_NODE)
-                        if (is_last_write(c, buf, offs)) {
-                                clean_buf(c, &buf, lnum, &offs, &len);
-                                need_clean = 1;
-                                empty_chkd = 1;
-                                break;
-                        }
-                if (ret == SCANNED_A_CORRUPT_NODE)
-                        if (no_more_nodes(c, buf, len, lnum, offs)) {
-                                clean_buf(c, &buf, lnum, &offs, &len);
-                                need_clean = 1;
-                                empty_chkd = 1;
-                                break;
-                        }
-                if (quiet) {
-                        /* Redo the last scan but noisily */
-                        quiet = 0;
-                        continue;
-                }
-                switch (ret) {
-                case SCANNED_GARBAGE:
-                        dbg_err("garbage");
-                        goto corrupted;
-                case SCANNED_A_CORRUPT_NODE:
-                case SCANNED_A_BAD_PAD_NODE:
-                        dbg_err("bad node");
-                        goto corrupted;
-                default:
-                        dbg_err("unknown");
                        err = -EINVAL;
                        goto error;
                }
        }
-        if (!empty_chkd && !is_empty(buf, len)) {
+        if (ret == SCANNED_GARBAGE || ret == SCANNED_A_BAD_PAD_NODE) {
-                if (is_last_write(c, buf, offs)) {
+                if (!is_last_write(c, buf, offs))
-                        clean_buf(c, &buf, lnum, &offs, &len);
+                        goto corrupted_rescan;
-                        need_clean = 1;
+        } else if (ret == SCANNED_A_CORRUPT_NODE) {
-                } else {
+                if (!no_more_nodes(c, buf, len, lnum, offs))
+                        goto corrupted_rescan;
+        } else if (!is_empty(buf, len)) {
+                if (!is_last_write(c, buf, offs)) {
                        int corruption = first_non_ff(buf, len);
                        /*
@@ -728,29 +689,82 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
                }
        }
-        /* Drop nodes from incomplete group */
+        min_io_unit = round_down(offs, c->min_io_size);
-        if (grouped && drop_incomplete_group(sleb, &offs)) {
+        if (grouped)
-                buf = sbuf + offs;
+                /*
-                len = c->leb_size - offs;
+                 * If nodes are grouped, always drop the incomplete group at
-                clean_buf(c, &buf, lnum, &offs, &len);
+                 * the end.
-                need_clean = 1;
+                 */
-        }
+                drop_last_node(sleb, &offs, 1);
-        if (offs % c->min_io_size) {
+        /*
-                clean_buf(c, &buf, lnum, &offs, &len);
+         * While we are in the middle of the same min. I/O unit keep dropping
-                need_clean = 1;
+         * nodes. So basically, what we want is to make sure that the last min.
-        }
+         * I/O unit where we saw the corruption is dropped completely with all
+         * the uncorrupted node which may possibly sit there.
+         *
+         * In other words, let's name the min. I/O unit where the corruption
+         * starts B, and the previous min. I/O unit A. The below code tries to
+         * deal with a situation when half of B contains valid nodes or the end
+         * of a valid node, and the second half of B contains corrupted data or
+         * garbage. This means that UBIFS had been writing to B just before the
+         * power cut happened. I do not know how realistic is this scenario
+         * that half of the min. I/O unit had been written successfully and the
+         * other half not, but this is possible in our 'failure mode emulation'
+         * infrastructure at least.
+         *
+         * So what is the problem, why we need to drop those nodes? Whey can't
+         * we just clean-up the second half of B by putting a padding node
+         * there? We can, and this works fine with one exception which was
+         * reproduced with power cut emulation testing and happens extremely
+         * rarely. The description follows, but it is worth noting that that is
+         * only about the GC head, so we could do this trick only if the bud
+         * belongs to the GC head, but it does not seem to be worth an
+         * additional "if" statement.
+         *
+         * So, imagine the file-system is full, we run GC which is moving valid
+         * nodes from LEB X to LEB Y (obviously, LEB Y is the current GC head
+         * LEB). The @c->gc_lnum is -1, which means that GC will retain LEB X
+         * and will try to continue. Imagine that LEB X is currently the
+         * dirtiest LEB, and the amount of used space in LEB Y is exactly the
+         * same as amount of free space in LEB X.
+         *
+         * And a power cut happens when nodes are moved from LEB X to LEB Y. We
+         * are here trying to recover LEB Y which is the GC head LEB. We find
+         * the min. I/O unit B as described above. Then we clean-up LEB Y by
+         * padding min. I/O unit. And later 'ubifs_rcvry_gc_commit()' function
+         * fails, because it cannot find a dirty LEB which could be GC'd into
+         * LEB Y! Even LEB X does not match because the amount of valid nodes
+         * there does not fit the free space in LEB Y any more! And this is
+         * because of the padding node which we added to LEB Y. The
+         * user-visible effect of this which I once observed and analysed is
+         * that we cannot mount the file-system with -ENOSPC error.
+         *
+         * So obviously, to make sure that situation does not happen we should
+         * free min. I/O unit B in LEB Y completely and the last used min. I/O
+         * unit in LEB Y should be A. This is basically what the below code
+         * tries to do.
+         */
+        while (min_io_unit == round_down(offs, c->min_io_size) &&
+               min_io_unit != offs &&
+               drop_last_node(sleb, &offs, grouped));
+        buf = sbuf + offs;
+        len = c->leb_size - offs;
+        clean_buf(c, &buf, lnum, &offs, &len);
        ubifs_end_scan(c, sleb, lnum, offs);
-        if (need_clean) {
+        err = fix_unclean_leb(c, sleb, start);
-                err = fix_unclean_leb(c, sleb, start);
+        if (err)
-                if (err)
+                goto error;
-                        goto error;
-        }
        return sleb;
+corrupted_rescan:
+        /* Re-scan the corrupted data with verbose messages */
+        dbg_err("corruptio %d", ret);
+        ubifs_scan_a_node(c, buf, len, lnum, offs, 1);
 corrupted:
        ubifs_scanned_corruption(c, lnum, offs, buf);
        err = -EUCLEAN;
@@ -1070,6 +1084,53 @@ int ubifs_clean_lebs(const struct ubifs_info *c, void *sbuf)
 }
 /**
+ * grab_empty_leb - grab an empty LEB to use as GC LEB and run commit.
+ * @c: UBIFS file-system description object
+ *
+ * This is a helper function for 'ubifs_rcvry_gc_commit()' which grabs an empty
+ * LEB to be used as GC LEB (@c->gc_lnum), and then runs the commit. Returns
+ * zero in case of success and a negative error code in case of failure.
+ */
+static int grab_empty_leb(struct ubifs_info *c)
+{
+        int lnum, err;
+        /*
+         * Note, it is very important to first search for an empty LEB and then
+         * run the commit, not vice-versa. The reason is that there might be
+         * only one empty LEB at the moment, the one which has been the
+         * @c->gc_lnum just before the power cut happened. During the regular
+         * UBIFS operation (not now) @c->gc_lnum is marked as "taken", so no
+         * one but GC can grab it. But at this moment this single empty LEB is
+         * not marked as taken, so if we run commit - what happens? Right, the
+         * commit will grab it and write the index there. Remember that the
+         * index always expands as long as there is free space, and it only
+         * starts consolidating when we run out of space.
+         *
+         * IOW, if we run commit now, we might not be able to find a free LEB
+         * after this.
+         */
+        lnum = ubifs_find_free_leb_for_idx(c);
+        if (lnum < 0) {
+                dbg_err("could not find an empty LEB");
+                dbg_dump_lprops(c);
+                dbg_dump_budg(c, &c->bi);
+                return lnum;
+        }
+        /* Reset the index flag */
+        err = ubifs_change_one_lp(c, lnum, LPROPS_NC, LPROPS_NC, 0,
+                                  LPROPS_INDEX, 0);
+        if (err)
+                return err;
+        c->gc_lnum = lnum;
+        dbg_rcvry("found empty LEB %d, run commit", lnum);
+        return ubifs_run_commit(c);
+}
+/**
 * ubifs_rcvry_gc_commit - recover the GC LEB number and run the commit.
 * @c: UBIFS file-system description object
 *
@@ -1091,71 +1152,26 @@ int ubifs_rcvry_gc_commit(struct ubifs_info *c)
 {
        struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
        struct ubifs_lprops lp;
-        int lnum, err;
+        int err;
+        dbg_rcvry("GC head LEB %d, offs %d", wbuf->lnum, wbuf->offs);
        c->gc_lnum = -1;
-        if (wbuf->lnum == -1) {
+        if (wbuf->lnum == -1 || wbuf->offs == c->leb_size)
-                dbg_rcvry("no GC head LEB");
+                return grab_empty_leb(c);
-                goto find_free;
-        }
-        /*
-         * See whether the used space in the dirtiest LEB fits in the GC head
-         * LEB.
-         */
-        if (wbuf->offs == c->leb_size) {
-                dbg_rcvry("no room in GC head LEB");
-                goto find_free;
-        }
        err = ubifs_find_dirty_leb(c, &lp, wbuf->offs, 2);
        if (err) {
-                /*
+                if (err != -ENOSPC)
-                 * There are no dirty or empty LEBs subject to here being
-                 * enough for the index. Try to use
-                 * 'ubifs_find_free_leb_for_idx()', which will return any empty
-                 * LEBs (ignoring index requirements). If the index then
-                 * doesn't have enough LEBs the recovery commit will fail -
-                 * which is the  same result anyway i.e. recovery fails. So
-                 * there is no problem ignoring index  requirements and just
-                 * grabbing a free LEB since we have already established there
-                 * is not a dirty LEB we could have used instead.
-                 */
-                if (err == -ENOSPC) {
-                        dbg_rcvry("could not find a dirty LEB");
-                        goto find_free;
-                }
-                return err;
-        }
-        ubifs_assert(!(lp.flags & LPROPS_INDEX));
-        lnum = lp.lnum;
-        if (lp.free + lp.dirty == c->leb_size) {
-                /* An empty LEB was returned */
-                if (lp.free != c->leb_size) {
-                        err = ubifs_change_one_lp(c, lnum, c->leb_size,
-                                                  0, 0, 0, 0);
-                        if (err)
-                                return err;
-                }
-                err = ubifs_leb_unmap(c, lnum);
-                if (err)
                        return err;
-                c->gc_lnum = lnum;
-                dbg_rcvry("allocated LEB %d for GC", lnum);
+                dbg_rcvry("could not find a dirty LEB");
-                /* Run the commit */
+                return grab_empty_leb(c);
-                dbg_rcvry("committing");
-                return ubifs_run_commit(c);
-        }
-        /*
-         * There was no empty LEB so the used space in the dirtiest LEB must fit
-         * in the GC head LEB.
-         */
-        if (lp.free + lp.dirty < wbuf->offs) {
-                dbg_rcvry("LEB %d doesn't fit in GC head LEB %d:%d",
-                          lnum, wbuf->lnum, wbuf->offs);
-                err = ubifs_return_leb(c, lnum);
-                if (err)
-                        return err;
-                goto find_free;
        }
+        ubifs_assert(!(lp.flags & LPROPS_INDEX));
+        ubifs_assert(lp.free + lp.dirty >= wbuf->offs);
        /*
         * We run the commit before garbage collection otherwise subsequent
         * mounts will see the GC and orphan deletion in a different order.
@@ -1164,11 +1180,8 @@ int ubifs_rcvry_gc_commit(struct ubifs_info *c)
        err = ubifs_run_commit(c);
        if (err)
                return err;
-        /*
-         * The data in the dirtiest LEB fits in the GC head LEB, so do the GC
+        dbg_rcvry("GC'ing LEB %d", lp.lnum);
-         * - use locking to keep 'ubifs_assert()' happy.
-         */
-        dbg_rcvry("GC'ing LEB %d", lnum);
        mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
        err = ubifs_garbage_collect_leb(c, &lp);
        if (err >= 0) {
@@ -1184,37 +1197,17 @@ int ubifs_rcvry_gc_commit(struct ubifs_info *c)
                        err = -EINVAL;
                return err;
        }
-        if (err != LEB_RETAINED) {
-                dbg_err("GC returned %d", err);
+        ubifs_assert(err == LEB_RETAINED);
+        if (err != LEB_RETAINED)
                return -EINVAL;
-        }
        err = ubifs_leb_unmap(c, c->gc_lnum);
        if (err)
                return err;
-        dbg_rcvry("allocated LEB %d for GC", lnum);
-        return 0;
-find_free:
+        dbg_rcvry("allocated LEB %d for GC", lp.lnum);
-        /*
+        return 0;
-         * There is no GC head LEB or the free space in the GC head LEB is too
-         * small, or there are not dirty LEBs. Allocate gc_lnum by calling
-         * 'ubifs_find_free_leb_for_idx()' so GC is not run.
-         */
-        lnum = ubifs_find_free_leb_for_idx(c);
-        if (lnum < 0) {
-                dbg_err("could not find an empty LEB");
-                return lnum;
-        }
-        /* And reset the index flag */
-        err = ubifs_change_one_lp(c, lnum, LPROPS_NC, LPROPS_NC, 0,
-                                  LPROPS_INDEX, 0);
-        if (err)
-                return err;
-        c->gc_lnum = lnum;
-        dbg_rcvry("allocated LEB %d for GC", lnum);
-        /* Run the commit */
-        dbg_rcvry("committing");
-        return ubifs_run_commit(c);
 }
 /**
@@ -1456,7 +1449,7 @@ static int fix_size_in_place(struct ubifs_info *c, struct size_entry *e)
        err = ubi_leb_change(c->ubi, lnum, c->sbuf, len, UBI_UNKNOWN);
        if (err)
                goto out;
-        dbg_rcvry("inode %lu at %d:%d size %lld -> %lld ",
+        dbg_rcvry("inode %lu at %d:%d size %lld -> %lld",
                  (unsigned long)e->inum, lnum, offs, i_size, e->d_size);
        return 0;
@@ -1505,20 +1498,27 @@ int ubifs_recover_size(struct ubifs_info *c)
                                e->i_size = le64_to_cpu(ino->size);
                        }
                }
                if (e->exists && e->i_size < e->d_size) {
-                        if (!e->inode && c->ro_mount) {
+                        if (c->ro_mount) {
                                /* Fix the inode size and pin it in memory */
                                struct inode *inode;
+                                struct ubifs_inode *ui;
+                                ubifs_assert(!e->inode);
                                inode = ubifs_iget(c->vfs_sb, e->inum);
                                if (IS_ERR(inode))
                                        return PTR_ERR(inode);
+                                ui = ubifs_inode(inode);
                                if (inode->i_size < e->d_size) {
                                        dbg_rcvry("ino %lu size %lld -> %lld",
                                                  (unsigned long)e->inum,
-                                                  e->d_size, inode->i_size);
+                                                  inode->i_size, e->d_size);
                                        inode->i_size = e->d_size;
-                                        ubifs_inode(inode)->ui_size = e->d_size;
+                                        ui->ui_size = e->d_size;
+                                        ui->synced_i_size = e->d_size;
                                        e->inode = inode;
                                        this = rb_next(this);
                                        continue;
@@ -1533,9 +1533,11 @@ int ubifs_recover_size(struct ubifs_info *c)
                                        iput(e->inode);
                        }
                }
                this = rb_next(this);
                rb_erase(&e->rb, &c->size_tree);
                kfree(e);
        }
        return 0;
 }
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index d3d6d365bfc..6617280d167 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -33,44 +33,32 @@
 */
 #include "ubifs.h"
+#include <linux/list_sort.h>
-/*
- * Replay flags.
- *
- * REPLAY_DELETION: node was deleted
- * REPLAY_REF: node is a reference node
- */
-enum {
-        REPLAY_DELETION = 1,
-        REPLAY_REF = 2,
-};
 /**
- * struct replay_entry - replay tree entry.
+ * struct replay_entry - replay list entry.
 * @lnum: logical eraseblock number of the node
 * @offs: node offset
 * @len: node length
+ * @deletion: non-zero if this entry corresponds to a node deletion
 * @sqnum: node sequence number
- * @flags: replay flags
+ * @list: links the replay list
- * @rb: links the replay tree
 * @key: node key
 * @nm: directory entry name
 * @old_size: truncation old size
 * @new_size: truncation new size
- * @free: amount of free space in a bud
- * @dirty: amount of dirty space in a bud from padding and deletion nodes
- * @jhead: journal head number of the bud
 *
- * UBIFS journal replay must compare node sequence numbers, which means it must
+ * The replay process first scans all buds and builds the replay list, then
- * build a tree of node information to insert into the TNC.
+ * sorts the replay list in nodes sequence number order, and then inserts all
+ * the replay entries to the TNC.
 */
 struct replay_entry {
        int lnum;
        int offs;
        int len;
+        unsigned int deletion:1;
        unsigned long long sqnum;
-        int flags;
+        struct list_head list;
-        struct rb_node rb;
        union ubifs_key key;
        union {
                struct qstr nm;
@@ -78,11 +66,6 @@ struct replay_entry {
                        loff_t old_size;
                        loff_t new_size;
                };
-                struct {
-                        int free;
-                        int dirty;
-                        int jhead;
-                };
        };
 };
@@ -90,57 +73,64 @@ struct replay_entry {
 * struct bud_entry - entry in the list of buds to replay.
 * @list: next bud in the list
 * @bud: bud description object
- * @free: free bytes in the bud
 * @sqnum: reference node sequence number
+ * @free: free bytes in the bud
+ * @dirty: dirty bytes in the bud
 */
 struct bud_entry {
        struct list_head list;
        struct ubifs_bud *bud;
-        int free;
        unsigned long long sqnum;
+        int free;
+        int dirty;
 };
 /**
 * set_bud_lprops - set free and dirty space used by a bud.
 * @c: UBIFS file-system description object
- * @r: replay entry of bud
+ * @b: bud entry which describes the bud
+ *
+ * This function makes sure the LEB properties of bud @b are set correctly
+ * after the replay. Returns zero in case of success and a negative error code
+ * in case of failure.
 */
-static int set_bud_lprops(struct ubifs_info *c, struct replay_entry *r)
+static int set_bud_lprops(struct ubifs_info *c, struct bud_entry *b)
 {
        const struct ubifs_lprops *lp;
        int err = 0, dirty;
        ubifs_get_lprops(c);
-        lp = ubifs_lpt_lookup_dirty(c, r->lnum);
+        lp = ubifs_lpt_lookup_dirty(c, b->bud->lnum);
        if (IS_ERR(lp)) {
                err = PTR_ERR(lp);
                goto out;
        }
        dirty = lp->dirty;
-        if (r->offs == 0 && (lp->free != c->leb_size || lp->dirty != 0)) {
+        if (b->bud->start == 0 && (lp->free != c->leb_size || lp->dirty != 0)) {
                /*
                 * The LEB was added to the journal with a starting offset of
                 * zero which means the LEB must have been empty. The LEB
-                 * property values should be lp->free == c->leb_size and
+                 * property values should be @lp->free == @c->leb_size and
-                 * lp->dirty == 0, but that is not the case. The reason is that
+                 * @lp->dirty == 0, but that is not the case. The reason is that
-                 * the LEB was garbage collected. The garbage collector resets
+                 * the LEB had been garbage collected before it became the bud,
-                 * the free and dirty space without recording it anywhere except
+                 * and there was not commit inbetween. The garbage collector
-                 * lprops, so if there is not a commit then lprops does not have
+                 * resets the free and dirty space without recording it
-                 * that information next time the file system is mounted.
+                 * anywhere except lprops, so if there was no commit then
+                 * lprops does not have that information.
                 *
                 * We do not need to adjust free space because the scan has told
                 * us the exact value which is recorded in the replay entry as
-                 * r->free.
+                 * @b->free.
                 *
                 * However we do need to subtract from the dirty space the
                 * amount of space that the garbage collector reclaimed, which
                 * is the whole LEB minus the amount of space that was free.
                 */
-                dbg_mnt("bud LEB %d was GC'd (%d free, %d dirty)", r->lnum,
+                dbg_mnt("bud LEB %d was GC'd (%d free, %d dirty)", b->bud->lnum,
                        lp->free, lp->dirty);
-                dbg_gc("bud LEB %d was GC'd (%d free, %d dirty)", r->lnum,
+                dbg_gc("bud LEB %d was GC'd (%d free, %d dirty)", b->bud->lnum,
                        lp->free, lp->dirty);
                dirty -= c->leb_size - lp->free;
                /*
@@ -152,10 +142,10 @@ static int set_bud_lprops(struct ubifs_info *c, struct replay_entry *r)
                 */
                if (dirty != 0)
                        dbg_msg("LEB %d lp: %d free %d dirty "
-                                "replay: %d free %d dirty", r->lnum, lp->free,
+                                "replay: %d free %d dirty", b->bud->lnum,
-                                lp->dirty, r->free, r->dirty);
+                                lp->free, lp->dirty, b->free, b->dirty);
        }
-        lp = ubifs_change_lp(c, lp, r->free, dirty + r->dirty,
+        lp = ubifs_change_lp(c, lp, b->free, dirty + b->dirty,
                             lp->flags | LPROPS_TAKEN, 0);
        if (IS_ERR(lp)) {
                err = PTR_ERR(lp);
@@ -163,8 +153,9 @@ static int set_bud_lprops(struct ubifs_info *c, struct replay_entry *r)
        }
        /* Make sure the journal head points to the latest bud */
-        err = ubifs_wbuf_seek_nolock(&c->jheads[r->jhead].wbuf, r->lnum,
+        err = ubifs_wbuf_seek_nolock(&c->jheads[b->bud->jhead].wbuf,
-                                     c->leb_size - r->free, UBI_SHORTTERM);
+                                     b->bud->lnum, c->leb_size - b->free,
+                                     UBI_SHORTTERM);
 out:
        ubifs_release_lprops(c);
@@ -172,6 +163,27 @@ out:
 }
 /**
+ * set_buds_lprops - set free and dirty space for all replayed buds.
+ * @c: UBIFS file-system description object
+ *
+ * This function sets LEB properties for all replayed buds. Returns zero in
+ * case of success and a negative error code in case of failure.
+ */
+static int set_buds_lprops(struct ubifs_info *c)
+{
+        struct bud_entry *b;
+        int err;
+        list_for_each_entry(b, &c->replay_buds, list) {
+                err = set_bud_lprops(c, b);
+                if (err)
+                        return err;
+        }
+        return 0;
+}
+/**
 * trun_remove_range - apply a replay entry for a truncation to the TNC.
 * @c: UBIFS file-system description object
 * @r: replay entry of truncation
@@ -207,24 +219,22 @@ static int trun_remove_range(struct ubifs_info *c, struct replay_entry *r)
 */
 static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r)
 {
-        int err, deletion = ((r->flags & REPLAY_DELETION) != 0);
+        int err;
-        dbg_mnt("LEB %d:%d len %d flgs %d sqnum %llu %s", r->lnum,
+        dbg_mnt("LEB %d:%d len %d deletion %d sqnum %llu %s", r->lnum,
-                r->offs, r->len, r->flags, r->sqnum, DBGKEY(&r->key));
+                r->offs, r->len, r->deletion, r->sqnum, DBGKEY(&r->key));
        /* Set c->replay_sqnum to help deal with dangling branches. */
        c->replay_sqnum = r->sqnum;
-        if (r->flags & REPLAY_REF)
+        if (is_hash_key(c, &r->key)) {
-                err = set_bud_lprops(c, r);
+                if (r->deletion)
-        else if (is_hash_key(c, &r->key)) {
-                if (deletion)
                        err = ubifs_tnc_remove_nm(c, &r->key, &r->nm);
                else
                        err = ubifs_tnc_add_nm(c, &r->key, r->lnum, r->offs,
                                               r->len, &r->nm);
        } else {
-                if (deletion)
+                if (r->deletion)
                        switch (key_type(c, &r->key)) {
                        case UBIFS_INO_KEY:
                        {
@@ -247,7 +257,7 @@ static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r)
                        return err;
                if (c->need_recovery)
-                        err = ubifs_recover_size_accum(c, &r->key, deletion,
+                        err = ubifs_recover_size_accum(c, &r->key, r->deletion,
                                                       r->new_size);
        }
@@ -255,68 +265,77 @@ static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r)
 }
 /**
- * destroy_replay_tree - destroy the replay.
+ * replay_entries_cmp - compare 2 replay entries.
- * @c: UBIFS file-system description object
+ * @priv: UBIFS file-system description object
+ * @a: first replay entry
+ * @a: second replay entry
 *
- * Destroy the replay tree.
+ * This is a comparios function for 'list_sort()' which compares 2 replay
+ * entries @a and @b by comparing their sequence numer.  Returns %1 if @a has
+ * greater sequence number and %-1 otherwise.
 */
-static void destroy_replay_tree(struct ubifs_info *c)
+static int replay_entries_cmp(void *priv, struct list_head *a,
+                              struct list_head *b)
 {
-        struct rb_node *this = c->replay_tree.rb_node;
+        struct replay_entry *ra, *rb;
-        struct replay_entry *r;
+        cond_resched();
-        while (this) {
+        if (a == b)
-                if (this->rb_left) {
+                return 0;
-                        this = this->rb_left;
-                        continue;
+        ra = list_entry(a, struct replay_entry, list);
-                } else if (this->rb_right) {
+        rb = list_entry(b, struct replay_entry, list);
-                        this = this->rb_right;
+        ubifs_assert(ra->sqnum != rb->sqnum);
-                        continue;
+        if (ra->sqnum > rb->sqnum)
-                }
+                return 1;
-                r = rb_entry(this, struct replay_entry, rb);
+        return -1;
-                this = rb_parent(this);
-                if (this) {
-                        if (this->rb_left == &r->rb)
-                                this->rb_left = NULL;
-                        else
-                                this->rb_right = NULL;
-                }
-                if (is_hash_key(c, &r->key))
-                        kfree(r->nm.name);
-                kfree(r);
-        }
-        c->replay_tree = RB_ROOT;
 }
 /**
- * apply_replay_tree - apply the replay tree to the TNC.
+ * apply_replay_list - apply the replay list to the TNC.
 * @c: UBIFS file-system description object
 *
- * Apply the replay tree.
+ * Apply all entries in the replay list to the TNC. Returns zero in case of
- * Returns zero in case of success and a negative error code in case of
+ * success and a negative error code in case of failure.
- * failure.
 */
-static int apply_replay_tree(struct ubifs_info *c)
+static int apply_replay_list(struct ubifs_info *c)
 {
-        struct rb_node *this = rb_first(&c->replay_tree);
+        struct replay_entry *r;
+        int err;
-        while (this) {
+        list_sort(c, &c->replay_list, &replay_entries_cmp);
-                struct replay_entry *r;
-                int err;
+        list_for_each_entry(r, &c->replay_list, list) {
                cond_resched();
-                r = rb_entry(this, struct replay_entry, rb);
                err = apply_replay_entry(c, r);
                if (err)
                        return err;
-                this = rb_next(this);
        }
        return 0;
 }
 /**
- * insert_node - insert a node to the replay tree.
+ * destroy_replay_list - destroy the replay.
+ * @c: UBIFS file-system description object
+ *
+ * Destroy the replay list.
+ */
+static void destroy_replay_list(struct ubifs_info *c)
+{
+        struct replay_entry *r, *tmp;
+        list_for_each_entry_safe(r, tmp, &c->replay_list, list) {
+                if (is_hash_key(c, &r->key))
+                        kfree(r->nm.name);
+                list_del(&r->list);
+                kfree(r);
+        }
+}
+/**
+ * insert_node - insert a node to the replay list
 * @c: UBIFS file-system description object
 * @lnum: node logical eraseblock number
 * @offs: node offset
@@ -328,39 +347,25 @@ static int apply_replay_tree(struct ubifs_info *c)
 * @old_size: truncation old size
 * @new_size: truncation new size
 *
- * This function inserts a scanned non-direntry node to the replay tree. The
+ * This function inserts a scanned non-direntry node to the replay list. The
- * replay tree is an RB-tree containing @struct replay_entry elements which are
+ * replay list contains @struct replay_entry elements, and we sort this list in
- * indexed by the sequence number. The replay tree is applied at the very end
+ * sequence number order before applying it. The replay list is applied at the
- * of the replay process. Since the tree is sorted in sequence number order,
+ * very end of the replay process. Since the list is sorted in sequence number
- * the older modifications are applied first. This function returns zero in
+ * order, the older modifications are applied first. This function returns zero
- * case of success and a negative error code in case of failure.
+ * in case of success and a negative error code in case of failure.
 */
 static int insert_node(struct ubifs_info *c, int lnum, int offs, int len,
                       union ubifs_key *key, unsigned long long sqnum,
                       int deletion, int *used, loff_t old_size,
                       loff_t new_size)
 {
-        struct rb_node **p = &c->replay_tree.rb_node, *parent = NULL;
        struct replay_entry *r;
+        dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key));
        if (key_inum(c, key) >= c->highest_inum)
                c->highest_inum = key_inum(c, key);
-        dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key));
-        while (*p) {
-                parent = *p;
-                r = rb_entry(parent, struct replay_entry, rb);
-                if (sqnum < r->sqnum) {
-                        p = &(*p)->rb_left;
-                        continue;
-                } else if (sqnum > r->sqnum) {
-                        p = &(*p)->rb_right;
-                        continue;
-                }
-                ubifs_err("duplicate sqnum in replay");
-                return -EINVAL;
-        }
        r = kzalloc(sizeof(struct replay_entry), GFP_KERNEL);
        if (!r)
                return -ENOMEM;
@@ -370,19 +375,18 @@ static int insert_node(struct ubifs_info *c, int lnum, int offs, int len,
        r->lnum = lnum;
        r->offs = offs;
        r->len = len;
+        r->deletion = !!deletion;
        r->sqnum = sqnum;
-        r->flags = (deletion ? REPLAY_DELETION : 0);
+        key_copy(c, key, &r->key);
        r->old_size = old_size;
        r->new_size = new_size;
-        key_copy(c, key, &r->key);
-        rb_link_node(&r->rb, parent, p);
+        list_add_tail(&r->list, &c->replay_list);
-        rb_insert_color(&r->rb, &c->replay_tree);
        return 0;
 }
 /**
- * insert_dent - insert a directory entry node into the replay tree.
+ * insert_dent - insert a directory entry node into the replay list.
 * @c: UBIFS file-system description object
 * @lnum: node logical eraseblock number
 * @offs: node offset
@@ -394,43 +398,25 @@ static int insert_node(struct ubifs_info *c, int lnum, int offs, int len,
 * @deletion: non-zero if this is a deletion
 * @used: number of bytes in use in a LEB
 *
- * This function inserts a scanned directory entry node to the replay tree.
+ * This function inserts a scanned directory entry node or an extended
- * Returns zero in case of success and a negative error code in case of
+ * attribute entry to the replay list. Returns zero in case of success and a
- * failure.
+ * negative error code in case of failure.
- *
- * This function is also used for extended attribute entries because they are
- * implemented as directory entry nodes.
 */
 static int insert_dent(struct ubifs_info *c, int lnum, int offs, int len,
                       union ubifs_key *key, const char *name, int nlen,
                       unsigned long long sqnum, int deletion, int *used)
 {
-        struct rb_node **p = &c->replay_tree.rb_node, *parent = NULL;
        struct replay_entry *r;
        char *nbuf;
+        dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key));
        if (key_inum(c, key) >= c->highest_inum)
                c->highest_inum = key_inum(c, key);
-        dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key));
-        while (*p) {
-                parent = *p;
-                r = rb_entry(parent, struct replay_entry, rb);
-                if (sqnum < r->sqnum) {
-                        p = &(*p)->rb_left;
-                        continue;
-                }
-                if (sqnum > r->sqnum) {
-                        p = &(*p)->rb_right;
-                        continue;
-                }
-                ubifs_err("duplicate sqnum in replay");
-                return -EINVAL;
-        }
        r = kzalloc(sizeof(struct replay_entry), GFP_KERNEL);
        if (!r)
                return -ENOMEM;
        nbuf = kmalloc(nlen + 1, GFP_KERNEL);
        if (!nbuf) {
                kfree(r);
@@ -442,17 +428,15 @@ static int insert_dent(struct ubifs_info *c, int lnum, int offs, int len,
        r->lnum = lnum;
        r->offs = offs;
        r->len = len;
+        r->deletion = !!deletion;
        r->sqnum = sqnum;
+        key_copy(c, key, &r->key);
        r->nm.len = nlen;
        memcpy(nbuf, name, nlen);
        nbuf[nlen] = '\0';
        r->nm.name = nbuf;
-        r->flags = (deletion ? REPLAY_DELETION : 0);
-        key_copy(c, key, &r->key);
-        ubifs_assert(!*p);
+        list_add_tail(&r->list, &c->replay_list);
-        rb_link_node(&r->rb, parent, p);
-        rb_insert_color(&r->rb, &c->replay_tree);
        return 0;
 }
@@ -489,29 +473,92 @@ int ubifs_validate_entry(struct ubifs_info *c,
 }
 /**
+ * is_last_bud - check if the bud is the last in the journal head.
+ * @c: UBIFS file-system description object
+ * @bud: bud description object
+ *
+ * This function checks if bud @bud is the last bud in its journal head. This
+ * information is then used by 'replay_bud()' to decide whether the bud can
+ * have corruptions or not. Indeed, only last buds can be corrupted by power
+ * cuts. Returns %1 if this is the last bud, and %0 if not.
+ */
+static int is_last_bud(struct ubifs_info *c, struct ubifs_bud *bud)
+{
+        struct ubifs_jhead *jh = &c->jheads[bud->jhead];
+        struct ubifs_bud *next;
+        uint32_t data;
+        int err;
+        if (list_is_last(&bud->list, &jh->buds_list))
+                return 1;
+        /*
+         * The following is a quirk to make sure we work correctly with UBIFS
+         * images used with older UBIFS.
+         *
+         * Normally, the last bud will be the last in the journal head's list
+         * of bud. However, there is one exception if the UBIFS image belongs
+         * to older UBIFS. This is fairly unlikely: one would need to use old
+         * UBIFS, then have a power cut exactly at the right point, and then
+         * try to mount this image with new UBIFS.
+         *
+         * The exception is: it is possible to have 2 buds A and B, A goes
+         * before B, and B is the last, bud B is contains no data, and bud A is
+         * corrupted at the end. The reason is that in older versions when the
+         * journal code switched the next bud (from A to B), it first added a
+         * log reference node for the new bud (B), and only after this it
+         * synchronized the write-buffer of current bud (A). But later this was
+         * changed and UBIFS started to always synchronize the write-buffer of
+         * the bud (A) before writing the log reference for the new bud (B).
+         *
+         * But because older UBIFS always synchronized A's write-buffer before
+         * writing to B, we can recognize this exceptional situation but
+         * checking the contents of bud B - if it is empty, then A can be
+         * treated as the last and we can recover it.
+         *
+         * TODO: remove this piece of code in a couple of years (today it is
+         * 16.05.2011).
+         */
+        next = list_entry(bud->list.next, struct ubifs_bud, list);
+        if (!list_is_last(&next->list, &jh->buds_list))
+                return 0;
+        err = ubi_read(c->ubi, next->lnum, (char *)&data,
+                       next->start, 4);
+        if (err)
+                return 0;
+        return data == 0xFFFFFFFF;
+}
+/**
 * replay_bud - replay a bud logical eraseblock.
 * @c: UBIFS file-system description object
- * @lnum: bud logical eraseblock number to replay
+ * @b: bud entry which describes the bud
- * @offs: bud start offset
- * @jhead: journal head to which this bud belongs
- * @free: amount of free space in the bud is returned here
- * @dirty: amount of dirty space from padding and deletion nodes is returned
- * here
 *
- * This function returns zero in case of success and a negative error code in
+ * This function replays bud @bud, recovers it if needed, and adds all nodes
- * case of failure.
+ * from this bud to the replay list. Returns zero in case of success and a
+ * negative error code in case of failure.
 */
-static int replay_bud(struct ubifs_info *c, int lnum, int offs, int jhead,
+static int replay_bud(struct ubifs_info *c, struct bud_entry *b)
-                      int *free, int *dirty)
 {
-        int err = 0, used = 0;
+        int is_last = is_last_bud(c, b->bud);
+        int err = 0, used = 0, lnum = b->bud->lnum, offs = b->bud->start;
        struct ubifs_scan_leb *sleb;
        struct ubifs_scan_node *snod;
-        struct ubifs_bud *bud;
-        dbg_mnt("replay bud LEB %d, head %d", lnum, jhead);
+        dbg_mnt("replay bud LEB %d, head %d, offs %d, is_last %d",
-        if (c->need_recovery)
+                lnum, b->bud->jhead, offs, is_last);
-                sleb = ubifs_recover_leb(c, lnum, offs, c->sbuf, jhead != GCHD);
+        if (c->need_recovery && is_last)
+                /*
+                 * Recover only last LEBs in the journal heads, because power
+                 * cuts may cause corruptions only in these LEBs, because only
+                 * these LEBs could possibly be written to at the power cut
+                 * time.
+                 */
+                sleb = ubifs_recover_leb(c, lnum, offs, c->sbuf,
+                                         b->bud->jhead != GCHD);
        else
                sleb = ubifs_scan(c, lnum, offs, c->sbuf, 0);
        if (IS_ERR(sleb))
@@ -627,15 +674,13 @@ static int replay_bud(struct ubifs_info *c, int lnum, int offs, int jhead,
                        goto out;
        }
-        bud = ubifs_search_bud(c, lnum);
+        ubifs_assert(ubifs_search_bud(c, lnum));
-        if (!bud)
-                BUG();
        ubifs_assert(sleb->endpt - offs >= used);
        ubifs_assert(sleb->endpt % c->min_io_size == 0);
-        *dirty = sleb->endpt - offs - used;
+        b->dirty = sleb->endpt - offs - used;
-        *free = c->leb_size - sleb->endpt;
+        b->free = c->leb_size - sleb->endpt;
+        dbg_mnt("bud LEB %d replied: dirty %d, free %d", lnum, b->dirty, b->free);
 out:
        ubifs_scan_destroy(sleb);
@@ -649,58 +694,6 @@ out_dump:
 }
 /**
- * insert_ref_node - insert a reference node to the replay tree.
- * @c: UBIFS file-system description object
- * @lnum: node logical eraseblock number
- * @offs: node offset
- * @sqnum: sequence number
- * @free: amount of free space in bud
- * @dirty: amount of dirty space from padding and deletion nodes
- * @jhead: journal head number for the bud
- *
- * This function inserts a reference node to the replay tree and returns zero
- * in case of success or a negative error code in case of failure.
- */
-static int insert_ref_node(struct ubifs_info *c, int lnum, int offs,
-                           unsigned long long sqnum, int free, int dirty,
-                           int jhead)
-{
-        struct rb_node **p = &c->replay_tree.rb_node, *parent = NULL;
-        struct replay_entry *r;
-        dbg_mnt("add ref LEB %d:%d", lnum, offs);
-        while (*p) {
-                parent = *p;
-                r = rb_entry(parent, struct replay_entry, rb);
-                if (sqnum < r->sqnum) {
-                        p = &(*p)->rb_left;
-                        continue;
-                } else if (sqnum > r->sqnum) {
-                        p = &(*p)->rb_right;
-                        continue;
-                }
-                ubifs_err("duplicate sqnum in replay tree");
-                return -EINVAL;
-        }
-        r = kzalloc(sizeof(struct replay_entry), GFP_KERNEL);
-        if (!r)
-                return -ENOMEM;
-        r->lnum = lnum;
-        r->offs = offs;
-        r->sqnum = sqnum;
-        r->flags = REPLAY_REF;
-        r->free = free;
-        r->dirty = dirty;
-        r->jhead = jhead;
-        rb_link_node(&r->rb, parent, p);
-        rb_insert_color(&r->rb, &c->replay_tree);
-        return 0;
-}
-/**
 * replay_buds - replay all buds.
 * @c: UBIFS file-system description object
 *
@@ -710,17 +703,16 @@ static int insert_ref_node(struct ubifs_info *c, int lnum, int offs,
 static int replay_buds(struct ubifs_info *c)
 {
        struct bud_entry *b;
-        int err, uninitialized_var(free), uninitialized_var(dirty);
+        int err;
+        unsigned long long prev_sqnum = 0;
        list_for_each_entry(b, &c->replay_buds, list) {
-                err = replay_bud(c, b->bud->lnum, b->bud->start, b->bud->jhead,
+                err = replay_bud(c, b);
-                                 &free, &dirty);
-                if (err)
-                        return err;
-                err = insert_ref_node(c, b->bud->lnum, b->bud->start, b->sqnum,
-                                      free, dirty, b->bud->jhead);
                if (err)
                        return err;
+                ubifs_assert(b->sqnum > prev_sqnum);
+                prev_sqnum = b->sqnum;
        }
        return 0;
@@ -1060,25 +1052,29 @@ int ubifs_replay_journal(struct ubifs_info *c)
        if (err)
                goto out;
-        err = apply_replay_tree(c);
+        err = apply_replay_list(c);
+        if (err)
+                goto out;
+        err = set_buds_lprops(c);
        if (err)
                goto out;
        /*
-         * UBIFS budgeting calculations use @c->budg_uncommitted_idx variable
+         * UBIFS budgeting calculations use @c->bi.uncommitted_idx variable
-         * to roughly estimate index growth. Things like @c->min_idx_lebs
+         * to roughly estimate index growth. Things like @c->bi.min_idx_lebs
         * depend on it. This means we have to initialize it to make sure
         * budgeting works properly.
         */
-        c->budg_uncommitted_idx = atomic_long_read(&c->dirty_zn_cnt);
+        c->bi.uncommitted_idx = atomic_long_read(&c->dirty_zn_cnt);
-        c->budg_uncommitted_idx *= c->max_idx_node_sz;
+        c->bi.uncommitted_idx *= c->max_idx_node_sz;
        ubifs_assert(c->bud_bytes <= c->max_bud_bytes || c->need_recovery);
        dbg_mnt("finished, log head LEB %d:%d, max_sqnum %llu, "
                "highest_inum %lu", c->lhead_lnum, c->lhead_offs, c->max_sqnum,
                (unsigned long)c->highest_inum);
 out:
-        destroy_replay_tree(c);
+        destroy_replay_list(c);
        destroy_bud_list(c);
        c->replaying = 0;
        return err;
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index bf31b4729e5..c606f010e8d 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -475,7 +475,8 @@ failed:
 * @c: UBIFS file-system description object
 *
 * This function returns a pointer to the superblock node or a negative error
- * code.
+ * code. Note, the user of this function is responsible of kfree()'ing the
+ * returned superblock buffer.
 */
 struct ubifs_sb_node *ubifs_read_sb_node(struct ubifs_info *c)
 {
@@ -616,6 +617,7 @@ int ubifs_read_superblock(struct ubifs_info *c)
        c->vfs_sb->s_time_gran = le32_to_cpu(sup->time_gran);
        memcpy(&c->uuid, &sup->uuid, 16);
        c->big_lpt = !!(sup_flags & UBIFS_FLG_BIGLPT);
+        c->space_fixup = !!(sup_flags & UBIFS_FLG_SPACE_FIXUP);
        /* Automatically increase file system size to the maximum size */
        c->old_leb_cnt = c->leb_cnt;
@@ -650,3 +652,152 @@ out:
        kfree(sup);
        return err;
 }
+/**
+ * fixup_leb - fixup/unmap an LEB containing free space.
+ * @c: UBIFS file-system description object
+ * @lnum: the LEB number to fix up
+ * @len: number of used bytes in LEB (starting at offset 0)
+ *
+ * This function reads the contents of the given LEB number @lnum, then fixes
+ * it up, so that empty min. I/O units in the end of LEB are actually erased on
+ * flash (rather than being just all-0xff real data). If the LEB is completely
+ * empty, it is simply unmapped.
+ */
+static int fixup_leb(struct ubifs_info *c, int lnum, int len)
+{
+        int err;
+        ubifs_assert(len >= 0);
+        ubifs_assert(len % c->min_io_size == 0);
+        ubifs_assert(len < c->leb_size);
+        if (len == 0) {
+                dbg_mnt("unmap empty LEB %d", lnum);
+                return ubi_leb_unmap(c->ubi, lnum);
+        }
+        dbg_mnt("fixup LEB %d, data len %d", lnum, len);
+        err = ubi_read(c->ubi, lnum, c->sbuf, 0, len);
+        if (err)
+                return err;
+        return ubi_leb_change(c->ubi, lnum, c->sbuf, len, UBI_UNKNOWN);
+}
+/**
+ * fixup_free_space - find & remap all LEBs containing free space.
+ * @c: UBIFS file-system description object
+ *
+ * This function walks through all LEBs in the filesystem and fiexes up those
+ * containing free/empty space.
+ */
+static int fixup_free_space(struct ubifs_info *c)
+{
+        int lnum, err = 0;
+        struct ubifs_lprops *lprops;
+        ubifs_get_lprops(c);
+        /* Fixup LEBs in the master area */
+        for (lnum = UBIFS_MST_LNUM; lnum < UBIFS_LOG_LNUM; lnum++) {
+                err = fixup_leb(c, lnum, c->mst_offs + c->mst_node_alsz);
+                if (err)
+                        goto out;
+        }
+        /* Unmap unused log LEBs */
+        lnum = ubifs_next_log_lnum(c, c->lhead_lnum);
+        while (lnum != c->ltail_lnum) {
+                err = fixup_leb(c, lnum, 0);
+                if (err)
+                        goto out;
+                lnum = ubifs_next_log_lnum(c, lnum);
+        }
+        /* Fixup the current log head */
+        err = fixup_leb(c, c->lhead_lnum, c->lhead_offs);
+        if (err)
+                goto out;
+        /* Fixup LEBs in the LPT area */
+        for (lnum = c->lpt_first; lnum <= c->lpt_last; lnum++) {
+                int free = c->ltab[lnum - c->lpt_first].free;
+                if (free > 0) {
+                        err = fixup_leb(c, lnum, c->leb_size - free);
+                        if (err)
+                                goto out;
+                }
+        }
+        /* Unmap LEBs in the orphans area */
+        for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) {
+                err = fixup_leb(c, lnum, 0);
+                if (err)
+                        goto out;
+        }
+        /* Fixup LEBs in the main area */
+        for (lnum = c->main_first; lnum < c->leb_cnt; lnum++) {
+                lprops = ubifs_lpt_lookup(c, lnum);
+                if (IS_ERR(lprops)) {
+                        err = PTR_ERR(lprops);
+                        goto out;
+                }
+                if (lprops->free > 0) {
+                        err = fixup_leb(c, lnum, c->leb_size - lprops->free);
+                        if (err)
+                                goto out;
+                }
+        }
+out:
+        ubifs_release_lprops(c);
+        return err;
+}
+/**
+ * ubifs_fixup_free_space - find & fix all LEBs with free space.
+ * @c: UBIFS file-system description object
+ *
+ * This function fixes up LEBs containing free space on first mount, if the
+ * appropriate flag was set when the FS was created. Each LEB with one or more
+ * empty min. I/O unit (i.e. free-space-count > 0) is re-written, to make sure
+ * the free space is actually erased. E.g., this is necessary for some NAND
+ * chips, since the free space may have been programmed like real "0xff" data
+ * (generating a non-0xff ECC), causing future writes to the not-really-erased
+ * NAND pages to behave badly. After the space is fixed up, the superblock flag
+ * is cleared, so that this is skipped for all future mounts.
+ */
+int ubifs_fixup_free_space(struct ubifs_info *c)
+{
+        int err;
+        struct ubifs_sb_node *sup;
+        ubifs_assert(c->space_fixup);
+        ubifs_assert(!c->ro_mount);
+        ubifs_msg("start fixing up free space");
+        err = fixup_free_space(c);
+        if (err)
+                return err;
+        sup = ubifs_read_sb_node(c);
+        if (IS_ERR(sup))
+                return PTR_ERR(sup);
+        /* Free-space fixup is no longer required */
+        c->space_fixup = 0;
+        sup->flags &= cpu_to_le32(~UBIFS_FLG_SPACE_FIXUP);
+        err = ubifs_write_sb_node(c, sup);
+        kfree(sup);
+        if (err)
+                return err;
+        ubifs_msg("free space fixup complete");
+        return err;
+}
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 04ad07f4fcc..6db0bdaa9f7 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -375,7 +375,7 @@ out:
                ubifs_release_dirty_inode_budget(c, ui);
        else {
                /* We've deleted something - clean the "no space" flags */
-                c->nospace = c->nospace_rp = 0;
+                c->bi.nospace = c->bi.nospace_rp = 0;
                smp_wmb();
        }
 done:
@@ -694,11 +694,11 @@ static int init_constants_sb(struct ubifs_info *c)
         * be compressed and direntries are of the maximum size.
         *
         * Note, data, which may be stored in inodes is budgeted separately, so
-         * it is not included into 'c->inode_budget'.
+         * it is not included into 'c->bi.inode_budget'.
         */
-        c->page_budget = UBIFS_MAX_DATA_NODE_SZ * UBIFS_BLOCKS_PER_PAGE;
+        c->bi.page_budget = UBIFS_MAX_DATA_NODE_SZ * UBIFS_BLOCKS_PER_PAGE;
-        c->inode_budget = UBIFS_INO_NODE_SZ;
+        c->bi.inode_budget = UBIFS_INO_NODE_SZ;
-        c->dent_budget = UBIFS_MAX_DENT_NODE_SZ;
+        c->bi.dent_budget = UBIFS_MAX_DENT_NODE_SZ;
        /*
         * When the amount of flash space used by buds becomes
@@ -742,7 +742,7 @@ static void init_constants_master(struct ubifs_info *c)
 {
        long long tmp64;
-        c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+        c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c);
        c->report_rp_size = ubifs_reported_space(c, c->rp_size);
        /*
@@ -1144,8 +1144,8 @@ static int check_free_space(struct ubifs_info *c)
 {
        ubifs_assert(c->dark_wm > 0);
        if (c->lst.total_free + c->lst.total_dirty < c->dark_wm) {
-                ubifs_err("insufficient free space to mount in read/write mode");
+                ubifs_err("insufficient free space to mount in R/W mode");
-                dbg_dump_budg(c);
+                dbg_dump_budg(c, &c->bi);
                dbg_dump_lprops(c);
                return -ENOSPC;
        }
@@ -1304,7 +1304,7 @@ static int mount_ubifs(struct ubifs_info *c)
        if (err)
                goto out_lpt;
-        err = dbg_check_idx_size(c, c->old_idx_sz);
+        err = dbg_check_idx_size(c, c->bi.old_idx_sz);
        if (err)
                goto out_lpt;
@@ -1313,7 +1313,7 @@ static int mount_ubifs(struct ubifs_info *c)
                goto out_journal;
        /* Calculate 'min_idx_lebs' after journal replay */
-        c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+        c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c);
        err = ubifs_mount_orphans(c, c->need_recovery, c->ro_mount);
        if (err)
@@ -1396,6 +1396,12 @@ static int mount_ubifs(struct ubifs_info *c)
        } else
                ubifs_assert(c->lst.taken_empty_lebs > 0);
+        if (!c->ro_mount && c->space_fixup) {
+                err = ubifs_fixup_free_space(c);
+                if (err)
+                        goto out_infos;
+        }
        err = dbg_check_filesystem(c);
        if (err)
                goto out_infos;
@@ -1442,7 +1448,8 @@ static int mount_ubifs(struct ubifs_info *c)
                c->main_lebs, c->main_first, c->leb_cnt - 1);
        dbg_msg("index LEBs:          %d", c->lst.idx_lebs);
        dbg_msg("total index bytes:   %lld (%lld KiB, %lld MiB)",
-                c->old_idx_sz, c->old_idx_sz >> 10, c->old_idx_sz >> 20);
+                c->bi.old_idx_sz, c->bi.old_idx_sz >> 10,
+                c->bi.old_idx_sz >> 20);
        dbg_msg("key hash type:       %d", c->key_hash_type);
        dbg_msg("tree fanout:         %d", c->fanout);
        dbg_msg("reserved GC LEB:     %d", c->gc_lnum);
@@ -1456,7 +1463,7 @@ static int mount_ubifs(struct ubifs_info *c)
        dbg_msg("node sizes:          ref %zu, cmt. start %zu, orph %zu",
                UBIFS_REF_NODE_SZ, UBIFS_CS_NODE_SZ, UBIFS_ORPH_NODE_SZ);
        dbg_msg("max. node sizes:     data %zu, inode %zu dentry %zu, idx %d",
-                UBIFS_MAX_DATA_NODE_SZ, UBIFS_MAX_INO_NODE_SZ,
+                UBIFS_MAX_DATA_NODE_SZ, UBIFS_MAX_INO_NODE_SZ,
                UBIFS_MAX_DENT_NODE_SZ, ubifs_idx_node_sz(c, c->fanout));
        dbg_msg("dead watermark:      %d", c->dead_wm);
        dbg_msg("dark watermark:      %d", c->dark_wm);
@@ -1584,6 +1591,7 @@ static int ubifs_remount_rw(struct ubifs_info *c)
                }
                sup->leb_cnt = cpu_to_le32(c->leb_cnt);
                err = ubifs_write_sb_node(c, sup);
+                kfree(sup);
                if (err)
                        goto out;
        }
@@ -1684,6 +1692,13 @@ static int ubifs_remount_rw(struct ubifs_info *c)
                 */
                err = dbg_check_space_info(c);
        }
+        if (c->space_fixup) {
+                err = ubifs_fixup_free_space(c);
+                if (err)
+                        goto out;
+        }
        mutex_unlock(&c->umount_mutex);
        return err;
@@ -1766,10 +1781,9 @@ static void ubifs_put_super(struct super_block *sb)
         * to write them back because of I/O errors.
         */
        if (!c->ro_error) {
-                ubifs_assert(atomic_long_read(&c->dirty_pg_cnt) == 0);
+                ubifs_assert(c->bi.idx_growth == 0);
-                ubifs_assert(c->budg_idx_growth == 0);
+                ubifs_assert(c->bi.dd_growth == 0);
-                ubifs_assert(c->budg_dd_growth == 0);
+                ubifs_assert(c->bi.data_growth == 0);
-                ubifs_assert(c->budg_data_growth == 0);
        }
        /*
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index de485979ca3..8119b1fd8d9 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -2557,11 +2557,11 @@ int ubifs_tnc_remove_nm(struct ubifs_info *c, const union ubifs_key *key,
                if (err) {
                        /* Ensure the znode is dirtied */
                        if (znode->cnext || !ubifs_zn_dirty(znode)) {
-                                    znode = dirty_cow_bottom_up(c, znode);
+                                znode = dirty_cow_bottom_up(c, znode);
-                                    if (IS_ERR(znode)) {
+                                if (IS_ERR(znode)) {
-                                            err = PTR_ERR(znode);
+                                        err = PTR_ERR(znode);
-                                            goto out_unlock;
+                                        goto out_unlock;
-                                    }
+                                }
                        }
                        err = tnc_delete(c, znode, n);
                }
diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c
index 53288e5d604..41920f357bb 100644
--- a/fs/ubifs/tnc_commit.c
+++ b/fs/ubifs/tnc_commit.c
@@ -377,15 +377,13 @@ static int layout_in_gaps(struct ubifs_info *c, int cnt)
                                c->gap_lebs = NULL;
                                return err;
                        }
-                        if (!dbg_force_in_the_gaps_enabled) {
+                        if (dbg_force_in_the_gaps_enabled()) {
                                /*
                                 * Do not print scary warnings if the debugging
                                 * option which forces in-the-gaps is enabled.
                                 */
-                                ubifs_err("out of space");
+                                ubifs_warn("out of space");
-                                spin_lock(&c->space_lock);
+                                dbg_dump_budg(c, &c->bi);
-                                dbg_dump_budg(c);
-                                spin_unlock(&c->space_lock);
                                dbg_dump_lprops(c);
                        }
                        /* Try to commit anyway */
@@ -796,16 +794,16 @@ int ubifs_tnc_start_commit(struct ubifs_info *c, struct ubifs_zbranch *zroot)
        spin_lock(&c->space_lock);
        /*
         * Although we have not finished committing yet, update size of the
-         * committed index ('c->old_idx_sz') and zero out the index growth
+         * committed index ('c->bi.old_idx_sz') and zero out the index growth
         * budget. It is OK to do this now, because we've reserved all the
         * space which is needed to commit the index, and it is save for the
         * budgeting subsystem to assume the index is already committed,
         * even though it is not.
         */
-        ubifs_assert(c->min_idx_lebs == ubifs_calc_min_idx_lebs(c));
+        ubifs_assert(c->bi.min_idx_lebs == ubifs_calc_min_idx_lebs(c));
-        c->old_idx_sz = c->calc_idx_sz;
+        c->bi.old_idx_sz = c->calc_idx_sz;
-        c->budg_uncommitted_idx = 0;
+        c->bi.uncommitted_idx = 0;
-        c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+        c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c);
        spin_unlock(&c->space_lock);
        mutex_unlock(&c->tnc_mutex);
diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h
index 191ca7863fe..e24380cf46e 100644
--- a/fs/ubifs/ubifs-media.h
+++ b/fs/ubifs/ubifs-media.h
@@ -408,9 +408,11 @@ enum {
 * Superblock flags.
 *
 * UBIFS_FLG_BIGLPT: if "big" LPT model is used if set
+ * UBIFS_FLG_SPACE_FIXUP: first-mount "fixup" of free space within LEBs needed
 */
 enum {
        UBIFS_FLG_BIGLPT = 0x02,
+        UBIFS_FLG_SPACE_FIXUP = 0x04,
 };
 /**
@@ -434,7 +436,7 @@ struct ubifs_ch {
        __u8 node_type;
        __u8 group_type;
        __u8 padding[2];
-} __attribute__ ((packed));
+} __packed;
 /**
 * union ubifs_dev_desc - device node descriptor.
@@ -448,7 +450,7 @@ struct ubifs_ch {
 union ubifs_dev_desc {
        __le32 new;
        __le64 huge;
-} __attribute__ ((packed));
+} __packed;
 /**
 * struct ubifs_ino_node - inode node.
@@ -509,7 +511,7 @@ struct ubifs_ino_node {
        __le16 compr_type;
        __u8 padding2[26]; /* Watch 'zero_ino_node_unused()' if changing! */
        __u8 data[];
-} __attribute__ ((packed));
+} __packed;
 /**
 * struct ubifs_dent_node - directory entry node.
@@ -534,7 +536,7 @@ struct ubifs_dent_node {
        __le16 nlen;
        __u8 padding2[4]; /* Watch 'zero_dent_node_unused()' if changing! */
        __u8 name[];
-} __attribute__ ((packed));
+} __packed;
 /**
 * struct ubifs_data_node - data node.
@@ -555,7 +557,7 @@ struct ubifs_data_node {
        __le16 compr_type;
        __u8 padding[2]; /* Watch 'zero_data_node_unused()' if changing! */
        __u8 data[];
-} __attribute__ ((packed));
+} __packed;
 /**
 * struct ubifs_trun_node - truncation node.
@@ -575,7 +577,7 @@ struct ubifs_trun_node {
        __u8 padding[12]; /* Watch 'zero_trun_node_unused()' if changing! */
        __le64 old_size;
        __le64 new_size;
-} __attribute__ ((packed));
+} __packed;
 /**
 * struct ubifs_pad_node - padding node.
@@ -586,7 +588,7 @@ struct ubifs_trun_node {
 struct ubifs_pad_node {
        struct ubifs_ch ch;
        __le32 pad_len;
-} __attribute__ ((packed));
+} __packed;
 /**
 * struct ubifs_sb_node - superblock node.
@@ -644,7 +646,7 @@ struct ubifs_sb_node {
        __u8 uuid[16];
        __le32 ro_compat_version;
        __u8 padding2[3968];
-} __attribute__ ((packed));
+} __packed;
 /**
 * struct ubifs_mst_node - master node.
@@ -711,7 +713,7 @@ struct ubifs_mst_node {
        __le32 idx_lebs;
        __le32 leb_cnt;
        __u8 padding[344];
-} __attribute__ ((packed));
+} __packed;
 /**
 * struct ubifs_ref_node - logical eraseblock reference node.
@@ -727,7 +729,7 @@ struct ubifs_ref_node {
        __le32 offs;
        __le32 jhead;
        __u8 padding[28];
-} __attribute__ ((packed));
+} __packed;
 /**
 * struct ubifs_branch - key/reference/length branch
@@ -741,7 +743,7 @@ struct ubifs_branch {
        __le32 offs;
        __le32 len;
        __u8 key[];
-} __attribute__ ((packed));
+} __packed;
 /**
 * struct ubifs_idx_node - indexing node.
@@ -755,7 +757,7 @@ struct ubifs_idx_node {
        __le16 child_cnt;
        __le16 level;
        __u8 branches[];
-} __attribute__ ((packed));
+} __packed;
 /**
 * struct ubifs_cs_node - commit start node.
@@ -765,7 +767,7 @@ struct ubifs_idx_node {
 struct ubifs_cs_node {
        struct ubifs_ch ch;
        __le64 cmt_no;
-} __attribute__ ((packed));
+} __packed;
 /**
 * struct ubifs_orph_node - orphan node.
@@ -777,6 +779,6 @@ struct ubifs_orph_node {
        struct ubifs_ch ch;
        __le64 cmt_no;
        __le64 inos[];
-} __attribute__ ((packed));
+} __packed;
 #endif /* __UBIFS_MEDIA_H__ */
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 8c40ad3c672..93d1412a06f 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -389,9 +389,9 @@ struct ubifs_gced_idx_leb {
 * The @ui_size is a "shadow" variable for @inode->i_size and UBIFS uses
 * @ui_size instead of @inode->i_size. The reason for this is that UBIFS cannot
 * make sure @inode->i_size is always changed under @ui_mutex, because it
- * cannot call 'truncate_setsize()' with @ui_mutex locked, because it would deadlock
+ * cannot call 'truncate_setsize()' with @ui_mutex locked, because it would
- * with 'ubifs_writepage()' (see file.c). All the other inode fields are
+ * deadlock with 'ubifs_writepage()' (see file.c). All the other inode fields
- * changed under @ui_mutex, so they do not need "shadow" fields. Note, one
+ * are changed under @ui_mutex, so they do not need "shadow" fields. Note, one
 * could consider to rework locking and base it on "shadow" fields.
 */
 struct ubifs_inode {
@@ -937,6 +937,40 @@ struct ubifs_mount_opts {
        unsigned int compr_type:2;
 };
+/**
+ * struct ubifs_budg_info - UBIFS budgeting information.
+ * @idx_growth: amount of bytes budgeted for index growth
+ * @data_growth: amount of bytes budgeted for cached data
+ * @dd_growth: amount of bytes budgeted for cached data that will make
+ *             other data dirty
+ * @uncommitted_idx: amount of bytes were budgeted for growth of the index, but
+ *                   which still have to be taken into account because the index
+ *                   has not been committed so far
+ * @old_idx_sz: size of index on flash
+ * @min_idx_lebs: minimum number of LEBs required for the index
+ * @nospace: non-zero if the file-system does not have flash space (used as
+ *           optimization)
+ * @nospace_rp: the same as @nospace, but additionally means that even reserved
+ *              pool is full
+ * @page_budget: budget for a page (constant, nenver changed after mount)
+ * @inode_budget: budget for an inode (constant, nenver changed after mount)
+ * @dent_budget: budget for a directory entry (constant, nenver changed after
+ *               mount)
+ */
+struct ubifs_budg_info {
+        long long idx_growth;
+        long long data_growth;
+        long long dd_growth;
+        long long uncommitted_idx;
+        unsigned long long old_idx_sz;
+        int min_idx_lebs;
+        unsigned int nospace:1;
+        unsigned int nospace_rp:1;
+        int page_budget;
+        int inode_budget;
+        int dent_budget;
+};
 struct ubifs_debug_info;
 /**
@@ -980,6 +1014,7 @@ struct ubifs_debug_info;
 * @cmt_wq: wait queue to sleep on if the log is full and a commit is running
 *
 * @big_lpt: flag that LPT is too big to write whole during commit
+ * @space_fixup: flag indicating that free space in LEBs needs to be cleaned up
 * @no_chk_data_crc: do not check CRCs when reading data nodes (except during
 *                   recovery)
 * @bulk_read: enable bulk-reads
@@ -1057,32 +1092,14 @@ struct ubifs_debug_info;
 * @dirty_zn_cnt: number of dirty znodes
 * @clean_zn_cnt: number of clean znodes
 *
- * @budg_idx_growth: amount of bytes budgeted for index growth
+ * @space_lock: protects @bi and @lst
- * @budg_data_growth: amount of bytes budgeted for cached data
+ * @lst: lprops statistics
- * @budg_dd_growth: amount of bytes budgeted for cached data that will make
+ * @bi: budgeting information
- *                  other data dirty
- * @budg_uncommitted_idx: amount of bytes were budgeted for growth of the index,
- *                        but which still have to be taken into account because
- *                        the index has not been committed so far
- * @space_lock: protects @budg_idx_growth, @budg_data_growth, @budg_dd_growth,
- *              @budg_uncommited_idx, @min_idx_lebs, @old_idx_sz, @lst,
- *              @nospace, and @nospace_rp;
- * @min_idx_lebs: minimum number of LEBs required for the index
- * @old_idx_sz: size of index on flash
 * @calc_idx_sz: temporary variable which is used to calculate new index size
 *               (contains accurate new index size at end of TNC commit start)
- * @lst: lprops statistics
- * @nospace: non-zero if the file-system does not have flash space (used as
- *           optimization)
- * @nospace_rp: the same as @nospace, but additionally means that even reserved
- *              pool is full
- *
- * @page_budget: budget for a page
- * @inode_budget: budget for an inode
- * @dent_budget: budget for a directory entry
 *
 * @ref_node_alsz: size of the LEB reference node aligned to the min. flash
- * I/O unit
+ *                 I/O unit
 * @mst_node_alsz: master node aligned size
 * @min_idx_node_sz: minimum indexing node aligned on 8-bytes boundary
 * @max_idx_node_sz: maximum indexing node aligned on 8-bytes boundary
@@ -1189,7 +1206,6 @@ struct ubifs_debug_info;
 * @replaying: %1 during journal replay
 * @mounting: %1 while mounting
 * @remounting_rw: %1 while re-mounting from R/O mode to R/W mode
- * @replay_tree: temporary tree used during journal replay
 * @replay_list: temporary list used during journal replay
 * @replay_buds: list of buds to replay
 * @cs_sqnum: sequence number of first node in the log (commit start node)
@@ -1238,6 +1254,7 @@ struct ubifs_info {
        wait_queue_head_t cmt_wq;
        unsigned int big_lpt:1;
+        unsigned int space_fixup:1;
        unsigned int no_chk_data_crc:1;
        unsigned int bulk_read:1;
        unsigned int default_compr:2;
@@ -1308,21 +1325,10 @@ struct ubifs_info {
        atomic_long_t dirty_zn_cnt;
        atomic_long_t clean_zn_cnt;
-        long long budg_idx_growth;
-        long long budg_data_growth;
-        long long budg_dd_growth;
-        long long budg_uncommitted_idx;
        spinlock_t space_lock;
-        int min_idx_lebs;
-        unsigned long long old_idx_sz;
-        unsigned long long calc_idx_sz;
        struct ubifs_lp_stats lst;
-        unsigned int nospace:1;
+        struct ubifs_budg_info bi;
-        unsigned int nospace_rp:1;
+        unsigned long long calc_idx_sz;
-        int page_budget;
-        int inode_budget;
-        int dent_budget;
        int ref_node_alsz;
        int mst_node_alsz;
@@ -1430,7 +1436,6 @@ struct ubifs_info {
        unsigned int replaying:1;
        unsigned int mounting:1;
        unsigned int remounting_rw:1;
-        struct rb_root replay_tree;
        struct list_head replay_list;
        struct list_head replay_buds;
        unsigned long long cs_sqnum;
@@ -1628,6 +1633,7 @@ int ubifs_write_master(struct ubifs_info *c);
 int ubifs_read_superblock(struct ubifs_info *c);
 struct ubifs_sb_node *ubifs_read_sb_node(struct ubifs_info *c);
 int ubifs_write_sb_node(struct ubifs_info *c, struct ubifs_sb_node *sup);
+int ubifs_fixup_free_space(struct ubifs_info *c);
 /* replay.c */
 int ubifs_validate_entry(struct ubifs_info *c,
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index 3299f469e71..16f19f55e63 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -80,8 +80,8 @@ enum {
        SECURITY_XATTR,
 };
-static const struct inode_operations none_inode_operations;
+static const struct inode_operations empty_iops;
-static const struct file_operations none_file_operations;
+static const struct file_operations empty_fops;
 /**
 * create_xattr - create an extended attribute.
@@ -131,8 +131,8 @@ static int create_xattr(struct ubifs_info *c, struct inode *host,
        /* Re-define all operations to be "nothing" */
        inode->i_mapping->a_ops = &empty_aops;
-        inode->i_op = &none_inode_operations;
+        inode->i_op = &empty_iops;
-        inode->i_fop = &none_file_operations;
+        inode->i_fop = &empty_fops;
        inode->i_flags |= S_SYNC | S_NOATIME | S_NOCMTIME | S_NOQUOTA;
        ui = ubifs_inode(inode);