aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorJoel Becker <jlbec@evilplan.org>2011-05-26 00:51:55 -0400
committerJoel Becker <jlbec@evilplan.org>2011-05-26 00:51:55 -0400
commitece928df16494becd43f999aff9bd530182e7e81 (patch)
tree905042764ea5d8ab6eda63666406e19f607bcf4c /fs
parent3d1c1829ebe7e8bb48a997b39b4865abc9197e5e (diff)
parentdda54e76d7dba0532ebdd72e0b4f492a03f83225 (diff)
Merge branch 'move_extents' of git://oss.oracle.com/git/tye/linux-2.6 into ocfs2-merge-window
Conflicts: fs/ocfs2/ioctl.c
Diffstat (limited to 'fs')
-rw-r--r--fs/binfmt_flat.c8
-rw-r--r--fs/dlm/config.c9
-rw-r--r--fs/dlm/config.h1
-rw-r--r--fs/dlm/dlm_internal.h3
-rw-r--r--fs/dlm/lock.c182
-rw-r--r--fs/dlm/lock.h1
-rw-r--r--fs/dlm/lockspace.c6
-rw-r--r--fs/dlm/plock.c65
-rw-r--r--fs/dlm/user.c1
-rw-r--r--fs/ext2/super.c3
-rw-r--r--fs/ext3/namei.c80
-rw-r--r--fs/jbd/commit.c15
-rw-r--r--fs/jbd/journal.c16
-rw-r--r--fs/jbd/transaction.c3
-rw-r--r--fs/jbd2/commit.c6
-rw-r--r--fs/ocfs2/Makefile1
-rw-r--r--fs/ocfs2/ioctl.c468
-rw-r--r--fs/ocfs2/move_extents.c1153
-rw-r--r--fs/ocfs2/move_extents.h22
-rw-r--r--fs/ocfs2/ocfs2_ioctl.h68
-rw-r--r--fs/ocfs2/refcounttree.c58
-rw-r--r--fs/ocfs2/refcounttree.h11
-rw-r--r--fs/ubifs/budget.c104
-rw-r--r--fs/ubifs/commit.c2
-rw-r--r--fs/ubifs/debug.c167
-rw-r--r--fs/ubifs/debug.h178
-rw-r--r--fs/ubifs/dir.c4
-rw-r--r--fs/ubifs/file.c28
-rw-r--r--fs/ubifs/find.c10
-rw-r--r--fs/ubifs/gc.c71
-rw-r--r--fs/ubifs/io.c33
-rw-r--r--fs/ubifs/journal.c29
-rw-r--r--fs/ubifs/log.c28
-rw-r--r--fs/ubifs/lprops.c115
-rw-r--r--fs/ubifs/lpt_commit.c55
-rw-r--r--fs/ubifs/master.c8
-rw-r--r--fs/ubifs/misc.h17
-rw-r--r--fs/ubifs/orphan.c3
-rw-r--r--fs/ubifs/recovery.c354
-rw-r--r--fs/ubifs/replay.c468
-rw-r--r--fs/ubifs/sb.c153
-rw-r--r--fs/ubifs/super.c46
-rw-r--r--fs/ubifs/tnc.c10
-rw-r--r--fs/ubifs/tnc_commit.c18
-rw-r--r--fs/ubifs/ubifs-media.h30
-rw-r--r--fs/ubifs/ubifs.h86
-rw-r--r--fs/ubifs/xattr.c8
47 files changed, 3158 insertions, 1047 deletions
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 397d3057d336..1bffbe0ed778 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -820,6 +820,8 @@ static int load_flat_shared_library(int id, struct lib_info *libs)
820 int res; 820 int res;
821 char buf[16]; 821 char buf[16];
822 822
823 memset(&bprm, 0, sizeof(bprm));
824
823 /* Create the file name */ 825 /* Create the file name */
824 sprintf(buf, "/lib/lib%d.so", id); 826 sprintf(buf, "/lib/lib%d.so", id);
825 827
@@ -835,6 +837,12 @@ static int load_flat_shared_library(int id, struct lib_info *libs)
835 if (!bprm.cred) 837 if (!bprm.cred)
836 goto out; 838 goto out;
837 839
840 /* We don't really care about recalculating credentials at this point
841 * as we're past the point of no return and are dealing with shared
842 * libraries.
843 */
844 bprm.cred_prepared = 1;
845
838 res = prepare_binprm(&bprm); 846 res = prepare_binprm(&bprm);
839 847
840 if (!IS_ERR_VALUE(res)) 848 if (!IS_ERR_VALUE(res))
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 0d329ff8ed4c..9b026ea8baa9 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -100,6 +100,7 @@ struct dlm_cluster {
100 unsigned int cl_log_debug; 100 unsigned int cl_log_debug;
101 unsigned int cl_protocol; 101 unsigned int cl_protocol;
102 unsigned int cl_timewarn_cs; 102 unsigned int cl_timewarn_cs;
103 unsigned int cl_waitwarn_us;
103}; 104};
104 105
105enum { 106enum {
@@ -114,6 +115,7 @@ enum {
114 CLUSTER_ATTR_LOG_DEBUG, 115 CLUSTER_ATTR_LOG_DEBUG,
115 CLUSTER_ATTR_PROTOCOL, 116 CLUSTER_ATTR_PROTOCOL,
116 CLUSTER_ATTR_TIMEWARN_CS, 117 CLUSTER_ATTR_TIMEWARN_CS,
118 CLUSTER_ATTR_WAITWARN_US,
117}; 119};
118 120
119struct cluster_attribute { 121struct cluster_attribute {
@@ -166,6 +168,7 @@ CLUSTER_ATTR(scan_secs, 1);
166CLUSTER_ATTR(log_debug, 0); 168CLUSTER_ATTR(log_debug, 0);
167CLUSTER_ATTR(protocol, 0); 169CLUSTER_ATTR(protocol, 0);
168CLUSTER_ATTR(timewarn_cs, 1); 170CLUSTER_ATTR(timewarn_cs, 1);
171CLUSTER_ATTR(waitwarn_us, 0);
169 172
170static struct configfs_attribute *cluster_attrs[] = { 173static struct configfs_attribute *cluster_attrs[] = {
171 [CLUSTER_ATTR_TCP_PORT] = &cluster_attr_tcp_port.attr, 174 [CLUSTER_ATTR_TCP_PORT] = &cluster_attr_tcp_port.attr,
@@ -179,6 +182,7 @@ static struct configfs_attribute *cluster_attrs[] = {
179 [CLUSTER_ATTR_LOG_DEBUG] = &cluster_attr_log_debug.attr, 182 [CLUSTER_ATTR_LOG_DEBUG] = &cluster_attr_log_debug.attr,
180 [CLUSTER_ATTR_PROTOCOL] = &cluster_attr_protocol.attr, 183 [CLUSTER_ATTR_PROTOCOL] = &cluster_attr_protocol.attr,
181 [CLUSTER_ATTR_TIMEWARN_CS] = &cluster_attr_timewarn_cs.attr, 184 [CLUSTER_ATTR_TIMEWARN_CS] = &cluster_attr_timewarn_cs.attr,
185 [CLUSTER_ATTR_WAITWARN_US] = &cluster_attr_waitwarn_us.attr,
182 NULL, 186 NULL,
183}; 187};
184 188
@@ -439,6 +443,7 @@ static struct config_group *make_cluster(struct config_group *g,
439 cl->cl_log_debug = dlm_config.ci_log_debug; 443 cl->cl_log_debug = dlm_config.ci_log_debug;
440 cl->cl_protocol = dlm_config.ci_protocol; 444 cl->cl_protocol = dlm_config.ci_protocol;
441 cl->cl_timewarn_cs = dlm_config.ci_timewarn_cs; 445 cl->cl_timewarn_cs = dlm_config.ci_timewarn_cs;
446 cl->cl_waitwarn_us = dlm_config.ci_waitwarn_us;
442 447
443 space_list = &sps->ss_group; 448 space_list = &sps->ss_group;
444 comm_list = &cms->cs_group; 449 comm_list = &cms->cs_group;
@@ -986,6 +991,7 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num)
986#define DEFAULT_LOG_DEBUG 0 991#define DEFAULT_LOG_DEBUG 0
987#define DEFAULT_PROTOCOL 0 992#define DEFAULT_PROTOCOL 0
988#define DEFAULT_TIMEWARN_CS 500 /* 5 sec = 500 centiseconds */ 993#define DEFAULT_TIMEWARN_CS 500 /* 5 sec = 500 centiseconds */
994#define DEFAULT_WAITWARN_US 0
989 995
990struct dlm_config_info dlm_config = { 996struct dlm_config_info dlm_config = {
991 .ci_tcp_port = DEFAULT_TCP_PORT, 997 .ci_tcp_port = DEFAULT_TCP_PORT,
@@ -998,6 +1004,7 @@ struct dlm_config_info dlm_config = {
998 .ci_scan_secs = DEFAULT_SCAN_SECS, 1004 .ci_scan_secs = DEFAULT_SCAN_SECS,
999 .ci_log_debug = DEFAULT_LOG_DEBUG, 1005 .ci_log_debug = DEFAULT_LOG_DEBUG,
1000 .ci_protocol = DEFAULT_PROTOCOL, 1006 .ci_protocol = DEFAULT_PROTOCOL,
1001 .ci_timewarn_cs = DEFAULT_TIMEWARN_CS 1007 .ci_timewarn_cs = DEFAULT_TIMEWARN_CS,
1008 .ci_waitwarn_us = DEFAULT_WAITWARN_US
1002}; 1009};
1003 1010
diff --git a/fs/dlm/config.h b/fs/dlm/config.h
index 4f1d6fce58c5..dd0ce24d5a80 100644
--- a/fs/dlm/config.h
+++ b/fs/dlm/config.h
@@ -28,6 +28,7 @@ struct dlm_config_info {
28 int ci_log_debug; 28 int ci_log_debug;
29 int ci_protocol; 29 int ci_protocol;
30 int ci_timewarn_cs; 30 int ci_timewarn_cs;
31 int ci_waitwarn_us;
31}; 32};
32 33
33extern struct dlm_config_info dlm_config; 34extern struct dlm_config_info dlm_config;
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index b94204913011..0262451eb9c6 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -209,6 +209,7 @@ struct dlm_args {
209#define DLM_IFL_WATCH_TIMEWARN 0x00400000 209#define DLM_IFL_WATCH_TIMEWARN 0x00400000
210#define DLM_IFL_TIMEOUT_CANCEL 0x00800000 210#define DLM_IFL_TIMEOUT_CANCEL 0x00800000
211#define DLM_IFL_DEADLOCK_CANCEL 0x01000000 211#define DLM_IFL_DEADLOCK_CANCEL 0x01000000
212#define DLM_IFL_STUB_MS 0x02000000 /* magic number for m_flags */
212#define DLM_IFL_USER 0x00000001 213#define DLM_IFL_USER 0x00000001
213#define DLM_IFL_ORPHAN 0x00000002 214#define DLM_IFL_ORPHAN 0x00000002
214 215
@@ -245,6 +246,7 @@ struct dlm_lkb {
245 246
246 int8_t lkb_wait_type; /* type of reply waiting for */ 247 int8_t lkb_wait_type; /* type of reply waiting for */
247 int8_t lkb_wait_count; 248 int8_t lkb_wait_count;
249 int lkb_wait_nodeid; /* for debugging */
248 250
249 struct list_head lkb_idtbl_list; /* lockspace lkbtbl */ 251 struct list_head lkb_idtbl_list; /* lockspace lkbtbl */
250 struct list_head lkb_statequeue; /* rsb g/c/w list */ 252 struct list_head lkb_statequeue; /* rsb g/c/w list */
@@ -254,6 +256,7 @@ struct dlm_lkb {
254 struct list_head lkb_ownqueue; /* list of locks for a process */ 256 struct list_head lkb_ownqueue; /* list of locks for a process */
255 struct list_head lkb_time_list; 257 struct list_head lkb_time_list;
256 ktime_t lkb_timestamp; 258 ktime_t lkb_timestamp;
259 ktime_t lkb_wait_time;
257 unsigned long lkb_timeout_cs; 260 unsigned long lkb_timeout_cs;
258 261
259 struct dlm_callback lkb_callbacks[DLM_CALLBACKS_SIZE]; 262 struct dlm_callback lkb_callbacks[DLM_CALLBACKS_SIZE];
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 56d6bfcc1e48..f71d0b5abd95 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -799,10 +799,84 @@ static int msg_reply_type(int mstype)
799 return -1; 799 return -1;
800} 800}
801 801
802static int nodeid_warned(int nodeid, int num_nodes, int *warned)
803{
804 int i;
805
806 for (i = 0; i < num_nodes; i++) {
807 if (!warned[i]) {
808 warned[i] = nodeid;
809 return 0;
810 }
811 if (warned[i] == nodeid)
812 return 1;
813 }
814 return 0;
815}
816
817void dlm_scan_waiters(struct dlm_ls *ls)
818{
819 struct dlm_lkb *lkb;
820 ktime_t zero = ktime_set(0, 0);
821 s64 us;
822 s64 debug_maxus = 0;
823 u32 debug_scanned = 0;
824 u32 debug_expired = 0;
825 int num_nodes = 0;
826 int *warned = NULL;
827
828 if (!dlm_config.ci_waitwarn_us)
829 return;
830
831 mutex_lock(&ls->ls_waiters_mutex);
832
833 list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
834 if (ktime_equal(lkb->lkb_wait_time, zero))
835 continue;
836
837 debug_scanned++;
838
839 us = ktime_to_us(ktime_sub(ktime_get(), lkb->lkb_wait_time));
840
841 if (us < dlm_config.ci_waitwarn_us)
842 continue;
843
844 lkb->lkb_wait_time = zero;
845
846 debug_expired++;
847 if (us > debug_maxus)
848 debug_maxus = us;
849
850 if (!num_nodes) {
851 num_nodes = ls->ls_num_nodes;
852 warned = kmalloc(GFP_KERNEL, num_nodes * sizeof(int));
853 if (warned)
854 memset(warned, 0, num_nodes * sizeof(int));
855 }
856 if (!warned)
857 continue;
858 if (nodeid_warned(lkb->lkb_wait_nodeid, num_nodes, warned))
859 continue;
860
861 log_error(ls, "waitwarn %x %lld %d us check connection to "
862 "node %d", lkb->lkb_id, (long long)us,
863 dlm_config.ci_waitwarn_us, lkb->lkb_wait_nodeid);
864 }
865 mutex_unlock(&ls->ls_waiters_mutex);
866
867 if (warned)
868 kfree(warned);
869
870 if (debug_expired)
871 log_debug(ls, "scan_waiters %u warn %u over %d us max %lld us",
872 debug_scanned, debug_expired,
873 dlm_config.ci_waitwarn_us, (long long)debug_maxus);
874}
875
802/* add/remove lkb from global waiters list of lkb's waiting for 876/* add/remove lkb from global waiters list of lkb's waiting for
803 a reply from a remote node */ 877 a reply from a remote node */
804 878
805static int add_to_waiters(struct dlm_lkb *lkb, int mstype) 879static int add_to_waiters(struct dlm_lkb *lkb, int mstype, int to_nodeid)
806{ 880{
807 struct dlm_ls *ls = lkb->lkb_resource->res_ls; 881 struct dlm_ls *ls = lkb->lkb_resource->res_ls;
808 int error = 0; 882 int error = 0;
@@ -842,6 +916,8 @@ static int add_to_waiters(struct dlm_lkb *lkb, int mstype)
842 916
843 lkb->lkb_wait_count++; 917 lkb->lkb_wait_count++;
844 lkb->lkb_wait_type = mstype; 918 lkb->lkb_wait_type = mstype;
919 lkb->lkb_wait_time = ktime_get();
920 lkb->lkb_wait_nodeid = to_nodeid; /* for debugging */
845 hold_lkb(lkb); 921 hold_lkb(lkb);
846 list_add(&lkb->lkb_wait_reply, &ls->ls_waiters); 922 list_add(&lkb->lkb_wait_reply, &ls->ls_waiters);
847 out: 923 out:
@@ -961,10 +1037,10 @@ static int remove_from_waiters_ms(struct dlm_lkb *lkb, struct dlm_message *ms)
961 struct dlm_ls *ls = lkb->lkb_resource->res_ls; 1037 struct dlm_ls *ls = lkb->lkb_resource->res_ls;
962 int error; 1038 int error;
963 1039
964 if (ms != &ls->ls_stub_ms) 1040 if (ms->m_flags != DLM_IFL_STUB_MS)
965 mutex_lock(&ls->ls_waiters_mutex); 1041 mutex_lock(&ls->ls_waiters_mutex);
966 error = _remove_from_waiters(lkb, ms->m_type, ms); 1042 error = _remove_from_waiters(lkb, ms->m_type, ms);
967 if (ms != &ls->ls_stub_ms) 1043 if (ms->m_flags != DLM_IFL_STUB_MS)
968 mutex_unlock(&ls->ls_waiters_mutex); 1044 mutex_unlock(&ls->ls_waiters_mutex);
969 return error; 1045 return error;
970} 1046}
@@ -1157,6 +1233,16 @@ void dlm_adjust_timeouts(struct dlm_ls *ls)
1157 list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list) 1233 list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list)
1158 lkb->lkb_timestamp = ktime_add_us(lkb->lkb_timestamp, adj_us); 1234 lkb->lkb_timestamp = ktime_add_us(lkb->lkb_timestamp, adj_us);
1159 mutex_unlock(&ls->ls_timeout_mutex); 1235 mutex_unlock(&ls->ls_timeout_mutex);
1236
1237 if (!dlm_config.ci_waitwarn_us)
1238 return;
1239
1240 mutex_lock(&ls->ls_waiters_mutex);
1241 list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
1242 if (ktime_to_us(lkb->lkb_wait_time))
1243 lkb->lkb_wait_time = ktime_get();
1244 }
1245 mutex_unlock(&ls->ls_waiters_mutex);
1160} 1246}
1161 1247
1162/* lkb is master or local copy */ 1248/* lkb is master or local copy */
@@ -1376,14 +1462,8 @@ static void grant_lock_pending(struct dlm_rsb *r, struct dlm_lkb *lkb)
1376 ALTPR/ALTCW: our rqmode may have been changed to PR or CW to become 1462 ALTPR/ALTCW: our rqmode may have been changed to PR or CW to become
1377 compatible with other granted locks */ 1463 compatible with other granted locks */
1378 1464
1379static void munge_demoted(struct dlm_lkb *lkb, struct dlm_message *ms) 1465static void munge_demoted(struct dlm_lkb *lkb)
1380{ 1466{
1381 if (ms->m_type != DLM_MSG_CONVERT_REPLY) {
1382 log_print("munge_demoted %x invalid reply type %d",
1383 lkb->lkb_id, ms->m_type);
1384 return;
1385 }
1386
1387 if (lkb->lkb_rqmode == DLM_LOCK_IV || lkb->lkb_grmode == DLM_LOCK_IV) { 1467 if (lkb->lkb_rqmode == DLM_LOCK_IV || lkb->lkb_grmode == DLM_LOCK_IV) {
1388 log_print("munge_demoted %x invalid modes gr %d rq %d", 1468 log_print("munge_demoted %x invalid modes gr %d rq %d",
1389 lkb->lkb_id, lkb->lkb_grmode, lkb->lkb_rqmode); 1469 lkb->lkb_id, lkb->lkb_grmode, lkb->lkb_rqmode);
@@ -2844,12 +2924,12 @@ static int send_common(struct dlm_rsb *r, struct dlm_lkb *lkb, int mstype)
2844 struct dlm_mhandle *mh; 2924 struct dlm_mhandle *mh;
2845 int to_nodeid, error; 2925 int to_nodeid, error;
2846 2926
2847 error = add_to_waiters(lkb, mstype); 2927 to_nodeid = r->res_nodeid;
2928
2929 error = add_to_waiters(lkb, mstype, to_nodeid);
2848 if (error) 2930 if (error)
2849 return error; 2931 return error;
2850 2932
2851 to_nodeid = r->res_nodeid;
2852
2853 error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh); 2933 error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh);
2854 if (error) 2934 if (error)
2855 goto fail; 2935 goto fail;
@@ -2880,9 +2960,9 @@ static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
2880 /* down conversions go without a reply from the master */ 2960 /* down conversions go without a reply from the master */
2881 if (!error && down_conversion(lkb)) { 2961 if (!error && down_conversion(lkb)) {
2882 remove_from_waiters(lkb, DLM_MSG_CONVERT_REPLY); 2962 remove_from_waiters(lkb, DLM_MSG_CONVERT_REPLY);
2963 r->res_ls->ls_stub_ms.m_flags = DLM_IFL_STUB_MS;
2883 r->res_ls->ls_stub_ms.m_type = DLM_MSG_CONVERT_REPLY; 2964 r->res_ls->ls_stub_ms.m_type = DLM_MSG_CONVERT_REPLY;
2884 r->res_ls->ls_stub_ms.m_result = 0; 2965 r->res_ls->ls_stub_ms.m_result = 0;
2885 r->res_ls->ls_stub_ms.m_flags = lkb->lkb_flags;
2886 __receive_convert_reply(r, lkb, &r->res_ls->ls_stub_ms); 2966 __receive_convert_reply(r, lkb, &r->res_ls->ls_stub_ms);
2887 } 2967 }
2888 2968
@@ -2951,12 +3031,12 @@ static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb)
2951 struct dlm_mhandle *mh; 3031 struct dlm_mhandle *mh;
2952 int to_nodeid, error; 3032 int to_nodeid, error;
2953 3033
2954 error = add_to_waiters(lkb, DLM_MSG_LOOKUP); 3034 to_nodeid = dlm_dir_nodeid(r);
3035
3036 error = add_to_waiters(lkb, DLM_MSG_LOOKUP, to_nodeid);
2955 if (error) 3037 if (error)
2956 return error; 3038 return error;
2957 3039
2958 to_nodeid = dlm_dir_nodeid(r);
2959
2960 error = create_message(r, NULL, to_nodeid, DLM_MSG_LOOKUP, &ms, &mh); 3040 error = create_message(r, NULL, to_nodeid, DLM_MSG_LOOKUP, &ms, &mh);
2961 if (error) 3041 if (error)
2962 goto fail; 3042 goto fail;
@@ -3070,6 +3150,9 @@ static void receive_flags(struct dlm_lkb *lkb, struct dlm_message *ms)
3070 3150
3071static void receive_flags_reply(struct dlm_lkb *lkb, struct dlm_message *ms) 3151static void receive_flags_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
3072{ 3152{
3153 if (ms->m_flags == DLM_IFL_STUB_MS)
3154 return;
3155
3073 lkb->lkb_sbflags = ms->m_sbflags; 3156 lkb->lkb_sbflags = ms->m_sbflags;
3074 lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) | 3157 lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) |
3075 (ms->m_flags & 0x0000FFFF); 3158 (ms->m_flags & 0x0000FFFF);
@@ -3612,7 +3695,7 @@ static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
3612 /* convert was queued on remote master */ 3695 /* convert was queued on remote master */
3613 receive_flags_reply(lkb, ms); 3696 receive_flags_reply(lkb, ms);
3614 if (is_demoted(lkb)) 3697 if (is_demoted(lkb))
3615 munge_demoted(lkb, ms); 3698 munge_demoted(lkb);
3616 del_lkb(r, lkb); 3699 del_lkb(r, lkb);
3617 add_lkb(r, lkb, DLM_LKSTS_CONVERT); 3700 add_lkb(r, lkb, DLM_LKSTS_CONVERT);
3618 add_timeout(lkb); 3701 add_timeout(lkb);
@@ -3622,7 +3705,7 @@ static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
3622 /* convert was granted on remote master */ 3705 /* convert was granted on remote master */
3623 receive_flags_reply(lkb, ms); 3706 receive_flags_reply(lkb, ms);
3624 if (is_demoted(lkb)) 3707 if (is_demoted(lkb))
3625 munge_demoted(lkb, ms); 3708 munge_demoted(lkb);
3626 grant_lock_pc(r, lkb, ms); 3709 grant_lock_pc(r, lkb, ms);
3627 queue_cast(r, lkb, 0); 3710 queue_cast(r, lkb, 0);
3628 break; 3711 break;
@@ -3996,15 +4079,17 @@ void dlm_receive_buffer(union dlm_packet *p, int nodeid)
3996 dlm_put_lockspace(ls); 4079 dlm_put_lockspace(ls);
3997} 4080}
3998 4081
3999static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb) 4082static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb,
4083 struct dlm_message *ms_stub)
4000{ 4084{
4001 if (middle_conversion(lkb)) { 4085 if (middle_conversion(lkb)) {
4002 hold_lkb(lkb); 4086 hold_lkb(lkb);
4003 ls->ls_stub_ms.m_type = DLM_MSG_CONVERT_REPLY; 4087 memset(ms_stub, 0, sizeof(struct dlm_message));
4004 ls->ls_stub_ms.m_result = -EINPROGRESS; 4088 ms_stub->m_flags = DLM_IFL_STUB_MS;
4005 ls->ls_stub_ms.m_flags = lkb->lkb_flags; 4089 ms_stub->m_type = DLM_MSG_CONVERT_REPLY;
4006 ls->ls_stub_ms.m_header.h_nodeid = lkb->lkb_nodeid; 4090 ms_stub->m_result = -EINPROGRESS;
4007 _receive_convert_reply(lkb, &ls->ls_stub_ms); 4091 ms_stub->m_header.h_nodeid = lkb->lkb_nodeid;
4092 _receive_convert_reply(lkb, ms_stub);
4008 4093
4009 /* Same special case as in receive_rcom_lock_args() */ 4094 /* Same special case as in receive_rcom_lock_args() */
4010 lkb->lkb_grmode = DLM_LOCK_IV; 4095 lkb->lkb_grmode = DLM_LOCK_IV;
@@ -4045,13 +4130,27 @@ static int waiter_needs_recovery(struct dlm_ls *ls, struct dlm_lkb *lkb)
4045void dlm_recover_waiters_pre(struct dlm_ls *ls) 4130void dlm_recover_waiters_pre(struct dlm_ls *ls)
4046{ 4131{
4047 struct dlm_lkb *lkb, *safe; 4132 struct dlm_lkb *lkb, *safe;
4133 struct dlm_message *ms_stub;
4048 int wait_type, stub_unlock_result, stub_cancel_result; 4134 int wait_type, stub_unlock_result, stub_cancel_result;
4049 4135
4136 ms_stub = kmalloc(GFP_KERNEL, sizeof(struct dlm_message));
4137 if (!ms_stub) {
4138 log_error(ls, "dlm_recover_waiters_pre no mem");
4139 return;
4140 }
4141
4050 mutex_lock(&ls->ls_waiters_mutex); 4142 mutex_lock(&ls->ls_waiters_mutex);
4051 4143
4052 list_for_each_entry_safe(lkb, safe, &ls->ls_waiters, lkb_wait_reply) { 4144 list_for_each_entry_safe(lkb, safe, &ls->ls_waiters, lkb_wait_reply) {
4053 log_debug(ls, "pre recover waiter lkid %x type %d flags %x", 4145
4054 lkb->lkb_id, lkb->lkb_wait_type, lkb->lkb_flags); 4146 /* exclude debug messages about unlocks because there can be so
4147 many and they aren't very interesting */
4148
4149 if (lkb->lkb_wait_type != DLM_MSG_UNLOCK) {
4150 log_debug(ls, "recover_waiter %x nodeid %d "
4151 "msg %d to %d", lkb->lkb_id, lkb->lkb_nodeid,
4152 lkb->lkb_wait_type, lkb->lkb_wait_nodeid);
4153 }
4055 4154
4056 /* all outstanding lookups, regardless of destination will be 4155 /* all outstanding lookups, regardless of destination will be
4057 resent after recovery is done */ 4156 resent after recovery is done */
@@ -4097,26 +4196,28 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
4097 break; 4196 break;
4098 4197
4099 case DLM_MSG_CONVERT: 4198 case DLM_MSG_CONVERT:
4100 recover_convert_waiter(ls, lkb); 4199 recover_convert_waiter(ls, lkb, ms_stub);
4101 break; 4200 break;
4102 4201
4103 case DLM_MSG_UNLOCK: 4202 case DLM_MSG_UNLOCK:
4104 hold_lkb(lkb); 4203 hold_lkb(lkb);
4105 ls->ls_stub_ms.m_type = DLM_MSG_UNLOCK_REPLY; 4204 memset(ms_stub, 0, sizeof(struct dlm_message));
4106 ls->ls_stub_ms.m_result = stub_unlock_result; 4205 ms_stub->m_flags = DLM_IFL_STUB_MS;
4107 ls->ls_stub_ms.m_flags = lkb->lkb_flags; 4206 ms_stub->m_type = DLM_MSG_UNLOCK_REPLY;
4108 ls->ls_stub_ms.m_header.h_nodeid = lkb->lkb_nodeid; 4207 ms_stub->m_result = stub_unlock_result;
4109 _receive_unlock_reply(lkb, &ls->ls_stub_ms); 4208 ms_stub->m_header.h_nodeid = lkb->lkb_nodeid;
4209 _receive_unlock_reply(lkb, ms_stub);
4110 dlm_put_lkb(lkb); 4210 dlm_put_lkb(lkb);
4111 break; 4211 break;
4112 4212
4113 case DLM_MSG_CANCEL: 4213 case DLM_MSG_CANCEL:
4114 hold_lkb(lkb); 4214 hold_lkb(lkb);
4115 ls->ls_stub_ms.m_type = DLM_MSG_CANCEL_REPLY; 4215 memset(ms_stub, 0, sizeof(struct dlm_message));
4116 ls->ls_stub_ms.m_result = stub_cancel_result; 4216 ms_stub->m_flags = DLM_IFL_STUB_MS;
4117 ls->ls_stub_ms.m_flags = lkb->lkb_flags; 4217 ms_stub->m_type = DLM_MSG_CANCEL_REPLY;
4118 ls->ls_stub_ms.m_header.h_nodeid = lkb->lkb_nodeid; 4218 ms_stub->m_result = stub_cancel_result;
4119 _receive_cancel_reply(lkb, &ls->ls_stub_ms); 4219 ms_stub->m_header.h_nodeid = lkb->lkb_nodeid;
4220 _receive_cancel_reply(lkb, ms_stub);
4120 dlm_put_lkb(lkb); 4221 dlm_put_lkb(lkb);
4121 break; 4222 break;
4122 4223
@@ -4127,6 +4228,7 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
4127 schedule(); 4228 schedule();
4128 } 4229 }
4129 mutex_unlock(&ls->ls_waiters_mutex); 4230 mutex_unlock(&ls->ls_waiters_mutex);
4231 kfree(ms_stub);
4130} 4232}
4131 4233
4132static struct dlm_lkb *find_resend_waiter(struct dlm_ls *ls) 4234static struct dlm_lkb *find_resend_waiter(struct dlm_ls *ls)
@@ -4191,8 +4293,8 @@ int dlm_recover_waiters_post(struct dlm_ls *ls)
4191 ou = is_overlap_unlock(lkb); 4293 ou = is_overlap_unlock(lkb);
4192 err = 0; 4294 err = 0;
4193 4295
4194 log_debug(ls, "recover_waiters_post %x type %d flags %x %s", 4296 log_debug(ls, "recover_waiter %x nodeid %d msg %d r_nodeid %d",
4195 lkb->lkb_id, mstype, lkb->lkb_flags, r->res_name); 4297 lkb->lkb_id, lkb->lkb_nodeid, mstype, r->res_nodeid);
4196 4298
4197 /* At this point we assume that we won't get a reply to any 4299 /* At this point we assume that we won't get a reply to any
4198 previous op or overlap op on this lock. First, do a big 4300 previous op or overlap op on this lock. First, do a big
diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h
index 88e93c80cc22..265017a7c3e7 100644
--- a/fs/dlm/lock.h
+++ b/fs/dlm/lock.h
@@ -24,6 +24,7 @@ int dlm_put_lkb(struct dlm_lkb *lkb);
24void dlm_scan_rsbs(struct dlm_ls *ls); 24void dlm_scan_rsbs(struct dlm_ls *ls);
25int dlm_lock_recovery_try(struct dlm_ls *ls); 25int dlm_lock_recovery_try(struct dlm_ls *ls);
26void dlm_unlock_recovery(struct dlm_ls *ls); 26void dlm_unlock_recovery(struct dlm_ls *ls);
27void dlm_scan_waiters(struct dlm_ls *ls);
27void dlm_scan_timeout(struct dlm_ls *ls); 28void dlm_scan_timeout(struct dlm_ls *ls);
28void dlm_adjust_timeouts(struct dlm_ls *ls); 29void dlm_adjust_timeouts(struct dlm_ls *ls);
29 30
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index f994a7dfda85..14cbf4099753 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -243,7 +243,6 @@ static struct dlm_ls *find_ls_to_scan(void)
243static int dlm_scand(void *data) 243static int dlm_scand(void *data)
244{ 244{
245 struct dlm_ls *ls; 245 struct dlm_ls *ls;
246 int timeout_jiffies = dlm_config.ci_scan_secs * HZ;
247 246
248 while (!kthread_should_stop()) { 247 while (!kthread_should_stop()) {
249 ls = find_ls_to_scan(); 248 ls = find_ls_to_scan();
@@ -252,13 +251,14 @@ static int dlm_scand(void *data)
252 ls->ls_scan_time = jiffies; 251 ls->ls_scan_time = jiffies;
253 dlm_scan_rsbs(ls); 252 dlm_scan_rsbs(ls);
254 dlm_scan_timeout(ls); 253 dlm_scan_timeout(ls);
254 dlm_scan_waiters(ls);
255 dlm_unlock_recovery(ls); 255 dlm_unlock_recovery(ls);
256 } else { 256 } else {
257 ls->ls_scan_time += HZ; 257 ls->ls_scan_time += HZ;
258 } 258 }
259 } else { 259 continue;
260 schedule_timeout_interruptible(timeout_jiffies);
261 } 260 }
261 schedule_timeout_interruptible(dlm_config.ci_scan_secs * HZ);
262 } 262 }
263 return 0; 263 return 0;
264} 264}
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index 30d8b85febbf..e2b878004364 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -71,6 +71,36 @@ static void send_op(struct plock_op *op)
71 wake_up(&send_wq); 71 wake_up(&send_wq);
72} 72}
73 73
74/* If a process was killed while waiting for the only plock on a file,
75 locks_remove_posix will not see any lock on the file so it won't
76 send an unlock-close to us to pass on to userspace to clean up the
77 abandoned waiter. So, we have to insert the unlock-close when the
78 lock call is interrupted. */
79
80static void do_unlock_close(struct dlm_ls *ls, u64 number,
81 struct file *file, struct file_lock *fl)
82{
83 struct plock_op *op;
84
85 op = kzalloc(sizeof(*op), GFP_NOFS);
86 if (!op)
87 return;
88
89 op->info.optype = DLM_PLOCK_OP_UNLOCK;
90 op->info.pid = fl->fl_pid;
91 op->info.fsid = ls->ls_global_id;
92 op->info.number = number;
93 op->info.start = 0;
94 op->info.end = OFFSET_MAX;
95 if (fl->fl_lmops && fl->fl_lmops->fl_grant)
96 op->info.owner = (__u64) fl->fl_pid;
97 else
98 op->info.owner = (__u64)(long) fl->fl_owner;
99
100 op->info.flags |= DLM_PLOCK_FL_CLOSE;
101 send_op(op);
102}
103
74int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file, 104int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
75 int cmd, struct file_lock *fl) 105 int cmd, struct file_lock *fl)
76{ 106{
@@ -114,9 +144,19 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
114 144
115 send_op(op); 145 send_op(op);
116 146
117 if (xop->callback == NULL) 147 if (xop->callback == NULL) {
118 wait_event(recv_wq, (op->done != 0)); 148 rv = wait_event_killable(recv_wq, (op->done != 0));
119 else { 149 if (rv == -ERESTARTSYS) {
150 log_debug(ls, "dlm_posix_lock: wait killed %llx",
151 (unsigned long long)number);
152 spin_lock(&ops_lock);
153 list_del(&op->list);
154 spin_unlock(&ops_lock);
155 kfree(xop);
156 do_unlock_close(ls, number, file, fl);
157 goto out;
158 }
159 } else {
120 rv = FILE_LOCK_DEFERRED; 160 rv = FILE_LOCK_DEFERRED;
121 goto out; 161 goto out;
122 } 162 }
@@ -233,6 +273,13 @@ int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
233 else 273 else
234 op->info.owner = (__u64)(long) fl->fl_owner; 274 op->info.owner = (__u64)(long) fl->fl_owner;
235 275
276 if (fl->fl_flags & FL_CLOSE) {
277 op->info.flags |= DLM_PLOCK_FL_CLOSE;
278 send_op(op);
279 rv = 0;
280 goto out;
281 }
282
236 send_op(op); 283 send_op(op);
237 wait_event(recv_wq, (op->done != 0)); 284 wait_event(recv_wq, (op->done != 0));
238 285
@@ -334,7 +381,10 @@ static ssize_t dev_read(struct file *file, char __user *u, size_t count,
334 spin_lock(&ops_lock); 381 spin_lock(&ops_lock);
335 if (!list_empty(&send_list)) { 382 if (!list_empty(&send_list)) {
336 op = list_entry(send_list.next, struct plock_op, list); 383 op = list_entry(send_list.next, struct plock_op, list);
337 list_move(&op->list, &recv_list); 384 if (op->info.flags & DLM_PLOCK_FL_CLOSE)
385 list_del(&op->list);
386 else
387 list_move(&op->list, &recv_list);
338 memcpy(&info, &op->info, sizeof(info)); 388 memcpy(&info, &op->info, sizeof(info));
339 } 389 }
340 spin_unlock(&ops_lock); 390 spin_unlock(&ops_lock);
@@ -342,6 +392,13 @@ static ssize_t dev_read(struct file *file, char __user *u, size_t count,
342 if (!op) 392 if (!op)
343 return -EAGAIN; 393 return -EAGAIN;
344 394
395 /* there is no need to get a reply from userspace for unlocks
396 that were generated by the vfs cleaning up for a close
397 (the process did not make an unlock call). */
398
399 if (op->info.flags & DLM_PLOCK_FL_CLOSE)
400 kfree(op);
401
345 if (copy_to_user(u, &info, sizeof(info))) 402 if (copy_to_user(u, &info, sizeof(info)))
346 return -EFAULT; 403 return -EFAULT;
347 return sizeof(info); 404 return sizeof(info);
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index d5ab3fe7c198..e96bf3e9be88 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -611,7 +611,6 @@ static ssize_t device_write(struct file *file, const char __user *buf,
611 611
612 out_sig: 612 out_sig:
613 sigprocmask(SIG_SETMASK, &tmpsig, NULL); 613 sigprocmask(SIG_SETMASK, &tmpsig, NULL);
614 recalc_sigpending();
615 out_free: 614 out_free:
616 kfree(kbuf); 615 kfree(kbuf);
617 return error; 616 return error;
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 0a78dae7e2cb..1dd62ed35b85 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -898,7 +898,8 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
898 brelse(bh); 898 brelse(bh);
899 899
900 if (!sb_set_blocksize(sb, blocksize)) { 900 if (!sb_set_blocksize(sb, blocksize)) {
901 ext2_msg(sb, KERN_ERR, "error: blocksize is too small"); 901 ext2_msg(sb, KERN_ERR,
902 "error: bad blocksize %d", blocksize);
902 goto failed_sbi; 903 goto failed_sbi;
903 } 904 }
904 905
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 32f3b8695859..34b6d9bfc48a 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1416,10 +1416,19 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1416 frame->at = entries; 1416 frame->at = entries;
1417 frame->bh = bh; 1417 frame->bh = bh;
1418 bh = bh2; 1418 bh = bh2;
1419 /*
1420 * Mark buffers dirty here so that if do_split() fails we write a
1421 * consistent set of buffers to disk.
1422 */
1423 ext3_journal_dirty_metadata(handle, frame->bh);
1424 ext3_journal_dirty_metadata(handle, bh);
1419 de = do_split(handle,dir, &bh, frame, &hinfo, &retval); 1425 de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
1420 dx_release (frames); 1426 if (!de) {
1421 if (!(de)) 1427 ext3_mark_inode_dirty(handle, dir);
1428 dx_release(frames);
1422 return retval; 1429 return retval;
1430 }
1431 dx_release(frames);
1423 1432
1424 return add_dirent_to_buf(handle, dentry, inode, de, bh); 1433 return add_dirent_to_buf(handle, dentry, inode, de, bh);
1425} 1434}
@@ -2189,6 +2198,7 @@ static int ext3_symlink (struct inode * dir,
2189 handle_t *handle; 2198 handle_t *handle;
2190 struct inode * inode; 2199 struct inode * inode;
2191 int l, err, retries = 0; 2200 int l, err, retries = 0;
2201 int credits;
2192 2202
2193 l = strlen(symname)+1; 2203 l = strlen(symname)+1;
2194 if (l > dir->i_sb->s_blocksize) 2204 if (l > dir->i_sb->s_blocksize)
@@ -2196,10 +2206,26 @@ static int ext3_symlink (struct inode * dir,
2196 2206
2197 dquot_initialize(dir); 2207 dquot_initialize(dir);
2198 2208
2209 if (l > EXT3_N_BLOCKS * 4) {
2210 /*
2211 * For non-fast symlinks, we just allocate inode and put it on
2212 * orphan list in the first transaction => we need bitmap,
2213 * group descriptor, sb, inode block, quota blocks.
2214 */
2215 credits = 4 + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
2216 } else {
2217 /*
2218 * Fast symlink. We have to add entry to directory
2219 * (EXT3_DATA_TRANS_BLOCKS + EXT3_INDEX_EXTRA_TRANS_BLOCKS),
2220 * allocate new inode (bitmap, group descriptor, inode block,
2221 * quota blocks, sb is already counted in previous macros).
2222 */
2223 credits = EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
2224 EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
2225 EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
2226 }
2199retry: 2227retry:
2200 handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + 2228 handle = ext3_journal_start(dir, credits);
2201 EXT3_INDEX_EXTRA_TRANS_BLOCKS + 5 +
2202 EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
2203 if (IS_ERR(handle)) 2229 if (IS_ERR(handle))
2204 return PTR_ERR(handle); 2230 return PTR_ERR(handle);
2205 2231
@@ -2211,21 +2237,45 @@ retry:
2211 if (IS_ERR(inode)) 2237 if (IS_ERR(inode))
2212 goto out_stop; 2238 goto out_stop;
2213 2239
2214 if (l > sizeof (EXT3_I(inode)->i_data)) { 2240 if (l > EXT3_N_BLOCKS * 4) {
2215 inode->i_op = &ext3_symlink_inode_operations; 2241 inode->i_op = &ext3_symlink_inode_operations;
2216 ext3_set_aops(inode); 2242 ext3_set_aops(inode);
2217 /* 2243 /*
2218 * page_symlink() calls into ext3_prepare/commit_write. 2244 * We cannot call page_symlink() with transaction started
2219 * We have a transaction open. All is sweetness. It also sets 2245 * because it calls into ext3_write_begin() which acquires page
2220 * i_size in generic_commit_write(). 2246 * lock which ranks below transaction start (and it can also
2247 * wait for journal commit if we are running out of space). So
2248 * we have to stop transaction now and restart it when symlink
2249 * contents is written.
2250 *
2251 * To keep fs consistent in case of crash, we have to put inode
2252 * to orphan list in the mean time.
2221 */ 2253 */
2254 drop_nlink(inode);
2255 err = ext3_orphan_add(handle, inode);
2256 ext3_journal_stop(handle);
2257 if (err)
2258 goto err_drop_inode;
2222 err = __page_symlink(inode, symname, l, 1); 2259 err = __page_symlink(inode, symname, l, 1);
2260 if (err)
2261 goto err_drop_inode;
2262 /*
2263 * Now inode is being linked into dir (EXT3_DATA_TRANS_BLOCKS
2264 * + EXT3_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified
2265 */
2266 handle = ext3_journal_start(dir,
2267 EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
2268 EXT3_INDEX_EXTRA_TRANS_BLOCKS + 1);
2269 if (IS_ERR(handle)) {
2270 err = PTR_ERR(handle);
2271 goto err_drop_inode;
2272 }
2273 inc_nlink(inode);
2274 err = ext3_orphan_del(handle, inode);
2223 if (err) { 2275 if (err) {
2276 ext3_journal_stop(handle);
2224 drop_nlink(inode); 2277 drop_nlink(inode);
2225 unlock_new_inode(inode); 2278 goto err_drop_inode;
2226 ext3_mark_inode_dirty(handle, inode);
2227 iput (inode);
2228 goto out_stop;
2229 } 2279 }
2230 } else { 2280 } else {
2231 inode->i_op = &ext3_fast_symlink_inode_operations; 2281 inode->i_op = &ext3_fast_symlink_inode_operations;
@@ -2239,6 +2289,10 @@ out_stop:
2239 if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) 2289 if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
2240 goto retry; 2290 goto retry;
2241 return err; 2291 return err;
2292err_drop_inode:
2293 unlock_new_inode(inode);
2294 iput(inode);
2295 return err;
2242} 2296}
2243 2297
2244static int ext3_link (struct dentry * old_dentry, 2298static int ext3_link (struct dentry * old_dentry,
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 69b180459463..72ffa974b0b8 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -302,12 +302,6 @@ void journal_commit_transaction(journal_t *journal)
302 * all outstanding updates to complete. 302 * all outstanding updates to complete.
303 */ 303 */
304 304
305#ifdef COMMIT_STATS
306 spin_lock(&journal->j_list_lock);
307 summarise_journal_usage(journal);
308 spin_unlock(&journal->j_list_lock);
309#endif
310
311 /* Do we need to erase the effects of a prior journal_flush? */ 305 /* Do we need to erase the effects of a prior journal_flush? */
312 if (journal->j_flags & JFS_FLUSHED) { 306 if (journal->j_flags & JFS_FLUSHED) {
313 jbd_debug(3, "super block updated\n"); 307 jbd_debug(3, "super block updated\n");
@@ -722,8 +716,13 @@ wait_for_iobuf:
722 required. */ 716 required. */
723 JBUFFER_TRACE(jh, "file as BJ_Forget"); 717 JBUFFER_TRACE(jh, "file as BJ_Forget");
724 journal_file_buffer(jh, commit_transaction, BJ_Forget); 718 journal_file_buffer(jh, commit_transaction, BJ_Forget);
725 /* Wake up any transactions which were waiting for this 719 /*
726 IO to complete */ 720 * Wake up any transactions which were waiting for this
721 * IO to complete. The barrier must be here so that changes
722 * by journal_file_buffer() take effect before wake_up_bit()
723 * does the waitqueue check.
724 */
725 smp_mb();
727 wake_up_bit(&bh->b_state, BH_Unshadow); 726 wake_up_bit(&bh->b_state, BH_Unshadow);
728 JBUFFER_TRACE(jh, "brelse shadowed buffer"); 727 JBUFFER_TRACE(jh, "brelse shadowed buffer");
729 __brelse(bh); 728 __brelse(bh);
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index b3713afaaa9e..e2d4285fbe90 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -437,9 +437,12 @@ int __log_space_left(journal_t *journal)
437int __log_start_commit(journal_t *journal, tid_t target) 437int __log_start_commit(journal_t *journal, tid_t target)
438{ 438{
439 /* 439 /*
440 * Are we already doing a recent enough commit? 440 * The only transaction we can possibly wait upon is the
441 * currently running transaction (if it exists). Otherwise,
442 * the target tid must be an old one.
441 */ 443 */
442 if (!tid_geq(journal->j_commit_request, target)) { 444 if (journal->j_running_transaction &&
445 journal->j_running_transaction->t_tid == target) {
443 /* 446 /*
444 * We want a new commit: OK, mark the request and wakeup the 447 * We want a new commit: OK, mark the request and wakeup the
445 * commit thread. We do _not_ do the commit ourselves. 448 * commit thread. We do _not_ do the commit ourselves.
@@ -451,7 +454,14 @@ int __log_start_commit(journal_t *journal, tid_t target)
451 journal->j_commit_sequence); 454 journal->j_commit_sequence);
452 wake_up(&journal->j_wait_commit); 455 wake_up(&journal->j_wait_commit);
453 return 1; 456 return 1;
454 } 457 } else if (!tid_geq(journal->j_commit_request, target))
458 /* This should never happen, but if it does, preserve
459 the evidence before kjournald goes into a loop and
460 increments j_commit_sequence beyond all recognition. */
461 WARN_ONCE(1, "jbd: bad log_start_commit: %u %u %u %u\n",
462 journal->j_commit_request, journal->j_commit_sequence,
463 target, journal->j_running_transaction ?
464 journal->j_running_transaction->t_tid : 0);
455 return 0; 465 return 0;
456} 466}
457 467
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 60d2319651b2..f7ee81a065da 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -266,7 +266,8 @@ static handle_t *new_handle(int nblocks)
266 * This function is visible to journal users (like ext3fs), so is not 266 * This function is visible to journal users (like ext3fs), so is not
267 * called with the journal already locked. 267 * called with the journal already locked.
268 * 268 *
269 * Return a pointer to a newly allocated handle, or NULL on failure 269 * Return a pointer to a newly allocated handle, or an ERR_PTR() value
270 * on failure.
270 */ 271 */
271handle_t *journal_start(journal_t *journal, int nblocks) 272handle_t *journal_start(journal_t *journal, int nblocks)
272{ 273{
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 6e28000a4b21..29148a81c783 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -338,12 +338,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
338 * all outstanding updates to complete. 338 * all outstanding updates to complete.
339 */ 339 */
340 340
341#ifdef COMMIT_STATS
342 spin_lock(&journal->j_list_lock);
343 summarise_journal_usage(journal);
344 spin_unlock(&journal->j_list_lock);
345#endif
346
347 /* Do we need to erase the effects of a prior jbd2_journal_flush? */ 341 /* Do we need to erase the effects of a prior jbd2_journal_flush? */
348 if (journal->j_flags & JBD2_FLUSHED) { 342 if (journal->j_flags & JBD2_FLUSHED) {
349 jbd_debug(3, "super block updated\n"); 343 jbd_debug(3, "super block updated\n");
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index d8a0313e99e6..f17e58b32989 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -30,6 +30,7 @@ ocfs2-objs := \
30 namei.o \ 30 namei.o \
31 refcounttree.o \ 31 refcounttree.o \
32 reservations.o \ 32 reservations.o \
33 move_extents.o \
33 resize.o \ 34 resize.o \
34 slot_map.o \ 35 slot_map.o \
35 suballoc.o \ 36 suballoc.o \
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 312a28f433a4..bc91072b7219 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -22,6 +22,11 @@
22#include "ioctl.h" 22#include "ioctl.h"
23#include "resize.h" 23#include "resize.h"
24#include "refcounttree.h" 24#include "refcounttree.h"
25#include "sysfile.h"
26#include "dir.h"
27#include "buffer_head_io.h"
28#include "suballoc.h"
29#include "move_extents.h"
25 30
26#include <linux/ext2_fs.h> 31#include <linux/ext2_fs.h>
27 32
@@ -35,31 +40,27 @@
35 * be -EFAULT. The error will be returned from the ioctl(2) call. It's 40 * be -EFAULT. The error will be returned from the ioctl(2) call. It's
36 * just a best-effort to tell userspace that this request caused the error. 41 * just a best-effort to tell userspace that this request caused the error.
37 */ 42 */
38static inline void __o2info_set_request_error(struct ocfs2_info_request *kreq, 43static inline void o2info_set_request_error(struct ocfs2_info_request *kreq,
39 struct ocfs2_info_request __user *req) 44 struct ocfs2_info_request __user *req)
40{ 45{
41 kreq->ir_flags |= OCFS2_INFO_FL_ERROR; 46 kreq->ir_flags |= OCFS2_INFO_FL_ERROR;
42 (void)put_user(kreq->ir_flags, (__u32 __user *)&(req->ir_flags)); 47 (void)put_user(kreq->ir_flags, (__u32 __user *)&(req->ir_flags));
43} 48}
44 49
45#define o2info_set_request_error(a, b) \ 50static inline void o2info_set_request_filled(struct ocfs2_info_request *req)
46 __o2info_set_request_error((struct ocfs2_info_request *)&(a), b)
47
48static inline void __o2info_set_request_filled(struct ocfs2_info_request *req)
49{ 51{
50 req->ir_flags |= OCFS2_INFO_FL_FILLED; 52 req->ir_flags |= OCFS2_INFO_FL_FILLED;
51} 53}
52 54
53#define o2info_set_request_filled(a) \ 55static inline void o2info_clear_request_filled(struct ocfs2_info_request *req)
54 __o2info_set_request_filled((struct ocfs2_info_request *)&(a))
55
56static inline void __o2info_clear_request_filled(struct ocfs2_info_request *req)
57{ 56{
58 req->ir_flags &= ~OCFS2_INFO_FL_FILLED; 57 req->ir_flags &= ~OCFS2_INFO_FL_FILLED;
59} 58}
60 59
61#define o2info_clear_request_filled(a) \ 60static inline int o2info_coherent(struct ocfs2_info_request *req)
62 __o2info_clear_request_filled((struct ocfs2_info_request *)&(a)) 61{
62 return (!(req->ir_flags & OCFS2_INFO_FL_NON_COHERENT));
63}
63 64
64static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags) 65static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags)
65{ 66{
@@ -153,7 +154,7 @@ int ocfs2_info_handle_blocksize(struct inode *inode,
153 154
154 oib.ib_blocksize = inode->i_sb->s_blocksize; 155 oib.ib_blocksize = inode->i_sb->s_blocksize;
155 156
156 o2info_set_request_filled(oib); 157 o2info_set_request_filled(&oib.ib_req);
157 158
158 if (o2info_to_user(oib, req)) 159 if (o2info_to_user(oib, req))
159 goto bail; 160 goto bail;
@@ -161,7 +162,7 @@ int ocfs2_info_handle_blocksize(struct inode *inode,
161 status = 0; 162 status = 0;
162bail: 163bail:
163 if (status) 164 if (status)
164 o2info_set_request_error(oib, req); 165 o2info_set_request_error(&oib.ib_req, req);
165 166
166 return status; 167 return status;
167} 168}
@@ -178,7 +179,7 @@ int ocfs2_info_handle_clustersize(struct inode *inode,
178 179
179 oic.ic_clustersize = osb->s_clustersize; 180 oic.ic_clustersize = osb->s_clustersize;
180 181
181 o2info_set_request_filled(oic); 182 o2info_set_request_filled(&oic.ic_req);
182 183
183 if (o2info_to_user(oic, req)) 184 if (o2info_to_user(oic, req))
184 goto bail; 185 goto bail;
@@ -186,7 +187,7 @@ int ocfs2_info_handle_clustersize(struct inode *inode,
186 status = 0; 187 status = 0;
187bail: 188bail:
188 if (status) 189 if (status)
189 o2info_set_request_error(oic, req); 190 o2info_set_request_error(&oic.ic_req, req);
190 191
191 return status; 192 return status;
192} 193}
@@ -203,7 +204,7 @@ int ocfs2_info_handle_maxslots(struct inode *inode,
203 204
204 oim.im_max_slots = osb->max_slots; 205 oim.im_max_slots = osb->max_slots;
205 206
206 o2info_set_request_filled(oim); 207 o2info_set_request_filled(&oim.im_req);
207 208
208 if (o2info_to_user(oim, req)) 209 if (o2info_to_user(oim, req))
209 goto bail; 210 goto bail;
@@ -211,7 +212,7 @@ int ocfs2_info_handle_maxslots(struct inode *inode,
211 status = 0; 212 status = 0;
212bail: 213bail:
213 if (status) 214 if (status)
214 o2info_set_request_error(oim, req); 215 o2info_set_request_error(&oim.im_req, req);
215 216
216 return status; 217 return status;
217} 218}
@@ -228,7 +229,7 @@ int ocfs2_info_handle_label(struct inode *inode,
228 229
229 memcpy(oil.il_label, osb->vol_label, OCFS2_MAX_VOL_LABEL_LEN); 230 memcpy(oil.il_label, osb->vol_label, OCFS2_MAX_VOL_LABEL_LEN);
230 231
231 o2info_set_request_filled(oil); 232 o2info_set_request_filled(&oil.il_req);
232 233
233 if (o2info_to_user(oil, req)) 234 if (o2info_to_user(oil, req))
234 goto bail; 235 goto bail;
@@ -236,7 +237,7 @@ int ocfs2_info_handle_label(struct inode *inode,
236 status = 0; 237 status = 0;
237bail: 238bail:
238 if (status) 239 if (status)
239 o2info_set_request_error(oil, req); 240 o2info_set_request_error(&oil.il_req, req);
240 241
241 return status; 242 return status;
242} 243}
@@ -253,7 +254,7 @@ int ocfs2_info_handle_uuid(struct inode *inode,
253 254
254 memcpy(oiu.iu_uuid_str, osb->uuid_str, OCFS2_TEXT_UUID_LEN + 1); 255 memcpy(oiu.iu_uuid_str, osb->uuid_str, OCFS2_TEXT_UUID_LEN + 1);
255 256
256 o2info_set_request_filled(oiu); 257 o2info_set_request_filled(&oiu.iu_req);
257 258
258 if (o2info_to_user(oiu, req)) 259 if (o2info_to_user(oiu, req))
259 goto bail; 260 goto bail;
@@ -261,7 +262,7 @@ int ocfs2_info_handle_uuid(struct inode *inode,
261 status = 0; 262 status = 0;
262bail: 263bail:
263 if (status) 264 if (status)
264 o2info_set_request_error(oiu, req); 265 o2info_set_request_error(&oiu.iu_req, req);
265 266
266 return status; 267 return status;
267} 268}
@@ -280,7 +281,7 @@ int ocfs2_info_handle_fs_features(struct inode *inode,
280 oif.if_incompat_features = osb->s_feature_incompat; 281 oif.if_incompat_features = osb->s_feature_incompat;
281 oif.if_ro_compat_features = osb->s_feature_ro_compat; 282 oif.if_ro_compat_features = osb->s_feature_ro_compat;
282 283
283 o2info_set_request_filled(oif); 284 o2info_set_request_filled(&oif.if_req);
284 285
285 if (o2info_to_user(oif, req)) 286 if (o2info_to_user(oif, req))
286 goto bail; 287 goto bail;
@@ -288,7 +289,7 @@ int ocfs2_info_handle_fs_features(struct inode *inode,
288 status = 0; 289 status = 0;
289bail: 290bail:
290 if (status) 291 if (status)
291 o2info_set_request_error(oif, req); 292 o2info_set_request_error(&oif.if_req, req);
292 293
293 return status; 294 return status;
294} 295}
@@ -305,7 +306,7 @@ int ocfs2_info_handle_journal_size(struct inode *inode,
305 306
306 oij.ij_journal_size = osb->journal->j_inode->i_size; 307 oij.ij_journal_size = osb->journal->j_inode->i_size;
307 308
308 o2info_set_request_filled(oij); 309 o2info_set_request_filled(&oij.ij_req);
309 310
310 if (o2info_to_user(oij, req)) 311 if (o2info_to_user(oij, req))
311 goto bail; 312 goto bail;
@@ -313,7 +314,408 @@ int ocfs2_info_handle_journal_size(struct inode *inode,
313 status = 0; 314 status = 0;
314bail: 315bail:
315 if (status) 316 if (status)
316 o2info_set_request_error(oij, req); 317 o2info_set_request_error(&oij.ij_req, req);
318
319 return status;
320}
321
322int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb,
323 struct inode *inode_alloc, u64 blkno,
324 struct ocfs2_info_freeinode *fi, u32 slot)
325{
326 int status = 0, unlock = 0;
327
328 struct buffer_head *bh = NULL;
329 struct ocfs2_dinode *dinode_alloc = NULL;
330
331 if (inode_alloc)
332 mutex_lock(&inode_alloc->i_mutex);
333
334 if (o2info_coherent(&fi->ifi_req)) {
335 status = ocfs2_inode_lock(inode_alloc, &bh, 0);
336 if (status < 0) {
337 mlog_errno(status);
338 goto bail;
339 }
340 unlock = 1;
341 } else {
342 status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh);
343 if (status < 0) {
344 mlog_errno(status);
345 goto bail;
346 }
347 }
348
349 dinode_alloc = (struct ocfs2_dinode *)bh->b_data;
350
351 fi->ifi_stat[slot].lfi_total =
352 le32_to_cpu(dinode_alloc->id1.bitmap1.i_total);
353 fi->ifi_stat[slot].lfi_free =
354 le32_to_cpu(dinode_alloc->id1.bitmap1.i_total) -
355 le32_to_cpu(dinode_alloc->id1.bitmap1.i_used);
356
357bail:
358 if (unlock)
359 ocfs2_inode_unlock(inode_alloc, 0);
360
361 if (inode_alloc)
362 mutex_unlock(&inode_alloc->i_mutex);
363
364 brelse(bh);
365
366 return status;
367}
368
369int ocfs2_info_handle_freeinode(struct inode *inode,
370 struct ocfs2_info_request __user *req)
371{
372 u32 i;
373 u64 blkno = -1;
374 char namebuf[40];
375 int status = -EFAULT, type = INODE_ALLOC_SYSTEM_INODE;
376 struct ocfs2_info_freeinode *oifi = NULL;
377 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
378 struct inode *inode_alloc = NULL;
379
380 oifi = kzalloc(sizeof(struct ocfs2_info_freeinode), GFP_KERNEL);
381 if (!oifi) {
382 status = -ENOMEM;
383 mlog_errno(status);
384 goto bail;
385 }
386
387 if (o2info_from_user(*oifi, req))
388 goto bail;
389
390 oifi->ifi_slotnum = osb->max_slots;
391
392 for (i = 0; i < oifi->ifi_slotnum; i++) {
393 if (o2info_coherent(&oifi->ifi_req)) {
394 inode_alloc = ocfs2_get_system_file_inode(osb, type, i);
395 if (!inode_alloc) {
396 mlog(ML_ERROR, "unable to get alloc inode in "
397 "slot %u\n", i);
398 status = -EIO;
399 goto bail;
400 }
401 } else {
402 ocfs2_sprintf_system_inode_name(namebuf,
403 sizeof(namebuf),
404 type, i);
405 status = ocfs2_lookup_ino_from_name(osb->sys_root_inode,
406 namebuf,
407 strlen(namebuf),
408 &blkno);
409 if (status < 0) {
410 status = -ENOENT;
411 goto bail;
412 }
413 }
414
415 status = ocfs2_info_scan_inode_alloc(osb, inode_alloc, blkno, oifi, i);
416 if (status < 0)
417 goto bail;
418
419 iput(inode_alloc);
420 inode_alloc = NULL;
421 }
422
423 o2info_set_request_filled(&oifi->ifi_req);
424
425 if (o2info_to_user(*oifi, req))
426 goto bail;
427
428 status = 0;
429bail:
430 if (status)
431 o2info_set_request_error(&oifi->ifi_req, req);
432
433 kfree(oifi);
434
435 return status;
436}
437
438static void o2ffg_update_histogram(struct ocfs2_info_free_chunk_list *hist,
439 unsigned int chunksize)
440{
441 int index;
442
443 index = __ilog2_u32(chunksize);
444 if (index >= OCFS2_INFO_MAX_HIST)
445 index = OCFS2_INFO_MAX_HIST - 1;
446
447 hist->fc_chunks[index]++;
448 hist->fc_clusters[index] += chunksize;
449}
450
451static void o2ffg_update_stats(struct ocfs2_info_freefrag_stats *stats,
452 unsigned int chunksize)
453{
454 if (chunksize > stats->ffs_max)
455 stats->ffs_max = chunksize;
456
457 if (chunksize < stats->ffs_min)
458 stats->ffs_min = chunksize;
459
460 stats->ffs_avg += chunksize;
461 stats->ffs_free_chunks_real++;
462}
463
464void ocfs2_info_update_ffg(struct ocfs2_info_freefrag *ffg,
465 unsigned int chunksize)
466{
467 o2ffg_update_histogram(&(ffg->iff_ffs.ffs_fc_hist), chunksize);
468 o2ffg_update_stats(&(ffg->iff_ffs), chunksize);
469}
470
471int ocfs2_info_freefrag_scan_chain(struct ocfs2_super *osb,
472 struct inode *gb_inode,
473 struct ocfs2_dinode *gb_dinode,
474 struct ocfs2_chain_rec *rec,
475 struct ocfs2_info_freefrag *ffg,
476 u32 chunks_in_group)
477{
478 int status = 0, used;
479 u64 blkno;
480
481 struct buffer_head *bh = NULL;
482 struct ocfs2_group_desc *bg = NULL;
483
484 unsigned int max_bits, num_clusters;
485 unsigned int offset = 0, cluster, chunk;
486 unsigned int chunk_free, last_chunksize = 0;
487
488 if (!le32_to_cpu(rec->c_free))
489 goto bail;
490
491 do {
492 if (!bg)
493 blkno = le64_to_cpu(rec->c_blkno);
494 else
495 blkno = le64_to_cpu(bg->bg_next_group);
496
497 if (bh) {
498 brelse(bh);
499 bh = NULL;
500 }
501
502 if (o2info_coherent(&ffg->iff_req))
503 status = ocfs2_read_group_descriptor(gb_inode,
504 gb_dinode,
505 blkno, &bh);
506 else
507 status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh);
508
509 if (status < 0) {
510 mlog(ML_ERROR, "Can't read the group descriptor # "
511 "%llu from device.", (unsigned long long)blkno);
512 status = -EIO;
513 goto bail;
514 }
515
516 bg = (struct ocfs2_group_desc *)bh->b_data;
517
518 if (!le16_to_cpu(bg->bg_free_bits_count))
519 continue;
520
521 max_bits = le16_to_cpu(bg->bg_bits);
522 offset = 0;
523
524 for (chunk = 0; chunk < chunks_in_group; chunk++) {
525 /*
526 * last chunk may be not an entire one.
527 */
528 if ((offset + ffg->iff_chunksize) > max_bits)
529 num_clusters = max_bits - offset;
530 else
531 num_clusters = ffg->iff_chunksize;
532
533 chunk_free = 0;
534 for (cluster = 0; cluster < num_clusters; cluster++) {
535 used = ocfs2_test_bit(offset,
536 (unsigned long *)bg->bg_bitmap);
537 /*
538 * - chunk_free counts free clusters in #N chunk.
539 * - last_chunksize records the size(in) clusters
540 * for the last real free chunk being counted.
541 */
542 if (!used) {
543 last_chunksize++;
544 chunk_free++;
545 }
546
547 if (used && last_chunksize) {
548 ocfs2_info_update_ffg(ffg,
549 last_chunksize);
550 last_chunksize = 0;
551 }
552
553 offset++;
554 }
555
556 if (chunk_free == ffg->iff_chunksize)
557 ffg->iff_ffs.ffs_free_chunks++;
558 }
559
560 /*
561 * need to update the info for last free chunk.
562 */
563 if (last_chunksize)
564 ocfs2_info_update_ffg(ffg, last_chunksize);
565
566 } while (le64_to_cpu(bg->bg_next_group));
567
568bail:
569 brelse(bh);
570
571 return status;
572}
573
574int ocfs2_info_freefrag_scan_bitmap(struct ocfs2_super *osb,
575 struct inode *gb_inode, u64 blkno,
576 struct ocfs2_info_freefrag *ffg)
577{
578 u32 chunks_in_group;
579 int status = 0, unlock = 0, i;
580
581 struct buffer_head *bh = NULL;
582 struct ocfs2_chain_list *cl = NULL;
583 struct ocfs2_chain_rec *rec = NULL;
584 struct ocfs2_dinode *gb_dinode = NULL;
585
586 if (gb_inode)
587 mutex_lock(&gb_inode->i_mutex);
588
589 if (o2info_coherent(&ffg->iff_req)) {
590 status = ocfs2_inode_lock(gb_inode, &bh, 0);
591 if (status < 0) {
592 mlog_errno(status);
593 goto bail;
594 }
595 unlock = 1;
596 } else {
597 status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh);
598 if (status < 0) {
599 mlog_errno(status);
600 goto bail;
601 }
602 }
603
604 gb_dinode = (struct ocfs2_dinode *)bh->b_data;
605 cl = &(gb_dinode->id2.i_chain);
606
607 /*
608 * Chunksize(in) clusters from userspace should be
609 * less than clusters in a group.
610 */
611 if (ffg->iff_chunksize > le16_to_cpu(cl->cl_cpg)) {
612 status = -EINVAL;
613 goto bail;
614 }
615
616 memset(&ffg->iff_ffs, 0, sizeof(struct ocfs2_info_freefrag_stats));
617
618 ffg->iff_ffs.ffs_min = ~0U;
619 ffg->iff_ffs.ffs_clusters =
620 le32_to_cpu(gb_dinode->id1.bitmap1.i_total);
621 ffg->iff_ffs.ffs_free_clusters = ffg->iff_ffs.ffs_clusters -
622 le32_to_cpu(gb_dinode->id1.bitmap1.i_used);
623
624 chunks_in_group = le16_to_cpu(cl->cl_cpg) / ffg->iff_chunksize + 1;
625
626 for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i++) {
627 rec = &(cl->cl_recs[i]);
628 status = ocfs2_info_freefrag_scan_chain(osb, gb_inode,
629 gb_dinode,
630 rec, ffg,
631 chunks_in_group);
632 if (status)
633 goto bail;
634 }
635
636 if (ffg->iff_ffs.ffs_free_chunks_real)
637 ffg->iff_ffs.ffs_avg = (ffg->iff_ffs.ffs_avg /
638 ffg->iff_ffs.ffs_free_chunks_real);
639bail:
640 if (unlock)
641 ocfs2_inode_unlock(gb_inode, 0);
642
643 if (gb_inode)
644 mutex_unlock(&gb_inode->i_mutex);
645
646 if (gb_inode)
647 iput(gb_inode);
648
649 brelse(bh);
650
651 return status;
652}
653
654int ocfs2_info_handle_freefrag(struct inode *inode,
655 struct ocfs2_info_request __user *req)
656{
657 u64 blkno = -1;
658 char namebuf[40];
659 int status = -EFAULT, type = GLOBAL_BITMAP_SYSTEM_INODE;
660
661 struct ocfs2_info_freefrag *oiff;
662 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
663 struct inode *gb_inode = NULL;
664
665 oiff = kzalloc(sizeof(struct ocfs2_info_freefrag), GFP_KERNEL);
666 if (!oiff) {
667 status = -ENOMEM;
668 mlog_errno(status);
669 goto bail;
670 }
671
672 if (o2info_from_user(*oiff, req))
673 goto bail;
674 /*
675 * chunksize from userspace should be power of 2.
676 */
677 if ((oiff->iff_chunksize & (oiff->iff_chunksize - 1)) ||
678 (!oiff->iff_chunksize)) {
679 status = -EINVAL;
680 goto bail;
681 }
682
683 if (o2info_coherent(&oiff->iff_req)) {
684 gb_inode = ocfs2_get_system_file_inode(osb, type,
685 OCFS2_INVALID_SLOT);
686 if (!gb_inode) {
687 mlog(ML_ERROR, "unable to get global_bitmap inode\n");
688 status = -EIO;
689 goto bail;
690 }
691 } else {
692 ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type,
693 OCFS2_INVALID_SLOT);
694 status = ocfs2_lookup_ino_from_name(osb->sys_root_inode,
695 namebuf,
696 strlen(namebuf),
697 &blkno);
698 if (status < 0) {
699 status = -ENOENT;
700 goto bail;
701 }
702 }
703
704 status = ocfs2_info_freefrag_scan_bitmap(osb, gb_inode, blkno, oiff);
705 if (status < 0)
706 goto bail;
707
708 o2info_set_request_filled(&oiff->iff_req);
709
710 if (o2info_to_user(*oiff, req))
711 goto bail;
712
713 status = 0;
714bail:
715 if (status)
716 o2info_set_request_error(&oiff->iff_req, req);
717
718 kfree(oiff);
317 719
318 return status; 720 return status;
319} 721}
@@ -327,7 +729,7 @@ int ocfs2_info_handle_unknown(struct inode *inode,
327 if (o2info_from_user(oir, req)) 729 if (o2info_from_user(oir, req))
328 goto bail; 730 goto bail;
329 731
330 o2info_clear_request_filled(oir); 732 o2info_clear_request_filled(&oir);
331 733
332 if (o2info_to_user(oir, req)) 734 if (o2info_to_user(oir, req))
333 goto bail; 735 goto bail;
@@ -335,7 +737,7 @@ int ocfs2_info_handle_unknown(struct inode *inode,
335 status = 0; 737 status = 0;
336bail: 738bail:
337 if (status) 739 if (status)
338 o2info_set_request_error(oir, req); 740 o2info_set_request_error(&oir, req);
339 741
340 return status; 742 return status;
341} 743}
@@ -389,6 +791,14 @@ int ocfs2_info_handle_request(struct inode *inode,
389 if (oir.ir_size == sizeof(struct ocfs2_info_journal_size)) 791 if (oir.ir_size == sizeof(struct ocfs2_info_journal_size))
390 status = ocfs2_info_handle_journal_size(inode, req); 792 status = ocfs2_info_handle_journal_size(inode, req);
391 break; 793 break;
794 case OCFS2_INFO_FREEINODE:
795 if (oir.ir_size == sizeof(struct ocfs2_info_freeinode))
796 status = ocfs2_info_handle_freeinode(inode, req);
797 break;
798 case OCFS2_INFO_FREEFRAG:
799 if (oir.ir_size == sizeof(struct ocfs2_info_freefrag))
800 status = ocfs2_info_handle_freefrag(inode, req);
801 break;
392 default: 802 default:
393 status = ocfs2_info_handle_unknown(inode, req); 803 status = ocfs2_info_handle_unknown(inode, req);
394 break; 804 break;
@@ -565,6 +975,8 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
565 975
566 return 0; 976 return 0;
567 } 977 }
978 case OCFS2_IOC_MOVE_EXT:
979 return ocfs2_ioctl_move_extents(filp, (void __user *)arg);
568 default: 980 default:
569 return -ENOTTY; 981 return -ENOTTY;
570 } 982 }
@@ -608,6 +1020,8 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
608 return -EFAULT; 1020 return -EFAULT;
609 1021
610 return ocfs2_info_handle(inode, &info, 1); 1022 return ocfs2_info_handle(inode, &info, 1);
1023 case OCFS2_IOC_MOVE_EXT:
1024 break;
611 default: 1025 default:
612 return -ENOIOCTLCMD; 1026 return -ENOIOCTLCMD;
613 } 1027 }
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
new file mode 100644
index 000000000000..4c5488468c14
--- /dev/null
+++ b/fs/ocfs2/move_extents.c
@@ -0,0 +1,1153 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * move_extents.c
5 *
6 * Copyright (C) 2011 Oracle. All rights reserved.
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public
10 * License version 2 as published by the Free Software Foundation.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License for more details.
16 */
17#include <linux/fs.h>
18#include <linux/types.h>
19#include <linux/mount.h>
20#include <linux/swap.h>
21
22#include <cluster/masklog.h>
23
24#include "ocfs2.h"
25#include "ocfs2_ioctl.h"
26
27#include "alloc.h"
28#include "aops.h"
29#include "dlmglue.h"
30#include "extent_map.h"
31#include "inode.h"
32#include "journal.h"
33#include "suballoc.h"
34#include "uptodate.h"
35#include "super.h"
36#include "dir.h"
37#include "buffer_head_io.h"
38#include "sysfile.h"
39#include "suballoc.h"
40#include "refcounttree.h"
41#include "move_extents.h"
42
43struct ocfs2_move_extents_context {
44 struct inode *inode;
45 struct file *file;
46 int auto_defrag;
47 int partial;
48 int credits;
49 u32 new_phys_cpos;
50 u32 clusters_moved;
51 u64 refcount_loc;
52 struct ocfs2_move_extents *range;
53 struct ocfs2_extent_tree et;
54 struct ocfs2_alloc_context *meta_ac;
55 struct ocfs2_alloc_context *data_ac;
56 struct ocfs2_cached_dealloc_ctxt dealloc;
57};
58
59static int __ocfs2_move_extent(handle_t *handle,
60 struct ocfs2_move_extents_context *context,
61 u32 cpos, u32 len, u32 p_cpos, u32 new_p_cpos,
62 int ext_flags)
63{
64 int ret = 0, index;
65 struct inode *inode = context->inode;
66 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
67 struct ocfs2_extent_rec *rec, replace_rec;
68 struct ocfs2_path *path = NULL;
69 struct ocfs2_extent_list *el;
70 u64 ino = ocfs2_metadata_cache_owner(context->et.et_ci);
71 u64 old_blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cpos);
72
73 ret = ocfs2_duplicate_clusters_by_page(handle, context->file, cpos,
74 p_cpos, new_p_cpos, len);
75 if (ret) {
76 mlog_errno(ret);
77 goto out;
78 }
79
80 memset(&replace_rec, 0, sizeof(replace_rec));
81 replace_rec.e_cpos = cpu_to_le32(cpos);
82 replace_rec.e_leaf_clusters = cpu_to_le16(len);
83 replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(inode->i_sb,
84 new_p_cpos));
85
86 path = ocfs2_new_path_from_et(&context->et);
87 if (!path) {
88 ret = -ENOMEM;
89 mlog_errno(ret);
90 goto out;
91 }
92
93 ret = ocfs2_find_path(INODE_CACHE(inode), path, cpos);
94 if (ret) {
95 mlog_errno(ret);
96 goto out;
97 }
98
99 el = path_leaf_el(path);
100
101 index = ocfs2_search_extent_list(el, cpos);
102 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
103 ocfs2_error(inode->i_sb,
104 "Inode %llu has an extent at cpos %u which can no "
105 "longer be found.\n",
106 (unsigned long long)ino, cpos);
107 ret = -EROFS;
108 goto out;
109 }
110
111 rec = &el->l_recs[index];
112
113 BUG_ON(ext_flags != rec->e_flags);
114 /*
115 * after moving/defraging to new location, the extent is not going
116 * to be refcounted anymore.
117 */
118 replace_rec.e_flags = ext_flags & ~OCFS2_EXT_REFCOUNTED;
119
120 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
121 context->et.et_root_bh,
122 OCFS2_JOURNAL_ACCESS_WRITE);
123 if (ret) {
124 mlog_errno(ret);
125 goto out;
126 }
127
128 ret = ocfs2_split_extent(handle, &context->et, path, index,
129 &replace_rec, context->meta_ac,
130 &context->dealloc);
131 if (ret) {
132 mlog_errno(ret);
133 goto out;
134 }
135
136 ocfs2_journal_dirty(handle, context->et.et_root_bh);
137
138 context->new_phys_cpos = new_p_cpos;
139
140 /*
141 * need I to append truncate log for old clusters?
142 */
143 if (old_blkno) {
144 if (ext_flags & OCFS2_EXT_REFCOUNTED)
145 ret = ocfs2_decrease_refcount(inode, handle,
146 ocfs2_blocks_to_clusters(osb->sb,
147 old_blkno),
148 len, context->meta_ac,
149 &context->dealloc, 1);
150 else
151 ret = ocfs2_truncate_log_append(osb, handle,
152 old_blkno, len);
153 }
154
155out:
156 return ret;
157}
158
159/*
160 * lock allocators, and reserving appropriate number of bits for
161 * meta blocks and data clusters.
162 *
163 * in some cases, we don't need to reserve clusters, just let data_ac
164 * be NULL.
165 */
166static int ocfs2_lock_allocators_move_extents(struct inode *inode,
167 struct ocfs2_extent_tree *et,
168 u32 clusters_to_move,
169 u32 extents_to_split,
170 struct ocfs2_alloc_context **meta_ac,
171 struct ocfs2_alloc_context **data_ac,
172 int extra_blocks,
173 int *credits)
174{
175 int ret, num_free_extents;
176 unsigned int max_recs_needed = 2 * extents_to_split + clusters_to_move;
177 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
178
179 num_free_extents = ocfs2_num_free_extents(osb, et);
180 if (num_free_extents < 0) {
181 ret = num_free_extents;
182 mlog_errno(ret);
183 goto out;
184 }
185
186 if (!num_free_extents ||
187 (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed))
188 extra_blocks += ocfs2_extend_meta_needed(et->et_root_el);
189
190 ret = ocfs2_reserve_new_metadata_blocks(osb, extra_blocks, meta_ac);
191 if (ret) {
192 mlog_errno(ret);
193 goto out;
194 }
195
196 if (data_ac) {
197 ret = ocfs2_reserve_clusters(osb, clusters_to_move, data_ac);
198 if (ret) {
199 mlog_errno(ret);
200 goto out;
201 }
202 }
203
204 *credits += ocfs2_calc_extend_credits(osb->sb, et->et_root_el,
205 clusters_to_move + 2);
206
207 mlog(0, "reserve metadata_blocks: %d, data_clusters: %u, credits: %d\n",
208 extra_blocks, clusters_to_move, *credits);
209out:
210 if (ret) {
211 if (*meta_ac) {
212 ocfs2_free_alloc_context(*meta_ac);
213 *meta_ac = NULL;
214 }
215 }
216
217 return ret;
218}
219
220/*
221 * Using one journal handle to guarantee the data consistency in case
222 * crash happens anywhere.
223 *
224 * XXX: defrag can end up with finishing partial extent as requested,
225 * due to not enough contiguous clusters can be found in allocator.
226 */
227static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context,
228 u32 cpos, u32 phys_cpos, u32 *len, int ext_flags)
229{
230 int ret, credits = 0, extra_blocks = 0, partial = context->partial;
231 handle_t *handle;
232 struct inode *inode = context->inode;
233 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
234 struct inode *tl_inode = osb->osb_tl_inode;
235 struct ocfs2_refcount_tree *ref_tree = NULL;
236 u32 new_phys_cpos, new_len;
237 u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
238
239 if ((ext_flags & OCFS2_EXT_REFCOUNTED) && *len) {
240
241 BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
242 OCFS2_HAS_REFCOUNT_FL));
243
244 BUG_ON(!context->refcount_loc);
245
246 ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1,
247 &ref_tree, NULL);
248 if (ret) {
249 mlog_errno(ret);
250 return ret;
251 }
252
253 ret = ocfs2_prepare_refcount_change_for_del(inode,
254 context->refcount_loc,
255 phys_blkno,
256 *len,
257 &credits,
258 &extra_blocks);
259 if (ret) {
260 mlog_errno(ret);
261 goto out;
262 }
263 }
264
265 ret = ocfs2_lock_allocators_move_extents(inode, &context->et, *len, 1,
266 &context->meta_ac,
267 &context->data_ac,
268 extra_blocks, &credits);
269 if (ret) {
270 mlog_errno(ret);
271 goto out;
272 }
273
274 /*
275 * should be using allocation reservation strategy there?
276 *
277 * if (context->data_ac)
278 * context->data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
279 */
280
281 mutex_lock(&tl_inode->i_mutex);
282
283 if (ocfs2_truncate_log_needs_flush(osb)) {
284 ret = __ocfs2_flush_truncate_log(osb);
285 if (ret < 0) {
286 mlog_errno(ret);
287 goto out_unlock_mutex;
288 }
289 }
290
291 handle = ocfs2_start_trans(osb, credits);
292 if (IS_ERR(handle)) {
293 ret = PTR_ERR(handle);
294 mlog_errno(ret);
295 goto out_unlock_mutex;
296 }
297
298 ret = __ocfs2_claim_clusters(handle, context->data_ac, 1, *len,
299 &new_phys_cpos, &new_len);
300 if (ret) {
301 mlog_errno(ret);
302 goto out_commit;
303 }
304
305 /*
306 * allowing partial extent moving is kind of 'pros and cons', it makes
307 * whole defragmentation less likely to fail, on the contrary, the bad
308 * thing is it may make the fs even more fragmented after moving, let
309 * userspace make a good decision here.
310 */
311 if (new_len != *len) {
312 mlog(0, "len_claimed: %u, len: %u\n", new_len, *len);
313 if (!partial) {
314 context->range->me_flags &= ~OCFS2_MOVE_EXT_FL_COMPLETE;
315 ret = -ENOSPC;
316 goto out_commit;
317 }
318 }
319
320 mlog(0, "cpos: %u, phys_cpos: %u, new_phys_cpos: %u\n", cpos,
321 phys_cpos, new_phys_cpos);
322
323 ret = __ocfs2_move_extent(handle, context, cpos, new_len, phys_cpos,
324 new_phys_cpos, ext_flags);
325 if (ret)
326 mlog_errno(ret);
327
328 if (partial && (new_len != *len))
329 *len = new_len;
330
331 /*
332 * Here we should write the new page out first if we are
333 * in write-back mode.
334 */
335 ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, *len);
336 if (ret)
337 mlog_errno(ret);
338
339out_commit:
340 ocfs2_commit_trans(osb, handle);
341
342out_unlock_mutex:
343 mutex_unlock(&tl_inode->i_mutex);
344
345 if (context->data_ac) {
346 ocfs2_free_alloc_context(context->data_ac);
347 context->data_ac = NULL;
348 }
349
350 if (context->meta_ac) {
351 ocfs2_free_alloc_context(context->meta_ac);
352 context->meta_ac = NULL;
353 }
354
355out:
356 if (ref_tree)
357 ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
358
359 return ret;
360}
361
362/*
363 * find the victim alloc group, where #blkno fits.
364 */
365static int ocfs2_find_victim_alloc_group(struct inode *inode,
366 u64 vict_blkno,
367 int type, int slot,
368 int *vict_bit,
369 struct buffer_head **ret_bh)
370{
371 int ret, i, blocks_per_unit = 1;
372 u64 blkno;
373 char namebuf[40];
374
375 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
376 struct buffer_head *ac_bh = NULL, *gd_bh = NULL;
377 struct ocfs2_chain_list *cl;
378 struct ocfs2_chain_rec *rec;
379 struct ocfs2_dinode *ac_dinode;
380 struct ocfs2_group_desc *bg;
381
382 ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, slot);
383 ret = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf,
384 strlen(namebuf), &blkno);
385 if (ret) {
386 ret = -ENOENT;
387 goto out;
388 }
389
390 ret = ocfs2_read_blocks_sync(osb, blkno, 1, &ac_bh);
391 if (ret) {
392 mlog_errno(ret);
393 goto out;
394 }
395
396 ac_dinode = (struct ocfs2_dinode *)ac_bh->b_data;
397 cl = &(ac_dinode->id2.i_chain);
398 rec = &(cl->cl_recs[0]);
399
400 if (type == GLOBAL_BITMAP_SYSTEM_INODE)
401 blocks_per_unit <<= (osb->s_clustersize_bits -
402 inode->i_sb->s_blocksize_bits);
403 /*
404 * 'vict_blkno' was out of the valid range.
405 */
406 if ((vict_blkno < le64_to_cpu(rec->c_blkno)) ||
407 (vict_blkno >= (le32_to_cpu(ac_dinode->id1.bitmap1.i_total) *
408 blocks_per_unit))) {
409 ret = -EINVAL;
410 goto out;
411 }
412
413 for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i++) {
414
415 rec = &(cl->cl_recs[i]);
416 if (!rec)
417 continue;
418
419 bg = NULL;
420
421 do {
422 if (!bg)
423 blkno = le64_to_cpu(rec->c_blkno);
424 else
425 blkno = le64_to_cpu(bg->bg_next_group);
426
427 if (gd_bh) {
428 brelse(gd_bh);
429 gd_bh = NULL;
430 }
431
432 ret = ocfs2_read_blocks_sync(osb, blkno, 1, &gd_bh);
433 if (ret) {
434 mlog_errno(ret);
435 goto out;
436 }
437
438 bg = (struct ocfs2_group_desc *)gd_bh->b_data;
439
440 if (vict_blkno < (le64_to_cpu(bg->bg_blkno) +
441 le16_to_cpu(bg->bg_bits))) {
442
443 *ret_bh = gd_bh;
444 *vict_bit = (vict_blkno - blkno) /
445 blocks_per_unit;
446 mlog(0, "find the victim group: #%llu, "
447 "total_bits: %u, vict_bit: %u\n",
448 blkno, le16_to_cpu(bg->bg_bits),
449 *vict_bit);
450 goto out;
451 }
452
453 } while (le64_to_cpu(bg->bg_next_group));
454 }
455
456 ret = -EINVAL;
457out:
458 brelse(ac_bh);
459
460 /*
461 * caller has to release the gd_bh properly.
462 */
463 return ret;
464}
465
466/*
467 * XXX: helper to validate and adjust moving goal.
468 */
469static int ocfs2_validate_and_adjust_move_goal(struct inode *inode,
470 struct ocfs2_move_extents *range)
471{
472 int ret, goal_bit = 0;
473
474 struct buffer_head *gd_bh = NULL;
475 struct ocfs2_group_desc *bg;
476 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
477 int c_to_b = 1 << (osb->s_clustersize_bits -
478 inode->i_sb->s_blocksize_bits);
479
480 /*
481 * validate goal sits within global_bitmap, and return the victim
482 * group desc
483 */
484 ret = ocfs2_find_victim_alloc_group(inode, range->me_goal,
485 GLOBAL_BITMAP_SYSTEM_INODE,
486 OCFS2_INVALID_SLOT,
487 &goal_bit, &gd_bh);
488 if (ret)
489 goto out;
490
491 bg = (struct ocfs2_group_desc *)gd_bh->b_data;
492
493 /*
494 * make goal become cluster aligned.
495 */
496 if (range->me_goal % c_to_b)
497 range->me_goal = range->me_goal / c_to_b * c_to_b;
498
499 /*
500 * moving goal is not allowd to start with a group desc blok(#0 blk)
501 * let's compromise to the latter cluster.
502 */
503 if (range->me_goal == le64_to_cpu(bg->bg_blkno))
504 range->me_goal += c_to_b;
505
506 /*
507 * movement is not gonna cross two groups.
508 */
509 if ((le16_to_cpu(bg->bg_bits) - goal_bit) * osb->s_clustersize <
510 range->me_len) {
511 ret = -EINVAL;
512 goto out;
513 }
514 /*
515 * more exact validations/adjustments will be performed later during
516 * moving operation for each extent range.
517 */
518 mlog(0, "extents get ready to be moved to #%llu block\n",
519 range->me_goal);
520
521out:
522 brelse(gd_bh);
523
524 return ret;
525}
526
527static void ocfs2_probe_alloc_group(struct inode *inode, struct buffer_head *bh,
528 int *goal_bit, u32 move_len, u32 max_hop,
529 u32 *phys_cpos)
530{
531 int i, used, last_free_bits = 0, base_bit = *goal_bit;
532 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
533 u32 base_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
534 le64_to_cpu(gd->bg_blkno));
535
536 for (i = base_bit; i < le16_to_cpu(gd->bg_bits); i++) {
537
538 used = ocfs2_test_bit(i, (unsigned long *)gd->bg_bitmap);
539 if (used) {
540 /*
541 * we even tried searching the free chunk by jumping
542 * a 'max_hop' distance, but still failed.
543 */
544 if ((i - base_bit) > max_hop) {
545 *phys_cpos = 0;
546 break;
547 }
548
549 if (last_free_bits)
550 last_free_bits = 0;
551
552 continue;
553 } else
554 last_free_bits++;
555
556 if (last_free_bits == move_len) {
557 *goal_bit = i;
558 *phys_cpos = base_cpos + i;
559 break;
560 }
561 }
562
563 mlog(0, "found phys_cpos: %u to fit the wanted moving.\n", *phys_cpos);
564}
565
566static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
567 handle_t *handle,
568 struct buffer_head *di_bh,
569 u32 num_bits,
570 u16 chain)
571{
572 int ret;
573 u32 tmp_used;
574 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
575 struct ocfs2_chain_list *cl =
576 (struct ocfs2_chain_list *) &di->id2.i_chain;
577
578 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
579 OCFS2_JOURNAL_ACCESS_WRITE);
580 if (ret < 0) {
581 mlog_errno(ret);
582 goto out;
583 }
584
585 tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
586 di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
587 le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
588 ocfs2_journal_dirty(handle, di_bh);
589
590out:
591 return ret;
592}
593
594static inline int ocfs2_block_group_set_bits(handle_t *handle,
595 struct inode *alloc_inode,
596 struct ocfs2_group_desc *bg,
597 struct buffer_head *group_bh,
598 unsigned int bit_off,
599 unsigned int num_bits)
600{
601 int status;
602 void *bitmap = bg->bg_bitmap;
603 int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
604
605 /* All callers get the descriptor via
606 * ocfs2_read_group_descriptor(). Any corruption is a code bug. */
607 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
608 BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
609
610 mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
611 num_bits);
612
613 if (ocfs2_is_cluster_bitmap(alloc_inode))
614 journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
615
616 status = ocfs2_journal_access_gd(handle,
617 INODE_CACHE(alloc_inode),
618 group_bh,
619 journal_type);
620 if (status < 0) {
621 mlog_errno(status);
622 goto bail;
623 }
624
625 le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
626 if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
627 ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
628 " count %u but claims %u are freed. num_bits %d",
629 (unsigned long long)le64_to_cpu(bg->bg_blkno),
630 le16_to_cpu(bg->bg_bits),
631 le16_to_cpu(bg->bg_free_bits_count), num_bits);
632 return -EROFS;
633 }
634 while (num_bits--)
635 ocfs2_set_bit(bit_off++, bitmap);
636
637 ocfs2_journal_dirty(handle, group_bh);
638
639bail:
640 return status;
641}
642
643static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
644 u32 cpos, u32 phys_cpos, u32 *new_phys_cpos,
645 u32 len, int ext_flags)
646{
647 int ret, credits = 0, extra_blocks = 0, goal_bit = 0;
648 handle_t *handle;
649 struct inode *inode = context->inode;
650 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
651 struct inode *tl_inode = osb->osb_tl_inode;
652 struct inode *gb_inode = NULL;
653 struct buffer_head *gb_bh = NULL;
654 struct buffer_head *gd_bh = NULL;
655 struct ocfs2_group_desc *gd;
656 struct ocfs2_refcount_tree *ref_tree = NULL;
657 u32 move_max_hop = ocfs2_blocks_to_clusters(inode->i_sb,
658 context->range->me_threshold);
659 u64 phys_blkno, new_phys_blkno;
660
661 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
662
663 if ((ext_flags & OCFS2_EXT_REFCOUNTED) && len) {
664
665 BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
666 OCFS2_HAS_REFCOUNT_FL));
667
668 BUG_ON(!context->refcount_loc);
669
670 ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1,
671 &ref_tree, NULL);
672 if (ret) {
673 mlog_errno(ret);
674 return ret;
675 }
676
677 ret = ocfs2_prepare_refcount_change_for_del(inode,
678 context->refcount_loc,
679 phys_blkno,
680 len,
681 &credits,
682 &extra_blocks);
683 if (ret) {
684 mlog_errno(ret);
685 goto out;
686 }
687 }
688
689 ret = ocfs2_lock_allocators_move_extents(inode, &context->et, len, 1,
690 &context->meta_ac,
691 NULL, extra_blocks, &credits);
692 if (ret) {
693 mlog_errno(ret);
694 goto out;
695 }
696
697 /*
698 * need to count 2 extra credits for global_bitmap inode and
699 * group descriptor.
700 */
701 credits += OCFS2_INODE_UPDATE_CREDITS + 1;
702
703 /*
704 * ocfs2_move_extent() didn't reserve any clusters in lock_allocators()
705 * logic, while we still need to lock the global_bitmap.
706 */
707 gb_inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE,
708 OCFS2_INVALID_SLOT);
709 if (!gb_inode) {
710 mlog(ML_ERROR, "unable to get global_bitmap inode\n");
711 ret = -EIO;
712 goto out;
713 }
714
715 mutex_lock(&gb_inode->i_mutex);
716
717 ret = ocfs2_inode_lock(gb_inode, &gb_bh, 1);
718 if (ret) {
719 mlog_errno(ret);
720 goto out_unlock_gb_mutex;
721 }
722
723 mutex_lock(&tl_inode->i_mutex);
724
725 handle = ocfs2_start_trans(osb, credits);
726 if (IS_ERR(handle)) {
727 ret = PTR_ERR(handle);
728 mlog_errno(ret);
729 goto out_unlock_tl_inode;
730 }
731
732 new_phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, *new_phys_cpos);
733 ret = ocfs2_find_victim_alloc_group(inode, new_phys_blkno,
734 GLOBAL_BITMAP_SYSTEM_INODE,
735 OCFS2_INVALID_SLOT,
736 &goal_bit, &gd_bh);
737 if (ret) {
738 mlog_errno(ret);
739 goto out_commit;
740 }
741
742 /*
743 * probe the victim cluster group to find a proper
744 * region to fit wanted movement, it even will perfrom
745 * a best-effort attempt by compromising to a threshold
746 * around the goal.
747 */
748 ocfs2_probe_alloc_group(inode, gd_bh, &goal_bit, len, move_max_hop,
749 new_phys_cpos);
750 if (!new_phys_cpos) {
751 ret = -ENOSPC;
752 goto out_commit;
753 }
754
755 ret = __ocfs2_move_extent(handle, context, cpos, len, phys_cpos,
756 *new_phys_cpos, ext_flags);
757 if (ret) {
758 mlog_errno(ret);
759 goto out_commit;
760 }
761
762 gd = (struct ocfs2_group_desc *)gd_bh->b_data;
763 ret = ocfs2_alloc_dinode_update_counts(gb_inode, handle, gb_bh, len,
764 le16_to_cpu(gd->bg_chain));
765 if (ret) {
766 mlog_errno(ret);
767 goto out_commit;
768 }
769
770 ret = ocfs2_block_group_set_bits(handle, gb_inode, gd, gd_bh,
771 goal_bit, len);
772 if (ret)
773 mlog_errno(ret);
774
775 /*
776 * Here we should write the new page out first if we are
777 * in write-back mode.
778 */
779 ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, len);
780 if (ret)
781 mlog_errno(ret);
782
783out_commit:
784 ocfs2_commit_trans(osb, handle);
785 brelse(gd_bh);
786
787out_unlock_tl_inode:
788 mutex_unlock(&tl_inode->i_mutex);
789
790 ocfs2_inode_unlock(gb_inode, 1);
791out_unlock_gb_mutex:
792 mutex_unlock(&gb_inode->i_mutex);
793 brelse(gb_bh);
794 iput(gb_inode);
795
796out:
797 if (context->meta_ac) {
798 ocfs2_free_alloc_context(context->meta_ac);
799 context->meta_ac = NULL;
800 }
801
802 if (ref_tree)
803 ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
804
805 return ret;
806}
807
808/*
809 * Helper to calculate the defraging length in one run according to threshold.
810 */
811static void ocfs2_calc_extent_defrag_len(u32 *alloc_size, u32 *len_defraged,
812 u32 threshold, int *skip)
813{
814 if ((*alloc_size + *len_defraged) < threshold) {
815 /*
816 * proceed defragmentation until we meet the thresh
817 */
818 *len_defraged += *alloc_size;
819 } else if (*len_defraged == 0) {
820 /*
821 * XXX: skip a large extent.
822 */
823 *skip = 1;
824 } else {
825 /*
826 * split this extent to coalesce with former pieces as
827 * to reach the threshold.
828 *
829 * we're done here with one cycle of defragmentation
830 * in a size of 'thresh', resetting 'len_defraged'
831 * forces a new defragmentation.
832 */
833 *alloc_size = threshold - *len_defraged;
834 *len_defraged = 0;
835 }
836}
837
838static int __ocfs2_move_extents_range(struct buffer_head *di_bh,
839 struct ocfs2_move_extents_context *context)
840{
841 int ret = 0, flags, do_defrag, skip = 0;
842 u32 cpos, phys_cpos, move_start, len_to_move, alloc_size;
843 u32 len_defraged = 0, defrag_thresh = 0, new_phys_cpos = 0;
844
845 struct inode *inode = context->inode;
846 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
847 struct ocfs2_move_extents *range = context->range;
848 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
849
850 if ((inode->i_size == 0) || (range->me_len == 0))
851 return 0;
852
853 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
854 return 0;
855
856 context->refcount_loc = le64_to_cpu(di->i_refcount_loc);
857
858 ocfs2_init_dinode_extent_tree(&context->et, INODE_CACHE(inode), di_bh);
859 ocfs2_init_dealloc_ctxt(&context->dealloc);
860
861 /*
862 * TO-DO XXX:
863 *
864 * - xattr extents.
865 */
866
867 do_defrag = context->auto_defrag;
868
869 /*
870 * extents moving happens in unit of clusters, for the sake
871 * of simplicity, we may ignore two clusters where 'byte_start'
872 * and 'byte_start + len' were within.
873 */
874 move_start = ocfs2_clusters_for_bytes(osb->sb, range->me_start);
875 len_to_move = (range->me_start + range->me_len) >>
876 osb->s_clustersize_bits;
877 if (len_to_move >= move_start)
878 len_to_move -= move_start;
879 else
880 len_to_move = 0;
881
882 if (do_defrag) {
883 defrag_thresh = range->me_threshold >> osb->s_clustersize_bits;
884 if (defrag_thresh <= 1)
885 goto done;
886 } else
887 new_phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
888 range->me_goal);
889
890 mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u, "
891 "thresh: %u\n",
892 (unsigned long long)OCFS2_I(inode)->ip_blkno,
893 (unsigned long long)range->me_start,
894 (unsigned long long)range->me_len,
895 move_start, len_to_move, defrag_thresh);
896
897 cpos = move_start;
898 while (len_to_move) {
899 ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &alloc_size,
900 &flags);
901 if (ret) {
902 mlog_errno(ret);
903 goto out;
904 }
905
906 if (alloc_size > len_to_move)
907 alloc_size = len_to_move;
908
909 /*
910 * XXX: how to deal with a hole:
911 *
912 * - skip the hole of course
913 * - force a new defragmentation
914 */
915 if (!phys_cpos) {
916 if (do_defrag)
917 len_defraged = 0;
918
919 goto next;
920 }
921
922 if (do_defrag) {
923 ocfs2_calc_extent_defrag_len(&alloc_size, &len_defraged,
924 defrag_thresh, &skip);
925 /*
926 * skip large extents
927 */
928 if (skip) {
929 skip = 0;
930 goto next;
931 }
932
933 mlog(0, "#Defrag: cpos: %u, phys_cpos: %u, "
934 "alloc_size: %u, len_defraged: %u\n",
935 cpos, phys_cpos, alloc_size, len_defraged);
936
937 ret = ocfs2_defrag_extent(context, cpos, phys_cpos,
938 &alloc_size, flags);
939 } else {
940 ret = ocfs2_move_extent(context, cpos, phys_cpos,
941 &new_phys_cpos, alloc_size,
942 flags);
943
944 new_phys_cpos += alloc_size;
945 }
946
947 if (ret < 0) {
948 mlog_errno(ret);
949 goto out;
950 }
951
952 context->clusters_moved += alloc_size;
953next:
954 cpos += alloc_size;
955 len_to_move -= alloc_size;
956 }
957
958done:
959 range->me_flags |= OCFS2_MOVE_EXT_FL_COMPLETE;
960
961out:
962 range->me_moved_len = ocfs2_clusters_to_bytes(osb->sb,
963 context->clusters_moved);
964 range->me_new_offset = ocfs2_clusters_to_bytes(osb->sb,
965 context->new_phys_cpos);
966
967 ocfs2_schedule_truncate_log_flush(osb, 1);
968 ocfs2_run_deallocs(osb, &context->dealloc);
969
970 return ret;
971}
972
973static int ocfs2_move_extents(struct ocfs2_move_extents_context *context)
974{
975 int status;
976 handle_t *handle;
977 struct inode *inode = context->inode;
978 struct ocfs2_dinode *di;
979 struct buffer_head *di_bh = NULL;
980 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
981
982 if (!inode)
983 return -ENOENT;
984
985 if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
986 return -EROFS;
987
988 mutex_lock(&inode->i_mutex);
989
990 /*
991 * This prevents concurrent writes from other nodes
992 */
993 status = ocfs2_rw_lock(inode, 1);
994 if (status) {
995 mlog_errno(status);
996 goto out;
997 }
998
999 status = ocfs2_inode_lock(inode, &di_bh, 1);
1000 if (status) {
1001 mlog_errno(status);
1002 goto out_rw_unlock;
1003 }
1004
1005 /*
1006 * rememer ip_xattr_sem also needs to be held if necessary
1007 */
1008 down_write(&OCFS2_I(inode)->ip_alloc_sem);
1009
1010 status = __ocfs2_move_extents_range(di_bh, context);
1011
1012 up_write(&OCFS2_I(inode)->ip_alloc_sem);
1013 if (status) {
1014 mlog_errno(status);
1015 goto out_inode_unlock;
1016 }
1017
1018 /*
1019 * We update ctime for these changes
1020 */
1021 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1022 if (IS_ERR(handle)) {
1023 status = PTR_ERR(handle);
1024 mlog_errno(status);
1025 goto out_inode_unlock;
1026 }
1027
1028 status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
1029 OCFS2_JOURNAL_ACCESS_WRITE);
1030 if (status) {
1031 mlog_errno(status);
1032 goto out_commit;
1033 }
1034
1035 di = (struct ocfs2_dinode *)di_bh->b_data;
1036 inode->i_ctime = CURRENT_TIME;
1037 di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
1038 di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
1039
1040 ocfs2_journal_dirty(handle, di_bh);
1041
1042out_commit:
1043 ocfs2_commit_trans(osb, handle);
1044
1045out_inode_unlock:
1046 brelse(di_bh);
1047 ocfs2_inode_unlock(inode, 1);
1048out_rw_unlock:
1049 ocfs2_rw_unlock(inode, 1);
1050out:
1051 mutex_unlock(&inode->i_mutex);
1052
1053 return status;
1054}
1055
1056int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp)
1057{
1058 int status;
1059
1060 struct inode *inode = filp->f_path.dentry->d_inode;
1061 struct ocfs2_move_extents range;
1062 struct ocfs2_move_extents_context *context = NULL;
1063
1064 status = mnt_want_write(filp->f_path.mnt);
1065 if (status)
1066 return status;
1067
1068 if ((!S_ISREG(inode->i_mode)) || !(filp->f_mode & FMODE_WRITE))
1069 goto out;
1070
1071 if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
1072 status = -EPERM;
1073 goto out;
1074 }
1075
1076 context = kzalloc(sizeof(struct ocfs2_move_extents_context), GFP_NOFS);
1077 if (!context) {
1078 status = -ENOMEM;
1079 mlog_errno(status);
1080 goto out;
1081 }
1082
1083 context->inode = inode;
1084 context->file = filp;
1085
1086 if (argp) {
1087 if (copy_from_user(&range, (struct ocfs2_move_extents *)argp,
1088 sizeof(range))) {
1089 status = -EFAULT;
1090 goto out;
1091 }
1092 } else {
1093 status = -EINVAL;
1094 goto out;
1095 }
1096
1097 if (range.me_start > i_size_read(inode))
1098 goto out;
1099
1100 if (range.me_start + range.me_len > i_size_read(inode))
1101 range.me_len = i_size_read(inode) - range.me_start;
1102
1103 context->range = &range;
1104
1105 if (range.me_flags & OCFS2_MOVE_EXT_FL_AUTO_DEFRAG) {
1106 context->auto_defrag = 1;
1107 /*
1108 * ok, the default theshold for the defragmentation
1109 * is 1M, since our maximum clustersize was 1M also.
1110 * any thought?
1111 */
1112 if (!range.me_threshold)
1113 range.me_threshold = 1024 * 1024;
1114
1115 if (range.me_threshold > i_size_read(inode))
1116 range.me_threshold = i_size_read(inode);
1117
1118 if (range.me_flags & OCFS2_MOVE_EXT_FL_PART_DEFRAG)
1119 context->partial = 1;
1120 } else {
1121 /*
1122 * first best-effort attempt to validate and adjust the goal
1123 * (physical address in block), while it can't guarantee later
1124 * operation can succeed all the time since global_bitmap may
1125 * change a bit over time.
1126 */
1127
1128 status = ocfs2_validate_and_adjust_move_goal(inode, &range);
1129 if (status)
1130 goto out;
1131 }
1132
1133 status = ocfs2_move_extents(context);
1134 if (status)
1135 mlog_errno(status);
1136out:
1137 /*
1138 * movement/defragmentation may end up being partially completed,
1139 * that's the reason why we need to return userspace the finished
1140 * length and new_offset even if failure happens somewhere.
1141 */
1142 if (argp) {
1143 if (copy_to_user((struct ocfs2_move_extents *)argp, &range,
1144 sizeof(range)))
1145 status = -EFAULT;
1146 }
1147
1148 kfree(context);
1149
1150 mnt_drop_write(filp->f_path.mnt);
1151
1152 return status;
1153}
diff --git a/fs/ocfs2/move_extents.h b/fs/ocfs2/move_extents.h
new file mode 100644
index 000000000000..4e143e811441
--- /dev/null
+++ b/fs/ocfs2/move_extents.h
@@ -0,0 +1,22 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * move_extents.h
5 *
6 * Copyright (C) 2011 Oracle. All rights reserved.
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public
10 * License version 2 as published by the Free Software Foundation.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License for more details.
16 */
17#ifndef OCFS2_MOVE_EXTENTS_H
18#define OCFS2_MOVE_EXTENTS_H
19
20int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp);
21
22#endif /* OCFS2_MOVE_EXTENTS_H */
diff --git a/fs/ocfs2/ocfs2_ioctl.h b/fs/ocfs2/ocfs2_ioctl.h
index b46f39bf7438..5b27ff1fa577 100644
--- a/fs/ocfs2/ocfs2_ioctl.h
+++ b/fs/ocfs2/ocfs2_ioctl.h
@@ -142,6 +142,38 @@ struct ocfs2_info_journal_size {
142 __u64 ij_journal_size; 142 __u64 ij_journal_size;
143}; 143};
144 144
145struct ocfs2_info_freeinode {
146 struct ocfs2_info_request ifi_req;
147 struct ocfs2_info_local_freeinode {
148 __u64 lfi_total;
149 __u64 lfi_free;
150 } ifi_stat[OCFS2_MAX_SLOTS];
151 __u32 ifi_slotnum; /* out */
152 __u32 ifi_pad;
153};
154
155#define OCFS2_INFO_MAX_HIST (32)
156
157struct ocfs2_info_freefrag {
158 struct ocfs2_info_request iff_req;
159 struct ocfs2_info_freefrag_stats { /* (out) */
160 struct ocfs2_info_free_chunk_list {
161 __u32 fc_chunks[OCFS2_INFO_MAX_HIST];
162 __u32 fc_clusters[OCFS2_INFO_MAX_HIST];
163 } ffs_fc_hist;
164 __u32 ffs_clusters;
165 __u32 ffs_free_clusters;
166 __u32 ffs_free_chunks;
167 __u32 ffs_free_chunks_real;
168 __u32 ffs_min; /* Minimum free chunksize in clusters */
169 __u32 ffs_max;
170 __u32 ffs_avg;
171 __u32 ffs_pad;
172 } iff_ffs;
173 __u32 iff_chunksize; /* chunksize in clusters(in) */
174 __u32 iff_pad;
175};
176
145/* Codes for ocfs2_info_request */ 177/* Codes for ocfs2_info_request */
146enum ocfs2_info_type { 178enum ocfs2_info_type {
147 OCFS2_INFO_CLUSTERSIZE = 1, 179 OCFS2_INFO_CLUSTERSIZE = 1,
@@ -151,6 +183,8 @@ enum ocfs2_info_type {
151 OCFS2_INFO_UUID, 183 OCFS2_INFO_UUID,
152 OCFS2_INFO_FS_FEATURES, 184 OCFS2_INFO_FS_FEATURES,
153 OCFS2_INFO_JOURNAL_SIZE, 185 OCFS2_INFO_JOURNAL_SIZE,
186 OCFS2_INFO_FREEINODE,
187 OCFS2_INFO_FREEFRAG,
154 OCFS2_INFO_NUM_TYPES 188 OCFS2_INFO_NUM_TYPES
155}; 189};
156 190
@@ -171,4 +205,38 @@ enum ocfs2_info_type {
171 205
172#define OCFS2_IOC_INFO _IOR('o', 5, struct ocfs2_info) 206#define OCFS2_IOC_INFO _IOR('o', 5, struct ocfs2_info)
173 207
208struct ocfs2_move_extents {
209/* All values are in bytes */
210 /* in */
211 __u64 me_start; /* Virtual start in the file to move */
212 __u64 me_len; /* Length of the extents to be moved */
213 __u64 me_goal; /* Physical offset of the goal,
214 it's in block unit */
215 __u64 me_threshold; /* Maximum distance from goal or threshold
216 for auto defragmentation */
217 __u64 me_flags; /* Flags for the operation:
218 * - auto defragmentation.
219 * - refcount,xattr cases.
220 */
221 /* out */
222 __u64 me_moved_len; /* Moved/defraged length */
223 __u64 me_new_offset; /* Resulting physical location */
224 __u32 me_reserved[2]; /* Reserved for futhure */
225};
226
227#define OCFS2_MOVE_EXT_FL_AUTO_DEFRAG (0x00000001) /* Kernel manages to
228 claim new clusters
229 as the goal place
230 for extents moving */
231#define OCFS2_MOVE_EXT_FL_PART_DEFRAG (0x00000002) /* Allow partial extent
232 moving, is to make
233 movement less likely
234 to fail, may make fs
235 even more fragmented */
236#define OCFS2_MOVE_EXT_FL_COMPLETE (0x00000004) /* Move or defragmenation
237 completely gets done.
238 */
239
240#define OCFS2_IOC_MOVE_EXT _IOW('o', 6, struct ocfs2_move_extents)
241
174#endif /* OCFS2_IOCTL_H */ 242#endif /* OCFS2_IOCTL_H */
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 3c7606cff1ab..ebfd3825f12a 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -66,7 +66,7 @@ struct ocfs2_cow_context {
66 u32 *num_clusters, 66 u32 *num_clusters,
67 unsigned int *extent_flags); 67 unsigned int *extent_flags);
68 int (*cow_duplicate_clusters)(handle_t *handle, 68 int (*cow_duplicate_clusters)(handle_t *handle,
69 struct ocfs2_cow_context *context, 69 struct file *file,
70 u32 cpos, u32 old_cluster, 70 u32 cpos, u32 old_cluster,
71 u32 new_cluster, u32 new_len); 71 u32 new_cluster, u32 new_len);
72}; 72};
@@ -2921,20 +2921,21 @@ static int ocfs2_clear_cow_buffer(handle_t *handle, struct buffer_head *bh)
2921 return 0; 2921 return 0;
2922} 2922}
2923 2923
2924static int ocfs2_duplicate_clusters_by_page(handle_t *handle, 2924int ocfs2_duplicate_clusters_by_page(handle_t *handle,
2925 struct ocfs2_cow_context *context, 2925 struct file *file,
2926 u32 cpos, u32 old_cluster, 2926 u32 cpos, u32 old_cluster,
2927 u32 new_cluster, u32 new_len) 2927 u32 new_cluster, u32 new_len)
2928{ 2928{
2929 int ret = 0, partial; 2929 int ret = 0, partial;
2930 struct ocfs2_caching_info *ci = context->data_et.et_ci; 2930 struct inode *inode = file->f_path.dentry->d_inode;
2931 struct ocfs2_caching_info *ci = INODE_CACHE(inode);
2931 struct super_block *sb = ocfs2_metadata_cache_get_super(ci); 2932 struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
2932 u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster); 2933 u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
2933 struct page *page; 2934 struct page *page;
2934 pgoff_t page_index; 2935 pgoff_t page_index;
2935 unsigned int from, to, readahead_pages; 2936 unsigned int from, to, readahead_pages;
2936 loff_t offset, end, map_end; 2937 loff_t offset, end, map_end;
2937 struct address_space *mapping = context->inode->i_mapping; 2938 struct address_space *mapping = inode->i_mapping;
2938 2939
2939 trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster, 2940 trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster,
2940 new_cluster, new_len); 2941 new_cluster, new_len);
@@ -2948,8 +2949,8 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
2948 * We only duplicate pages until we reach the page contains i_size - 1. 2949 * We only duplicate pages until we reach the page contains i_size - 1.
2949 * So trim 'end' to i_size. 2950 * So trim 'end' to i_size.
2950 */ 2951 */
2951 if (end > i_size_read(context->inode)) 2952 if (end > i_size_read(inode))
2952 end = i_size_read(context->inode); 2953 end = i_size_read(inode);
2953 2954
2954 while (offset < end) { 2955 while (offset < end) {
2955 page_index = offset >> PAGE_CACHE_SHIFT; 2956 page_index = offset >> PAGE_CACHE_SHIFT;
@@ -2972,10 +2973,9 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
2972 if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize) 2973 if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize)
2973 BUG_ON(PageDirty(page)); 2974 BUG_ON(PageDirty(page));
2974 2975
2975 if (PageReadahead(page) && context->file) { 2976 if (PageReadahead(page)) {
2976 page_cache_async_readahead(mapping, 2977 page_cache_async_readahead(mapping,
2977 &context->file->f_ra, 2978 &file->f_ra, file,
2978 context->file,
2979 page, page_index, 2979 page, page_index,
2980 readahead_pages); 2980 readahead_pages);
2981 } 2981 }
@@ -2999,8 +2999,7 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
2999 } 2999 }
3000 } 3000 }
3001 3001
3002 ocfs2_map_and_dirty_page(context->inode, 3002 ocfs2_map_and_dirty_page(inode, handle, from, to,
3003 handle, from, to,
3004 page, 0, &new_block); 3003 page, 0, &new_block);
3005 mark_page_accessed(page); 3004 mark_page_accessed(page);
3006unlock: 3005unlock:
@@ -3015,14 +3014,15 @@ unlock:
3015 return ret; 3014 return ret;
3016} 3015}
3017 3016
3018static int ocfs2_duplicate_clusters_by_jbd(handle_t *handle, 3017int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
3019 struct ocfs2_cow_context *context, 3018 struct file *file,
3020 u32 cpos, u32 old_cluster, 3019 u32 cpos, u32 old_cluster,
3021 u32 new_cluster, u32 new_len) 3020 u32 new_cluster, u32 new_len)
3022{ 3021{
3023 int ret = 0; 3022 int ret = 0;
3024 struct super_block *sb = context->inode->i_sb; 3023 struct inode *inode = file->f_path.dentry->d_inode;
3025 struct ocfs2_caching_info *ci = context->data_et.et_ci; 3024 struct super_block *sb = inode->i_sb;
3025 struct ocfs2_caching_info *ci = INODE_CACHE(inode);
3026 int i, blocks = ocfs2_clusters_to_blocks(sb, new_len); 3026 int i, blocks = ocfs2_clusters_to_blocks(sb, new_len);
3027 u64 old_block = ocfs2_clusters_to_blocks(sb, old_cluster); 3027 u64 old_block = ocfs2_clusters_to_blocks(sb, old_cluster);
3028 u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster); 3028 u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
@@ -3145,8 +3145,8 @@ static int ocfs2_replace_clusters(handle_t *handle,
3145 3145
3146 /*If the old clusters is unwritten, no need to duplicate. */ 3146 /*If the old clusters is unwritten, no need to duplicate. */
3147 if (!(ext_flags & OCFS2_EXT_UNWRITTEN)) { 3147 if (!(ext_flags & OCFS2_EXT_UNWRITTEN)) {
3148 ret = context->cow_duplicate_clusters(handle, context, cpos, 3148 ret = context->cow_duplicate_clusters(handle, context->file,
3149 old, new, len); 3149 cpos, old, new, len);
3150 if (ret) { 3150 if (ret) {
3151 mlog_errno(ret); 3151 mlog_errno(ret);
3152 goto out; 3152 goto out;
@@ -3162,22 +3162,22 @@ out:
3162 return ret; 3162 return ret;
3163} 3163}
3164 3164
3165static int ocfs2_cow_sync_writeback(struct super_block *sb, 3165int ocfs2_cow_sync_writeback(struct super_block *sb,
3166 struct ocfs2_cow_context *context, 3166 struct inode *inode,
3167 u32 cpos, u32 num_clusters) 3167 u32 cpos, u32 num_clusters)
3168{ 3168{
3169 int ret = 0; 3169 int ret = 0;
3170 loff_t offset, end, map_end; 3170 loff_t offset, end, map_end;
3171 pgoff_t page_index; 3171 pgoff_t page_index;
3172 struct page *page; 3172 struct page *page;
3173 3173
3174 if (ocfs2_should_order_data(context->inode)) 3174 if (ocfs2_should_order_data(inode))
3175 return 0; 3175 return 0;
3176 3176
3177 offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits; 3177 offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
3178 end = offset + (num_clusters << OCFS2_SB(sb)->s_clustersize_bits); 3178 end = offset + (num_clusters << OCFS2_SB(sb)->s_clustersize_bits);
3179 3179
3180 ret = filemap_fdatawrite_range(context->inode->i_mapping, 3180 ret = filemap_fdatawrite_range(inode->i_mapping,
3181 offset, end - 1); 3181 offset, end - 1);
3182 if (ret < 0) { 3182 if (ret < 0) {
3183 mlog_errno(ret); 3183 mlog_errno(ret);
@@ -3190,7 +3190,7 @@ static int ocfs2_cow_sync_writeback(struct super_block *sb,
3190 if (map_end > end) 3190 if (map_end > end)
3191 map_end = end; 3191 map_end = end;
3192 3192
3193 page = find_or_create_page(context->inode->i_mapping, 3193 page = find_or_create_page(inode->i_mapping,
3194 page_index, GFP_NOFS); 3194 page_index, GFP_NOFS);
3195 BUG_ON(!page); 3195 BUG_ON(!page);
3196 3196
@@ -3349,7 +3349,7 @@ static int ocfs2_make_clusters_writable(struct super_block *sb,
3349 * in write-back mode. 3349 * in write-back mode.
3350 */ 3350 */
3351 if (context->get_clusters == ocfs2_di_get_clusters) { 3351 if (context->get_clusters == ocfs2_di_get_clusters) {
3352 ret = ocfs2_cow_sync_writeback(sb, context, cpos, 3352 ret = ocfs2_cow_sync_writeback(sb, context->inode, cpos,
3353 orig_num_clusters); 3353 orig_num_clusters);
3354 if (ret) 3354 if (ret)
3355 mlog_errno(ret); 3355 mlog_errno(ret);
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index c8ce46f7d8e3..7754608c83a4 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -84,6 +84,17 @@ int ocfs2_refcount_cow_xattr(struct inode *inode,
84 struct buffer_head *ref_root_bh, 84 struct buffer_head *ref_root_bh,
85 u32 cpos, u32 write_len, 85 u32 cpos, u32 write_len,
86 struct ocfs2_post_refcount *post); 86 struct ocfs2_post_refcount *post);
87int ocfs2_duplicate_clusters_by_page(handle_t *handle,
88 struct file *file,
89 u32 cpos, u32 old_cluster,
90 u32 new_cluster, u32 new_len);
91int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
92 struct file *file,
93 u32 cpos, u32 old_cluster,
94 u32 new_cluster, u32 new_len);
95int ocfs2_cow_sync_writeback(struct super_block *sb,
96 struct inode *inode,
97 u32 cpos, u32 num_clusters);
87int ocfs2_add_refcount_flag(struct inode *inode, 98int ocfs2_add_refcount_flag(struct inode *inode,
88 struct ocfs2_extent_tree *data_et, 99 struct ocfs2_extent_tree *data_et,
89 struct ocfs2_caching_info *ref_ci, 100 struct ocfs2_caching_info *ref_ci,
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index 8b3a7da531eb..315de66e52b2 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -106,7 +106,7 @@ static long long get_liability(struct ubifs_info *c)
106 long long liab; 106 long long liab;
107 107
108 spin_lock(&c->space_lock); 108 spin_lock(&c->space_lock);
109 liab = c->budg_idx_growth + c->budg_data_growth + c->budg_dd_growth; 109 liab = c->bi.idx_growth + c->bi.data_growth + c->bi.dd_growth;
110 spin_unlock(&c->space_lock); 110 spin_unlock(&c->space_lock);
111 return liab; 111 return liab;
112} 112}
@@ -180,7 +180,7 @@ int ubifs_calc_min_idx_lebs(struct ubifs_info *c)
180 int idx_lebs; 180 int idx_lebs;
181 long long idx_size; 181 long long idx_size;
182 182
183 idx_size = c->old_idx_sz + c->budg_idx_growth + c->budg_uncommitted_idx; 183 idx_size = c->bi.old_idx_sz + c->bi.idx_growth + c->bi.uncommitted_idx;
184 /* And make sure we have thrice the index size of space reserved */ 184 /* And make sure we have thrice the index size of space reserved */
185 idx_size += idx_size << 1; 185 idx_size += idx_size << 1;
186 /* 186 /*
@@ -292,13 +292,13 @@ static int can_use_rp(struct ubifs_info *c)
292 * budgeted index space to the size of the current index, multiplies this by 3, 292 * budgeted index space to the size of the current index, multiplies this by 3,
293 * and makes sure this does not exceed the amount of free LEBs. 293 * and makes sure this does not exceed the amount of free LEBs.
294 * 294 *
295 * Notes about @c->min_idx_lebs and @c->lst.idx_lebs variables: 295 * Notes about @c->bi.min_idx_lebs and @c->lst.idx_lebs variables:
296 * o @c->lst.idx_lebs is the number of LEBs the index currently uses. It might 296 * o @c->lst.idx_lebs is the number of LEBs the index currently uses. It might
297 * be large, because UBIFS does not do any index consolidation as long as 297 * be large, because UBIFS does not do any index consolidation as long as
298 * there is free space. IOW, the index may take a lot of LEBs, but the LEBs 298 * there is free space. IOW, the index may take a lot of LEBs, but the LEBs
299 * will contain a lot of dirt. 299 * will contain a lot of dirt.
300 * o @c->min_idx_lebs is the number of LEBS the index presumably takes. IOW, 300 * o @c->bi.min_idx_lebs is the number of LEBS the index presumably takes. IOW,
301 * the index may be consolidated to take up to @c->min_idx_lebs LEBs. 301 * the index may be consolidated to take up to @c->bi.min_idx_lebs LEBs.
302 * 302 *
303 * This function returns zero in case of success, and %-ENOSPC in case of 303 * This function returns zero in case of success, and %-ENOSPC in case of
304 * failure. 304 * failure.
@@ -343,13 +343,13 @@ static int do_budget_space(struct ubifs_info *c)
343 c->lst.taken_empty_lebs; 343 c->lst.taken_empty_lebs;
344 if (unlikely(rsvd_idx_lebs > lebs)) { 344 if (unlikely(rsvd_idx_lebs > lebs)) {
345 dbg_budg("out of indexing space: min_idx_lebs %d (old %d), " 345 dbg_budg("out of indexing space: min_idx_lebs %d (old %d), "
346 "rsvd_idx_lebs %d", min_idx_lebs, c->min_idx_lebs, 346 "rsvd_idx_lebs %d", min_idx_lebs, c->bi.min_idx_lebs,
347 rsvd_idx_lebs); 347 rsvd_idx_lebs);
348 return -ENOSPC; 348 return -ENOSPC;
349 } 349 }
350 350
351 available = ubifs_calc_available(c, min_idx_lebs); 351 available = ubifs_calc_available(c, min_idx_lebs);
352 outstanding = c->budg_data_growth + c->budg_dd_growth; 352 outstanding = c->bi.data_growth + c->bi.dd_growth;
353 353
354 if (unlikely(available < outstanding)) { 354 if (unlikely(available < outstanding)) {
355 dbg_budg("out of data space: available %lld, outstanding %lld", 355 dbg_budg("out of data space: available %lld, outstanding %lld",
@@ -360,7 +360,7 @@ static int do_budget_space(struct ubifs_info *c)
360 if (available - outstanding <= c->rp_size && !can_use_rp(c)) 360 if (available - outstanding <= c->rp_size && !can_use_rp(c))
361 return -ENOSPC; 361 return -ENOSPC;
362 362
363 c->min_idx_lebs = min_idx_lebs; 363 c->bi.min_idx_lebs = min_idx_lebs;
364 return 0; 364 return 0;
365} 365}
366 366
@@ -393,11 +393,11 @@ static int calc_data_growth(const struct ubifs_info *c,
393{ 393{
394 int data_growth; 394 int data_growth;
395 395
396 data_growth = req->new_ino ? c->inode_budget : 0; 396 data_growth = req->new_ino ? c->bi.inode_budget : 0;
397 if (req->new_page) 397 if (req->new_page)
398 data_growth += c->page_budget; 398 data_growth += c->bi.page_budget;
399 if (req->new_dent) 399 if (req->new_dent)
400 data_growth += c->dent_budget; 400 data_growth += c->bi.dent_budget;
401 data_growth += req->new_ino_d; 401 data_growth += req->new_ino_d;
402 return data_growth; 402 return data_growth;
403} 403}
@@ -413,12 +413,12 @@ static int calc_dd_growth(const struct ubifs_info *c,
413{ 413{
414 int dd_growth; 414 int dd_growth;
415 415
416 dd_growth = req->dirtied_page ? c->page_budget : 0; 416 dd_growth = req->dirtied_page ? c->bi.page_budget : 0;
417 417
418 if (req->dirtied_ino) 418 if (req->dirtied_ino)
419 dd_growth += c->inode_budget << (req->dirtied_ino - 1); 419 dd_growth += c->bi.inode_budget << (req->dirtied_ino - 1);
420 if (req->mod_dent) 420 if (req->mod_dent)
421 dd_growth += c->dent_budget; 421 dd_growth += c->bi.dent_budget;
422 dd_growth += req->dirtied_ino_d; 422 dd_growth += req->dirtied_ino_d;
423 return dd_growth; 423 return dd_growth;
424} 424}
@@ -460,19 +460,19 @@ int ubifs_budget_space(struct ubifs_info *c, struct ubifs_budget_req *req)
460 460
461again: 461again:
462 spin_lock(&c->space_lock); 462 spin_lock(&c->space_lock);
463 ubifs_assert(c->budg_idx_growth >= 0); 463 ubifs_assert(c->bi.idx_growth >= 0);
464 ubifs_assert(c->budg_data_growth >= 0); 464 ubifs_assert(c->bi.data_growth >= 0);
465 ubifs_assert(c->budg_dd_growth >= 0); 465 ubifs_assert(c->bi.dd_growth >= 0);
466 466
467 if (unlikely(c->nospace) && (c->nospace_rp || !can_use_rp(c))) { 467 if (unlikely(c->bi.nospace) && (c->bi.nospace_rp || !can_use_rp(c))) {
468 dbg_budg("no space"); 468 dbg_budg("no space");
469 spin_unlock(&c->space_lock); 469 spin_unlock(&c->space_lock);
470 return -ENOSPC; 470 return -ENOSPC;
471 } 471 }
472 472
473 c->budg_idx_growth += idx_growth; 473 c->bi.idx_growth += idx_growth;
474 c->budg_data_growth += data_growth; 474 c->bi.data_growth += data_growth;
475 c->budg_dd_growth += dd_growth; 475 c->bi.dd_growth += dd_growth;
476 476
477 err = do_budget_space(c); 477 err = do_budget_space(c);
478 if (likely(!err)) { 478 if (likely(!err)) {
@@ -484,9 +484,9 @@ again:
484 } 484 }
485 485
486 /* Restore the old values */ 486 /* Restore the old values */
487 c->budg_idx_growth -= idx_growth; 487 c->bi.idx_growth -= idx_growth;
488 c->budg_data_growth -= data_growth; 488 c->bi.data_growth -= data_growth;
489 c->budg_dd_growth -= dd_growth; 489 c->bi.dd_growth -= dd_growth;
490 spin_unlock(&c->space_lock); 490 spin_unlock(&c->space_lock);
491 491
492 if (req->fast) { 492 if (req->fast) {
@@ -506,9 +506,9 @@ again:
506 goto again; 506 goto again;
507 } 507 }
508 dbg_budg("FS is full, -ENOSPC"); 508 dbg_budg("FS is full, -ENOSPC");
509 c->nospace = 1; 509 c->bi.nospace = 1;
510 if (can_use_rp(c) || c->rp_size == 0) 510 if (can_use_rp(c) || c->rp_size == 0)
511 c->nospace_rp = 1; 511 c->bi.nospace_rp = 1;
512 smp_wmb(); 512 smp_wmb();
513 } else 513 } else
514 ubifs_err("cannot budget space, error %d", err); 514 ubifs_err("cannot budget space, error %d", err);
@@ -523,8 +523,8 @@ again:
523 * This function releases the space budgeted by 'ubifs_budget_space()'. Note, 523 * This function releases the space budgeted by 'ubifs_budget_space()'. Note,
524 * since the index changes (which were budgeted for in @req->idx_growth) will 524 * since the index changes (which were budgeted for in @req->idx_growth) will
525 * only be written to the media on commit, this function moves the index budget 525 * only be written to the media on commit, this function moves the index budget
526 * from @c->budg_idx_growth to @c->budg_uncommitted_idx. The latter will be 526 * from @c->bi.idx_growth to @c->bi.uncommitted_idx. The latter will be zeroed
527 * zeroed by the commit operation. 527 * by the commit operation.
528 */ 528 */
529void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req) 529void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req)
530{ 530{
@@ -553,23 +553,23 @@ void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req)
553 if (!req->data_growth && !req->dd_growth) 553 if (!req->data_growth && !req->dd_growth)
554 return; 554 return;
555 555
556 c->nospace = c->nospace_rp = 0; 556 c->bi.nospace = c->bi.nospace_rp = 0;
557 smp_wmb(); 557 smp_wmb();
558 558
559 spin_lock(&c->space_lock); 559 spin_lock(&c->space_lock);
560 c->budg_idx_growth -= req->idx_growth; 560 c->bi.idx_growth -= req->idx_growth;
561 c->budg_uncommitted_idx += req->idx_growth; 561 c->bi.uncommitted_idx += req->idx_growth;
562 c->budg_data_growth -= req->data_growth; 562 c->bi.data_growth -= req->data_growth;
563 c->budg_dd_growth -= req->dd_growth; 563 c->bi.dd_growth -= req->dd_growth;
564 c->min_idx_lebs = ubifs_calc_min_idx_lebs(c); 564 c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c);
565 565
566 ubifs_assert(c->budg_idx_growth >= 0); 566 ubifs_assert(c->bi.idx_growth >= 0);
567 ubifs_assert(c->budg_data_growth >= 0); 567 ubifs_assert(c->bi.data_growth >= 0);
568 ubifs_assert(c->budg_dd_growth >= 0); 568 ubifs_assert(c->bi.dd_growth >= 0);
569 ubifs_assert(c->min_idx_lebs < c->main_lebs); 569 ubifs_assert(c->bi.min_idx_lebs < c->main_lebs);
570 ubifs_assert(!(c->budg_idx_growth & 7)); 570 ubifs_assert(!(c->bi.idx_growth & 7));
571 ubifs_assert(!(c->budg_data_growth & 7)); 571 ubifs_assert(!(c->bi.data_growth & 7));
572 ubifs_assert(!(c->budg_dd_growth & 7)); 572 ubifs_assert(!(c->bi.dd_growth & 7));
573 spin_unlock(&c->space_lock); 573 spin_unlock(&c->space_lock);
574} 574}
575 575
@@ -586,13 +586,13 @@ void ubifs_convert_page_budget(struct ubifs_info *c)
586{ 586{
587 spin_lock(&c->space_lock); 587 spin_lock(&c->space_lock);
588 /* Release the index growth reservation */ 588 /* Release the index growth reservation */
589 c->budg_idx_growth -= c->max_idx_node_sz << UBIFS_BLOCKS_PER_PAGE_SHIFT; 589 c->bi.idx_growth -= c->max_idx_node_sz << UBIFS_BLOCKS_PER_PAGE_SHIFT;
590 /* Release the data growth reservation */ 590 /* Release the data growth reservation */
591 c->budg_data_growth -= c->page_budget; 591 c->bi.data_growth -= c->bi.page_budget;
592 /* Increase the dirty data growth reservation instead */ 592 /* Increase the dirty data growth reservation instead */
593 c->budg_dd_growth += c->page_budget; 593 c->bi.dd_growth += c->bi.page_budget;
594 /* And re-calculate the indexing space reservation */ 594 /* And re-calculate the indexing space reservation */
595 c->min_idx_lebs = ubifs_calc_min_idx_lebs(c); 595 c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c);
596 spin_unlock(&c->space_lock); 596 spin_unlock(&c->space_lock);
597} 597}
598 598
@@ -612,7 +612,7 @@ void ubifs_release_dirty_inode_budget(struct ubifs_info *c,
612 612
613 memset(&req, 0, sizeof(struct ubifs_budget_req)); 613 memset(&req, 0, sizeof(struct ubifs_budget_req));
614 /* The "no space" flags will be cleared because dd_growth is > 0 */ 614 /* The "no space" flags will be cleared because dd_growth is > 0 */
615 req.dd_growth = c->inode_budget + ALIGN(ui->data_len, 8); 615 req.dd_growth = c->bi.inode_budget + ALIGN(ui->data_len, 8);
616 ubifs_release_budget(c, &req); 616 ubifs_release_budget(c, &req);
617} 617}
618 618
@@ -682,9 +682,9 @@ long long ubifs_get_free_space_nolock(struct ubifs_info *c)
682 int rsvd_idx_lebs, lebs; 682 int rsvd_idx_lebs, lebs;
683 long long available, outstanding, free; 683 long long available, outstanding, free;
684 684
685 ubifs_assert(c->min_idx_lebs == ubifs_calc_min_idx_lebs(c)); 685 ubifs_assert(c->bi.min_idx_lebs == ubifs_calc_min_idx_lebs(c));
686 outstanding = c->budg_data_growth + c->budg_dd_growth; 686 outstanding = c->bi.data_growth + c->bi.dd_growth;
687 available = ubifs_calc_available(c, c->min_idx_lebs); 687 available = ubifs_calc_available(c, c->bi.min_idx_lebs);
688 688
689 /* 689 /*
690 * When reporting free space to user-space, UBIFS guarantees that it is 690 * When reporting free space to user-space, UBIFS guarantees that it is
@@ -697,8 +697,8 @@ long long ubifs_get_free_space_nolock(struct ubifs_info *c)
697 * Note, the calculations below are similar to what we have in 697 * Note, the calculations below are similar to what we have in
698 * 'do_budget_space()', so refer there for comments. 698 * 'do_budget_space()', so refer there for comments.
699 */ 699 */
700 if (c->min_idx_lebs > c->lst.idx_lebs) 700 if (c->bi.min_idx_lebs > c->lst.idx_lebs)
701 rsvd_idx_lebs = c->min_idx_lebs - c->lst.idx_lebs; 701 rsvd_idx_lebs = c->bi.min_idx_lebs - c->lst.idx_lebs;
702 else 702 else
703 rsvd_idx_lebs = 0; 703 rsvd_idx_lebs = 0;
704 lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt - 704 lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt -
diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
index 1bd01ded7123..87cd0ead8633 100644
--- a/fs/ubifs/commit.c
+++ b/fs/ubifs/commit.c
@@ -182,7 +182,7 @@ static int do_commit(struct ubifs_info *c)
182 c->mst_node->root_len = cpu_to_le32(zroot.len); 182 c->mst_node->root_len = cpu_to_le32(zroot.len);
183 c->mst_node->ihead_lnum = cpu_to_le32(c->ihead_lnum); 183 c->mst_node->ihead_lnum = cpu_to_le32(c->ihead_lnum);
184 c->mst_node->ihead_offs = cpu_to_le32(c->ihead_offs); 184 c->mst_node->ihead_offs = cpu_to_le32(c->ihead_offs);
185 c->mst_node->index_size = cpu_to_le64(c->old_idx_sz); 185 c->mst_node->index_size = cpu_to_le64(c->bi.old_idx_sz);
186 c->mst_node->lpt_lnum = cpu_to_le32(c->lpt_lnum); 186 c->mst_node->lpt_lnum = cpu_to_le32(c->lpt_lnum);
187 c->mst_node->lpt_offs = cpu_to_le32(c->lpt_offs); 187 c->mst_node->lpt_offs = cpu_to_le32(c->lpt_offs);
188 c->mst_node->nhead_lnum = cpu_to_le32(c->nhead_lnum); 188 c->mst_node->nhead_lnum = cpu_to_le32(c->nhead_lnum);
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 004d3745dc45..0bb2bcef0de9 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -34,7 +34,6 @@
34#include <linux/moduleparam.h> 34#include <linux/moduleparam.h>
35#include <linux/debugfs.h> 35#include <linux/debugfs.h>
36#include <linux/math64.h> 36#include <linux/math64.h>
37#include <linux/slab.h>
38 37
39#ifdef CONFIG_UBIFS_FS_DEBUG 38#ifdef CONFIG_UBIFS_FS_DEBUG
40 39
@@ -43,15 +42,12 @@ DEFINE_SPINLOCK(dbg_lock);
43static char dbg_key_buf0[128]; 42static char dbg_key_buf0[128];
44static char dbg_key_buf1[128]; 43static char dbg_key_buf1[128];
45 44
46unsigned int ubifs_msg_flags;
47unsigned int ubifs_chk_flags; 45unsigned int ubifs_chk_flags;
48unsigned int ubifs_tst_flags; 46unsigned int ubifs_tst_flags;
49 47
50module_param_named(debug_msgs, ubifs_msg_flags, uint, S_IRUGO | S_IWUSR);
51module_param_named(debug_chks, ubifs_chk_flags, uint, S_IRUGO | S_IWUSR); 48module_param_named(debug_chks, ubifs_chk_flags, uint, S_IRUGO | S_IWUSR);
52module_param_named(debug_tsts, ubifs_tst_flags, uint, S_IRUGO | S_IWUSR); 49module_param_named(debug_tsts, ubifs_tst_flags, uint, S_IRUGO | S_IWUSR);
53 50
54MODULE_PARM_DESC(debug_msgs, "Debug message type flags");
55MODULE_PARM_DESC(debug_chks, "Debug check flags"); 51MODULE_PARM_DESC(debug_chks, "Debug check flags");
56MODULE_PARM_DESC(debug_tsts, "Debug special test flags"); 52MODULE_PARM_DESC(debug_tsts, "Debug special test flags");
57 53
@@ -317,6 +313,8 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
317 printk(KERN_DEBUG "\tflags %#x\n", sup_flags); 313 printk(KERN_DEBUG "\tflags %#x\n", sup_flags);
318 printk(KERN_DEBUG "\t big_lpt %u\n", 314 printk(KERN_DEBUG "\t big_lpt %u\n",
319 !!(sup_flags & UBIFS_FLG_BIGLPT)); 315 !!(sup_flags & UBIFS_FLG_BIGLPT));
316 printk(KERN_DEBUG "\t space_fixup %u\n",
317 !!(sup_flags & UBIFS_FLG_SPACE_FIXUP));
320 printk(KERN_DEBUG "\tmin_io_size %u\n", 318 printk(KERN_DEBUG "\tmin_io_size %u\n",
321 le32_to_cpu(sup->min_io_size)); 319 le32_to_cpu(sup->min_io_size));
322 printk(KERN_DEBUG "\tleb_size %u\n", 320 printk(KERN_DEBUG "\tleb_size %u\n",
@@ -602,7 +600,7 @@ void dbg_dump_lstats(const struct ubifs_lp_stats *lst)
602 spin_unlock(&dbg_lock); 600 spin_unlock(&dbg_lock);
603} 601}
604 602
605void dbg_dump_budg(struct ubifs_info *c) 603void dbg_dump_budg(struct ubifs_info *c, const struct ubifs_budg_info *bi)
606{ 604{
607 int i; 605 int i;
608 struct rb_node *rb; 606 struct rb_node *rb;
@@ -610,26 +608,42 @@ void dbg_dump_budg(struct ubifs_info *c)
610 struct ubifs_gced_idx_leb *idx_gc; 608 struct ubifs_gced_idx_leb *idx_gc;
611 long long available, outstanding, free; 609 long long available, outstanding, free;
612 610
613 ubifs_assert(spin_is_locked(&c->space_lock)); 611 spin_lock(&c->space_lock);
614 spin_lock(&dbg_lock); 612 spin_lock(&dbg_lock);
615 printk(KERN_DEBUG "(pid %d) Budgeting info: budg_data_growth %lld, " 613 printk(KERN_DEBUG "(pid %d) Budgeting info: data budget sum %lld, "
616 "budg_dd_growth %lld, budg_idx_growth %lld\n", current->pid, 614 "total budget sum %lld\n", current->pid,
617 c->budg_data_growth, c->budg_dd_growth, c->budg_idx_growth); 615 bi->data_growth + bi->dd_growth,
618 printk(KERN_DEBUG "\tdata budget sum %lld, total budget sum %lld, " 616 bi->data_growth + bi->dd_growth + bi->idx_growth);
619 "freeable_cnt %d\n", c->budg_data_growth + c->budg_dd_growth, 617 printk(KERN_DEBUG "\tbudg_data_growth %lld, budg_dd_growth %lld, "
620 c->budg_data_growth + c->budg_dd_growth + c->budg_idx_growth, 618 "budg_idx_growth %lld\n", bi->data_growth, bi->dd_growth,
621 c->freeable_cnt); 619 bi->idx_growth);
622 printk(KERN_DEBUG "\tmin_idx_lebs %d, old_idx_sz %lld, " 620 printk(KERN_DEBUG "\tmin_idx_lebs %d, old_idx_sz %llu, "
623 "calc_idx_sz %lld, idx_gc_cnt %d\n", c->min_idx_lebs, 621 "uncommitted_idx %lld\n", bi->min_idx_lebs, bi->old_idx_sz,
624 c->old_idx_sz, c->calc_idx_sz, c->idx_gc_cnt); 622 bi->uncommitted_idx);
623 printk(KERN_DEBUG "\tpage_budget %d, inode_budget %d, dent_budget %d\n",
624 bi->page_budget, bi->inode_budget, bi->dent_budget);
625 printk(KERN_DEBUG "\tnospace %u, nospace_rp %u\n",
626 bi->nospace, bi->nospace_rp);
627 printk(KERN_DEBUG "\tdark_wm %d, dead_wm %d, max_idx_node_sz %d\n",
628 c->dark_wm, c->dead_wm, c->max_idx_node_sz);
629
630 if (bi != &c->bi)
631 /*
632 * If we are dumping saved budgeting data, do not print
633 * additional information which is about the current state, not
634 * the old one which corresponded to the saved budgeting data.
635 */
636 goto out_unlock;
637
638 printk(KERN_DEBUG "\tfreeable_cnt %d, calc_idx_sz %lld, idx_gc_cnt %d\n",
639 c->freeable_cnt, c->calc_idx_sz, c->idx_gc_cnt);
625 printk(KERN_DEBUG "\tdirty_pg_cnt %ld, dirty_zn_cnt %ld, " 640 printk(KERN_DEBUG "\tdirty_pg_cnt %ld, dirty_zn_cnt %ld, "
626 "clean_zn_cnt %ld\n", atomic_long_read(&c->dirty_pg_cnt), 641 "clean_zn_cnt %ld\n", atomic_long_read(&c->dirty_pg_cnt),
627 atomic_long_read(&c->dirty_zn_cnt), 642 atomic_long_read(&c->dirty_zn_cnt),
628 atomic_long_read(&c->clean_zn_cnt)); 643 atomic_long_read(&c->clean_zn_cnt));
629 printk(KERN_DEBUG "\tdark_wm %d, dead_wm %d, max_idx_node_sz %d\n",
630 c->dark_wm, c->dead_wm, c->max_idx_node_sz);
631 printk(KERN_DEBUG "\tgc_lnum %d, ihead_lnum %d\n", 644 printk(KERN_DEBUG "\tgc_lnum %d, ihead_lnum %d\n",
632 c->gc_lnum, c->ihead_lnum); 645 c->gc_lnum, c->ihead_lnum);
646
633 /* If we are in R/O mode, journal heads do not exist */ 647 /* If we are in R/O mode, journal heads do not exist */
634 if (c->jheads) 648 if (c->jheads)
635 for (i = 0; i < c->jhead_cnt; i++) 649 for (i = 0; i < c->jhead_cnt; i++)
@@ -648,13 +662,15 @@ void dbg_dump_budg(struct ubifs_info *c)
648 printk(KERN_DEBUG "\tcommit state %d\n", c->cmt_state); 662 printk(KERN_DEBUG "\tcommit state %d\n", c->cmt_state);
649 663
650 /* Print budgeting predictions */ 664 /* Print budgeting predictions */
651 available = ubifs_calc_available(c, c->min_idx_lebs); 665 available = ubifs_calc_available(c, c->bi.min_idx_lebs);
652 outstanding = c->budg_data_growth + c->budg_dd_growth; 666 outstanding = c->bi.data_growth + c->bi.dd_growth;
653 free = ubifs_get_free_space_nolock(c); 667 free = ubifs_get_free_space_nolock(c);
654 printk(KERN_DEBUG "Budgeting predictions:\n"); 668 printk(KERN_DEBUG "Budgeting predictions:\n");
655 printk(KERN_DEBUG "\tavailable: %lld, outstanding %lld, free %lld\n", 669 printk(KERN_DEBUG "\tavailable: %lld, outstanding %lld, free %lld\n",
656 available, outstanding, free); 670 available, outstanding, free);
671out_unlock:
657 spin_unlock(&dbg_lock); 672 spin_unlock(&dbg_lock);
673 spin_unlock(&c->space_lock);
658} 674}
659 675
660void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp) 676void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp)
@@ -729,7 +745,13 @@ void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp)
729 if (bud->lnum == lp->lnum) { 745 if (bud->lnum == lp->lnum) {
730 int head = 0; 746 int head = 0;
731 for (i = 0; i < c->jhead_cnt; i++) { 747 for (i = 0; i < c->jhead_cnt; i++) {
732 if (lp->lnum == c->jheads[i].wbuf.lnum) { 748 /*
749 * Note, if we are in R/O mode or in the middle
750 * of mounting/re-mounting, the write-buffers do
751 * not exist.
752 */
753 if (c->jheads &&
754 lp->lnum == c->jheads[i].wbuf.lnum) {
733 printk(KERN_CONT ", jhead %s", 755 printk(KERN_CONT ", jhead %s",
734 dbg_jhead(i)); 756 dbg_jhead(i));
735 head = 1; 757 head = 1;
@@ -976,6 +998,8 @@ void dbg_save_space_info(struct ubifs_info *c)
976 998
977 spin_lock(&c->space_lock); 999 spin_lock(&c->space_lock);
978 memcpy(&d->saved_lst, &c->lst, sizeof(struct ubifs_lp_stats)); 1000 memcpy(&d->saved_lst, &c->lst, sizeof(struct ubifs_lp_stats));
1001 memcpy(&d->saved_bi, &c->bi, sizeof(struct ubifs_budg_info));
1002 d->saved_idx_gc_cnt = c->idx_gc_cnt;
979 1003
980 /* 1004 /*
981 * We use a dirty hack here and zero out @c->freeable_cnt, because it 1005 * We use a dirty hack here and zero out @c->freeable_cnt, because it
@@ -1042,14 +1066,14 @@ int dbg_check_space_info(struct ubifs_info *c)
1042out: 1066out:
1043 ubifs_msg("saved lprops statistics dump"); 1067 ubifs_msg("saved lprops statistics dump");
1044 dbg_dump_lstats(&d->saved_lst); 1068 dbg_dump_lstats(&d->saved_lst);
1045 ubifs_get_lp_stats(c, &lst); 1069 ubifs_msg("saved budgeting info dump");
1046 1070 dbg_dump_budg(c, &d->saved_bi);
1071 ubifs_msg("saved idx_gc_cnt %d", d->saved_idx_gc_cnt);
1047 ubifs_msg("current lprops statistics dump"); 1072 ubifs_msg("current lprops statistics dump");
1073 ubifs_get_lp_stats(c, &lst);
1048 dbg_dump_lstats(&lst); 1074 dbg_dump_lstats(&lst);
1049 1075 ubifs_msg("current budgeting info dump");
1050 spin_lock(&c->space_lock); 1076 dbg_dump_budg(c, &c->bi);
1051 dbg_dump_budg(c);
1052 spin_unlock(&c->space_lock);
1053 dump_stack(); 1077 dump_stack();
1054 return -EINVAL; 1078 return -EINVAL;
1055} 1079}
@@ -1793,6 +1817,8 @@ static struct fsck_inode *add_inode(struct ubifs_info *c,
1793 struct rb_node **p, *parent = NULL; 1817 struct rb_node **p, *parent = NULL;
1794 struct fsck_inode *fscki; 1818 struct fsck_inode *fscki;
1795 ino_t inum = key_inum_flash(c, &ino->key); 1819 ino_t inum = key_inum_flash(c, &ino->key);
1820 struct inode *inode;
1821 struct ubifs_inode *ui;
1796 1822
1797 p = &fsckd->inodes.rb_node; 1823 p = &fsckd->inodes.rb_node;
1798 while (*p) { 1824 while (*p) {
@@ -1816,19 +1842,46 @@ static struct fsck_inode *add_inode(struct ubifs_info *c,
1816 if (!fscki) 1842 if (!fscki)
1817 return ERR_PTR(-ENOMEM); 1843 return ERR_PTR(-ENOMEM);
1818 1844
1845 inode = ilookup(c->vfs_sb, inum);
1846
1819 fscki->inum = inum; 1847 fscki->inum = inum;
1820 fscki->nlink = le32_to_cpu(ino->nlink); 1848 /*
1821 fscki->size = le64_to_cpu(ino->size); 1849 * If the inode is present in the VFS inode cache, use it instead of
1822 fscki->xattr_cnt = le32_to_cpu(ino->xattr_cnt); 1850 * the on-flash inode which might be out-of-date. E.g., the size might
1823 fscki->xattr_sz = le32_to_cpu(ino->xattr_size); 1851 * be out-of-date. If we do not do this, the following may happen, for
1824 fscki->xattr_nms = le32_to_cpu(ino->xattr_names); 1852 * example:
1825 fscki->mode = le32_to_cpu(ino->mode); 1853 * 1. A power cut happens
1854 * 2. We mount the file-system R/O, the replay process fixes up the
1855 * inode size in the VFS cache, but on on-flash.
1856 * 3. 'check_leaf()' fails because it hits a data node beyond inode
1857 * size.
1858 */
1859 if (!inode) {
1860 fscki->nlink = le32_to_cpu(ino->nlink);
1861 fscki->size = le64_to_cpu(ino->size);
1862 fscki->xattr_cnt = le32_to_cpu(ino->xattr_cnt);
1863 fscki->xattr_sz = le32_to_cpu(ino->xattr_size);
1864 fscki->xattr_nms = le32_to_cpu(ino->xattr_names);
1865 fscki->mode = le32_to_cpu(ino->mode);
1866 } else {
1867 ui = ubifs_inode(inode);
1868 fscki->nlink = inode->i_nlink;
1869 fscki->size = inode->i_size;
1870 fscki->xattr_cnt = ui->xattr_cnt;
1871 fscki->xattr_sz = ui->xattr_size;
1872 fscki->xattr_nms = ui->xattr_names;
1873 fscki->mode = inode->i_mode;
1874 iput(inode);
1875 }
1876
1826 if (S_ISDIR(fscki->mode)) { 1877 if (S_ISDIR(fscki->mode)) {
1827 fscki->calc_sz = UBIFS_INO_NODE_SZ; 1878 fscki->calc_sz = UBIFS_INO_NODE_SZ;
1828 fscki->calc_cnt = 2; 1879 fscki->calc_cnt = 2;
1829 } 1880 }
1881
1830 rb_link_node(&fscki->rb, parent, p); 1882 rb_link_node(&fscki->rb, parent, p);
1831 rb_insert_color(&fscki->rb, &fsckd->inodes); 1883 rb_insert_color(&fscki->rb, &fsckd->inodes);
1884
1832 return fscki; 1885 return fscki;
1833} 1886}
1834 1887
@@ -2421,7 +2474,8 @@ int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head)
2421 hashb = key_block(c, &sb->key); 2474 hashb = key_block(c, &sb->key);
2422 2475
2423 if (hasha > hashb) { 2476 if (hasha > hashb) {
2424 ubifs_err("larger hash %u goes before %u", hasha, hashb); 2477 ubifs_err("larger hash %u goes before %u",
2478 hasha, hashb);
2425 goto error_dump; 2479 goto error_dump;
2426 } 2480 }
2427 } 2481 }
@@ -2437,14 +2491,12 @@ error_dump:
2437 return 0; 2491 return 0;
2438} 2492}
2439 2493
2440static int invocation_cnt;
2441
2442int dbg_force_in_the_gaps(void) 2494int dbg_force_in_the_gaps(void)
2443{ 2495{
2444 if (!dbg_force_in_the_gaps_enabled) 2496 if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
2445 return 0; 2497 return 0;
2446 /* Force in-the-gaps every 8th commit */ 2498
2447 return !((invocation_cnt++) & 0x7); 2499 return !(random32() & 7);
2448} 2500}
2449 2501
2450/* Failure mode for recovery testing */ 2502/* Failure mode for recovery testing */
@@ -2632,7 +2684,7 @@ int dbg_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset,
2632 int len, int check) 2684 int len, int check)
2633{ 2685{
2634 if (in_failure_mode(desc)) 2686 if (in_failure_mode(desc))
2635 return -EIO; 2687 return -EROFS;
2636 return ubi_leb_read(desc, lnum, buf, offset, len, check); 2688 return ubi_leb_read(desc, lnum, buf, offset, len, check);
2637} 2689}
2638 2690
@@ -2642,7 +2694,7 @@ int dbg_leb_write(struct ubi_volume_desc *desc, int lnum, const void *buf,
2642 int err, failing; 2694 int err, failing;
2643 2695
2644 if (in_failure_mode(desc)) 2696 if (in_failure_mode(desc))
2645 return -EIO; 2697 return -EROFS;
2646 failing = do_fail(desc, lnum, 1); 2698 failing = do_fail(desc, lnum, 1);
2647 if (failing) 2699 if (failing)
2648 cut_data(buf, len); 2700 cut_data(buf, len);
@@ -2650,7 +2702,7 @@ int dbg_leb_write(struct ubi_volume_desc *desc, int lnum, const void *buf,
2650 if (err) 2702 if (err)
2651 return err; 2703 return err;
2652 if (failing) 2704 if (failing)
2653 return -EIO; 2705 return -EROFS;
2654 return 0; 2706 return 0;
2655} 2707}
2656 2708
@@ -2660,12 +2712,12 @@ int dbg_leb_change(struct ubi_volume_desc *desc, int lnum, const void *buf,
2660 int err; 2712 int err;
2661 2713
2662 if (do_fail(desc, lnum, 1)) 2714 if (do_fail(desc, lnum, 1))
2663 return -EIO; 2715 return -EROFS;
2664 err = ubi_leb_change(desc, lnum, buf, len, dtype); 2716 err = ubi_leb_change(desc, lnum, buf, len, dtype);
2665 if (err) 2717 if (err)
2666 return err; 2718 return err;
2667 if (do_fail(desc, lnum, 1)) 2719 if (do_fail(desc, lnum, 1))
2668 return -EIO; 2720 return -EROFS;
2669 return 0; 2721 return 0;
2670} 2722}
2671 2723
@@ -2674,12 +2726,12 @@ int dbg_leb_erase(struct ubi_volume_desc *desc, int lnum)
2674 int err; 2726 int err;
2675 2727
2676 if (do_fail(desc, lnum, 0)) 2728 if (do_fail(desc, lnum, 0))
2677 return -EIO; 2729 return -EROFS;
2678 err = ubi_leb_erase(desc, lnum); 2730 err = ubi_leb_erase(desc, lnum);
2679 if (err) 2731 if (err)
2680 return err; 2732 return err;
2681 if (do_fail(desc, lnum, 0)) 2733 if (do_fail(desc, lnum, 0))
2682 return -EIO; 2734 return -EROFS;
2683 return 0; 2735 return 0;
2684} 2736}
2685 2737
@@ -2688,19 +2740,19 @@ int dbg_leb_unmap(struct ubi_volume_desc *desc, int lnum)
2688 int err; 2740 int err;
2689 2741
2690 if (do_fail(desc, lnum, 0)) 2742 if (do_fail(desc, lnum, 0))
2691 return -EIO; 2743 return -EROFS;
2692 err = ubi_leb_unmap(desc, lnum); 2744 err = ubi_leb_unmap(desc, lnum);
2693 if (err) 2745 if (err)
2694 return err; 2746 return err;
2695 if (do_fail(desc, lnum, 0)) 2747 if (do_fail(desc, lnum, 0))
2696 return -EIO; 2748 return -EROFS;
2697 return 0; 2749 return 0;
2698} 2750}
2699 2751
2700int dbg_is_mapped(struct ubi_volume_desc *desc, int lnum) 2752int dbg_is_mapped(struct ubi_volume_desc *desc, int lnum)
2701{ 2753{
2702 if (in_failure_mode(desc)) 2754 if (in_failure_mode(desc))
2703 return -EIO; 2755 return -EROFS;
2704 return ubi_is_mapped(desc, lnum); 2756 return ubi_is_mapped(desc, lnum);
2705} 2757}
2706 2758
@@ -2709,12 +2761,12 @@ int dbg_leb_map(struct ubi_volume_desc *desc, int lnum, int dtype)
2709 int err; 2761 int err;
2710 2762
2711 if (do_fail(desc, lnum, 0)) 2763 if (do_fail(desc, lnum, 0))
2712 return -EIO; 2764 return -EROFS;
2713 err = ubi_leb_map(desc, lnum, dtype); 2765 err = ubi_leb_map(desc, lnum, dtype);
2714 if (err) 2766 if (err)
2715 return err; 2767 return err;
2716 if (do_fail(desc, lnum, 0)) 2768 if (do_fail(desc, lnum, 0))
2717 return -EIO; 2769 return -EROFS;
2718 return 0; 2770 return 0;
2719} 2771}
2720 2772
@@ -2784,7 +2836,7 @@ void dbg_debugfs_exit(void)
2784static int open_debugfs_file(struct inode *inode, struct file *file) 2836static int open_debugfs_file(struct inode *inode, struct file *file)
2785{ 2837{
2786 file->private_data = inode->i_private; 2838 file->private_data = inode->i_private;
2787 return 0; 2839 return nonseekable_open(inode, file);
2788} 2840}
2789 2841
2790static ssize_t write_debugfs_file(struct file *file, const char __user *buf, 2842static ssize_t write_debugfs_file(struct file *file, const char __user *buf,
@@ -2795,18 +2847,15 @@ static ssize_t write_debugfs_file(struct file *file, const char __user *buf,
2795 2847
2796 if (file->f_path.dentry == d->dfs_dump_lprops) 2848 if (file->f_path.dentry == d->dfs_dump_lprops)
2797 dbg_dump_lprops(c); 2849 dbg_dump_lprops(c);
2798 else if (file->f_path.dentry == d->dfs_dump_budg) { 2850 else if (file->f_path.dentry == d->dfs_dump_budg)
2799 spin_lock(&c->space_lock); 2851 dbg_dump_budg(c, &c->bi);
2800 dbg_dump_budg(c); 2852 else if (file->f_path.dentry == d->dfs_dump_tnc) {
2801 spin_unlock(&c->space_lock);
2802 } else if (file->f_path.dentry == d->dfs_dump_tnc) {
2803 mutex_lock(&c->tnc_mutex); 2853 mutex_lock(&c->tnc_mutex);
2804 dbg_dump_tnc(c); 2854 dbg_dump_tnc(c);
2805 mutex_unlock(&c->tnc_mutex); 2855 mutex_unlock(&c->tnc_mutex);
2806 } else 2856 } else
2807 return -EINVAL; 2857 return -EINVAL;
2808 2858
2809 *ppos += count;
2810 return count; 2859 return count;
2811} 2860}
2812 2861
@@ -2814,7 +2863,7 @@ static const struct file_operations dfs_fops = {
2814 .open = open_debugfs_file, 2863 .open = open_debugfs_file,
2815 .write = write_debugfs_file, 2864 .write = write_debugfs_file,
2816 .owner = THIS_MODULE, 2865 .owner = THIS_MODULE,
2817 .llseek = default_llseek, 2866 .llseek = no_llseek,
2818}; 2867};
2819 2868
2820/** 2869/**
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index e6493cac193d..a811ac4a26bb 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -31,6 +31,8 @@ typedef int (*dbg_znode_callback)(struct ubifs_info *c,
31 31
32#ifdef CONFIG_UBIFS_FS_DEBUG 32#ifdef CONFIG_UBIFS_FS_DEBUG
33 33
34#include <linux/random.h>
35
34/** 36/**
35 * ubifs_debug_info - per-FS debugging information. 37 * ubifs_debug_info - per-FS debugging information.
36 * @old_zroot: old index root - used by 'dbg_check_old_index()' 38 * @old_zroot: old index root - used by 'dbg_check_old_index()'
@@ -50,13 +52,15 @@ typedef int (*dbg_znode_callback)(struct ubifs_info *c,
50 * @new_ihead_offs: used by debugging to check @c->ihead_offs 52 * @new_ihead_offs: used by debugging to check @c->ihead_offs
51 * 53 *
52 * @saved_lst: saved lprops statistics (used by 'dbg_save_space_info()') 54 * @saved_lst: saved lprops statistics (used by 'dbg_save_space_info()')
53 * @saved_free: saved free space (used by 'dbg_save_space_info()') 55 * @saved_bi: saved budgeting information
56 * @saved_free: saved amount of free space
57 * @saved_idx_gc_cnt: saved value of @c->idx_gc_cnt
54 * 58 *
55 * dfs_dir_name: name of debugfs directory containing this file-system's files 59 * @dfs_dir_name: name of debugfs directory containing this file-system's files
56 * dfs_dir: direntry object of the file-system debugfs directory 60 * @dfs_dir: direntry object of the file-system debugfs directory
57 * dfs_dump_lprops: "dump lprops" debugfs knob 61 * @dfs_dump_lprops: "dump lprops" debugfs knob
58 * dfs_dump_budg: "dump budgeting information" debugfs knob 62 * @dfs_dump_budg: "dump budgeting information" debugfs knob
59 * dfs_dump_tnc: "dump TNC" debugfs knob 63 * @dfs_dump_tnc: "dump TNC" debugfs knob
60 */ 64 */
61struct ubifs_debug_info { 65struct ubifs_debug_info {
62 struct ubifs_zbranch old_zroot; 66 struct ubifs_zbranch old_zroot;
@@ -76,7 +80,9 @@ struct ubifs_debug_info {
76 int new_ihead_offs; 80 int new_ihead_offs;
77 81
78 struct ubifs_lp_stats saved_lst; 82 struct ubifs_lp_stats saved_lst;
83 struct ubifs_budg_info saved_bi;
79 long long saved_free; 84 long long saved_free;
85 int saved_idx_gc_cnt;
80 86
81 char dfs_dir_name[100]; 87 char dfs_dir_name[100];
82 struct dentry *dfs_dir; 88 struct dentry *dfs_dir;
@@ -101,23 +107,7 @@ struct ubifs_debug_info {
101 } \ 107 } \
102} while (0) 108} while (0)
103 109
104#define dbg_dump_stack() do { \ 110#define dbg_dump_stack() dump_stack()
105 if (!dbg_failure_mode) \
106 dump_stack(); \
107} while (0)
108
109/* Generic debugging messages */
110#define dbg_msg(fmt, ...) do { \
111 spin_lock(&dbg_lock); \
112 printk(KERN_DEBUG "UBIFS DBG (pid %d): %s: " fmt "\n", current->pid, \
113 __func__, ##__VA_ARGS__); \
114 spin_unlock(&dbg_lock); \
115} while (0)
116
117#define dbg_do_msg(typ, fmt, ...) do { \
118 if (ubifs_msg_flags & typ) \
119 dbg_msg(fmt, ##__VA_ARGS__); \
120} while (0)
121 111
122#define dbg_err(fmt, ...) do { \ 112#define dbg_err(fmt, ...) do { \
123 spin_lock(&dbg_lock); \ 113 spin_lock(&dbg_lock); \
@@ -137,77 +127,40 @@ const char *dbg_key_str1(const struct ubifs_info *c,
137#define DBGKEY(key) dbg_key_str0(c, (key)) 127#define DBGKEY(key) dbg_key_str0(c, (key))
138#define DBGKEY1(key) dbg_key_str1(c, (key)) 128#define DBGKEY1(key) dbg_key_str1(c, (key))
139 129
140/* General messages */ 130#define ubifs_dbg_msg(type, fmt, ...) do { \
141#define dbg_gen(fmt, ...) dbg_do_msg(UBIFS_MSG_GEN, fmt, ##__VA_ARGS__) 131 spin_lock(&dbg_lock); \
132 pr_debug("UBIFS DBG " type ": " fmt "\n", ##__VA_ARGS__); \
133 spin_unlock(&dbg_lock); \
134} while (0)
142 135
136/* Just a debugging messages not related to any specific UBIFS subsystem */
137#define dbg_msg(fmt, ...) ubifs_dbg_msg("msg", fmt, ##__VA_ARGS__)
138/* General messages */
139#define dbg_gen(fmt, ...) ubifs_dbg_msg("gen", fmt, ##__VA_ARGS__)
143/* Additional journal messages */ 140/* Additional journal messages */
144#define dbg_jnl(fmt, ...) dbg_do_msg(UBIFS_MSG_JNL, fmt, ##__VA_ARGS__) 141#define dbg_jnl(fmt, ...) ubifs_dbg_msg("jnl", fmt, ##__VA_ARGS__)
145
146/* Additional TNC messages */ 142/* Additional TNC messages */
147#define dbg_tnc(fmt, ...) dbg_do_msg(UBIFS_MSG_TNC, fmt, ##__VA_ARGS__) 143#define dbg_tnc(fmt, ...) ubifs_dbg_msg("tnc", fmt, ##__VA_ARGS__)
148
149/* Additional lprops messages */ 144/* Additional lprops messages */
150#define dbg_lp(fmt, ...) dbg_do_msg(UBIFS_MSG_LP, fmt, ##__VA_ARGS__) 145#define dbg_lp(fmt, ...) ubifs_dbg_msg("lp", fmt, ##__VA_ARGS__)
151
152/* Additional LEB find messages */ 146/* Additional LEB find messages */
153#define dbg_find(fmt, ...) dbg_do_msg(UBIFS_MSG_FIND, fmt, ##__VA_ARGS__) 147#define dbg_find(fmt, ...) ubifs_dbg_msg("find", fmt, ##__VA_ARGS__)
154
155/* Additional mount messages */ 148/* Additional mount messages */
156#define dbg_mnt(fmt, ...) dbg_do_msg(UBIFS_MSG_MNT, fmt, ##__VA_ARGS__) 149#define dbg_mnt(fmt, ...) ubifs_dbg_msg("mnt", fmt, ##__VA_ARGS__)
157
158/* Additional I/O messages */ 150/* Additional I/O messages */
159#define dbg_io(fmt, ...) dbg_do_msg(UBIFS_MSG_IO, fmt, ##__VA_ARGS__) 151#define dbg_io(fmt, ...) ubifs_dbg_msg("io", fmt, ##__VA_ARGS__)
160
161/* Additional commit messages */ 152/* Additional commit messages */
162#define dbg_cmt(fmt, ...) dbg_do_msg(UBIFS_MSG_CMT, fmt, ##__VA_ARGS__) 153#define dbg_cmt(fmt, ...) ubifs_dbg_msg("cmt", fmt, ##__VA_ARGS__)
163
164/* Additional budgeting messages */ 154/* Additional budgeting messages */
165#define dbg_budg(fmt, ...) dbg_do_msg(UBIFS_MSG_BUDG, fmt, ##__VA_ARGS__) 155#define dbg_budg(fmt, ...) ubifs_dbg_msg("budg", fmt, ##__VA_ARGS__)
166
167/* Additional log messages */ 156/* Additional log messages */
168#define dbg_log(fmt, ...) dbg_do_msg(UBIFS_MSG_LOG, fmt, ##__VA_ARGS__) 157#define dbg_log(fmt, ...) ubifs_dbg_msg("log", fmt, ##__VA_ARGS__)
169
170/* Additional gc messages */ 158/* Additional gc messages */
171#define dbg_gc(fmt, ...) dbg_do_msg(UBIFS_MSG_GC, fmt, ##__VA_ARGS__) 159#define dbg_gc(fmt, ...) ubifs_dbg_msg("gc", fmt, ##__VA_ARGS__)
172
173/* Additional scan messages */ 160/* Additional scan messages */
174#define dbg_scan(fmt, ...) dbg_do_msg(UBIFS_MSG_SCAN, fmt, ##__VA_ARGS__) 161#define dbg_scan(fmt, ...) ubifs_dbg_msg("scan", fmt, ##__VA_ARGS__)
175
176/* Additional recovery messages */ 162/* Additional recovery messages */
177#define dbg_rcvry(fmt, ...) dbg_do_msg(UBIFS_MSG_RCVRY, fmt, ##__VA_ARGS__) 163#define dbg_rcvry(fmt, ...) ubifs_dbg_msg("rcvry", fmt, ##__VA_ARGS__)
178
179/*
180 * Debugging message type flags.
181 *
182 * UBIFS_MSG_GEN: general messages
183 * UBIFS_MSG_JNL: journal messages
184 * UBIFS_MSG_MNT: mount messages
185 * UBIFS_MSG_CMT: commit messages
186 * UBIFS_MSG_FIND: LEB find messages
187 * UBIFS_MSG_BUDG: budgeting messages
188 * UBIFS_MSG_GC: garbage collection messages
189 * UBIFS_MSG_TNC: TNC messages
190 * UBIFS_MSG_LP: lprops messages
191 * UBIFS_MSG_IO: I/O messages
192 * UBIFS_MSG_LOG: log messages
193 * UBIFS_MSG_SCAN: scan messages
194 * UBIFS_MSG_RCVRY: recovery messages
195 */
196enum {
197 UBIFS_MSG_GEN = 0x1,
198 UBIFS_MSG_JNL = 0x2,
199 UBIFS_MSG_MNT = 0x4,
200 UBIFS_MSG_CMT = 0x8,
201 UBIFS_MSG_FIND = 0x10,
202 UBIFS_MSG_BUDG = 0x20,
203 UBIFS_MSG_GC = 0x40,
204 UBIFS_MSG_TNC = 0x80,
205 UBIFS_MSG_LP = 0x100,
206 UBIFS_MSG_IO = 0x200,
207 UBIFS_MSG_LOG = 0x400,
208 UBIFS_MSG_SCAN = 0x800,
209 UBIFS_MSG_RCVRY = 0x1000,
210};
211 164
212/* 165/*
213 * Debugging check flags. 166 * Debugging check flags.
@@ -233,11 +186,9 @@ enum {
233/* 186/*
234 * Special testing flags. 187 * Special testing flags.
235 * 188 *
236 * UBIFS_TST_FORCE_IN_THE_GAPS: force the use of in-the-gaps method
237 * UBIFS_TST_RCVRY: failure mode for recovery testing 189 * UBIFS_TST_RCVRY: failure mode for recovery testing
238 */ 190 */
239enum { 191enum {
240 UBIFS_TST_FORCE_IN_THE_GAPS = 0x2,
241 UBIFS_TST_RCVRY = 0x4, 192 UBIFS_TST_RCVRY = 0x4,
242}; 193};
243 194
@@ -262,7 +213,7 @@ void dbg_dump_lpt_node(const struct ubifs_info *c, void *node, int lnum,
262 int offs); 213 int offs);
263void dbg_dump_budget_req(const struct ubifs_budget_req *req); 214void dbg_dump_budget_req(const struct ubifs_budget_req *req);
264void dbg_dump_lstats(const struct ubifs_lp_stats *lst); 215void dbg_dump_lstats(const struct ubifs_lp_stats *lst);
265void dbg_dump_budg(struct ubifs_info *c); 216void dbg_dump_budg(struct ubifs_info *c, const struct ubifs_budg_info *bi);
266void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp); 217void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp);
267void dbg_dump_lprops(struct ubifs_info *c); 218void dbg_dump_lprops(struct ubifs_info *c);
268void dbg_dump_lpt_info(struct ubifs_info *c); 219void dbg_dump_lpt_info(struct ubifs_info *c);
@@ -304,18 +255,16 @@ int dbg_check_data_nodes_order(struct ubifs_info *c, struct list_head *head);
304int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head); 255int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head);
305 256
306/* Force the use of in-the-gaps method for testing */ 257/* Force the use of in-the-gaps method for testing */
307 258static inline int dbg_force_in_the_gaps_enabled(void)
308#define dbg_force_in_the_gaps_enabled \ 259{
309 (ubifs_tst_flags & UBIFS_TST_FORCE_IN_THE_GAPS) 260 return ubifs_chk_flags & UBIFS_CHK_GEN;
310 261}
311int dbg_force_in_the_gaps(void); 262int dbg_force_in_the_gaps(void);
312 263
313/* Failure mode for recovery testing */ 264/* Failure mode for recovery testing */
314
315#define dbg_failure_mode (ubifs_tst_flags & UBIFS_TST_RCVRY) 265#define dbg_failure_mode (ubifs_tst_flags & UBIFS_TST_RCVRY)
316 266
317#ifndef UBIFS_DBG_PRESERVE_UBI 267#ifndef UBIFS_DBG_PRESERVE_UBI
318
319#define ubi_leb_read dbg_leb_read 268#define ubi_leb_read dbg_leb_read
320#define ubi_leb_write dbg_leb_write 269#define ubi_leb_write dbg_leb_write
321#define ubi_leb_change dbg_leb_change 270#define ubi_leb_change dbg_leb_change
@@ -323,7 +272,6 @@ int dbg_force_in_the_gaps(void);
323#define ubi_leb_unmap dbg_leb_unmap 272#define ubi_leb_unmap dbg_leb_unmap
324#define ubi_is_mapped dbg_is_mapped 273#define ubi_is_mapped dbg_is_mapped
325#define ubi_leb_map dbg_leb_map 274#define ubi_leb_map dbg_leb_map
326
327#endif 275#endif
328 276
329int dbg_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset, 277int dbg_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset,
@@ -370,33 +318,33 @@ void dbg_debugfs_exit_fs(struct ubifs_info *c);
370 __func__, __LINE__, current->pid); \ 318 __func__, __LINE__, current->pid); \
371} while (0) 319} while (0)
372 320
373#define dbg_err(fmt, ...) do { \ 321#define dbg_err(fmt, ...) do { \
374 if (0) \ 322 if (0) \
375 ubifs_err(fmt, ##__VA_ARGS__); \ 323 ubifs_err(fmt, ##__VA_ARGS__); \
376} while (0) 324} while (0)
377 325
378#define dbg_msg(fmt, ...) do { \ 326#define ubifs_dbg_msg(fmt, ...) do { \
379 if (0) \ 327 if (0) \
380 printk(KERN_DEBUG "UBIFS DBG (pid %d): %s: " fmt "\n", \ 328 pr_debug(fmt "\n", ##__VA_ARGS__); \
381 current->pid, __func__, ##__VA_ARGS__); \
382} while (0) 329} while (0)
383 330
384#define dbg_dump_stack() 331#define dbg_dump_stack()
385#define ubifs_assert_cmt_locked(c) 332#define ubifs_assert_cmt_locked(c)
386 333
387#define dbg_gen(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) 334#define dbg_msg(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
388#define dbg_jnl(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) 335#define dbg_gen(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
389#define dbg_tnc(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) 336#define dbg_jnl(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
390#define dbg_lp(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) 337#define dbg_tnc(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
391#define dbg_find(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) 338#define dbg_lp(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
392#define dbg_mnt(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) 339#define dbg_find(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
393#define dbg_io(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) 340#define dbg_mnt(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
394#define dbg_cmt(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) 341#define dbg_io(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
395#define dbg_budg(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) 342#define dbg_cmt(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
396#define dbg_log(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) 343#define dbg_budg(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
397#define dbg_gc(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) 344#define dbg_log(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
398#define dbg_scan(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) 345#define dbg_gc(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
399#define dbg_rcvry(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) 346#define dbg_scan(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
347#define dbg_rcvry(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
400 348
401#define DBGKEY(key) ((char *)(key)) 349#define DBGKEY(key) ((char *)(key))
402#define DBGKEY1(key) ((char *)(key)) 350#define DBGKEY1(key) ((char *)(key))
@@ -420,7 +368,9 @@ static inline void
420dbg_dump_budget_req(const struct ubifs_budget_req *req) { return; } 368dbg_dump_budget_req(const struct ubifs_budget_req *req) { return; }
421static inline void 369static inline void
422dbg_dump_lstats(const struct ubifs_lp_stats *lst) { return; } 370dbg_dump_lstats(const struct ubifs_lp_stats *lst) { return; }
423static inline void dbg_dump_budg(struct ubifs_info *c) { return; } 371static inline void
372dbg_dump_budg(struct ubifs_info *c,
373 const struct ubifs_budg_info *bi) { return; }
424static inline void dbg_dump_lprop(const struct ubifs_info *c, 374static inline void dbg_dump_lprop(const struct ubifs_info *c,
425 const struct ubifs_lprops *lp) { return; } 375 const struct ubifs_lprops *lp) { return; }
426static inline void dbg_dump_lprops(struct ubifs_info *c) { return; } 376static inline void dbg_dump_lprops(struct ubifs_info *c) { return; }
@@ -482,8 +432,8 @@ dbg_check_nondata_nodes_order(struct ubifs_info *c,
482 struct list_head *head) { return 0; } 432 struct list_head *head) { return 0; }
483 433
484static inline int dbg_force_in_the_gaps(void) { return 0; } 434static inline int dbg_force_in_the_gaps(void) { return 0; }
485#define dbg_force_in_the_gaps_enabled 0 435#define dbg_force_in_the_gaps_enabled() 0
486#define dbg_failure_mode 0 436#define dbg_failure_mode 0
487 437
488static inline int dbg_debugfs_init(void) { return 0; } 438static inline int dbg_debugfs_init(void) { return 0; }
489static inline void dbg_debugfs_exit(void) { return; } 439static inline void dbg_debugfs_exit(void) { return; }
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 7217d67a80a6..ef5abd38f0bf 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -603,7 +603,7 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry)
603 ubifs_release_budget(c, &req); 603 ubifs_release_budget(c, &req);
604 else { 604 else {
605 /* We've deleted something - clean the "no space" flags */ 605 /* We've deleted something - clean the "no space" flags */
606 c->nospace = c->nospace_rp = 0; 606 c->bi.nospace = c->bi.nospace_rp = 0;
607 smp_wmb(); 607 smp_wmb();
608 } 608 }
609 return 0; 609 return 0;
@@ -693,7 +693,7 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry)
693 ubifs_release_budget(c, &req); 693 ubifs_release_budget(c, &req);
694 else { 694 else {
695 /* We've deleted something - clean the "no space" flags */ 695 /* We've deleted something - clean the "no space" flags */
696 c->nospace = c->nospace_rp = 0; 696 c->bi.nospace = c->bi.nospace_rp = 0;
697 smp_wmb(); 697 smp_wmb();
698 } 698 }
699 return 0; 699 return 0;
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index b286db79c686..5e7fccfc4b29 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -212,7 +212,7 @@ static void release_new_page_budget(struct ubifs_info *c)
212 */ 212 */
213static void release_existing_page_budget(struct ubifs_info *c) 213static void release_existing_page_budget(struct ubifs_info *c)
214{ 214{
215 struct ubifs_budget_req req = { .dd_growth = c->page_budget}; 215 struct ubifs_budget_req req = { .dd_growth = c->bi.page_budget};
216 216
217 ubifs_release_budget(c, &req); 217 ubifs_release_budget(c, &req);
218} 218}
@@ -971,11 +971,11 @@ static int do_writepage(struct page *page, int len)
971 * the page locked, and it locks @ui_mutex. However, write-back does take inode 971 * the page locked, and it locks @ui_mutex. However, write-back does take inode
972 * @i_mutex, which means other VFS operations may be run on this inode at the 972 * @i_mutex, which means other VFS operations may be run on this inode at the
973 * same time. And the problematic one is truncation to smaller size, from where 973 * same time. And the problematic one is truncation to smaller size, from where
974 * we have to call 'truncate_setsize()', which first changes @inode->i_size, then 974 * we have to call 'truncate_setsize()', which first changes @inode->i_size,
975 * drops the truncated pages. And while dropping the pages, it takes the page 975 * then drops the truncated pages. And while dropping the pages, it takes the
976 * lock. This means that 'do_truncation()' cannot call 'truncate_setsize()' with 976 * page lock. This means that 'do_truncation()' cannot call 'truncate_setsize()'
977 * @ui_mutex locked, because it would deadlock with 'ubifs_writepage()'. This 977 * with @ui_mutex locked, because it would deadlock with 'ubifs_writepage()'.
978 * means that @inode->i_size is changed while @ui_mutex is unlocked. 978 * This means that @inode->i_size is changed while @ui_mutex is unlocked.
979 * 979 *
980 * XXX(truncate): with the new truncate sequence this is not true anymore, 980 * XXX(truncate): with the new truncate sequence this is not true anymore,
981 * and the calls to truncate_setsize can be move around freely. They should 981 * and the calls to truncate_setsize can be move around freely. They should
@@ -1189,7 +1189,7 @@ out_budg:
1189 if (budgeted) 1189 if (budgeted)
1190 ubifs_release_budget(c, &req); 1190 ubifs_release_budget(c, &req);
1191 else { 1191 else {
1192 c->nospace = c->nospace_rp = 0; 1192 c->bi.nospace = c->bi.nospace_rp = 0;
1193 smp_wmb(); 1193 smp_wmb();
1194 } 1194 }
1195 return err; 1195 return err;
@@ -1312,7 +1312,11 @@ int ubifs_fsync(struct file *file, int datasync)
1312 1312
1313 dbg_gen("syncing inode %lu", inode->i_ino); 1313 dbg_gen("syncing inode %lu", inode->i_ino);
1314 1314
1315 if (inode->i_sb->s_flags & MS_RDONLY) 1315 if (c->ro_mount)
1316 /*
1317 * For some really strange reasons VFS does not filter out
1318 * 'fsync()' for R/O mounted file-systems as per 2.6.39.
1319 */
1316 return 0; 1320 return 0;
1317 1321
1318 /* 1322 /*
@@ -1432,10 +1436,11 @@ static int ubifs_releasepage(struct page *page, gfp_t unused_gfp_flags)
1432} 1436}
1433 1437
1434/* 1438/*
1435 * mmap()d file has taken write protection fault and is being made 1439 * mmap()d file has taken write protection fault and is being made writable.
1436 * writable. UBIFS must ensure page is budgeted for. 1440 * UBIFS must ensure page is budgeted for.
1437 */ 1441 */
1438static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) 1442static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma,
1443 struct vm_fault *vmf)
1439{ 1444{
1440 struct page *page = vmf->page; 1445 struct page *page = vmf->page;
1441 struct inode *inode = vma->vm_file->f_path.dentry->d_inode; 1446 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
@@ -1536,7 +1541,6 @@ static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma)
1536{ 1541{
1537 int err; 1542 int err;
1538 1543
1539 /* 'generic_file_mmap()' takes care of NOMMU case */
1540 err = generic_file_mmap(file, vma); 1544 err = generic_file_mmap(file, vma);
1541 if (err) 1545 if (err)
1542 return err; 1546 return err;
diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c
index 1d54383d1269..2559d174e004 100644
--- a/fs/ubifs/find.c
+++ b/fs/ubifs/find.c
@@ -252,8 +252,8 @@ int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
252 * But if the index takes fewer LEBs than it is reserved for it, 252 * But if the index takes fewer LEBs than it is reserved for it,
253 * this function must avoid picking those reserved LEBs. 253 * this function must avoid picking those reserved LEBs.
254 */ 254 */
255 if (c->min_idx_lebs >= c->lst.idx_lebs) { 255 if (c->bi.min_idx_lebs >= c->lst.idx_lebs) {
256 rsvd_idx_lebs = c->min_idx_lebs - c->lst.idx_lebs; 256 rsvd_idx_lebs = c->bi.min_idx_lebs - c->lst.idx_lebs;
257 exclude_index = 1; 257 exclude_index = 1;
258 } 258 }
259 spin_unlock(&c->space_lock); 259 spin_unlock(&c->space_lock);
@@ -276,7 +276,7 @@ int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
276 pick_free = 0; 276 pick_free = 0;
277 } else { 277 } else {
278 spin_lock(&c->space_lock); 278 spin_lock(&c->space_lock);
279 exclude_index = (c->min_idx_lebs >= c->lst.idx_lebs); 279 exclude_index = (c->bi.min_idx_lebs >= c->lst.idx_lebs);
280 spin_unlock(&c->space_lock); 280 spin_unlock(&c->space_lock);
281 } 281 }
282 282
@@ -501,8 +501,8 @@ int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *offs,
501 501
502 /* Check if there are enough empty LEBs for commit */ 502 /* Check if there are enough empty LEBs for commit */
503 spin_lock(&c->space_lock); 503 spin_lock(&c->space_lock);
504 if (c->min_idx_lebs > c->lst.idx_lebs) 504 if (c->bi.min_idx_lebs > c->lst.idx_lebs)
505 rsvd_idx_lebs = c->min_idx_lebs - c->lst.idx_lebs; 505 rsvd_idx_lebs = c->bi.min_idx_lebs - c->lst.idx_lebs;
506 else 506 else
507 rsvd_idx_lebs = 0; 507 rsvd_idx_lebs = 0;
508 lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt - 508 lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt -
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index 151f10882820..ded29f6224c2 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -100,6 +100,10 @@ static int switch_gc_head(struct ubifs_info *c)
100 if (err) 100 if (err)
101 return err; 101 return err;
102 102
103 err = ubifs_wbuf_sync_nolock(wbuf);
104 if (err)
105 return err;
106
103 err = ubifs_add_bud_to_log(c, GCHD, gc_lnum, 0); 107 err = ubifs_add_bud_to_log(c, GCHD, gc_lnum, 0);
104 if (err) 108 if (err)
105 return err; 109 return err;
@@ -118,7 +122,7 @@ static int switch_gc_head(struct ubifs_info *c)
118 * This function compares data nodes @a and @b. Returns %1 if @a has greater 122 * This function compares data nodes @a and @b. Returns %1 if @a has greater
119 * inode or block number, and %-1 otherwise. 123 * inode or block number, and %-1 otherwise.
120 */ 124 */
121int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b) 125static int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
122{ 126{
123 ino_t inuma, inumb; 127 ino_t inuma, inumb;
124 struct ubifs_info *c = priv; 128 struct ubifs_info *c = priv;
@@ -161,7 +165,8 @@ int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
161 * first and sorted by length in descending order. Directory entry nodes go 165 * first and sorted by length in descending order. Directory entry nodes go
162 * after inode nodes and are sorted in ascending hash valuer order. 166 * after inode nodes and are sorted in ascending hash valuer order.
163 */ 167 */
164int nondata_nodes_cmp(void *priv, struct list_head *a, struct list_head *b) 168static int nondata_nodes_cmp(void *priv, struct list_head *a,
169 struct list_head *b)
165{ 170{
166 ino_t inuma, inumb; 171 ino_t inuma, inumb;
167 struct ubifs_info *c = priv; 172 struct ubifs_info *c = priv;
@@ -473,6 +478,37 @@ int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp)
473 ubifs_assert(c->gc_lnum != lnum); 478 ubifs_assert(c->gc_lnum != lnum);
474 ubifs_assert(wbuf->lnum != lnum); 479 ubifs_assert(wbuf->lnum != lnum);
475 480
481 if (lp->free + lp->dirty == c->leb_size) {
482 /* Special case - a free LEB */
483 dbg_gc("LEB %d is free, return it", lp->lnum);
484 ubifs_assert(!(lp->flags & LPROPS_INDEX));
485
486 if (lp->free != c->leb_size) {
487 /*
488 * Write buffers must be sync'd before unmapping
489 * freeable LEBs, because one of them may contain data
490 * which obsoletes something in 'lp->pnum'.
491 */
492 err = gc_sync_wbufs(c);
493 if (err)
494 return err;
495 err = ubifs_change_one_lp(c, lp->lnum, c->leb_size,
496 0, 0, 0, 0);
497 if (err)
498 return err;
499 }
500 err = ubifs_leb_unmap(c, lp->lnum);
501 if (err)
502 return err;
503
504 if (c->gc_lnum == -1) {
505 c->gc_lnum = lnum;
506 return LEB_RETAINED;
507 }
508
509 return LEB_FREED;
510 }
511
476 /* 512 /*
477 * We scan the entire LEB even though we only really need to scan up to 513 * We scan the entire LEB even though we only really need to scan up to
478 * (c->leb_size - lp->free). 514 * (c->leb_size - lp->free).
@@ -682,37 +718,6 @@ int ubifs_garbage_collect(struct ubifs_info *c, int anyway)
682 "(min. space %d)", lp.lnum, lp.free, lp.dirty, 718 "(min. space %d)", lp.lnum, lp.free, lp.dirty,
683 lp.free + lp.dirty, min_space); 719 lp.free + lp.dirty, min_space);
684 720
685 if (lp.free + lp.dirty == c->leb_size) {
686 /* An empty LEB was returned */
687 dbg_gc("LEB %d is free, return it", lp.lnum);
688 /*
689 * ubifs_find_dirty_leb() doesn't return freeable index
690 * LEBs.
691 */
692 ubifs_assert(!(lp.flags & LPROPS_INDEX));
693 if (lp.free != c->leb_size) {
694 /*
695 * Write buffers must be sync'd before
696 * unmapping freeable LEBs, because one of them
697 * may contain data which obsoletes something
698 * in 'lp.pnum'.
699 */
700 ret = gc_sync_wbufs(c);
701 if (ret)
702 goto out;
703 ret = ubifs_change_one_lp(c, lp.lnum,
704 c->leb_size, 0, 0, 0,
705 0);
706 if (ret)
707 goto out;
708 }
709 ret = ubifs_leb_unmap(c, lp.lnum);
710 if (ret)
711 goto out;
712 ret = lp.lnum;
713 break;
714 }
715
716 space_before = c->leb_size - wbuf->offs - wbuf->used; 721 space_before = c->leb_size - wbuf->offs - wbuf->used;
717 if (wbuf->lnum == -1) 722 if (wbuf->lnum == -1)
718 space_before = 0; 723 space_before = 0;
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index dfd168b7807e..166951e0dcd3 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -393,7 +393,7 @@ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf)
393 ubifs_assert(wbuf->size % c->min_io_size == 0); 393 ubifs_assert(wbuf->size % c->min_io_size == 0);
394 ubifs_assert(!c->ro_media && !c->ro_mount); 394 ubifs_assert(!c->ro_media && !c->ro_mount);
395 if (c->leb_size - wbuf->offs >= c->max_write_size) 395 if (c->leb_size - wbuf->offs >= c->max_write_size)
396 ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size )); 396 ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size));
397 397
398 if (c->ro_error) 398 if (c->ro_error)
399 return -EROFS; 399 return -EROFS;
@@ -452,8 +452,8 @@ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf)
452 * @dtype: data type 452 * @dtype: data type
453 * 453 *
454 * This function targets the write-buffer to logical eraseblock @lnum:@offs. 454 * This function targets the write-buffer to logical eraseblock @lnum:@offs.
455 * The write-buffer is synchronized if it is not empty. Returns zero in case of 455 * The write-buffer has to be empty. Returns zero in case of success and a
456 * success and a negative error code in case of failure. 456 * negative error code in case of failure.
457 */ 457 */
458int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs, 458int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs,
459 int dtype) 459 int dtype)
@@ -465,13 +465,7 @@ int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs,
465 ubifs_assert(offs >= 0 && offs <= c->leb_size); 465 ubifs_assert(offs >= 0 && offs <= c->leb_size);
466 ubifs_assert(offs % c->min_io_size == 0 && !(offs & 7)); 466 ubifs_assert(offs % c->min_io_size == 0 && !(offs & 7));
467 ubifs_assert(lnum != wbuf->lnum); 467 ubifs_assert(lnum != wbuf->lnum);
468 468 ubifs_assert(wbuf->used == 0);
469 if (wbuf->used > 0) {
470 int err = ubifs_wbuf_sync_nolock(wbuf);
471
472 if (err)
473 return err;
474 }
475 469
476 spin_lock(&wbuf->lock); 470 spin_lock(&wbuf->lock);
477 wbuf->lnum = lnum; 471 wbuf->lnum = lnum;
@@ -573,7 +567,7 @@ out_timers:
573int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) 567int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
574{ 568{
575 struct ubifs_info *c = wbuf->c; 569 struct ubifs_info *c = wbuf->c;
576 int err, written, n, aligned_len = ALIGN(len, 8), offs; 570 int err, written, n, aligned_len = ALIGN(len, 8);
577 571
578 dbg_io("%d bytes (%s) to jhead %s wbuf at LEB %d:%d", len, 572 dbg_io("%d bytes (%s) to jhead %s wbuf at LEB %d:%d", len,
579 dbg_ntype(((struct ubifs_ch *)buf)->node_type), 573 dbg_ntype(((struct ubifs_ch *)buf)->node_type),
@@ -588,7 +582,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
588 ubifs_assert(mutex_is_locked(&wbuf->io_mutex)); 582 ubifs_assert(mutex_is_locked(&wbuf->io_mutex));
589 ubifs_assert(!c->ro_media && !c->ro_mount); 583 ubifs_assert(!c->ro_media && !c->ro_mount);
590 if (c->leb_size - wbuf->offs >= c->max_write_size) 584 if (c->leb_size - wbuf->offs >= c->max_write_size)
591 ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size )); 585 ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size));
592 586
593 if (c->leb_size - wbuf->offs - wbuf->used < aligned_len) { 587 if (c->leb_size - wbuf->offs - wbuf->used < aligned_len) {
594 err = -ENOSPC; 588 err = -ENOSPC;
@@ -636,7 +630,6 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
636 goto exit; 630 goto exit;
637 } 631 }
638 632
639 offs = wbuf->offs;
640 written = 0; 633 written = 0;
641 634
642 if (wbuf->used) { 635 if (wbuf->used) {
@@ -653,7 +646,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
653 if (err) 646 if (err)
654 goto out; 647 goto out;
655 648
656 offs += wbuf->size; 649 wbuf->offs += wbuf->size;
657 len -= wbuf->avail; 650 len -= wbuf->avail;
658 aligned_len -= wbuf->avail; 651 aligned_len -= wbuf->avail;
659 written += wbuf->avail; 652 written += wbuf->avail;
@@ -672,7 +665,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
672 if (err) 665 if (err)
673 goto out; 666 goto out;
674 667
675 offs += wbuf->size; 668 wbuf->offs += wbuf->size;
676 len -= wbuf->size; 669 len -= wbuf->size;
677 aligned_len -= wbuf->size; 670 aligned_len -= wbuf->size;
678 written += wbuf->size; 671 written += wbuf->size;
@@ -687,12 +680,13 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
687 n = aligned_len >> c->max_write_shift; 680 n = aligned_len >> c->max_write_shift;
688 if (n) { 681 if (n) {
689 n <<= c->max_write_shift; 682 n <<= c->max_write_shift;
690 dbg_io("write %d bytes to LEB %d:%d", n, wbuf->lnum, offs); 683 dbg_io("write %d bytes to LEB %d:%d", n, wbuf->lnum,
691 err = ubi_leb_write(c->ubi, wbuf->lnum, buf + written, offs, n, 684 wbuf->offs);
692 wbuf->dtype); 685 err = ubi_leb_write(c->ubi, wbuf->lnum, buf + written,
686 wbuf->offs, n, wbuf->dtype);
693 if (err) 687 if (err)
694 goto out; 688 goto out;
695 offs += n; 689 wbuf->offs += n;
696 aligned_len -= n; 690 aligned_len -= n;
697 len -= n; 691 len -= n;
698 written += n; 692 written += n;
@@ -707,7 +701,6 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
707 */ 701 */
708 memcpy(wbuf->buf, buf + written, len); 702 memcpy(wbuf->buf, buf + written, len);
709 703
710 wbuf->offs = offs;
711 if (c->leb_size - wbuf->offs >= c->max_write_size) 704 if (c->leb_size - wbuf->offs >= c->max_write_size)
712 wbuf->size = c->max_write_size; 705 wbuf->size = c->max_write_size;
713 else 706 else
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index aed25e864227..34b1679e6e3a 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -141,14 +141,8 @@ again:
141 * LEB with some empty space. 141 * LEB with some empty space.
142 */ 142 */
143 lnum = ubifs_find_free_space(c, len, &offs, squeeze); 143 lnum = ubifs_find_free_space(c, len, &offs, squeeze);
144 if (lnum >= 0) { 144 if (lnum >= 0)
145 /* Found an LEB, add it to the journal head */
146 err = ubifs_add_bud_to_log(c, jhead, lnum, offs);
147 if (err)
148 goto out_return;
149 /* A new bud was successfully allocated and added to the log */
150 goto out; 145 goto out;
151 }
152 146
153 err = lnum; 147 err = lnum;
154 if (err != -ENOSPC) 148 if (err != -ENOSPC)
@@ -203,12 +197,23 @@ again:
203 return 0; 197 return 0;
204 } 198 }
205 199
206 err = ubifs_add_bud_to_log(c, jhead, lnum, 0);
207 if (err)
208 goto out_return;
209 offs = 0; 200 offs = 0;
210 201
211out: 202out:
203 /*
204 * Make sure we synchronize the write-buffer before we add the new bud
205 * to the log. Otherwise we may have a power cut after the log
206 * reference node for the last bud (@lnum) is written but before the
207 * write-buffer data are written to the next-to-last bud
208 * (@wbuf->lnum). And the effect would be that the recovery would see
209 * that there is corruption in the next-to-last bud.
210 */
211 err = ubifs_wbuf_sync_nolock(wbuf);
212 if (err)
213 goto out_return;
214 err = ubifs_add_bud_to_log(c, jhead, lnum, offs);
215 if (err)
216 goto out_return;
212 err = ubifs_wbuf_seek_nolock(wbuf, lnum, offs, wbuf->dtype); 217 err = ubifs_wbuf_seek_nolock(wbuf, lnum, offs, wbuf->dtype);
213 if (err) 218 if (err)
214 goto out_unlock; 219 goto out_unlock;
@@ -380,10 +385,8 @@ out:
380 if (err == -ENOSPC) { 385 if (err == -ENOSPC) {
381 /* This are some budgeting problems, print useful information */ 386 /* This are some budgeting problems, print useful information */
382 down_write(&c->commit_sem); 387 down_write(&c->commit_sem);
383 spin_lock(&c->space_lock);
384 dbg_dump_stack(); 388 dbg_dump_stack();
385 dbg_dump_budg(c); 389 dbg_dump_budg(c, &c->bi);
386 spin_unlock(&c->space_lock);
387 dbg_dump_lprops(c); 390 dbg_dump_lprops(c);
388 cmt_retries = dbg_check_lprops(c); 391 cmt_retries = dbg_check_lprops(c);
389 up_write(&c->commit_sem); 392 up_write(&c->commit_sem);
diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c
index 40fa780ebea7..affea9494ae2 100644
--- a/fs/ubifs/log.c
+++ b/fs/ubifs/log.c
@@ -100,20 +100,6 @@ struct ubifs_wbuf *ubifs_get_wbuf(struct ubifs_info *c, int lnum)
100} 100}
101 101
102/** 102/**
103 * next_log_lnum - switch to the next log LEB.
104 * @c: UBIFS file-system description object
105 * @lnum: current log LEB
106 */
107static inline int next_log_lnum(const struct ubifs_info *c, int lnum)
108{
109 lnum += 1;
110 if (lnum > c->log_last)
111 lnum = UBIFS_LOG_LNUM;
112
113 return lnum;
114}
115
116/**
117 * empty_log_bytes - calculate amount of empty space in the log. 103 * empty_log_bytes - calculate amount of empty space in the log.
118 * @c: UBIFS file-system description object 104 * @c: UBIFS file-system description object
119 */ 105 */
@@ -257,7 +243,7 @@ int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs)
257 ref->jhead = cpu_to_le32(jhead); 243 ref->jhead = cpu_to_le32(jhead);
258 244
259 if (c->lhead_offs > c->leb_size - c->ref_node_alsz) { 245 if (c->lhead_offs > c->leb_size - c->ref_node_alsz) {
260 c->lhead_lnum = next_log_lnum(c, c->lhead_lnum); 246 c->lhead_lnum = ubifs_next_log_lnum(c, c->lhead_lnum);
261 c->lhead_offs = 0; 247 c->lhead_offs = 0;
262 } 248 }
263 249
@@ -425,7 +411,7 @@ int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum)
425 411
426 /* Switch to the next log LEB */ 412 /* Switch to the next log LEB */
427 if (c->lhead_offs) { 413 if (c->lhead_offs) {
428 c->lhead_lnum = next_log_lnum(c, c->lhead_lnum); 414 c->lhead_lnum = ubifs_next_log_lnum(c, c->lhead_lnum);
429 c->lhead_offs = 0; 415 c->lhead_offs = 0;
430 } 416 }
431 417
@@ -446,7 +432,7 @@ int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum)
446 432
447 c->lhead_offs += len; 433 c->lhead_offs += len;
448 if (c->lhead_offs == c->leb_size) { 434 if (c->lhead_offs == c->leb_size) {
449 c->lhead_lnum = next_log_lnum(c, c->lhead_lnum); 435 c->lhead_lnum = ubifs_next_log_lnum(c, c->lhead_lnum);
450 c->lhead_offs = 0; 436 c->lhead_offs = 0;
451 } 437 }
452 438
@@ -533,7 +519,7 @@ int ubifs_log_post_commit(struct ubifs_info *c, int old_ltail_lnum)
533 } 519 }
534 mutex_lock(&c->log_mutex); 520 mutex_lock(&c->log_mutex);
535 for (lnum = old_ltail_lnum; lnum != c->ltail_lnum; 521 for (lnum = old_ltail_lnum; lnum != c->ltail_lnum;
536 lnum = next_log_lnum(c, lnum)) { 522 lnum = ubifs_next_log_lnum(c, lnum)) {
537 dbg_log("unmap log LEB %d", lnum); 523 dbg_log("unmap log LEB %d", lnum);
538 err = ubifs_leb_unmap(c, lnum); 524 err = ubifs_leb_unmap(c, lnum);
539 if (err) 525 if (err)
@@ -642,7 +628,7 @@ static int add_node(struct ubifs_info *c, void *buf, int *lnum, int *offs,
642 err = ubifs_leb_change(c, *lnum, buf, sz, UBI_SHORTTERM); 628 err = ubifs_leb_change(c, *lnum, buf, sz, UBI_SHORTTERM);
643 if (err) 629 if (err)
644 return err; 630 return err;
645 *lnum = next_log_lnum(c, *lnum); 631 *lnum = ubifs_next_log_lnum(c, *lnum);
646 *offs = 0; 632 *offs = 0;
647 } 633 }
648 memcpy(buf + *offs, node, len); 634 memcpy(buf + *offs, node, len);
@@ -712,7 +698,7 @@ int ubifs_consolidate_log(struct ubifs_info *c)
712 ubifs_scan_destroy(sleb); 698 ubifs_scan_destroy(sleb);
713 if (lnum == c->lhead_lnum) 699 if (lnum == c->lhead_lnum)
714 break; 700 break;
715 lnum = next_log_lnum(c, lnum); 701 lnum = ubifs_next_log_lnum(c, lnum);
716 } 702 }
717 if (offs) { 703 if (offs) {
718 int sz = ALIGN(offs, c->min_io_size); 704 int sz = ALIGN(offs, c->min_io_size);
@@ -732,7 +718,7 @@ int ubifs_consolidate_log(struct ubifs_info *c)
732 /* Unmap remaining LEBs */ 718 /* Unmap remaining LEBs */
733 lnum = write_lnum; 719 lnum = write_lnum;
734 do { 720 do {
735 lnum = next_log_lnum(c, lnum); 721 lnum = ubifs_next_log_lnum(c, lnum);
736 err = ubifs_leb_unmap(c, lnum); 722 err = ubifs_leb_unmap(c, lnum);
737 if (err) 723 if (err)
738 return err; 724 return err;
diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c
index 0ee0847f2421..667884f4a615 100644
--- a/fs/ubifs/lprops.c
+++ b/fs/ubifs/lprops.c
@@ -1007,21 +1007,11 @@ out:
1007} 1007}
1008 1008
1009/** 1009/**
1010 * struct scan_check_data - data provided to scan callback function.
1011 * @lst: LEB properties statistics
1012 * @err: error code
1013 */
1014struct scan_check_data {
1015 struct ubifs_lp_stats lst;
1016 int err;
1017};
1018
1019/**
1020 * scan_check_cb - scan callback. 1010 * scan_check_cb - scan callback.
1021 * @c: the UBIFS file-system description object 1011 * @c: the UBIFS file-system description object
1022 * @lp: LEB properties to scan 1012 * @lp: LEB properties to scan
1023 * @in_tree: whether the LEB properties are in main memory 1013 * @in_tree: whether the LEB properties are in main memory
1024 * @data: information passed to and from the caller of the scan 1014 * @lst: lprops statistics to update
1025 * 1015 *
1026 * This function returns a code that indicates whether the scan should continue 1016 * This function returns a code that indicates whether the scan should continue
1027 * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree 1017 * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree
@@ -1030,11 +1020,10 @@ struct scan_check_data {
1030 */ 1020 */
1031static int scan_check_cb(struct ubifs_info *c, 1021static int scan_check_cb(struct ubifs_info *c,
1032 const struct ubifs_lprops *lp, int in_tree, 1022 const struct ubifs_lprops *lp, int in_tree,
1033 struct scan_check_data *data) 1023 struct ubifs_lp_stats *lst)
1034{ 1024{
1035 struct ubifs_scan_leb *sleb; 1025 struct ubifs_scan_leb *sleb;
1036 struct ubifs_scan_node *snod; 1026 struct ubifs_scan_node *snod;
1037 struct ubifs_lp_stats *lst = &data->lst;
1038 int cat, lnum = lp->lnum, is_idx = 0, used = 0, free, dirty, ret; 1027 int cat, lnum = lp->lnum, is_idx = 0, used = 0, free, dirty, ret;
1039 void *buf = NULL; 1028 void *buf = NULL;
1040 1029
@@ -1044,7 +1033,7 @@ static int scan_check_cb(struct ubifs_info *c,
1044 if (cat != (lp->flags & LPROPS_CAT_MASK)) { 1033 if (cat != (lp->flags & LPROPS_CAT_MASK)) {
1045 ubifs_err("bad LEB category %d expected %d", 1034 ubifs_err("bad LEB category %d expected %d",
1046 (lp->flags & LPROPS_CAT_MASK), cat); 1035 (lp->flags & LPROPS_CAT_MASK), cat);
1047 goto out; 1036 return -EINVAL;
1048 } 1037 }
1049 } 1038 }
1050 1039
@@ -1078,7 +1067,7 @@ static int scan_check_cb(struct ubifs_info *c,
1078 } 1067 }
1079 if (!found) { 1068 if (!found) {
1080 ubifs_err("bad LPT list (category %d)", cat); 1069 ubifs_err("bad LPT list (category %d)", cat);
1081 goto out; 1070 return -EINVAL;
1082 } 1071 }
1083 } 1072 }
1084 } 1073 }
@@ -1090,45 +1079,40 @@ static int scan_check_cb(struct ubifs_info *c,
1090 if ((lp->hpos != -1 && heap->arr[lp->hpos]->lnum != lnum) || 1079 if ((lp->hpos != -1 && heap->arr[lp->hpos]->lnum != lnum) ||
1091 lp != heap->arr[lp->hpos]) { 1080 lp != heap->arr[lp->hpos]) {
1092 ubifs_err("bad LPT heap (category %d)", cat); 1081 ubifs_err("bad LPT heap (category %d)", cat);
1093 goto out; 1082 return -EINVAL;
1094 } 1083 }
1095 } 1084 }
1096 1085
1097 buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL); 1086 buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL);
1098 if (!buf) { 1087 if (!buf)
1099 ubifs_err("cannot allocate memory to scan LEB %d", lnum); 1088 return -ENOMEM;
1100 goto out; 1089
1090 /*
1091 * After an unclean unmount, empty and freeable LEBs
1092 * may contain garbage - do not scan them.
1093 */
1094 if (lp->free == c->leb_size) {
1095 lst->empty_lebs += 1;
1096 lst->total_free += c->leb_size;
1097 lst->total_dark += ubifs_calc_dark(c, c->leb_size);
1098 return LPT_SCAN_CONTINUE;
1099 }
1100 if (lp->free + lp->dirty == c->leb_size &&
1101 !(lp->flags & LPROPS_INDEX)) {
1102 lst->total_free += lp->free;
1103 lst->total_dirty += lp->dirty;
1104 lst->total_dark += ubifs_calc_dark(c, c->leb_size);
1105 return LPT_SCAN_CONTINUE;
1101 } 1106 }
1102 1107
1103 sleb = ubifs_scan(c, lnum, 0, buf, 0); 1108 sleb = ubifs_scan(c, lnum, 0, buf, 0);
1104 if (IS_ERR(sleb)) { 1109 if (IS_ERR(sleb)) {
1105 /* 1110 ret = PTR_ERR(sleb);
1106 * After an unclean unmount, empty and freeable LEBs 1111 if (ret == -EUCLEAN) {
1107 * may contain garbage. 1112 dbg_dump_lprops(c);
1108 */ 1113 dbg_dump_budg(c, &c->bi);
1109 if (lp->free == c->leb_size) {
1110 ubifs_err("scan errors were in empty LEB "
1111 "- continuing checking");
1112 lst->empty_lebs += 1;
1113 lst->total_free += c->leb_size;
1114 lst->total_dark += ubifs_calc_dark(c, c->leb_size);
1115 ret = LPT_SCAN_CONTINUE;
1116 goto exit;
1117 }
1118
1119 if (lp->free + lp->dirty == c->leb_size &&
1120 !(lp->flags & LPROPS_INDEX)) {
1121 ubifs_err("scan errors were in freeable LEB "
1122 "- continuing checking");
1123 lst->total_free += lp->free;
1124 lst->total_dirty += lp->dirty;
1125 lst->total_dark += ubifs_calc_dark(c, c->leb_size);
1126 ret = LPT_SCAN_CONTINUE;
1127 goto exit;
1128 } 1114 }
1129 data->err = PTR_ERR(sleb); 1115 goto out;
1130 ret = LPT_SCAN_STOP;
1131 goto exit;
1132 } 1116 }
1133 1117
1134 is_idx = -1; 1118 is_idx = -1;
@@ -1246,10 +1230,8 @@ static int scan_check_cb(struct ubifs_info *c,
1246 } 1230 }
1247 1231
1248 ubifs_scan_destroy(sleb); 1232 ubifs_scan_destroy(sleb);
1249 ret = LPT_SCAN_CONTINUE;
1250exit:
1251 vfree(buf); 1233 vfree(buf);
1252 return ret; 1234 return LPT_SCAN_CONTINUE;
1253 1235
1254out_print: 1236out_print:
1255 ubifs_err("bad accounting of LEB %d: free %d, dirty %d flags %#x, " 1237 ubifs_err("bad accounting of LEB %d: free %d, dirty %d flags %#x, "
@@ -1258,10 +1240,10 @@ out_print:
1258 dbg_dump_leb(c, lnum); 1240 dbg_dump_leb(c, lnum);
1259out_destroy: 1241out_destroy:
1260 ubifs_scan_destroy(sleb); 1242 ubifs_scan_destroy(sleb);
1243 ret = -EINVAL;
1261out: 1244out:
1262 vfree(buf); 1245 vfree(buf);
1263 data->err = -EINVAL; 1246 return ret;
1264 return LPT_SCAN_STOP;
1265} 1247}
1266 1248
1267/** 1249/**
@@ -1278,8 +1260,7 @@ out:
1278int dbg_check_lprops(struct ubifs_info *c) 1260int dbg_check_lprops(struct ubifs_info *c)
1279{ 1261{
1280 int i, err; 1262 int i, err;
1281 struct scan_check_data data; 1263 struct ubifs_lp_stats lst;
1282 struct ubifs_lp_stats *lst = &data.lst;
1283 1264
1284 if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS)) 1265 if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS))
1285 return 0; 1266 return 0;
@@ -1294,29 +1275,23 @@ int dbg_check_lprops(struct ubifs_info *c)
1294 return err; 1275 return err;
1295 } 1276 }
1296 1277
1297 memset(lst, 0, sizeof(struct ubifs_lp_stats)); 1278 memset(&lst, 0, sizeof(struct ubifs_lp_stats));
1298
1299 data.err = 0;
1300 err = ubifs_lpt_scan_nolock(c, c->main_first, c->leb_cnt - 1, 1279 err = ubifs_lpt_scan_nolock(c, c->main_first, c->leb_cnt - 1,
1301 (ubifs_lpt_scan_callback)scan_check_cb, 1280 (ubifs_lpt_scan_callback)scan_check_cb,
1302 &data); 1281 &lst);
1303 if (err && err != -ENOSPC) 1282 if (err && err != -ENOSPC)
1304 goto out; 1283 goto out;
1305 if (data.err) {
1306 err = data.err;
1307 goto out;
1308 }
1309 1284
1310 if (lst->empty_lebs != c->lst.empty_lebs || 1285 if (lst.empty_lebs != c->lst.empty_lebs ||
1311 lst->idx_lebs != c->lst.idx_lebs || 1286 lst.idx_lebs != c->lst.idx_lebs ||
1312 lst->total_free != c->lst.total_free || 1287 lst.total_free != c->lst.total_free ||
1313 lst->total_dirty != c->lst.total_dirty || 1288 lst.total_dirty != c->lst.total_dirty ||
1314 lst->total_used != c->lst.total_used) { 1289 lst.total_used != c->lst.total_used) {
1315 ubifs_err("bad overall accounting"); 1290 ubifs_err("bad overall accounting");
1316 ubifs_err("calculated: empty_lebs %d, idx_lebs %d, " 1291 ubifs_err("calculated: empty_lebs %d, idx_lebs %d, "
1317 "total_free %lld, total_dirty %lld, total_used %lld", 1292 "total_free %lld, total_dirty %lld, total_used %lld",
1318 lst->empty_lebs, lst->idx_lebs, lst->total_free, 1293 lst.empty_lebs, lst.idx_lebs, lst.total_free,
1319 lst->total_dirty, lst->total_used); 1294 lst.total_dirty, lst.total_used);
1320 ubifs_err("read from lprops: empty_lebs %d, idx_lebs %d, " 1295 ubifs_err("read from lprops: empty_lebs %d, idx_lebs %d, "
1321 "total_free %lld, total_dirty %lld, total_used %lld", 1296 "total_free %lld, total_dirty %lld, total_used %lld",
1322 c->lst.empty_lebs, c->lst.idx_lebs, c->lst.total_free, 1297 c->lst.empty_lebs, c->lst.idx_lebs, c->lst.total_free,
@@ -1325,11 +1300,11 @@ int dbg_check_lprops(struct ubifs_info *c)
1325 goto out; 1300 goto out;
1326 } 1301 }
1327 1302
1328 if (lst->total_dead != c->lst.total_dead || 1303 if (lst.total_dead != c->lst.total_dead ||
1329 lst->total_dark != c->lst.total_dark) { 1304 lst.total_dark != c->lst.total_dark) {
1330 ubifs_err("bad dead/dark space accounting"); 1305 ubifs_err("bad dead/dark space accounting");
1331 ubifs_err("calculated: total_dead %lld, total_dark %lld", 1306 ubifs_err("calculated: total_dead %lld, total_dark %lld",
1332 lst->total_dead, lst->total_dark); 1307 lst.total_dead, lst.total_dark);
1333 ubifs_err("read from lprops: total_dead %lld, total_dark %lld", 1308 ubifs_err("read from lprops: total_dead %lld, total_dark %lld",
1334 c->lst.total_dead, c->lst.total_dark); 1309 c->lst.total_dead, c->lst.total_dark);
1335 err = -EINVAL; 1310 err = -EINVAL;
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index 0c9c69bd983a..dfcb5748a7dc 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -29,6 +29,12 @@
29#include <linux/slab.h> 29#include <linux/slab.h>
30#include "ubifs.h" 30#include "ubifs.h"
31 31
32#ifdef CONFIG_UBIFS_FS_DEBUG
33static int dbg_populate_lsave(struct ubifs_info *c);
34#else
35#define dbg_populate_lsave(c) 0
36#endif
37
32/** 38/**
33 * first_dirty_cnode - find first dirty cnode. 39 * first_dirty_cnode - find first dirty cnode.
34 * @c: UBIFS file-system description object 40 * @c: UBIFS file-system description object
@@ -586,7 +592,7 @@ static struct ubifs_pnode *next_pnode_to_dirty(struct ubifs_info *c,
586 if (nnode->nbranch[iip].lnum) 592 if (nnode->nbranch[iip].lnum)
587 break; 593 break;
588 } 594 }
589 } while (iip >= UBIFS_LPT_FANOUT); 595 } while (iip >= UBIFS_LPT_FANOUT);
590 596
591 /* Go right */ 597 /* Go right */
592 nnode = ubifs_get_nnode(c, nnode, iip); 598 nnode = ubifs_get_nnode(c, nnode, iip);
@@ -815,6 +821,10 @@ static void populate_lsave(struct ubifs_info *c)
815 c->lpt_drty_flgs |= LSAVE_DIRTY; 821 c->lpt_drty_flgs |= LSAVE_DIRTY;
816 ubifs_add_lpt_dirt(c, c->lsave_lnum, c->lsave_sz); 822 ubifs_add_lpt_dirt(c, c->lsave_lnum, c->lsave_sz);
817 } 823 }
824
825 if (dbg_populate_lsave(c))
826 return;
827
818 list_for_each_entry(lprops, &c->empty_list, list) { 828 list_for_each_entry(lprops, &c->empty_list, list) {
819 c->lsave[cnt++] = lprops->lnum; 829 c->lsave[cnt++] = lprops->lnum;
820 if (cnt >= c->lsave_cnt) 830 if (cnt >= c->lsave_cnt)
@@ -1994,4 +2004,47 @@ void dbg_dump_lpt_lebs(const struct ubifs_info *c)
1994 current->pid); 2004 current->pid);
1995} 2005}
1996 2006
2007/**
2008 * dbg_populate_lsave - debugging version of 'populate_lsave()'
2009 * @c: UBIFS file-system description object
2010 *
2011 * This is a debugging version for 'populate_lsave()' which populates lsave
2012 * with random LEBs instead of useful LEBs, which is good for test coverage.
2013 * Returns zero if lsave has not been populated (this debugging feature is
2014 * disabled) an non-zero if lsave has been populated.
2015 */
2016static int dbg_populate_lsave(struct ubifs_info *c)
2017{
2018 struct ubifs_lprops *lprops;
2019 struct ubifs_lpt_heap *heap;
2020 int i;
2021
2022 if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
2023 return 0;
2024 if (random32() & 3)
2025 return 0;
2026
2027 for (i = 0; i < c->lsave_cnt; i++)
2028 c->lsave[i] = c->main_first;
2029
2030 list_for_each_entry(lprops, &c->empty_list, list)
2031 c->lsave[random32() % c->lsave_cnt] = lprops->lnum;
2032 list_for_each_entry(lprops, &c->freeable_list, list)
2033 c->lsave[random32() % c->lsave_cnt] = lprops->lnum;
2034 list_for_each_entry(lprops, &c->frdi_idx_list, list)
2035 c->lsave[random32() % c->lsave_cnt] = lprops->lnum;
2036
2037 heap = &c->lpt_heap[LPROPS_DIRTY_IDX - 1];
2038 for (i = 0; i < heap->cnt; i++)
2039 c->lsave[random32() % c->lsave_cnt] = heap->arr[i]->lnum;
2040 heap = &c->lpt_heap[LPROPS_DIRTY - 1];
2041 for (i = 0; i < heap->cnt; i++)
2042 c->lsave[random32() % c->lsave_cnt] = heap->arr[i]->lnum;
2043 heap = &c->lpt_heap[LPROPS_FREE - 1];
2044 for (i = 0; i < heap->cnt; i++)
2045 c->lsave[random32() % c->lsave_cnt] = heap->arr[i]->lnum;
2046
2047 return 1;
2048}
2049
1997#endif /* CONFIG_UBIFS_FS_DEBUG */ 2050#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c
index 21f47afdacff..278c2382e8c2 100644
--- a/fs/ubifs/master.c
+++ b/fs/ubifs/master.c
@@ -148,7 +148,7 @@ static int validate_master(const struct ubifs_info *c)
148 } 148 }
149 149
150 main_sz = (long long)c->main_lebs * c->leb_size; 150 main_sz = (long long)c->main_lebs * c->leb_size;
151 if (c->old_idx_sz & 7 || c->old_idx_sz >= main_sz) { 151 if (c->bi.old_idx_sz & 7 || c->bi.old_idx_sz >= main_sz) {
152 err = 9; 152 err = 9;
153 goto out; 153 goto out;
154 } 154 }
@@ -218,7 +218,7 @@ static int validate_master(const struct ubifs_info *c)
218 } 218 }
219 219
220 if (c->lst.total_dead + c->lst.total_dark + 220 if (c->lst.total_dead + c->lst.total_dark +
221 c->lst.total_used + c->old_idx_sz > main_sz) { 221 c->lst.total_used + c->bi.old_idx_sz > main_sz) {
222 err = 21; 222 err = 21;
223 goto out; 223 goto out;
224 } 224 }
@@ -286,7 +286,7 @@ int ubifs_read_master(struct ubifs_info *c)
286 c->gc_lnum = le32_to_cpu(c->mst_node->gc_lnum); 286 c->gc_lnum = le32_to_cpu(c->mst_node->gc_lnum);
287 c->ihead_lnum = le32_to_cpu(c->mst_node->ihead_lnum); 287 c->ihead_lnum = le32_to_cpu(c->mst_node->ihead_lnum);
288 c->ihead_offs = le32_to_cpu(c->mst_node->ihead_offs); 288 c->ihead_offs = le32_to_cpu(c->mst_node->ihead_offs);
289 c->old_idx_sz = le64_to_cpu(c->mst_node->index_size); 289 c->bi.old_idx_sz = le64_to_cpu(c->mst_node->index_size);
290 c->lpt_lnum = le32_to_cpu(c->mst_node->lpt_lnum); 290 c->lpt_lnum = le32_to_cpu(c->mst_node->lpt_lnum);
291 c->lpt_offs = le32_to_cpu(c->mst_node->lpt_offs); 291 c->lpt_offs = le32_to_cpu(c->mst_node->lpt_offs);
292 c->nhead_lnum = le32_to_cpu(c->mst_node->nhead_lnum); 292 c->nhead_lnum = le32_to_cpu(c->mst_node->nhead_lnum);
@@ -305,7 +305,7 @@ int ubifs_read_master(struct ubifs_info *c)
305 c->lst.total_dead = le64_to_cpu(c->mst_node->total_dead); 305 c->lst.total_dead = le64_to_cpu(c->mst_node->total_dead);
306 c->lst.total_dark = le64_to_cpu(c->mst_node->total_dark); 306 c->lst.total_dark = le64_to_cpu(c->mst_node->total_dark);
307 307
308 c->calc_idx_sz = c->old_idx_sz; 308 c->calc_idx_sz = c->bi.old_idx_sz;
309 309
310 if (c->mst_node->flags & cpu_to_le32(UBIFS_MST_NO_ORPHS)) 310 if (c->mst_node->flags & cpu_to_le32(UBIFS_MST_NO_ORPHS))
311 c->no_orphs = 1; 311 c->no_orphs = 1;
diff --git a/fs/ubifs/misc.h b/fs/ubifs/misc.h
index c3de04dc952a..0b5296a9a4c5 100644
--- a/fs/ubifs/misc.h
+++ b/fs/ubifs/misc.h
@@ -340,4 +340,21 @@ static inline void ubifs_release_lprops(struct ubifs_info *c)
340 mutex_unlock(&c->lp_mutex); 340 mutex_unlock(&c->lp_mutex);
341} 341}
342 342
343/**
344 * ubifs_next_log_lnum - switch to the next log LEB.
345 * @c: UBIFS file-system description object
346 * @lnum: current log LEB
347 *
348 * This helper function returns the log LEB number which goes next after LEB
349 * 'lnum'.
350 */
351static inline int ubifs_next_log_lnum(const struct ubifs_info *c, int lnum)
352{
353 lnum += 1;
354 if (lnum > c->log_last)
355 lnum = UBIFS_LOG_LNUM;
356
357 return lnum;
358}
359
343#endif /* __UBIFS_MISC_H__ */ 360#endif /* __UBIFS_MISC_H__ */
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index 09df318e368f..bd644bf587a8 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -673,7 +673,8 @@ static int kill_orphans(struct ubifs_info *c)
673 sleb = ubifs_scan(c, lnum, 0, c->sbuf, 1); 673 sleb = ubifs_scan(c, lnum, 0, c->sbuf, 1);
674 if (IS_ERR(sleb)) { 674 if (IS_ERR(sleb)) {
675 if (PTR_ERR(sleb) == -EUCLEAN) 675 if (PTR_ERR(sleb) == -EUCLEAN)
676 sleb = ubifs_recover_leb(c, lnum, 0, c->sbuf, 0); 676 sleb = ubifs_recover_leb(c, lnum, 0,
677 c->sbuf, 0);
677 if (IS_ERR(sleb)) { 678 if (IS_ERR(sleb)) {
678 err = PTR_ERR(sleb); 679 err = PTR_ERR(sleb);
679 break; 680 break;
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 3dbad6fbd1eb..731d9e2e7b50 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -564,13 +564,16 @@ static int fix_unclean_leb(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
564} 564}
565 565
566/** 566/**
567 * drop_incomplete_group - drop nodes from an incomplete group. 567 * drop_last_node - drop the last node or group of nodes.
568 * @sleb: scanned LEB information 568 * @sleb: scanned LEB information
569 * @offs: offset of dropped nodes is returned here 569 * @offs: offset of dropped nodes is returned here
570 * @grouped: non-zero if whole group of nodes have to be dropped
570 * 571 *
571 * This function returns %1 if nodes are dropped and %0 otherwise. 572 * This is a helper function for 'ubifs_recover_leb()' which drops the last
573 * node of the scanned LEB or the last group of nodes if @grouped is not zero.
574 * This function returns %1 if a node was dropped and %0 otherwise.
572 */ 575 */
573static int drop_incomplete_group(struct ubifs_scan_leb *sleb, int *offs) 576static int drop_last_node(struct ubifs_scan_leb *sleb, int *offs, int grouped)
574{ 577{
575 int dropped = 0; 578 int dropped = 0;
576 579
@@ -589,6 +592,8 @@ static int drop_incomplete_group(struct ubifs_scan_leb *sleb, int *offs)
589 kfree(snod); 592 kfree(snod);
590 sleb->nodes_cnt -= 1; 593 sleb->nodes_cnt -= 1;
591 dropped = 1; 594 dropped = 1;
595 if (!grouped)
596 break;
592 } 597 }
593 return dropped; 598 return dropped;
594} 599}
@@ -609,8 +614,7 @@ static int drop_incomplete_group(struct ubifs_scan_leb *sleb, int *offs)
609struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum, 614struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
610 int offs, void *sbuf, int grouped) 615 int offs, void *sbuf, int grouped)
611{ 616{
612 int err, len = c->leb_size - offs, need_clean = 0, quiet = 1; 617 int ret = 0, err, len = c->leb_size - offs, start = offs, min_io_unit;
613 int empty_chkd = 0, start = offs;
614 struct ubifs_scan_leb *sleb; 618 struct ubifs_scan_leb *sleb;
615 void *buf = sbuf + offs; 619 void *buf = sbuf + offs;
616 620
@@ -620,12 +624,8 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
620 if (IS_ERR(sleb)) 624 if (IS_ERR(sleb))
621 return sleb; 625 return sleb;
622 626
623 if (sleb->ecc) 627 ubifs_assert(len >= 8);
624 need_clean = 1;
625
626 while (len >= 8) { 628 while (len >= 8) {
627 int ret;
628
629 dbg_scan("look at LEB %d:%d (%d bytes left)", 629 dbg_scan("look at LEB %d:%d (%d bytes left)",
630 lnum, offs, len); 630 lnum, offs, len);
631 631
@@ -635,8 +635,7 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
635 * Scan quietly until there is an error from which we cannot 635 * Scan quietly until there is an error from which we cannot
636 * recover 636 * recover
637 */ 637 */
638 ret = ubifs_scan_a_node(c, buf, len, lnum, offs, quiet); 638 ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 0);
639
640 if (ret == SCANNED_A_NODE) { 639 if (ret == SCANNED_A_NODE) {
641 /* A valid node, and not a padding node */ 640 /* A valid node, and not a padding node */
642 struct ubifs_ch *ch = buf; 641 struct ubifs_ch *ch = buf;
@@ -649,70 +648,32 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
649 offs += node_len; 648 offs += node_len;
650 buf += node_len; 649 buf += node_len;
651 len -= node_len; 650 len -= node_len;
652 continue; 651 } else if (ret > 0) {
653 }
654
655 if (ret > 0) {
656 /* Padding bytes or a valid padding node */ 652 /* Padding bytes or a valid padding node */
657 offs += ret; 653 offs += ret;
658 buf += ret; 654 buf += ret;
659 len -= ret; 655 len -= ret;
660 continue; 656 } else if (ret == SCANNED_EMPTY_SPACE ||
661 } 657 ret == SCANNED_GARBAGE ||
662 658 ret == SCANNED_A_BAD_PAD_NODE ||
663 if (ret == SCANNED_EMPTY_SPACE) { 659 ret == SCANNED_A_CORRUPT_NODE) {
664 if (!is_empty(buf, len)) { 660 dbg_rcvry("found corruption - %d", ret);
665 if (!is_last_write(c, buf, offs))
666 break;
667 clean_buf(c, &buf, lnum, &offs, &len);
668 need_clean = 1;
669 }
670 empty_chkd = 1;
671 break; 661 break;
672 } 662 } else {
673 663 dbg_err("unexpected return value %d", ret);
674 if (ret == SCANNED_GARBAGE || ret == SCANNED_A_BAD_PAD_NODE)
675 if (is_last_write(c, buf, offs)) {
676 clean_buf(c, &buf, lnum, &offs, &len);
677 need_clean = 1;
678 empty_chkd = 1;
679 break;
680 }
681
682 if (ret == SCANNED_A_CORRUPT_NODE)
683 if (no_more_nodes(c, buf, len, lnum, offs)) {
684 clean_buf(c, &buf, lnum, &offs, &len);
685 need_clean = 1;
686 empty_chkd = 1;
687 break;
688 }
689
690 if (quiet) {
691 /* Redo the last scan but noisily */
692 quiet = 0;
693 continue;
694 }
695
696 switch (ret) {
697 case SCANNED_GARBAGE:
698 dbg_err("garbage");
699 goto corrupted;
700 case SCANNED_A_CORRUPT_NODE:
701 case SCANNED_A_BAD_PAD_NODE:
702 dbg_err("bad node");
703 goto corrupted;
704 default:
705 dbg_err("unknown");
706 err = -EINVAL; 664 err = -EINVAL;
707 goto error; 665 goto error;
708 } 666 }
709 } 667 }
710 668
711 if (!empty_chkd && !is_empty(buf, len)) { 669 if (ret == SCANNED_GARBAGE || ret == SCANNED_A_BAD_PAD_NODE) {
712 if (is_last_write(c, buf, offs)) { 670 if (!is_last_write(c, buf, offs))
713 clean_buf(c, &buf, lnum, &offs, &len); 671 goto corrupted_rescan;
714 need_clean = 1; 672 } else if (ret == SCANNED_A_CORRUPT_NODE) {
715 } else { 673 if (!no_more_nodes(c, buf, len, lnum, offs))
674 goto corrupted_rescan;
675 } else if (!is_empty(buf, len)) {
676 if (!is_last_write(c, buf, offs)) {
716 int corruption = first_non_ff(buf, len); 677 int corruption = first_non_ff(buf, len);
717 678
718 /* 679 /*
@@ -728,29 +689,82 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
728 } 689 }
729 } 690 }
730 691
731 /* Drop nodes from incomplete group */ 692 min_io_unit = round_down(offs, c->min_io_size);
732 if (grouped && drop_incomplete_group(sleb, &offs)) { 693 if (grouped)
733 buf = sbuf + offs; 694 /*
734 len = c->leb_size - offs; 695 * If nodes are grouped, always drop the incomplete group at
735 clean_buf(c, &buf, lnum, &offs, &len); 696 * the end.
736 need_clean = 1; 697 */
737 } 698 drop_last_node(sleb, &offs, 1);
738 699
739 if (offs % c->min_io_size) { 700 /*
740 clean_buf(c, &buf, lnum, &offs, &len); 701 * While we are in the middle of the same min. I/O unit keep dropping
741 need_clean = 1; 702 * nodes. So basically, what we want is to make sure that the last min.
742 } 703 * I/O unit where we saw the corruption is dropped completely with all
704 * the uncorrupted node which may possibly sit there.
705 *
706 * In other words, let's name the min. I/O unit where the corruption
707 * starts B, and the previous min. I/O unit A. The below code tries to
708 * deal with a situation when half of B contains valid nodes or the end
709 * of a valid node, and the second half of B contains corrupted data or
710 * garbage. This means that UBIFS had been writing to B just before the
711 * power cut happened. I do not know how realistic is this scenario
712 * that half of the min. I/O unit had been written successfully and the
713 * other half not, but this is possible in our 'failure mode emulation'
714 * infrastructure at least.
715 *
716 * So what is the problem, why we need to drop those nodes? Whey can't
717 * we just clean-up the second half of B by putting a padding node
718 * there? We can, and this works fine with one exception which was
719 * reproduced with power cut emulation testing and happens extremely
720 * rarely. The description follows, but it is worth noting that that is
721 * only about the GC head, so we could do this trick only if the bud
722 * belongs to the GC head, but it does not seem to be worth an
723 * additional "if" statement.
724 *
725 * So, imagine the file-system is full, we run GC which is moving valid
726 * nodes from LEB X to LEB Y (obviously, LEB Y is the current GC head
727 * LEB). The @c->gc_lnum is -1, which means that GC will retain LEB X
728 * and will try to continue. Imagine that LEB X is currently the
729 * dirtiest LEB, and the amount of used space in LEB Y is exactly the
730 * same as amount of free space in LEB X.
731 *
732 * And a power cut happens when nodes are moved from LEB X to LEB Y. We
733 * are here trying to recover LEB Y which is the GC head LEB. We find
734 * the min. I/O unit B as described above. Then we clean-up LEB Y by
735 * padding min. I/O unit. And later 'ubifs_rcvry_gc_commit()' function
736 * fails, because it cannot find a dirty LEB which could be GC'd into
737 * LEB Y! Even LEB X does not match because the amount of valid nodes
738 * there does not fit the free space in LEB Y any more! And this is
739 * because of the padding node which we added to LEB Y. The
740 * user-visible effect of this which I once observed and analysed is
741 * that we cannot mount the file-system with -ENOSPC error.
742 *
743 * So obviously, to make sure that situation does not happen we should
744 * free min. I/O unit B in LEB Y completely and the last used min. I/O
745 * unit in LEB Y should be A. This is basically what the below code
746 * tries to do.
747 */
748 while (min_io_unit == round_down(offs, c->min_io_size) &&
749 min_io_unit != offs &&
750 drop_last_node(sleb, &offs, grouped));
751
752 buf = sbuf + offs;
753 len = c->leb_size - offs;
743 754
755 clean_buf(c, &buf, lnum, &offs, &len);
744 ubifs_end_scan(c, sleb, lnum, offs); 756 ubifs_end_scan(c, sleb, lnum, offs);
745 757
746 if (need_clean) { 758 err = fix_unclean_leb(c, sleb, start);
747 err = fix_unclean_leb(c, sleb, start); 759 if (err)
748 if (err) 760 goto error;
749 goto error;
750 }
751 761
752 return sleb; 762 return sleb;
753 763
764corrupted_rescan:
765 /* Re-scan the corrupted data with verbose messages */
766 dbg_err("corruptio %d", ret);
767 ubifs_scan_a_node(c, buf, len, lnum, offs, 1);
754corrupted: 768corrupted:
755 ubifs_scanned_corruption(c, lnum, offs, buf); 769 ubifs_scanned_corruption(c, lnum, offs, buf);
756 err = -EUCLEAN; 770 err = -EUCLEAN;
@@ -1070,6 +1084,53 @@ int ubifs_clean_lebs(const struct ubifs_info *c, void *sbuf)
1070} 1084}
1071 1085
1072/** 1086/**
1087 * grab_empty_leb - grab an empty LEB to use as GC LEB and run commit.
1088 * @c: UBIFS file-system description object
1089 *
1090 * This is a helper function for 'ubifs_rcvry_gc_commit()' which grabs an empty
1091 * LEB to be used as GC LEB (@c->gc_lnum), and then runs the commit. Returns
1092 * zero in case of success and a negative error code in case of failure.
1093 */
1094static int grab_empty_leb(struct ubifs_info *c)
1095{
1096 int lnum, err;
1097
1098 /*
1099 * Note, it is very important to first search for an empty LEB and then
1100 * run the commit, not vice-versa. The reason is that there might be
1101 * only one empty LEB at the moment, the one which has been the
1102 * @c->gc_lnum just before the power cut happened. During the regular
1103 * UBIFS operation (not now) @c->gc_lnum is marked as "taken", so no
1104 * one but GC can grab it. But at this moment this single empty LEB is
1105 * not marked as taken, so if we run commit - what happens? Right, the
1106 * commit will grab it and write the index there. Remember that the
1107 * index always expands as long as there is free space, and it only
1108 * starts consolidating when we run out of space.
1109 *
1110 * IOW, if we run commit now, we might not be able to find a free LEB
1111 * after this.
1112 */
1113 lnum = ubifs_find_free_leb_for_idx(c);
1114 if (lnum < 0) {
1115 dbg_err("could not find an empty LEB");
1116 dbg_dump_lprops(c);
1117 dbg_dump_budg(c, &c->bi);
1118 return lnum;
1119 }
1120
1121 /* Reset the index flag */
1122 err = ubifs_change_one_lp(c, lnum, LPROPS_NC, LPROPS_NC, 0,
1123 LPROPS_INDEX, 0);
1124 if (err)
1125 return err;
1126
1127 c->gc_lnum = lnum;
1128 dbg_rcvry("found empty LEB %d, run commit", lnum);
1129
1130 return ubifs_run_commit(c);
1131}
1132
1133/**
1073 * ubifs_rcvry_gc_commit - recover the GC LEB number and run the commit. 1134 * ubifs_rcvry_gc_commit - recover the GC LEB number and run the commit.
1074 * @c: UBIFS file-system description object 1135 * @c: UBIFS file-system description object
1075 * 1136 *
@@ -1091,71 +1152,26 @@ int ubifs_rcvry_gc_commit(struct ubifs_info *c)
1091{ 1152{
1092 struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf; 1153 struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
1093 struct ubifs_lprops lp; 1154 struct ubifs_lprops lp;
1094 int lnum, err; 1155 int err;
1156
1157 dbg_rcvry("GC head LEB %d, offs %d", wbuf->lnum, wbuf->offs);
1095 1158
1096 c->gc_lnum = -1; 1159 c->gc_lnum = -1;
1097 if (wbuf->lnum == -1) { 1160 if (wbuf->lnum == -1 || wbuf->offs == c->leb_size)
1098 dbg_rcvry("no GC head LEB"); 1161 return grab_empty_leb(c);
1099 goto find_free; 1162
1100 }
1101 /*
1102 * See whether the used space in the dirtiest LEB fits in the GC head
1103 * LEB.
1104 */
1105 if (wbuf->offs == c->leb_size) {
1106 dbg_rcvry("no room in GC head LEB");
1107 goto find_free;
1108 }
1109 err = ubifs_find_dirty_leb(c, &lp, wbuf->offs, 2); 1163 err = ubifs_find_dirty_leb(c, &lp, wbuf->offs, 2);
1110 if (err) { 1164 if (err) {
1111 /* 1165 if (err != -ENOSPC)
1112 * There are no dirty or empty LEBs subject to here being
1113 * enough for the index. Try to use
1114 * 'ubifs_find_free_leb_for_idx()', which will return any empty
1115 * LEBs (ignoring index requirements). If the index then
1116 * doesn't have enough LEBs the recovery commit will fail -
1117 * which is the same result anyway i.e. recovery fails. So
1118 * there is no problem ignoring index requirements and just
1119 * grabbing a free LEB since we have already established there
1120 * is not a dirty LEB we could have used instead.
1121 */
1122 if (err == -ENOSPC) {
1123 dbg_rcvry("could not find a dirty LEB");
1124 goto find_free;
1125 }
1126 return err;
1127 }
1128 ubifs_assert(!(lp.flags & LPROPS_INDEX));
1129 lnum = lp.lnum;
1130 if (lp.free + lp.dirty == c->leb_size) {
1131 /* An empty LEB was returned */
1132 if (lp.free != c->leb_size) {
1133 err = ubifs_change_one_lp(c, lnum, c->leb_size,
1134 0, 0, 0, 0);
1135 if (err)
1136 return err;
1137 }
1138 err = ubifs_leb_unmap(c, lnum);
1139 if (err)
1140 return err; 1166 return err;
1141 c->gc_lnum = lnum; 1167
1142 dbg_rcvry("allocated LEB %d for GC", lnum); 1168 dbg_rcvry("could not find a dirty LEB");
1143 /* Run the commit */ 1169 return grab_empty_leb(c);
1144 dbg_rcvry("committing");
1145 return ubifs_run_commit(c);
1146 }
1147 /*
1148 * There was no empty LEB so the used space in the dirtiest LEB must fit
1149 * in the GC head LEB.
1150 */
1151 if (lp.free + lp.dirty < wbuf->offs) {
1152 dbg_rcvry("LEB %d doesn't fit in GC head LEB %d:%d",
1153 lnum, wbuf->lnum, wbuf->offs);
1154 err = ubifs_return_leb(c, lnum);
1155 if (err)
1156 return err;
1157 goto find_free;
1158 } 1170 }
1171
1172 ubifs_assert(!(lp.flags & LPROPS_INDEX));
1173 ubifs_assert(lp.free + lp.dirty >= wbuf->offs);
1174
1159 /* 1175 /*
1160 * We run the commit before garbage collection otherwise subsequent 1176 * We run the commit before garbage collection otherwise subsequent
1161 * mounts will see the GC and orphan deletion in a different order. 1177 * mounts will see the GC and orphan deletion in a different order.
@@ -1164,11 +1180,8 @@ int ubifs_rcvry_gc_commit(struct ubifs_info *c)
1164 err = ubifs_run_commit(c); 1180 err = ubifs_run_commit(c);
1165 if (err) 1181 if (err)
1166 return err; 1182 return err;
1167 /* 1183
1168 * The data in the dirtiest LEB fits in the GC head LEB, so do the GC 1184 dbg_rcvry("GC'ing LEB %d", lp.lnum);
1169 * - use locking to keep 'ubifs_assert()' happy.
1170 */
1171 dbg_rcvry("GC'ing LEB %d", lnum);
1172 mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead); 1185 mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
1173 err = ubifs_garbage_collect_leb(c, &lp); 1186 err = ubifs_garbage_collect_leb(c, &lp);
1174 if (err >= 0) { 1187 if (err >= 0) {
@@ -1184,37 +1197,17 @@ int ubifs_rcvry_gc_commit(struct ubifs_info *c)
1184 err = -EINVAL; 1197 err = -EINVAL;
1185 return err; 1198 return err;
1186 } 1199 }
1187 if (err != LEB_RETAINED) { 1200
1188 dbg_err("GC returned %d", err); 1201 ubifs_assert(err == LEB_RETAINED);
1202 if (err != LEB_RETAINED)
1189 return -EINVAL; 1203 return -EINVAL;
1190 } 1204
1191 err = ubifs_leb_unmap(c, c->gc_lnum); 1205 err = ubifs_leb_unmap(c, c->gc_lnum);
1192 if (err) 1206 if (err)
1193 return err; 1207 return err;
1194 dbg_rcvry("allocated LEB %d for GC", lnum);
1195 return 0;
1196 1208
1197find_free: 1209 dbg_rcvry("allocated LEB %d for GC", lp.lnum);
1198 /* 1210 return 0;
1199 * There is no GC head LEB or the free space in the GC head LEB is too
1200 * small, or there are not dirty LEBs. Allocate gc_lnum by calling
1201 * 'ubifs_find_free_leb_for_idx()' so GC is not run.
1202 */
1203 lnum = ubifs_find_free_leb_for_idx(c);
1204 if (lnum < 0) {
1205 dbg_err("could not find an empty LEB");
1206 return lnum;
1207 }
1208 /* And reset the index flag */
1209 err = ubifs_change_one_lp(c, lnum, LPROPS_NC, LPROPS_NC, 0,
1210 LPROPS_INDEX, 0);
1211 if (err)
1212 return err;
1213 c->gc_lnum = lnum;
1214 dbg_rcvry("allocated LEB %d for GC", lnum);
1215 /* Run the commit */
1216 dbg_rcvry("committing");
1217 return ubifs_run_commit(c);
1218} 1211}
1219 1212
1220/** 1213/**
@@ -1456,7 +1449,7 @@ static int fix_size_in_place(struct ubifs_info *c, struct size_entry *e)
1456 err = ubi_leb_change(c->ubi, lnum, c->sbuf, len, UBI_UNKNOWN); 1449 err = ubi_leb_change(c->ubi, lnum, c->sbuf, len, UBI_UNKNOWN);
1457 if (err) 1450 if (err)
1458 goto out; 1451 goto out;
1459 dbg_rcvry("inode %lu at %d:%d size %lld -> %lld ", 1452 dbg_rcvry("inode %lu at %d:%d size %lld -> %lld",
1460 (unsigned long)e->inum, lnum, offs, i_size, e->d_size); 1453 (unsigned long)e->inum, lnum, offs, i_size, e->d_size);
1461 return 0; 1454 return 0;
1462 1455
@@ -1505,20 +1498,27 @@ int ubifs_recover_size(struct ubifs_info *c)
1505 e->i_size = le64_to_cpu(ino->size); 1498 e->i_size = le64_to_cpu(ino->size);
1506 } 1499 }
1507 } 1500 }
1501
1508 if (e->exists && e->i_size < e->d_size) { 1502 if (e->exists && e->i_size < e->d_size) {
1509 if (!e->inode && c->ro_mount) { 1503 if (c->ro_mount) {
1510 /* Fix the inode size and pin it in memory */ 1504 /* Fix the inode size and pin it in memory */
1511 struct inode *inode; 1505 struct inode *inode;
1506 struct ubifs_inode *ui;
1507
1508 ubifs_assert(!e->inode);
1512 1509
1513 inode = ubifs_iget(c->vfs_sb, e->inum); 1510 inode = ubifs_iget(c->vfs_sb, e->inum);
1514 if (IS_ERR(inode)) 1511 if (IS_ERR(inode))
1515 return PTR_ERR(inode); 1512 return PTR_ERR(inode);
1513
1514 ui = ubifs_inode(inode);
1516 if (inode->i_size < e->d_size) { 1515 if (inode->i_size < e->d_size) {
1517 dbg_rcvry("ino %lu size %lld -> %lld", 1516 dbg_rcvry("ino %lu size %lld -> %lld",
1518 (unsigned long)e->inum, 1517 (unsigned long)e->inum,
1519 e->d_size, inode->i_size); 1518 inode->i_size, e->d_size);
1520 inode->i_size = e->d_size; 1519 inode->i_size = e->d_size;
1521 ubifs_inode(inode)->ui_size = e->d_size; 1520 ui->ui_size = e->d_size;
1521 ui->synced_i_size = e->d_size;
1522 e->inode = inode; 1522 e->inode = inode;
1523 this = rb_next(this); 1523 this = rb_next(this);
1524 continue; 1524 continue;
@@ -1533,9 +1533,11 @@ int ubifs_recover_size(struct ubifs_info *c)
1533 iput(e->inode); 1533 iput(e->inode);
1534 } 1534 }
1535 } 1535 }
1536
1536 this = rb_next(this); 1537 this = rb_next(this);
1537 rb_erase(&e->rb, &c->size_tree); 1538 rb_erase(&e->rb, &c->size_tree);
1538 kfree(e); 1539 kfree(e);
1539 } 1540 }
1541
1540 return 0; 1542 return 0;
1541} 1543}
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index d3d6d365bfc1..6617280d1679 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -33,44 +33,32 @@
33 */ 33 */
34 34
35#include "ubifs.h" 35#include "ubifs.h"
36 36#include <linux/list_sort.h>
37/*
38 * Replay flags.
39 *
40 * REPLAY_DELETION: node was deleted
41 * REPLAY_REF: node is a reference node
42 */
43enum {
44 REPLAY_DELETION = 1,
45 REPLAY_REF = 2,
46};
47 37
48/** 38/**
49 * struct replay_entry - replay tree entry. 39 * struct replay_entry - replay list entry.
50 * @lnum: logical eraseblock number of the node 40 * @lnum: logical eraseblock number of the node
51 * @offs: node offset 41 * @offs: node offset
52 * @len: node length 42 * @len: node length
43 * @deletion: non-zero if this entry corresponds to a node deletion
53 * @sqnum: node sequence number 44 * @sqnum: node sequence number
54 * @flags: replay flags 45 * @list: links the replay list
55 * @rb: links the replay tree
56 * @key: node key 46 * @key: node key
57 * @nm: directory entry name 47 * @nm: directory entry name
58 * @old_size: truncation old size 48 * @old_size: truncation old size
59 * @new_size: truncation new size 49 * @new_size: truncation new size
60 * @free: amount of free space in a bud
61 * @dirty: amount of dirty space in a bud from padding and deletion nodes
62 * @jhead: journal head number of the bud
63 * 50 *
64 * UBIFS journal replay must compare node sequence numbers, which means it must 51 * The replay process first scans all buds and builds the replay list, then
65 * build a tree of node information to insert into the TNC. 52 * sorts the replay list in nodes sequence number order, and then inserts all
53 * the replay entries to the TNC.
66 */ 54 */
67struct replay_entry { 55struct replay_entry {
68 int lnum; 56 int lnum;
69 int offs; 57 int offs;
70 int len; 58 int len;
59 unsigned int deletion:1;
71 unsigned long long sqnum; 60 unsigned long long sqnum;
72 int flags; 61 struct list_head list;
73 struct rb_node rb;
74 union ubifs_key key; 62 union ubifs_key key;
75 union { 63 union {
76 struct qstr nm; 64 struct qstr nm;
@@ -78,11 +66,6 @@ struct replay_entry {
78 loff_t old_size; 66 loff_t old_size;
79 loff_t new_size; 67 loff_t new_size;
80 }; 68 };
81 struct {
82 int free;
83 int dirty;
84 int jhead;
85 };
86 }; 69 };
87}; 70};
88 71
@@ -90,57 +73,64 @@ struct replay_entry {
90 * struct bud_entry - entry in the list of buds to replay. 73 * struct bud_entry - entry in the list of buds to replay.
91 * @list: next bud in the list 74 * @list: next bud in the list
92 * @bud: bud description object 75 * @bud: bud description object
93 * @free: free bytes in the bud
94 * @sqnum: reference node sequence number 76 * @sqnum: reference node sequence number
77 * @free: free bytes in the bud
78 * @dirty: dirty bytes in the bud
95 */ 79 */
96struct bud_entry { 80struct bud_entry {
97 struct list_head list; 81 struct list_head list;
98 struct ubifs_bud *bud; 82 struct ubifs_bud *bud;
99 int free;
100 unsigned long long sqnum; 83 unsigned long long sqnum;
84 int free;
85 int dirty;
101}; 86};
102 87
103/** 88/**
104 * set_bud_lprops - set free and dirty space used by a bud. 89 * set_bud_lprops - set free and dirty space used by a bud.
105 * @c: UBIFS file-system description object 90 * @c: UBIFS file-system description object
106 * @r: replay entry of bud 91 * @b: bud entry which describes the bud
92 *
93 * This function makes sure the LEB properties of bud @b are set correctly
94 * after the replay. Returns zero in case of success and a negative error code
95 * in case of failure.
107 */ 96 */
108static int set_bud_lprops(struct ubifs_info *c, struct replay_entry *r) 97static int set_bud_lprops(struct ubifs_info *c, struct bud_entry *b)
109{ 98{
110 const struct ubifs_lprops *lp; 99 const struct ubifs_lprops *lp;
111 int err = 0, dirty; 100 int err = 0, dirty;
112 101
113 ubifs_get_lprops(c); 102 ubifs_get_lprops(c);
114 103
115 lp = ubifs_lpt_lookup_dirty(c, r->lnum); 104 lp = ubifs_lpt_lookup_dirty(c, b->bud->lnum);
116 if (IS_ERR(lp)) { 105 if (IS_ERR(lp)) {
117 err = PTR_ERR(lp); 106 err = PTR_ERR(lp);
118 goto out; 107 goto out;
119 } 108 }
120 109
121 dirty = lp->dirty; 110 dirty = lp->dirty;
122 if (r->offs == 0 && (lp->free != c->leb_size || lp->dirty != 0)) { 111 if (b->bud->start == 0 && (lp->free != c->leb_size || lp->dirty != 0)) {
123 /* 112 /*
124 * The LEB was added to the journal with a starting offset of 113 * The LEB was added to the journal with a starting offset of
125 * zero which means the LEB must have been empty. The LEB 114 * zero which means the LEB must have been empty. The LEB
126 * property values should be lp->free == c->leb_size and 115 * property values should be @lp->free == @c->leb_size and
127 * lp->dirty == 0, but that is not the case. The reason is that 116 * @lp->dirty == 0, but that is not the case. The reason is that
128 * the LEB was garbage collected. The garbage collector resets 117 * the LEB had been garbage collected before it became the bud,
129 * the free and dirty space without recording it anywhere except 118 * and there was not commit inbetween. The garbage collector
130 * lprops, so if there is not a commit then lprops does not have 119 * resets the free and dirty space without recording it
131 * that information next time the file system is mounted. 120 * anywhere except lprops, so if there was no commit then
121 * lprops does not have that information.
132 * 122 *
133 * We do not need to adjust free space because the scan has told 123 * We do not need to adjust free space because the scan has told
134 * us the exact value which is recorded in the replay entry as 124 * us the exact value which is recorded in the replay entry as
135 * r->free. 125 * @b->free.
136 * 126 *
137 * However we do need to subtract from the dirty space the 127 * However we do need to subtract from the dirty space the
138 * amount of space that the garbage collector reclaimed, which 128 * amount of space that the garbage collector reclaimed, which
139 * is the whole LEB minus the amount of space that was free. 129 * is the whole LEB minus the amount of space that was free.
140 */ 130 */
141 dbg_mnt("bud LEB %d was GC'd (%d free, %d dirty)", r->lnum, 131 dbg_mnt("bud LEB %d was GC'd (%d free, %d dirty)", b->bud->lnum,
142 lp->free, lp->dirty); 132 lp->free, lp->dirty);
143 dbg_gc("bud LEB %d was GC'd (%d free, %d dirty)", r->lnum, 133 dbg_gc("bud LEB %d was GC'd (%d free, %d dirty)", b->bud->lnum,
144 lp->free, lp->dirty); 134 lp->free, lp->dirty);
145 dirty -= c->leb_size - lp->free; 135 dirty -= c->leb_size - lp->free;
146 /* 136 /*
@@ -152,10 +142,10 @@ static int set_bud_lprops(struct ubifs_info *c, struct replay_entry *r)
152 */ 142 */
153 if (dirty != 0) 143 if (dirty != 0)
154 dbg_msg("LEB %d lp: %d free %d dirty " 144 dbg_msg("LEB %d lp: %d free %d dirty "
155 "replay: %d free %d dirty", r->lnum, lp->free, 145 "replay: %d free %d dirty", b->bud->lnum,
156 lp->dirty, r->free, r->dirty); 146 lp->free, lp->dirty, b->free, b->dirty);
157 } 147 }
158 lp = ubifs_change_lp(c, lp, r->free, dirty + r->dirty, 148 lp = ubifs_change_lp(c, lp, b->free, dirty + b->dirty,
159 lp->flags | LPROPS_TAKEN, 0); 149 lp->flags | LPROPS_TAKEN, 0);
160 if (IS_ERR(lp)) { 150 if (IS_ERR(lp)) {
161 err = PTR_ERR(lp); 151 err = PTR_ERR(lp);
@@ -163,8 +153,9 @@ static int set_bud_lprops(struct ubifs_info *c, struct replay_entry *r)
163 } 153 }
164 154
165 /* Make sure the journal head points to the latest bud */ 155 /* Make sure the journal head points to the latest bud */
166 err = ubifs_wbuf_seek_nolock(&c->jheads[r->jhead].wbuf, r->lnum, 156 err = ubifs_wbuf_seek_nolock(&c->jheads[b->bud->jhead].wbuf,
167 c->leb_size - r->free, UBI_SHORTTERM); 157 b->bud->lnum, c->leb_size - b->free,
158 UBI_SHORTTERM);
168 159
169out: 160out:
170 ubifs_release_lprops(c); 161 ubifs_release_lprops(c);
@@ -172,6 +163,27 @@ out:
172} 163}
173 164
174/** 165/**
166 * set_buds_lprops - set free and dirty space for all replayed buds.
167 * @c: UBIFS file-system description object
168 *
169 * This function sets LEB properties for all replayed buds. Returns zero in
170 * case of success and a negative error code in case of failure.
171 */
172static int set_buds_lprops(struct ubifs_info *c)
173{
174 struct bud_entry *b;
175 int err;
176
177 list_for_each_entry(b, &c->replay_buds, list) {
178 err = set_bud_lprops(c, b);
179 if (err)
180 return err;
181 }
182
183 return 0;
184}
185
186/**
175 * trun_remove_range - apply a replay entry for a truncation to the TNC. 187 * trun_remove_range - apply a replay entry for a truncation to the TNC.
176 * @c: UBIFS file-system description object 188 * @c: UBIFS file-system description object
177 * @r: replay entry of truncation 189 * @r: replay entry of truncation
@@ -207,24 +219,22 @@ static int trun_remove_range(struct ubifs_info *c, struct replay_entry *r)
207 */ 219 */
208static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r) 220static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r)
209{ 221{
210 int err, deletion = ((r->flags & REPLAY_DELETION) != 0); 222 int err;
211 223
212 dbg_mnt("LEB %d:%d len %d flgs %d sqnum %llu %s", r->lnum, 224 dbg_mnt("LEB %d:%d len %d deletion %d sqnum %llu %s", r->lnum,
213 r->offs, r->len, r->flags, r->sqnum, DBGKEY(&r->key)); 225 r->offs, r->len, r->deletion, r->sqnum, DBGKEY(&r->key));
214 226
215 /* Set c->replay_sqnum to help deal with dangling branches. */ 227 /* Set c->replay_sqnum to help deal with dangling branches. */
216 c->replay_sqnum = r->sqnum; 228 c->replay_sqnum = r->sqnum;
217 229
218 if (r->flags & REPLAY_REF) 230 if (is_hash_key(c, &r->key)) {
219 err = set_bud_lprops(c, r); 231 if (r->deletion)
220 else if (is_hash_key(c, &r->key)) {
221 if (deletion)
222 err = ubifs_tnc_remove_nm(c, &r->key, &r->nm); 232 err = ubifs_tnc_remove_nm(c, &r->key, &r->nm);
223 else 233 else
224 err = ubifs_tnc_add_nm(c, &r->key, r->lnum, r->offs, 234 err = ubifs_tnc_add_nm(c, &r->key, r->lnum, r->offs,
225 r->len, &r->nm); 235 r->len, &r->nm);
226 } else { 236 } else {
227 if (deletion) 237 if (r->deletion)
228 switch (key_type(c, &r->key)) { 238 switch (key_type(c, &r->key)) {
229 case UBIFS_INO_KEY: 239 case UBIFS_INO_KEY:
230 { 240 {
@@ -247,7 +257,7 @@ static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r)
247 return err; 257 return err;
248 258
249 if (c->need_recovery) 259 if (c->need_recovery)
250 err = ubifs_recover_size_accum(c, &r->key, deletion, 260 err = ubifs_recover_size_accum(c, &r->key, r->deletion,
251 r->new_size); 261 r->new_size);
252 } 262 }
253 263
@@ -255,68 +265,77 @@ static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r)
255} 265}
256 266
257/** 267/**
258 * destroy_replay_tree - destroy the replay. 268 * replay_entries_cmp - compare 2 replay entries.
259 * @c: UBIFS file-system description object 269 * @priv: UBIFS file-system description object
270 * @a: first replay entry
271 * @a: second replay entry
260 * 272 *
261 * Destroy the replay tree. 273 * This is a comparios function for 'list_sort()' which compares 2 replay
274 * entries @a and @b by comparing their sequence numer. Returns %1 if @a has
275 * greater sequence number and %-1 otherwise.
262 */ 276 */
263static void destroy_replay_tree(struct ubifs_info *c) 277static int replay_entries_cmp(void *priv, struct list_head *a,
278 struct list_head *b)
264{ 279{
265 struct rb_node *this = c->replay_tree.rb_node; 280 struct replay_entry *ra, *rb;
266 struct replay_entry *r; 281
267 282 cond_resched();
268 while (this) { 283 if (a == b)
269 if (this->rb_left) { 284 return 0;
270 this = this->rb_left; 285
271 continue; 286 ra = list_entry(a, struct replay_entry, list);
272 } else if (this->rb_right) { 287 rb = list_entry(b, struct replay_entry, list);
273 this = this->rb_right; 288 ubifs_assert(ra->sqnum != rb->sqnum);
274 continue; 289 if (ra->sqnum > rb->sqnum)
275 } 290 return 1;
276 r = rb_entry(this, struct replay_entry, rb); 291 return -1;
277 this = rb_parent(this);
278 if (this) {
279 if (this->rb_left == &r->rb)
280 this->rb_left = NULL;
281 else
282 this->rb_right = NULL;
283 }
284 if (is_hash_key(c, &r->key))
285 kfree(r->nm.name);
286 kfree(r);
287 }
288 c->replay_tree = RB_ROOT;
289} 292}
290 293
291/** 294/**
292 * apply_replay_tree - apply the replay tree to the TNC. 295 * apply_replay_list - apply the replay list to the TNC.
293 * @c: UBIFS file-system description object 296 * @c: UBIFS file-system description object
294 * 297 *
295 * Apply the replay tree. 298 * Apply all entries in the replay list to the TNC. Returns zero in case of
296 * Returns zero in case of success and a negative error code in case of 299 * success and a negative error code in case of failure.
297 * failure.
298 */ 300 */
299static int apply_replay_tree(struct ubifs_info *c) 301static int apply_replay_list(struct ubifs_info *c)
300{ 302{
301 struct rb_node *this = rb_first(&c->replay_tree); 303 struct replay_entry *r;
304 int err;
302 305
303 while (this) { 306 list_sort(c, &c->replay_list, &replay_entries_cmp);
304 struct replay_entry *r;
305 int err;
306 307
308 list_for_each_entry(r, &c->replay_list, list) {
307 cond_resched(); 309 cond_resched();
308 310
309 r = rb_entry(this, struct replay_entry, rb);
310 err = apply_replay_entry(c, r); 311 err = apply_replay_entry(c, r);
311 if (err) 312 if (err)
312 return err; 313 return err;
313 this = rb_next(this);
314 } 314 }
315
315 return 0; 316 return 0;
316} 317}
317 318
318/** 319/**
319 * insert_node - insert a node to the replay tree. 320 * destroy_replay_list - destroy the replay.
321 * @c: UBIFS file-system description object
322 *
323 * Destroy the replay list.
324 */
325static void destroy_replay_list(struct ubifs_info *c)
326{
327 struct replay_entry *r, *tmp;
328
329 list_for_each_entry_safe(r, tmp, &c->replay_list, list) {
330 if (is_hash_key(c, &r->key))
331 kfree(r->nm.name);
332 list_del(&r->list);
333 kfree(r);
334 }
335}
336
337/**
338 * insert_node - insert a node to the replay list
320 * @c: UBIFS file-system description object 339 * @c: UBIFS file-system description object
321 * @lnum: node logical eraseblock number 340 * @lnum: node logical eraseblock number
322 * @offs: node offset 341 * @offs: node offset
@@ -328,39 +347,25 @@ static int apply_replay_tree(struct ubifs_info *c)
328 * @old_size: truncation old size 347 * @old_size: truncation old size
329 * @new_size: truncation new size 348 * @new_size: truncation new size
330 * 349 *
331 * This function inserts a scanned non-direntry node to the replay tree. The 350 * This function inserts a scanned non-direntry node to the replay list. The
332 * replay tree is an RB-tree containing @struct replay_entry elements which are 351 * replay list contains @struct replay_entry elements, and we sort this list in
333 * indexed by the sequence number. The replay tree is applied at the very end 352 * sequence number order before applying it. The replay list is applied at the
334 * of the replay process. Since the tree is sorted in sequence number order, 353 * very end of the replay process. Since the list is sorted in sequence number
335 * the older modifications are applied first. This function returns zero in 354 * order, the older modifications are applied first. This function returns zero
336 * case of success and a negative error code in case of failure. 355 * in case of success and a negative error code in case of failure.
337 */ 356 */
338static int insert_node(struct ubifs_info *c, int lnum, int offs, int len, 357static int insert_node(struct ubifs_info *c, int lnum, int offs, int len,
339 union ubifs_key *key, unsigned long long sqnum, 358 union ubifs_key *key, unsigned long long sqnum,
340 int deletion, int *used, loff_t old_size, 359 int deletion, int *used, loff_t old_size,
341 loff_t new_size) 360 loff_t new_size)
342{ 361{
343 struct rb_node **p = &c->replay_tree.rb_node, *parent = NULL;
344 struct replay_entry *r; 362 struct replay_entry *r;
345 363
364 dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key));
365
346 if (key_inum(c, key) >= c->highest_inum) 366 if (key_inum(c, key) >= c->highest_inum)
347 c->highest_inum = key_inum(c, key); 367 c->highest_inum = key_inum(c, key);
348 368
349 dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key));
350 while (*p) {
351 parent = *p;
352 r = rb_entry(parent, struct replay_entry, rb);
353 if (sqnum < r->sqnum) {
354 p = &(*p)->rb_left;
355 continue;
356 } else if (sqnum > r->sqnum) {
357 p = &(*p)->rb_right;
358 continue;
359 }
360 ubifs_err("duplicate sqnum in replay");
361 return -EINVAL;
362 }
363
364 r = kzalloc(sizeof(struct replay_entry), GFP_KERNEL); 369 r = kzalloc(sizeof(struct replay_entry), GFP_KERNEL);
365 if (!r) 370 if (!r)
366 return -ENOMEM; 371 return -ENOMEM;
@@ -370,19 +375,18 @@ static int insert_node(struct ubifs_info *c, int lnum, int offs, int len,
370 r->lnum = lnum; 375 r->lnum = lnum;
371 r->offs = offs; 376 r->offs = offs;
372 r->len = len; 377 r->len = len;
378 r->deletion = !!deletion;
373 r->sqnum = sqnum; 379 r->sqnum = sqnum;
374 r->flags = (deletion ? REPLAY_DELETION : 0); 380 key_copy(c, key, &r->key);
375 r->old_size = old_size; 381 r->old_size = old_size;
376 r->new_size = new_size; 382 r->new_size = new_size;
377 key_copy(c, key, &r->key);
378 383
379 rb_link_node(&r->rb, parent, p); 384 list_add_tail(&r->list, &c->replay_list);
380 rb_insert_color(&r->rb, &c->replay_tree);
381 return 0; 385 return 0;
382} 386}
383 387
384/** 388/**
385 * insert_dent - insert a directory entry node into the replay tree. 389 * insert_dent - insert a directory entry node into the replay list.
386 * @c: UBIFS file-system description object 390 * @c: UBIFS file-system description object
387 * @lnum: node logical eraseblock number 391 * @lnum: node logical eraseblock number
388 * @offs: node offset 392 * @offs: node offset
@@ -394,43 +398,25 @@ static int insert_node(struct ubifs_info *c, int lnum, int offs, int len,
394 * @deletion: non-zero if this is a deletion 398 * @deletion: non-zero if this is a deletion
395 * @used: number of bytes in use in a LEB 399 * @used: number of bytes in use in a LEB
396 * 400 *
397 * This function inserts a scanned directory entry node to the replay tree. 401 * This function inserts a scanned directory entry node or an extended
398 * Returns zero in case of success and a negative error code in case of 402 * attribute entry to the replay list. Returns zero in case of success and a
399 * failure. 403 * negative error code in case of failure.
400 *
401 * This function is also used for extended attribute entries because they are
402 * implemented as directory entry nodes.
403 */ 404 */
404static int insert_dent(struct ubifs_info *c, int lnum, int offs, int len, 405static int insert_dent(struct ubifs_info *c, int lnum, int offs, int len,
405 union ubifs_key *key, const char *name, int nlen, 406 union ubifs_key *key, const char *name, int nlen,
406 unsigned long long sqnum, int deletion, int *used) 407 unsigned long long sqnum, int deletion, int *used)
407{ 408{
408 struct rb_node **p = &c->replay_tree.rb_node, *parent = NULL;
409 struct replay_entry *r; 409 struct replay_entry *r;
410 char *nbuf; 410 char *nbuf;
411 411
412 dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key));
412 if (key_inum(c, key) >= c->highest_inum) 413 if (key_inum(c, key) >= c->highest_inum)
413 c->highest_inum = key_inum(c, key); 414 c->highest_inum = key_inum(c, key);
414 415
415 dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key));
416 while (*p) {
417 parent = *p;
418 r = rb_entry(parent, struct replay_entry, rb);
419 if (sqnum < r->sqnum) {
420 p = &(*p)->rb_left;
421 continue;
422 }
423 if (sqnum > r->sqnum) {
424 p = &(*p)->rb_right;
425 continue;
426 }
427 ubifs_err("duplicate sqnum in replay");
428 return -EINVAL;
429 }
430
431 r = kzalloc(sizeof(struct replay_entry), GFP_KERNEL); 416 r = kzalloc(sizeof(struct replay_entry), GFP_KERNEL);
432 if (!r) 417 if (!r)
433 return -ENOMEM; 418 return -ENOMEM;
419
434 nbuf = kmalloc(nlen + 1, GFP_KERNEL); 420 nbuf = kmalloc(nlen + 1, GFP_KERNEL);
435 if (!nbuf) { 421 if (!nbuf) {
436 kfree(r); 422 kfree(r);
@@ -442,17 +428,15 @@ static int insert_dent(struct ubifs_info *c, int lnum, int offs, int len,
442 r->lnum = lnum; 428 r->lnum = lnum;
443 r->offs = offs; 429 r->offs = offs;
444 r->len = len; 430 r->len = len;
431 r->deletion = !!deletion;
445 r->sqnum = sqnum; 432 r->sqnum = sqnum;
433 key_copy(c, key, &r->key);
446 r->nm.len = nlen; 434 r->nm.len = nlen;
447 memcpy(nbuf, name, nlen); 435 memcpy(nbuf, name, nlen);
448 nbuf[nlen] = '\0'; 436 nbuf[nlen] = '\0';
449 r->nm.name = nbuf; 437 r->nm.name = nbuf;
450 r->flags = (deletion ? REPLAY_DELETION : 0);
451 key_copy(c, key, &r->key);
452 438
453 ubifs_assert(!*p); 439 list_add_tail(&r->list, &c->replay_list);
454 rb_link_node(&r->rb, parent, p);
455 rb_insert_color(&r->rb, &c->replay_tree);
456 return 0; 440 return 0;
457} 441}
458 442
@@ -489,29 +473,92 @@ int ubifs_validate_entry(struct ubifs_info *c,
489} 473}
490 474
491/** 475/**
476 * is_last_bud - check if the bud is the last in the journal head.
477 * @c: UBIFS file-system description object
478 * @bud: bud description object
479 *
480 * This function checks if bud @bud is the last bud in its journal head. This
481 * information is then used by 'replay_bud()' to decide whether the bud can
482 * have corruptions or not. Indeed, only last buds can be corrupted by power
483 * cuts. Returns %1 if this is the last bud, and %0 if not.
484 */
485static int is_last_bud(struct ubifs_info *c, struct ubifs_bud *bud)
486{
487 struct ubifs_jhead *jh = &c->jheads[bud->jhead];
488 struct ubifs_bud *next;
489 uint32_t data;
490 int err;
491
492 if (list_is_last(&bud->list, &jh->buds_list))
493 return 1;
494
495 /*
496 * The following is a quirk to make sure we work correctly with UBIFS
497 * images used with older UBIFS.
498 *
499 * Normally, the last bud will be the last in the journal head's list
500 * of bud. However, there is one exception if the UBIFS image belongs
501 * to older UBIFS. This is fairly unlikely: one would need to use old
502 * UBIFS, then have a power cut exactly at the right point, and then
503 * try to mount this image with new UBIFS.
504 *
505 * The exception is: it is possible to have 2 buds A and B, A goes
506 * before B, and B is the last, bud B is contains no data, and bud A is
507 * corrupted at the end. The reason is that in older versions when the
508 * journal code switched the next bud (from A to B), it first added a
509 * log reference node for the new bud (B), and only after this it
510 * synchronized the write-buffer of current bud (A). But later this was
511 * changed and UBIFS started to always synchronize the write-buffer of
512 * the bud (A) before writing the log reference for the new bud (B).
513 *
514 * But because older UBIFS always synchronized A's write-buffer before
515 * writing to B, we can recognize this exceptional situation but
516 * checking the contents of bud B - if it is empty, then A can be
517 * treated as the last and we can recover it.
518 *
519 * TODO: remove this piece of code in a couple of years (today it is
520 * 16.05.2011).
521 */
522 next = list_entry(bud->list.next, struct ubifs_bud, list);
523 if (!list_is_last(&next->list, &jh->buds_list))
524 return 0;
525
526 err = ubi_read(c->ubi, next->lnum, (char *)&data,
527 next->start, 4);
528 if (err)
529 return 0;
530
531 return data == 0xFFFFFFFF;
532}
533
534/**
492 * replay_bud - replay a bud logical eraseblock. 535 * replay_bud - replay a bud logical eraseblock.
493 * @c: UBIFS file-system description object 536 * @c: UBIFS file-system description object
494 * @lnum: bud logical eraseblock number to replay 537 * @b: bud entry which describes the bud
495 * @offs: bud start offset
496 * @jhead: journal head to which this bud belongs
497 * @free: amount of free space in the bud is returned here
498 * @dirty: amount of dirty space from padding and deletion nodes is returned
499 * here
500 * 538 *
501 * This function returns zero in case of success and a negative error code in 539 * This function replays bud @bud, recovers it if needed, and adds all nodes
502 * case of failure. 540 * from this bud to the replay list. Returns zero in case of success and a
541 * negative error code in case of failure.
503 */ 542 */
504static int replay_bud(struct ubifs_info *c, int lnum, int offs, int jhead, 543static int replay_bud(struct ubifs_info *c, struct bud_entry *b)
505 int *free, int *dirty)
506{ 544{
507 int err = 0, used = 0; 545 int is_last = is_last_bud(c, b->bud);
546 int err = 0, used = 0, lnum = b->bud->lnum, offs = b->bud->start;
508 struct ubifs_scan_leb *sleb; 547 struct ubifs_scan_leb *sleb;
509 struct ubifs_scan_node *snod; 548 struct ubifs_scan_node *snod;
510 struct ubifs_bud *bud;
511 549
512 dbg_mnt("replay bud LEB %d, head %d", lnum, jhead); 550 dbg_mnt("replay bud LEB %d, head %d, offs %d, is_last %d",
513 if (c->need_recovery) 551 lnum, b->bud->jhead, offs, is_last);
514 sleb = ubifs_recover_leb(c, lnum, offs, c->sbuf, jhead != GCHD); 552
553 if (c->need_recovery && is_last)
554 /*
555 * Recover only last LEBs in the journal heads, because power
556 * cuts may cause corruptions only in these LEBs, because only
557 * these LEBs could possibly be written to at the power cut
558 * time.
559 */
560 sleb = ubifs_recover_leb(c, lnum, offs, c->sbuf,
561 b->bud->jhead != GCHD);
515 else 562 else
516 sleb = ubifs_scan(c, lnum, offs, c->sbuf, 0); 563 sleb = ubifs_scan(c, lnum, offs, c->sbuf, 0);
517 if (IS_ERR(sleb)) 564 if (IS_ERR(sleb))
@@ -627,15 +674,13 @@ static int replay_bud(struct ubifs_info *c, int lnum, int offs, int jhead,
627 goto out; 674 goto out;
628 } 675 }
629 676
630 bud = ubifs_search_bud(c, lnum); 677 ubifs_assert(ubifs_search_bud(c, lnum));
631 if (!bud)
632 BUG();
633
634 ubifs_assert(sleb->endpt - offs >= used); 678 ubifs_assert(sleb->endpt - offs >= used);
635 ubifs_assert(sleb->endpt % c->min_io_size == 0); 679 ubifs_assert(sleb->endpt % c->min_io_size == 0);
636 680
637 *dirty = sleb->endpt - offs - used; 681 b->dirty = sleb->endpt - offs - used;
638 *free = c->leb_size - sleb->endpt; 682 b->free = c->leb_size - sleb->endpt;
683 dbg_mnt("bud LEB %d replied: dirty %d, free %d", lnum, b->dirty, b->free);
639 684
640out: 685out:
641 ubifs_scan_destroy(sleb); 686 ubifs_scan_destroy(sleb);
@@ -649,58 +694,6 @@ out_dump:
649} 694}
650 695
651/** 696/**
652 * insert_ref_node - insert a reference node to the replay tree.
653 * @c: UBIFS file-system description object
654 * @lnum: node logical eraseblock number
655 * @offs: node offset
656 * @sqnum: sequence number
657 * @free: amount of free space in bud
658 * @dirty: amount of dirty space from padding and deletion nodes
659 * @jhead: journal head number for the bud
660 *
661 * This function inserts a reference node to the replay tree and returns zero
662 * in case of success or a negative error code in case of failure.
663 */
664static int insert_ref_node(struct ubifs_info *c, int lnum, int offs,
665 unsigned long long sqnum, int free, int dirty,
666 int jhead)
667{
668 struct rb_node **p = &c->replay_tree.rb_node, *parent = NULL;
669 struct replay_entry *r;
670
671 dbg_mnt("add ref LEB %d:%d", lnum, offs);
672 while (*p) {
673 parent = *p;
674 r = rb_entry(parent, struct replay_entry, rb);
675 if (sqnum < r->sqnum) {
676 p = &(*p)->rb_left;
677 continue;
678 } else if (sqnum > r->sqnum) {
679 p = &(*p)->rb_right;
680 continue;
681 }
682 ubifs_err("duplicate sqnum in replay tree");
683 return -EINVAL;
684 }
685
686 r = kzalloc(sizeof(struct replay_entry), GFP_KERNEL);
687 if (!r)
688 return -ENOMEM;
689
690 r->lnum = lnum;
691 r->offs = offs;
692 r->sqnum = sqnum;
693 r->flags = REPLAY_REF;
694 r->free = free;
695 r->dirty = dirty;
696 r->jhead = jhead;
697
698 rb_link_node(&r->rb, parent, p);
699 rb_insert_color(&r->rb, &c->replay_tree);
700 return 0;
701}
702
703/**
704 * replay_buds - replay all buds. 697 * replay_buds - replay all buds.
705 * @c: UBIFS file-system description object 698 * @c: UBIFS file-system description object
706 * 699 *
@@ -710,17 +703,16 @@ static int insert_ref_node(struct ubifs_info *c, int lnum, int offs,
710static int replay_buds(struct ubifs_info *c) 703static int replay_buds(struct ubifs_info *c)
711{ 704{
712 struct bud_entry *b; 705 struct bud_entry *b;
713 int err, uninitialized_var(free), uninitialized_var(dirty); 706 int err;
707 unsigned long long prev_sqnum = 0;
714 708
715 list_for_each_entry(b, &c->replay_buds, list) { 709 list_for_each_entry(b, &c->replay_buds, list) {
716 err = replay_bud(c, b->bud->lnum, b->bud->start, b->bud->jhead, 710 err = replay_bud(c, b);
717 &free, &dirty);
718 if (err)
719 return err;
720 err = insert_ref_node(c, b->bud->lnum, b->bud->start, b->sqnum,
721 free, dirty, b->bud->jhead);
722 if (err) 711 if (err)
723 return err; 712 return err;
713
714 ubifs_assert(b->sqnum > prev_sqnum);
715 prev_sqnum = b->sqnum;
724 } 716 }
725 717
726 return 0; 718 return 0;
@@ -1060,25 +1052,29 @@ int ubifs_replay_journal(struct ubifs_info *c)
1060 if (err) 1052 if (err)
1061 goto out; 1053 goto out;
1062 1054
1063 err = apply_replay_tree(c); 1055 err = apply_replay_list(c);
1056 if (err)
1057 goto out;
1058
1059 err = set_buds_lprops(c);
1064 if (err) 1060 if (err)
1065 goto out; 1061 goto out;
1066 1062
1067 /* 1063 /*
1068 * UBIFS budgeting calculations use @c->budg_uncommitted_idx variable 1064 * UBIFS budgeting calculations use @c->bi.uncommitted_idx variable
1069 * to roughly estimate index growth. Things like @c->min_idx_lebs 1065 * to roughly estimate index growth. Things like @c->bi.min_idx_lebs
1070 * depend on it. This means we have to initialize it to make sure 1066 * depend on it. This means we have to initialize it to make sure
1071 * budgeting works properly. 1067 * budgeting works properly.
1072 */ 1068 */
1073 c->budg_uncommitted_idx = atomic_long_read(&c->dirty_zn_cnt); 1069 c->bi.uncommitted_idx = atomic_long_read(&c->dirty_zn_cnt);
1074 c->budg_uncommitted_idx *= c->max_idx_node_sz; 1070 c->bi.uncommitted_idx *= c->max_idx_node_sz;
1075 1071
1076 ubifs_assert(c->bud_bytes <= c->max_bud_bytes || c->need_recovery); 1072 ubifs_assert(c->bud_bytes <= c->max_bud_bytes || c->need_recovery);
1077 dbg_mnt("finished, log head LEB %d:%d, max_sqnum %llu, " 1073 dbg_mnt("finished, log head LEB %d:%d, max_sqnum %llu, "
1078 "highest_inum %lu", c->lhead_lnum, c->lhead_offs, c->max_sqnum, 1074 "highest_inum %lu", c->lhead_lnum, c->lhead_offs, c->max_sqnum,
1079 (unsigned long)c->highest_inum); 1075 (unsigned long)c->highest_inum);
1080out: 1076out:
1081 destroy_replay_tree(c); 1077 destroy_replay_list(c);
1082 destroy_bud_list(c); 1078 destroy_bud_list(c);
1083 c->replaying = 0; 1079 c->replaying = 0;
1084 return err; 1080 return err;
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index bf31b4729e51..c606f010e8df 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -475,7 +475,8 @@ failed:
475 * @c: UBIFS file-system description object 475 * @c: UBIFS file-system description object
476 * 476 *
477 * This function returns a pointer to the superblock node or a negative error 477 * This function returns a pointer to the superblock node or a negative error
478 * code. 478 * code. Note, the user of this function is responsible of kfree()'ing the
479 * returned superblock buffer.
479 */ 480 */
480struct ubifs_sb_node *ubifs_read_sb_node(struct ubifs_info *c) 481struct ubifs_sb_node *ubifs_read_sb_node(struct ubifs_info *c)
481{ 482{
@@ -616,6 +617,7 @@ int ubifs_read_superblock(struct ubifs_info *c)
616 c->vfs_sb->s_time_gran = le32_to_cpu(sup->time_gran); 617 c->vfs_sb->s_time_gran = le32_to_cpu(sup->time_gran);
617 memcpy(&c->uuid, &sup->uuid, 16); 618 memcpy(&c->uuid, &sup->uuid, 16);
618 c->big_lpt = !!(sup_flags & UBIFS_FLG_BIGLPT); 619 c->big_lpt = !!(sup_flags & UBIFS_FLG_BIGLPT);
620 c->space_fixup = !!(sup_flags & UBIFS_FLG_SPACE_FIXUP);
619 621
620 /* Automatically increase file system size to the maximum size */ 622 /* Automatically increase file system size to the maximum size */
621 c->old_leb_cnt = c->leb_cnt; 623 c->old_leb_cnt = c->leb_cnt;
@@ -650,3 +652,152 @@ out:
650 kfree(sup); 652 kfree(sup);
651 return err; 653 return err;
652} 654}
655
656/**
657 * fixup_leb - fixup/unmap an LEB containing free space.
658 * @c: UBIFS file-system description object
659 * @lnum: the LEB number to fix up
660 * @len: number of used bytes in LEB (starting at offset 0)
661 *
662 * This function reads the contents of the given LEB number @lnum, then fixes
663 * it up, so that empty min. I/O units in the end of LEB are actually erased on
664 * flash (rather than being just all-0xff real data). If the LEB is completely
665 * empty, it is simply unmapped.
666 */
667static int fixup_leb(struct ubifs_info *c, int lnum, int len)
668{
669 int err;
670
671 ubifs_assert(len >= 0);
672 ubifs_assert(len % c->min_io_size == 0);
673 ubifs_assert(len < c->leb_size);
674
675 if (len == 0) {
676 dbg_mnt("unmap empty LEB %d", lnum);
677 return ubi_leb_unmap(c->ubi, lnum);
678 }
679
680 dbg_mnt("fixup LEB %d, data len %d", lnum, len);
681 err = ubi_read(c->ubi, lnum, c->sbuf, 0, len);
682 if (err)
683 return err;
684
685 return ubi_leb_change(c->ubi, lnum, c->sbuf, len, UBI_UNKNOWN);
686}
687
688/**
689 * fixup_free_space - find & remap all LEBs containing free space.
690 * @c: UBIFS file-system description object
691 *
692 * This function walks through all LEBs in the filesystem and fiexes up those
693 * containing free/empty space.
694 */
695static int fixup_free_space(struct ubifs_info *c)
696{
697 int lnum, err = 0;
698 struct ubifs_lprops *lprops;
699
700 ubifs_get_lprops(c);
701
702 /* Fixup LEBs in the master area */
703 for (lnum = UBIFS_MST_LNUM; lnum < UBIFS_LOG_LNUM; lnum++) {
704 err = fixup_leb(c, lnum, c->mst_offs + c->mst_node_alsz);
705 if (err)
706 goto out;
707 }
708
709 /* Unmap unused log LEBs */
710 lnum = ubifs_next_log_lnum(c, c->lhead_lnum);
711 while (lnum != c->ltail_lnum) {
712 err = fixup_leb(c, lnum, 0);
713 if (err)
714 goto out;
715 lnum = ubifs_next_log_lnum(c, lnum);
716 }
717
718 /* Fixup the current log head */
719 err = fixup_leb(c, c->lhead_lnum, c->lhead_offs);
720 if (err)
721 goto out;
722
723 /* Fixup LEBs in the LPT area */
724 for (lnum = c->lpt_first; lnum <= c->lpt_last; lnum++) {
725 int free = c->ltab[lnum - c->lpt_first].free;
726
727 if (free > 0) {
728 err = fixup_leb(c, lnum, c->leb_size - free);
729 if (err)
730 goto out;
731 }
732 }
733
734 /* Unmap LEBs in the orphans area */
735 for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) {
736 err = fixup_leb(c, lnum, 0);
737 if (err)
738 goto out;
739 }
740
741 /* Fixup LEBs in the main area */
742 for (lnum = c->main_first; lnum < c->leb_cnt; lnum++) {
743 lprops = ubifs_lpt_lookup(c, lnum);
744 if (IS_ERR(lprops)) {
745 err = PTR_ERR(lprops);
746 goto out;
747 }
748
749 if (lprops->free > 0) {
750 err = fixup_leb(c, lnum, c->leb_size - lprops->free);
751 if (err)
752 goto out;
753 }
754 }
755
756out:
757 ubifs_release_lprops(c);
758 return err;
759}
760
761/**
762 * ubifs_fixup_free_space - find & fix all LEBs with free space.
763 * @c: UBIFS file-system description object
764 *
765 * This function fixes up LEBs containing free space on first mount, if the
766 * appropriate flag was set when the FS was created. Each LEB with one or more
767 * empty min. I/O unit (i.e. free-space-count > 0) is re-written, to make sure
768 * the free space is actually erased. E.g., this is necessary for some NAND
769 * chips, since the free space may have been programmed like real "0xff" data
770 * (generating a non-0xff ECC), causing future writes to the not-really-erased
771 * NAND pages to behave badly. After the space is fixed up, the superblock flag
772 * is cleared, so that this is skipped for all future mounts.
773 */
774int ubifs_fixup_free_space(struct ubifs_info *c)
775{
776 int err;
777 struct ubifs_sb_node *sup;
778
779 ubifs_assert(c->space_fixup);
780 ubifs_assert(!c->ro_mount);
781
782 ubifs_msg("start fixing up free space");
783
784 err = fixup_free_space(c);
785 if (err)
786 return err;
787
788 sup = ubifs_read_sb_node(c);
789 if (IS_ERR(sup))
790 return PTR_ERR(sup);
791
792 /* Free-space fixup is no longer required */
793 c->space_fixup = 0;
794 sup->flags &= cpu_to_le32(~UBIFS_FLG_SPACE_FIXUP);
795
796 err = ubifs_write_sb_node(c, sup);
797 kfree(sup);
798 if (err)
799 return err;
800
801 ubifs_msg("free space fixup complete");
802 return err;
803}
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 04ad07f4fcc3..6db0bdaa9f74 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -375,7 +375,7 @@ out:
375 ubifs_release_dirty_inode_budget(c, ui); 375 ubifs_release_dirty_inode_budget(c, ui);
376 else { 376 else {
377 /* We've deleted something - clean the "no space" flags */ 377 /* We've deleted something - clean the "no space" flags */
378 c->nospace = c->nospace_rp = 0; 378 c->bi.nospace = c->bi.nospace_rp = 0;
379 smp_wmb(); 379 smp_wmb();
380 } 380 }
381done: 381done:
@@ -694,11 +694,11 @@ static int init_constants_sb(struct ubifs_info *c)
694 * be compressed and direntries are of the maximum size. 694 * be compressed and direntries are of the maximum size.
695 * 695 *
696 * Note, data, which may be stored in inodes is budgeted separately, so 696 * Note, data, which may be stored in inodes is budgeted separately, so
697 * it is not included into 'c->inode_budget'. 697 * it is not included into 'c->bi.inode_budget'.
698 */ 698 */
699 c->page_budget = UBIFS_MAX_DATA_NODE_SZ * UBIFS_BLOCKS_PER_PAGE; 699 c->bi.page_budget = UBIFS_MAX_DATA_NODE_SZ * UBIFS_BLOCKS_PER_PAGE;
700 c->inode_budget = UBIFS_INO_NODE_SZ; 700 c->bi.inode_budget = UBIFS_INO_NODE_SZ;
701 c->dent_budget = UBIFS_MAX_DENT_NODE_SZ; 701 c->bi.dent_budget = UBIFS_MAX_DENT_NODE_SZ;
702 702
703 /* 703 /*
704 * When the amount of flash space used by buds becomes 704 * When the amount of flash space used by buds becomes
@@ -742,7 +742,7 @@ static void init_constants_master(struct ubifs_info *c)
742{ 742{
743 long long tmp64; 743 long long tmp64;
744 744
745 c->min_idx_lebs = ubifs_calc_min_idx_lebs(c); 745 c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c);
746 c->report_rp_size = ubifs_reported_space(c, c->rp_size); 746 c->report_rp_size = ubifs_reported_space(c, c->rp_size);
747 747
748 /* 748 /*
@@ -1144,8 +1144,8 @@ static int check_free_space(struct ubifs_info *c)
1144{ 1144{
1145 ubifs_assert(c->dark_wm > 0); 1145 ubifs_assert(c->dark_wm > 0);
1146 if (c->lst.total_free + c->lst.total_dirty < c->dark_wm) { 1146 if (c->lst.total_free + c->lst.total_dirty < c->dark_wm) {
1147 ubifs_err("insufficient free space to mount in read/write mode"); 1147 ubifs_err("insufficient free space to mount in R/W mode");
1148 dbg_dump_budg(c); 1148 dbg_dump_budg(c, &c->bi);
1149 dbg_dump_lprops(c); 1149 dbg_dump_lprops(c);
1150 return -ENOSPC; 1150 return -ENOSPC;
1151 } 1151 }
@@ -1304,7 +1304,7 @@ static int mount_ubifs(struct ubifs_info *c)
1304 if (err) 1304 if (err)
1305 goto out_lpt; 1305 goto out_lpt;
1306 1306
1307 err = dbg_check_idx_size(c, c->old_idx_sz); 1307 err = dbg_check_idx_size(c, c->bi.old_idx_sz);
1308 if (err) 1308 if (err)
1309 goto out_lpt; 1309 goto out_lpt;
1310 1310
@@ -1313,7 +1313,7 @@ static int mount_ubifs(struct ubifs_info *c)
1313 goto out_journal; 1313 goto out_journal;
1314 1314
1315 /* Calculate 'min_idx_lebs' after journal replay */ 1315 /* Calculate 'min_idx_lebs' after journal replay */
1316 c->min_idx_lebs = ubifs_calc_min_idx_lebs(c); 1316 c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c);
1317 1317
1318 err = ubifs_mount_orphans(c, c->need_recovery, c->ro_mount); 1318 err = ubifs_mount_orphans(c, c->need_recovery, c->ro_mount);
1319 if (err) 1319 if (err)
@@ -1396,6 +1396,12 @@ static int mount_ubifs(struct ubifs_info *c)
1396 } else 1396 } else
1397 ubifs_assert(c->lst.taken_empty_lebs > 0); 1397 ubifs_assert(c->lst.taken_empty_lebs > 0);
1398 1398
1399 if (!c->ro_mount && c->space_fixup) {
1400 err = ubifs_fixup_free_space(c);
1401 if (err)
1402 goto out_infos;
1403 }
1404
1399 err = dbg_check_filesystem(c); 1405 err = dbg_check_filesystem(c);
1400 if (err) 1406 if (err)
1401 goto out_infos; 1407 goto out_infos;
@@ -1442,7 +1448,8 @@ static int mount_ubifs(struct ubifs_info *c)
1442 c->main_lebs, c->main_first, c->leb_cnt - 1); 1448 c->main_lebs, c->main_first, c->leb_cnt - 1);
1443 dbg_msg("index LEBs: %d", c->lst.idx_lebs); 1449 dbg_msg("index LEBs: %d", c->lst.idx_lebs);
1444 dbg_msg("total index bytes: %lld (%lld KiB, %lld MiB)", 1450 dbg_msg("total index bytes: %lld (%lld KiB, %lld MiB)",
1445 c->old_idx_sz, c->old_idx_sz >> 10, c->old_idx_sz >> 20); 1451 c->bi.old_idx_sz, c->bi.old_idx_sz >> 10,
1452 c->bi.old_idx_sz >> 20);
1446 dbg_msg("key hash type: %d", c->key_hash_type); 1453 dbg_msg("key hash type: %d", c->key_hash_type);
1447 dbg_msg("tree fanout: %d", c->fanout); 1454 dbg_msg("tree fanout: %d", c->fanout);
1448 dbg_msg("reserved GC LEB: %d", c->gc_lnum); 1455 dbg_msg("reserved GC LEB: %d", c->gc_lnum);
@@ -1456,7 +1463,7 @@ static int mount_ubifs(struct ubifs_info *c)
1456 dbg_msg("node sizes: ref %zu, cmt. start %zu, orph %zu", 1463 dbg_msg("node sizes: ref %zu, cmt. start %zu, orph %zu",
1457 UBIFS_REF_NODE_SZ, UBIFS_CS_NODE_SZ, UBIFS_ORPH_NODE_SZ); 1464 UBIFS_REF_NODE_SZ, UBIFS_CS_NODE_SZ, UBIFS_ORPH_NODE_SZ);
1458 dbg_msg("max. node sizes: data %zu, inode %zu dentry %zu, idx %d", 1465 dbg_msg("max. node sizes: data %zu, inode %zu dentry %zu, idx %d",
1459 UBIFS_MAX_DATA_NODE_SZ, UBIFS_MAX_INO_NODE_SZ, 1466 UBIFS_MAX_DATA_NODE_SZ, UBIFS_MAX_INO_NODE_SZ,
1460 UBIFS_MAX_DENT_NODE_SZ, ubifs_idx_node_sz(c, c->fanout)); 1467 UBIFS_MAX_DENT_NODE_SZ, ubifs_idx_node_sz(c, c->fanout));
1461 dbg_msg("dead watermark: %d", c->dead_wm); 1468 dbg_msg("dead watermark: %d", c->dead_wm);
1462 dbg_msg("dark watermark: %d", c->dark_wm); 1469 dbg_msg("dark watermark: %d", c->dark_wm);
@@ -1584,6 +1591,7 @@ static int ubifs_remount_rw(struct ubifs_info *c)
1584 } 1591 }
1585 sup->leb_cnt = cpu_to_le32(c->leb_cnt); 1592 sup->leb_cnt = cpu_to_le32(c->leb_cnt);
1586 err = ubifs_write_sb_node(c, sup); 1593 err = ubifs_write_sb_node(c, sup);
1594 kfree(sup);
1587 if (err) 1595 if (err)
1588 goto out; 1596 goto out;
1589 } 1597 }
@@ -1684,6 +1692,13 @@ static int ubifs_remount_rw(struct ubifs_info *c)
1684 */ 1692 */
1685 err = dbg_check_space_info(c); 1693 err = dbg_check_space_info(c);
1686 } 1694 }
1695
1696 if (c->space_fixup) {
1697 err = ubifs_fixup_free_space(c);
1698 if (err)
1699 goto out;
1700 }
1701
1687 mutex_unlock(&c->umount_mutex); 1702 mutex_unlock(&c->umount_mutex);
1688 return err; 1703 return err;
1689 1704
@@ -1766,10 +1781,9 @@ static void ubifs_put_super(struct super_block *sb)
1766 * to write them back because of I/O errors. 1781 * to write them back because of I/O errors.
1767 */ 1782 */
1768 if (!c->ro_error) { 1783 if (!c->ro_error) {
1769 ubifs_assert(atomic_long_read(&c->dirty_pg_cnt) == 0); 1784 ubifs_assert(c->bi.idx_growth == 0);
1770 ubifs_assert(c->budg_idx_growth == 0); 1785 ubifs_assert(c->bi.dd_growth == 0);
1771 ubifs_assert(c->budg_dd_growth == 0); 1786 ubifs_assert(c->bi.data_growth == 0);
1772 ubifs_assert(c->budg_data_growth == 0);
1773 } 1787 }
1774 1788
1775 /* 1789 /*
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index de485979ca39..8119b1fd8d94 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -2557,11 +2557,11 @@ int ubifs_tnc_remove_nm(struct ubifs_info *c, const union ubifs_key *key,
2557 if (err) { 2557 if (err) {
2558 /* Ensure the znode is dirtied */ 2558 /* Ensure the znode is dirtied */
2559 if (znode->cnext || !ubifs_zn_dirty(znode)) { 2559 if (znode->cnext || !ubifs_zn_dirty(znode)) {
2560 znode = dirty_cow_bottom_up(c, znode); 2560 znode = dirty_cow_bottom_up(c, znode);
2561 if (IS_ERR(znode)) { 2561 if (IS_ERR(znode)) {
2562 err = PTR_ERR(znode); 2562 err = PTR_ERR(znode);
2563 goto out_unlock; 2563 goto out_unlock;
2564 } 2564 }
2565 } 2565 }
2566 err = tnc_delete(c, znode, n); 2566 err = tnc_delete(c, znode, n);
2567 } 2567 }
diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c
index 53288e5d604e..41920f357bbf 100644
--- a/fs/ubifs/tnc_commit.c
+++ b/fs/ubifs/tnc_commit.c
@@ -377,15 +377,13 @@ static int layout_in_gaps(struct ubifs_info *c, int cnt)
377 c->gap_lebs = NULL; 377 c->gap_lebs = NULL;
378 return err; 378 return err;
379 } 379 }
380 if (!dbg_force_in_the_gaps_enabled) { 380 if (dbg_force_in_the_gaps_enabled()) {
381 /* 381 /*
382 * Do not print scary warnings if the debugging 382 * Do not print scary warnings if the debugging
383 * option which forces in-the-gaps is enabled. 383 * option which forces in-the-gaps is enabled.
384 */ 384 */
385 ubifs_err("out of space"); 385 ubifs_warn("out of space");
386 spin_lock(&c->space_lock); 386 dbg_dump_budg(c, &c->bi);
387 dbg_dump_budg(c);
388 spin_unlock(&c->space_lock);
389 dbg_dump_lprops(c); 387 dbg_dump_lprops(c);
390 } 388 }
391 /* Try to commit anyway */ 389 /* Try to commit anyway */
@@ -796,16 +794,16 @@ int ubifs_tnc_start_commit(struct ubifs_info *c, struct ubifs_zbranch *zroot)
796 spin_lock(&c->space_lock); 794 spin_lock(&c->space_lock);
797 /* 795 /*
798 * Although we have not finished committing yet, update size of the 796 * Although we have not finished committing yet, update size of the
799 * committed index ('c->old_idx_sz') and zero out the index growth 797 * committed index ('c->bi.old_idx_sz') and zero out the index growth
800 * budget. It is OK to do this now, because we've reserved all the 798 * budget. It is OK to do this now, because we've reserved all the
801 * space which is needed to commit the index, and it is save for the 799 * space which is needed to commit the index, and it is save for the
802 * budgeting subsystem to assume the index is already committed, 800 * budgeting subsystem to assume the index is already committed,
803 * even though it is not. 801 * even though it is not.
804 */ 802 */
805 ubifs_assert(c->min_idx_lebs == ubifs_calc_min_idx_lebs(c)); 803 ubifs_assert(c->bi.min_idx_lebs == ubifs_calc_min_idx_lebs(c));
806 c->old_idx_sz = c->calc_idx_sz; 804 c->bi.old_idx_sz = c->calc_idx_sz;
807 c->budg_uncommitted_idx = 0; 805 c->bi.uncommitted_idx = 0;
808 c->min_idx_lebs = ubifs_calc_min_idx_lebs(c); 806 c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c);
809 spin_unlock(&c->space_lock); 807 spin_unlock(&c->space_lock);
810 mutex_unlock(&c->tnc_mutex); 808 mutex_unlock(&c->tnc_mutex);
811 809
diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h
index 191ca7863fe7..e24380cf46ed 100644
--- a/fs/ubifs/ubifs-media.h
+++ b/fs/ubifs/ubifs-media.h
@@ -408,9 +408,11 @@ enum {
408 * Superblock flags. 408 * Superblock flags.
409 * 409 *
410 * UBIFS_FLG_BIGLPT: if "big" LPT model is used if set 410 * UBIFS_FLG_BIGLPT: if "big" LPT model is used if set
411 * UBIFS_FLG_SPACE_FIXUP: first-mount "fixup" of free space within LEBs needed
411 */ 412 */
412enum { 413enum {
413 UBIFS_FLG_BIGLPT = 0x02, 414 UBIFS_FLG_BIGLPT = 0x02,
415 UBIFS_FLG_SPACE_FIXUP = 0x04,
414}; 416};
415 417
416/** 418/**
@@ -434,7 +436,7 @@ struct ubifs_ch {
434 __u8 node_type; 436 __u8 node_type;
435 __u8 group_type; 437 __u8 group_type;
436 __u8 padding[2]; 438 __u8 padding[2];
437} __attribute__ ((packed)); 439} __packed;
438 440
439/** 441/**
440 * union ubifs_dev_desc - device node descriptor. 442 * union ubifs_dev_desc - device node descriptor.
@@ -448,7 +450,7 @@ struct ubifs_ch {
448union ubifs_dev_desc { 450union ubifs_dev_desc {
449 __le32 new; 451 __le32 new;
450 __le64 huge; 452 __le64 huge;
451} __attribute__ ((packed)); 453} __packed;
452 454
453/** 455/**
454 * struct ubifs_ino_node - inode node. 456 * struct ubifs_ino_node - inode node.
@@ -509,7 +511,7 @@ struct ubifs_ino_node {
509 __le16 compr_type; 511 __le16 compr_type;
510 __u8 padding2[26]; /* Watch 'zero_ino_node_unused()' if changing! */ 512 __u8 padding2[26]; /* Watch 'zero_ino_node_unused()' if changing! */
511 __u8 data[]; 513 __u8 data[];
512} __attribute__ ((packed)); 514} __packed;
513 515
514/** 516/**
515 * struct ubifs_dent_node - directory entry node. 517 * struct ubifs_dent_node - directory entry node.
@@ -534,7 +536,7 @@ struct ubifs_dent_node {
534 __le16 nlen; 536 __le16 nlen;
535 __u8 padding2[4]; /* Watch 'zero_dent_node_unused()' if changing! */ 537 __u8 padding2[4]; /* Watch 'zero_dent_node_unused()' if changing! */
536 __u8 name[]; 538 __u8 name[];
537} __attribute__ ((packed)); 539} __packed;
538 540
539/** 541/**
540 * struct ubifs_data_node - data node. 542 * struct ubifs_data_node - data node.
@@ -555,7 +557,7 @@ struct ubifs_data_node {
555 __le16 compr_type; 557 __le16 compr_type;
556 __u8 padding[2]; /* Watch 'zero_data_node_unused()' if changing! */ 558 __u8 padding[2]; /* Watch 'zero_data_node_unused()' if changing! */
557 __u8 data[]; 559 __u8 data[];
558} __attribute__ ((packed)); 560} __packed;
559 561
560/** 562/**
561 * struct ubifs_trun_node - truncation node. 563 * struct ubifs_trun_node - truncation node.
@@ -575,7 +577,7 @@ struct ubifs_trun_node {
575 __u8 padding[12]; /* Watch 'zero_trun_node_unused()' if changing! */ 577 __u8 padding[12]; /* Watch 'zero_trun_node_unused()' if changing! */
576 __le64 old_size; 578 __le64 old_size;
577 __le64 new_size; 579 __le64 new_size;
578} __attribute__ ((packed)); 580} __packed;
579 581
580/** 582/**
581 * struct ubifs_pad_node - padding node. 583 * struct ubifs_pad_node - padding node.
@@ -586,7 +588,7 @@ struct ubifs_trun_node {
586struct ubifs_pad_node { 588struct ubifs_pad_node {
587 struct ubifs_ch ch; 589 struct ubifs_ch ch;
588 __le32 pad_len; 590 __le32 pad_len;
589} __attribute__ ((packed)); 591} __packed;
590 592
591/** 593/**
592 * struct ubifs_sb_node - superblock node. 594 * struct ubifs_sb_node - superblock node.
@@ -644,7 +646,7 @@ struct ubifs_sb_node {
644 __u8 uuid[16]; 646 __u8 uuid[16];
645 __le32 ro_compat_version; 647 __le32 ro_compat_version;
646 __u8 padding2[3968]; 648 __u8 padding2[3968];
647} __attribute__ ((packed)); 649} __packed;
648 650
649/** 651/**
650 * struct ubifs_mst_node - master node. 652 * struct ubifs_mst_node - master node.
@@ -711,7 +713,7 @@ struct ubifs_mst_node {
711 __le32 idx_lebs; 713 __le32 idx_lebs;
712 __le32 leb_cnt; 714 __le32 leb_cnt;
713 __u8 padding[344]; 715 __u8 padding[344];
714} __attribute__ ((packed)); 716} __packed;
715 717
716/** 718/**
717 * struct ubifs_ref_node - logical eraseblock reference node. 719 * struct ubifs_ref_node - logical eraseblock reference node.
@@ -727,7 +729,7 @@ struct ubifs_ref_node {
727 __le32 offs; 729 __le32 offs;
728 __le32 jhead; 730 __le32 jhead;
729 __u8 padding[28]; 731 __u8 padding[28];
730} __attribute__ ((packed)); 732} __packed;
731 733
732/** 734/**
733 * struct ubifs_branch - key/reference/length branch 735 * struct ubifs_branch - key/reference/length branch
@@ -741,7 +743,7 @@ struct ubifs_branch {
741 __le32 offs; 743 __le32 offs;
742 __le32 len; 744 __le32 len;
743 __u8 key[]; 745 __u8 key[];
744} __attribute__ ((packed)); 746} __packed;
745 747
746/** 748/**
747 * struct ubifs_idx_node - indexing node. 749 * struct ubifs_idx_node - indexing node.
@@ -755,7 +757,7 @@ struct ubifs_idx_node {
755 __le16 child_cnt; 757 __le16 child_cnt;
756 __le16 level; 758 __le16 level;
757 __u8 branches[]; 759 __u8 branches[];
758} __attribute__ ((packed)); 760} __packed;
759 761
760/** 762/**
761 * struct ubifs_cs_node - commit start node. 763 * struct ubifs_cs_node - commit start node.
@@ -765,7 +767,7 @@ struct ubifs_idx_node {
765struct ubifs_cs_node { 767struct ubifs_cs_node {
766 struct ubifs_ch ch; 768 struct ubifs_ch ch;
767 __le64 cmt_no; 769 __le64 cmt_no;
768} __attribute__ ((packed)); 770} __packed;
769 771
770/** 772/**
771 * struct ubifs_orph_node - orphan node. 773 * struct ubifs_orph_node - orphan node.
@@ -777,6 +779,6 @@ struct ubifs_orph_node {
777 struct ubifs_ch ch; 779 struct ubifs_ch ch;
778 __le64 cmt_no; 780 __le64 cmt_no;
779 __le64 inos[]; 781 __le64 inos[];
780} __attribute__ ((packed)); 782} __packed;
781 783
782#endif /* __UBIFS_MEDIA_H__ */ 784#endif /* __UBIFS_MEDIA_H__ */
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 8c40ad3c6721..93d1412a06f0 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -389,9 +389,9 @@ struct ubifs_gced_idx_leb {
389 * The @ui_size is a "shadow" variable for @inode->i_size and UBIFS uses 389 * The @ui_size is a "shadow" variable for @inode->i_size and UBIFS uses
390 * @ui_size instead of @inode->i_size. The reason for this is that UBIFS cannot 390 * @ui_size instead of @inode->i_size. The reason for this is that UBIFS cannot
391 * make sure @inode->i_size is always changed under @ui_mutex, because it 391 * make sure @inode->i_size is always changed under @ui_mutex, because it
392 * cannot call 'truncate_setsize()' with @ui_mutex locked, because it would deadlock 392 * cannot call 'truncate_setsize()' with @ui_mutex locked, because it would
393 * with 'ubifs_writepage()' (see file.c). All the other inode fields are 393 * deadlock with 'ubifs_writepage()' (see file.c). All the other inode fields
394 * changed under @ui_mutex, so they do not need "shadow" fields. Note, one 394 * are changed under @ui_mutex, so they do not need "shadow" fields. Note, one
395 * could consider to rework locking and base it on "shadow" fields. 395 * could consider to rework locking and base it on "shadow" fields.
396 */ 396 */
397struct ubifs_inode { 397struct ubifs_inode {
@@ -937,6 +937,40 @@ struct ubifs_mount_opts {
937 unsigned int compr_type:2; 937 unsigned int compr_type:2;
938}; 938};
939 939
940/**
941 * struct ubifs_budg_info - UBIFS budgeting information.
942 * @idx_growth: amount of bytes budgeted for index growth
943 * @data_growth: amount of bytes budgeted for cached data
944 * @dd_growth: amount of bytes budgeted for cached data that will make
945 * other data dirty
946 * @uncommitted_idx: amount of bytes were budgeted for growth of the index, but
947 * which still have to be taken into account because the index
948 * has not been committed so far
949 * @old_idx_sz: size of index on flash
950 * @min_idx_lebs: minimum number of LEBs required for the index
951 * @nospace: non-zero if the file-system does not have flash space (used as
952 * optimization)
953 * @nospace_rp: the same as @nospace, but additionally means that even reserved
954 * pool is full
955 * @page_budget: budget for a page (constant, nenver changed after mount)
956 * @inode_budget: budget for an inode (constant, nenver changed after mount)
957 * @dent_budget: budget for a directory entry (constant, nenver changed after
958 * mount)
959 */
960struct ubifs_budg_info {
961 long long idx_growth;
962 long long data_growth;
963 long long dd_growth;
964 long long uncommitted_idx;
965 unsigned long long old_idx_sz;
966 int min_idx_lebs;
967 unsigned int nospace:1;
968 unsigned int nospace_rp:1;
969 int page_budget;
970 int inode_budget;
971 int dent_budget;
972};
973
940struct ubifs_debug_info; 974struct ubifs_debug_info;
941 975
942/** 976/**
@@ -980,6 +1014,7 @@ struct ubifs_debug_info;
980 * @cmt_wq: wait queue to sleep on if the log is full and a commit is running 1014 * @cmt_wq: wait queue to sleep on if the log is full and a commit is running
981 * 1015 *
982 * @big_lpt: flag that LPT is too big to write whole during commit 1016 * @big_lpt: flag that LPT is too big to write whole during commit
1017 * @space_fixup: flag indicating that free space in LEBs needs to be cleaned up
983 * @no_chk_data_crc: do not check CRCs when reading data nodes (except during 1018 * @no_chk_data_crc: do not check CRCs when reading data nodes (except during
984 * recovery) 1019 * recovery)
985 * @bulk_read: enable bulk-reads 1020 * @bulk_read: enable bulk-reads
@@ -1057,32 +1092,14 @@ struct ubifs_debug_info;
1057 * @dirty_zn_cnt: number of dirty znodes 1092 * @dirty_zn_cnt: number of dirty znodes
1058 * @clean_zn_cnt: number of clean znodes 1093 * @clean_zn_cnt: number of clean znodes
1059 * 1094 *
1060 * @budg_idx_growth: amount of bytes budgeted for index growth 1095 * @space_lock: protects @bi and @lst
1061 * @budg_data_growth: amount of bytes budgeted for cached data 1096 * @lst: lprops statistics
1062 * @budg_dd_growth: amount of bytes budgeted for cached data that will make 1097 * @bi: budgeting information
1063 * other data dirty
1064 * @budg_uncommitted_idx: amount of bytes were budgeted for growth of the index,
1065 * but which still have to be taken into account because
1066 * the index has not been committed so far
1067 * @space_lock: protects @budg_idx_growth, @budg_data_growth, @budg_dd_growth,
1068 * @budg_uncommited_idx, @min_idx_lebs, @old_idx_sz, @lst,
1069 * @nospace, and @nospace_rp;
1070 * @min_idx_lebs: minimum number of LEBs required for the index
1071 * @old_idx_sz: size of index on flash
1072 * @calc_idx_sz: temporary variable which is used to calculate new index size 1098 * @calc_idx_sz: temporary variable which is used to calculate new index size
1073 * (contains accurate new index size at end of TNC commit start) 1099 * (contains accurate new index size at end of TNC commit start)
1074 * @lst: lprops statistics
1075 * @nospace: non-zero if the file-system does not have flash space (used as
1076 * optimization)
1077 * @nospace_rp: the same as @nospace, but additionally means that even reserved
1078 * pool is full
1079 *
1080 * @page_budget: budget for a page
1081 * @inode_budget: budget for an inode
1082 * @dent_budget: budget for a directory entry
1083 * 1100 *
1084 * @ref_node_alsz: size of the LEB reference node aligned to the min. flash 1101 * @ref_node_alsz: size of the LEB reference node aligned to the min. flash
1085 * I/O unit 1102 * I/O unit
1086 * @mst_node_alsz: master node aligned size 1103 * @mst_node_alsz: master node aligned size
1087 * @min_idx_node_sz: minimum indexing node aligned on 8-bytes boundary 1104 * @min_idx_node_sz: minimum indexing node aligned on 8-bytes boundary
1088 * @max_idx_node_sz: maximum indexing node aligned on 8-bytes boundary 1105 * @max_idx_node_sz: maximum indexing node aligned on 8-bytes boundary
@@ -1189,7 +1206,6 @@ struct ubifs_debug_info;
1189 * @replaying: %1 during journal replay 1206 * @replaying: %1 during journal replay
1190 * @mounting: %1 while mounting 1207 * @mounting: %1 while mounting
1191 * @remounting_rw: %1 while re-mounting from R/O mode to R/W mode 1208 * @remounting_rw: %1 while re-mounting from R/O mode to R/W mode
1192 * @replay_tree: temporary tree used during journal replay
1193 * @replay_list: temporary list used during journal replay 1209 * @replay_list: temporary list used during journal replay
1194 * @replay_buds: list of buds to replay 1210 * @replay_buds: list of buds to replay
1195 * @cs_sqnum: sequence number of first node in the log (commit start node) 1211 * @cs_sqnum: sequence number of first node in the log (commit start node)
@@ -1238,6 +1254,7 @@ struct ubifs_info {
1238 wait_queue_head_t cmt_wq; 1254 wait_queue_head_t cmt_wq;
1239 1255
1240 unsigned int big_lpt:1; 1256 unsigned int big_lpt:1;
1257 unsigned int space_fixup:1;
1241 unsigned int no_chk_data_crc:1; 1258 unsigned int no_chk_data_crc:1;
1242 unsigned int bulk_read:1; 1259 unsigned int bulk_read:1;
1243 unsigned int default_compr:2; 1260 unsigned int default_compr:2;
@@ -1308,21 +1325,10 @@ struct ubifs_info {
1308 atomic_long_t dirty_zn_cnt; 1325 atomic_long_t dirty_zn_cnt;
1309 atomic_long_t clean_zn_cnt; 1326 atomic_long_t clean_zn_cnt;
1310 1327
1311 long long budg_idx_growth;
1312 long long budg_data_growth;
1313 long long budg_dd_growth;
1314 long long budg_uncommitted_idx;
1315 spinlock_t space_lock; 1328 spinlock_t space_lock;
1316 int min_idx_lebs;
1317 unsigned long long old_idx_sz;
1318 unsigned long long calc_idx_sz;
1319 struct ubifs_lp_stats lst; 1329 struct ubifs_lp_stats lst;
1320 unsigned int nospace:1; 1330 struct ubifs_budg_info bi;
1321 unsigned int nospace_rp:1; 1331 unsigned long long calc_idx_sz;
1322
1323 int page_budget;
1324 int inode_budget;
1325 int dent_budget;
1326 1332
1327 int ref_node_alsz; 1333 int ref_node_alsz;
1328 int mst_node_alsz; 1334 int mst_node_alsz;
@@ -1430,7 +1436,6 @@ struct ubifs_info {
1430 unsigned int replaying:1; 1436 unsigned int replaying:1;
1431 unsigned int mounting:1; 1437 unsigned int mounting:1;
1432 unsigned int remounting_rw:1; 1438 unsigned int remounting_rw:1;
1433 struct rb_root replay_tree;
1434 struct list_head replay_list; 1439 struct list_head replay_list;
1435 struct list_head replay_buds; 1440 struct list_head replay_buds;
1436 unsigned long long cs_sqnum; 1441 unsigned long long cs_sqnum;
@@ -1628,6 +1633,7 @@ int ubifs_write_master(struct ubifs_info *c);
1628int ubifs_read_superblock(struct ubifs_info *c); 1633int ubifs_read_superblock(struct ubifs_info *c);
1629struct ubifs_sb_node *ubifs_read_sb_node(struct ubifs_info *c); 1634struct ubifs_sb_node *ubifs_read_sb_node(struct ubifs_info *c);
1630int ubifs_write_sb_node(struct ubifs_info *c, struct ubifs_sb_node *sup); 1635int ubifs_write_sb_node(struct ubifs_info *c, struct ubifs_sb_node *sup);
1636int ubifs_fixup_free_space(struct ubifs_info *c);
1631 1637
1632/* replay.c */ 1638/* replay.c */
1633int ubifs_validate_entry(struct ubifs_info *c, 1639int ubifs_validate_entry(struct ubifs_info *c,
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index 3299f469e712..16f19f55e63f 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -80,8 +80,8 @@ enum {
80 SECURITY_XATTR, 80 SECURITY_XATTR,
81}; 81};
82 82
83static const struct inode_operations none_inode_operations; 83static const struct inode_operations empty_iops;
84static const struct file_operations none_file_operations; 84static const struct file_operations empty_fops;
85 85
86/** 86/**
87 * create_xattr - create an extended attribute. 87 * create_xattr - create an extended attribute.
@@ -131,8 +131,8 @@ static int create_xattr(struct ubifs_info *c, struct inode *host,
131 131
132 /* Re-define all operations to be "nothing" */ 132 /* Re-define all operations to be "nothing" */
133 inode->i_mapping->a_ops = &empty_aops; 133 inode->i_mapping->a_ops = &empty_aops;
134 inode->i_op = &none_inode_operations; 134 inode->i_op = &empty_iops;
135 inode->i_fop = &none_file_operations; 135 inode->i_fop = &empty_fops;
136 136
137 inode->i_flags |= S_SYNC | S_NOATIME | S_NOCMTIME | S_NOQUOTA; 137 inode->i_flags |= S_SYNC | S_NOATIME | S_NOCMTIME | S_NOQUOTA;
138 ui = ubifs_inode(inode); 138 ui = ubifs_inode(inode);