aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/Kconfig5
-rw-r--r--fs/9p/vfs_inode_dotl.c11
-rw-r--r--fs/Kconfig18
-rw-r--r--fs/binfmt_flat.c8
-rw-r--r--fs/block_dev.c17
-rw-r--r--fs/ceph/addr.c5
-rw-r--r--fs/ceph/caps.c61
-rw-r--r--fs/ceph/dir.c7
-rw-r--r--fs/ceph/export.c25
-rw-r--r--fs/ceph/mds_client.c7
-rw-r--r--fs/ceph/mds_client.h1
-rw-r--r--fs/dcache.c8
-rw-r--r--fs/dlm/config.c9
-rw-r--r--fs/dlm/config.h1
-rw-r--r--fs/dlm/dlm_internal.h3
-rw-r--r--fs/dlm/lock.c182
-rw-r--r--fs/dlm/lock.h1
-rw-r--r--fs/dlm/lockspace.c6
-rw-r--r--fs/dlm/plock.c65
-rw-r--r--fs/dlm/user.c1
-rw-r--r--fs/drop_caches.c5
-rw-r--r--fs/exec.c13
-rw-r--r--fs/ext2/super.c3
-rw-r--r--fs/ext3/namei.c80
-rw-r--r--fs/fat/cache.c7
-rw-r--r--fs/fat/dir.c32
-rw-r--r--fs/fat/fat.h15
-rw-r--r--fs/fat/fatent.c4
-rw-r--r--fs/fat/inode.c74
-rw-r--r--fs/fat/misc.c44
-rw-r--r--fs/fat/namei_msdos.c4
-rw-r--r--fs/fat/namei_vfat.c4
-rw-r--r--fs/fscache/operation.c10
-rw-r--r--fs/fscache/page.c13
-rw-r--r--fs/gfs2/glock.c5
-rw-r--r--fs/gfs2/quota.c12
-rw-r--r--fs/gfs2/quota.h4
-rw-r--r--fs/hugetlbfs/inode.c4
-rw-r--r--fs/inode.c9
-rw-r--r--fs/jbd/commit.c15
-rw-r--r--fs/jbd/journal.c16
-rw-r--r--fs/jbd/transaction.c3
-rw-r--r--fs/jbd2/commit.c6
-rw-r--r--fs/mbcache.c10
-rw-r--r--fs/ncpfs/inode.c4
-rw-r--r--fs/nfs/dir.c5
-rw-r--r--fs/nfs/internal.h2
-rw-r--r--fs/partitions/check.c8
-rw-r--r--fs/proc/internal.h8
-rw-r--r--fs/proc/task_mmu.c204
-rw-r--r--fs/quota/dquot.c5
-rw-r--r--fs/splice.c33
-rw-r--r--fs/timerfd.c102
-rw-r--r--fs/ubifs/budget.c104
-rw-r--r--fs/ubifs/commit.c2
-rw-r--r--fs/ubifs/debug.c167
-rw-r--r--fs/ubifs/debug.h178
-rw-r--r--fs/ubifs/dir.c4
-rw-r--r--fs/ubifs/file.c28
-rw-r--r--fs/ubifs/find.c10
-rw-r--r--fs/ubifs/gc.c71
-rw-r--r--fs/ubifs/io.c33
-rw-r--r--fs/ubifs/journal.c29
-rw-r--r--fs/ubifs/log.c28
-rw-r--r--fs/ubifs/lprops.c115
-rw-r--r--fs/ubifs/lpt_commit.c55
-rw-r--r--fs/ubifs/master.c8
-rw-r--r--fs/ubifs/misc.h17
-rw-r--r--fs/ubifs/orphan.c3
-rw-r--r--fs/ubifs/recovery.c354
-rw-r--r--fs/ubifs/replay.c468
-rw-r--r--fs/ubifs/sb.c153
-rw-r--r--fs/ubifs/super.c46
-rw-r--r--fs/ubifs/tnc.c10
-rw-r--r--fs/ubifs/tnc_commit.c18
-rw-r--r--fs/ubifs/ubifs-media.h30
-rw-r--r--fs/ubifs/ubifs.h86
-rw-r--r--fs/ubifs/xattr.c8
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c26
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.h1
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl32.c3
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl32.h1
-rw-r--r--fs/xfs/linux-2.6/xfs_linux.h1
-rw-r--r--fs/xfs/linux-2.6/xfs_message.c20
-rw-r--r--fs/xfs/linux-2.6/xfs_message.h7
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c4
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c15
-rw-r--r--fs/xfs/linux-2.6/xfs_trace.h76
-rw-r--r--fs/xfs/quota/xfs_qm.c6
-rw-r--r--fs/xfs/xfs_ag.h1
-rw-r--r--fs/xfs/xfs_alloc.c844
-rw-r--r--fs/xfs/xfs_alloc.h15
-rw-r--r--fs/xfs/xfs_alloc_btree.c13
-rw-r--r--fs/xfs/xfs_dfrag.c6
-rw-r--r--fs/xfs/xfs_inode.c2
-rw-r--r--fs/xfs/xfs_inode_item.c1
-rw-r--r--fs/xfs/xfs_log.c15
-rw-r--r--fs/xfs/xfs_log.h2
-rw-r--r--fs/xfs/xfs_log_cil.c5
-rw-r--r--fs/xfs/xfs_log_priv.h2
-rw-r--r--fs/xfs/xfs_log_recover.c75
-rw-r--r--fs/xfs/xfs_mount.c4
-rw-r--r--fs/xfs/xfs_trans.c6
-rw-r--r--fs/xfs/xfs_types.h2
104 files changed, 2745 insertions, 1637 deletions
diff --git a/fs/9p/Kconfig b/fs/9p/Kconfig
index 814ac4e213a8..0a93dc1cb4ac 100644
--- a/fs/9p/Kconfig
+++ b/fs/9p/Kconfig
@@ -1,6 +1,6 @@
1config 9P_FS 1config 9P_FS
2 tristate "Plan 9 Resource Sharing Support (9P2000) (Experimental)" 2 tristate "Plan 9 Resource Sharing Support (9P2000)"
3 depends on INET && NET_9P && EXPERIMENTAL 3 depends on INET && NET_9P
4 help 4 help
5 If you say Y here, you will get experimental support for 5 If you say Y here, you will get experimental support for
6 Plan 9 resource sharing via the 9P2000 protocol. 6 Plan 9 resource sharing via the 9P2000 protocol.
@@ -10,7 +10,6 @@ config 9P_FS
10 If unsure, say N. 10 If unsure, say N.
11 11
12if 9P_FS 12if 9P_FS
13
14config 9P_FSCACHE 13config 9P_FSCACHE
15 bool "Enable 9P client caching support (EXPERIMENTAL)" 14 bool "Enable 9P client caching support (EXPERIMENTAL)"
16 depends on EXPERIMENTAL 15 depends on EXPERIMENTAL
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 82a7c38ddad0..691c78f58bef 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -259,7 +259,7 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
259 if (IS_ERR(inode_fid)) { 259 if (IS_ERR(inode_fid)) {
260 err = PTR_ERR(inode_fid); 260 err = PTR_ERR(inode_fid);
261 mutex_unlock(&v9inode->v_mutex); 261 mutex_unlock(&v9inode->v_mutex);
262 goto error; 262 goto err_clunk_old_fid;
263 } 263 }
264 v9inode->writeback_fid = (void *) inode_fid; 264 v9inode->writeback_fid = (void *) inode_fid;
265 } 265 }
@@ -267,8 +267,8 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
267 /* Since we are opening a file, assign the open fid to the file */ 267 /* Since we are opening a file, assign the open fid to the file */
268 filp = lookup_instantiate_filp(nd, dentry, generic_file_open); 268 filp = lookup_instantiate_filp(nd, dentry, generic_file_open);
269 if (IS_ERR(filp)) { 269 if (IS_ERR(filp)) {
270 p9_client_clunk(ofid); 270 err = PTR_ERR(filp);
271 return PTR_ERR(filp); 271 goto err_clunk_old_fid;
272 } 272 }
273 filp->private_data = ofid; 273 filp->private_data = ofid;
274#ifdef CONFIG_9P_FSCACHE 274#ifdef CONFIG_9P_FSCACHE
@@ -278,10 +278,11 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
278 return 0; 278 return 0;
279 279
280error: 280error:
281 if (ofid)
282 p9_client_clunk(ofid);
283 if (fid) 281 if (fid)
284 p9_client_clunk(fid); 282 p9_client_clunk(fid);
283err_clunk_old_fid:
284 if (ofid)
285 p9_client_clunk(ofid);
285 return err; 286 return err;
286} 287}
287 288
diff --git a/fs/Kconfig b/fs/Kconfig
index f3aa9b08b228..979992dcb386 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -121,9 +121,25 @@ config TMPFS
121 121
122 See <file:Documentation/filesystems/tmpfs.txt> for details. 122 See <file:Documentation/filesystems/tmpfs.txt> for details.
123 123
124config TMPFS_XATTR
125 bool "Tmpfs extended attributes"
126 depends on TMPFS
127 default n
128 help
129 Extended attributes are name:value pairs associated with inodes by
130 the kernel or by users (see the attr(5) manual page, or visit
131 <http://acl.bestbits.at/> for details).
132
133 Currently this enables support for the trusted.* and
134 security.* namespaces.
135
136 If unsure, say N.
137
138 You need this for POSIX ACL support on tmpfs.
139
124config TMPFS_POSIX_ACL 140config TMPFS_POSIX_ACL
125 bool "Tmpfs POSIX Access Control Lists" 141 bool "Tmpfs POSIX Access Control Lists"
126 depends on TMPFS 142 depends on TMPFS_XATTR
127 select GENERIC_ACL 143 select GENERIC_ACL
128 help 144 help
129 POSIX Access Control Lists (ACLs) support permissions for users and 145 POSIX Access Control Lists (ACLs) support permissions for users and
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 397d3057d336..1bffbe0ed778 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -820,6 +820,8 @@ static int load_flat_shared_library(int id, struct lib_info *libs)
820 int res; 820 int res;
821 char buf[16]; 821 char buf[16];
822 822
823 memset(&bprm, 0, sizeof(bprm));
824
823 /* Create the file name */ 825 /* Create the file name */
824 sprintf(buf, "/lib/lib%d.so", id); 826 sprintf(buf, "/lib/lib%d.so", id);
825 827
@@ -835,6 +837,12 @@ static int load_flat_shared_library(int id, struct lib_info *libs)
835 if (!bprm.cred) 837 if (!bprm.cred)
836 goto out; 838 goto out;
837 839
840 /* We don't really care about recalculating credentials at this point
841 * as we're past the point of no return and are dealing with shared
842 * libraries.
843 */
844 bprm.cred_prepared = 1;
845
838 res = prepare_binprm(&bprm); 846 res = prepare_binprm(&bprm);
839 847
840 if (!IS_ERR_VALUE(res)) 848 if (!IS_ERR_VALUE(res))
diff --git a/fs/block_dev.c b/fs/block_dev.c
index bf9c7a720371..1f2b19978333 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1238,6 +1238,8 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
1238 res = __blkdev_get(bdev, mode, 0); 1238 res = __blkdev_get(bdev, mode, 0);
1239 1239
1240 if (whole) { 1240 if (whole) {
1241 struct gendisk *disk = whole->bd_disk;
1242
1241 /* finish claiming */ 1243 /* finish claiming */
1242 mutex_lock(&bdev->bd_mutex); 1244 mutex_lock(&bdev->bd_mutex);
1243 spin_lock(&bdev_lock); 1245 spin_lock(&bdev_lock);
@@ -1264,15 +1266,16 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
1264 spin_unlock(&bdev_lock); 1266 spin_unlock(&bdev_lock);
1265 1267
1266 /* 1268 /*
1267 * Block event polling for write claims. Any write 1269 * Block event polling for write claims if requested. Any
1268 * holder makes the write_holder state stick until all 1270 * write holder makes the write_holder state stick until
1269 * are released. This is good enough and tracking 1271 * all are released. This is good enough and tracking
1270 * individual writeable reference is too fragile given 1272 * individual writeable reference is too fragile given the
1271 * the way @mode is used in blkdev_get/put(). 1273 * way @mode is used in blkdev_get/put().
1272 */ 1274 */
1273 if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder) { 1275 if ((disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE) &&
1276 !res && (mode & FMODE_WRITE) && !bdev->bd_write_holder) {
1274 bdev->bd_write_holder = true; 1277 bdev->bd_write_holder = true;
1275 disk_block_events(bdev->bd_disk); 1278 disk_block_events(disk);
1276 } 1279 }
1277 1280
1278 mutex_unlock(&bdev->bd_mutex); 1281 mutex_unlock(&bdev->bd_mutex);
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 38b8ab554924..33da49dc3cc6 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -848,7 +848,8 @@ get_more_pages:
848 op->payload_len = cpu_to_le32(len); 848 op->payload_len = cpu_to_le32(len);
849 req->r_request->hdr.data_len = cpu_to_le32(len); 849 req->r_request->hdr.data_len = cpu_to_le32(len);
850 850
851 ceph_osdc_start_request(&fsc->client->osdc, req, true); 851 rc = ceph_osdc_start_request(&fsc->client->osdc, req, true);
852 BUG_ON(rc);
852 req = NULL; 853 req = NULL;
853 854
854 /* continue? */ 855 /* continue? */
@@ -880,8 +881,6 @@ release_pvec_pages:
880out: 881out:
881 if (req) 882 if (req)
882 ceph_osdc_put_request(req); 883 ceph_osdc_put_request(req);
883 if (rc > 0)
884 rc = 0; /* vfs expects us to return 0 */
885 ceph_put_snap_context(snapc); 884 ceph_put_snap_context(snapc);
886 dout("writepages done, rc = %d\n", rc); 885 dout("writepages done, rc = %d\n", rc);
887 return rc; 886 return rc;
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 2a5404c1c42f..1f72b00447c4 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -569,7 +569,8 @@ retry:
569 list_add_tail(&cap->session_caps, &session->s_caps); 569 list_add_tail(&cap->session_caps, &session->s_caps);
570 session->s_nr_caps++; 570 session->s_nr_caps++;
571 spin_unlock(&session->s_cap_lock); 571 spin_unlock(&session->s_cap_lock);
572 } 572 } else if (new_cap)
573 ceph_put_cap(mdsc, new_cap);
573 574
574 if (!ci->i_snap_realm) { 575 if (!ci->i_snap_realm) {
575 /* 576 /*
@@ -2634,6 +2635,7 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
2634 struct ceph_mds_session *session, 2635 struct ceph_mds_session *session,
2635 int *open_target_sessions) 2636 int *open_target_sessions)
2636{ 2637{
2638 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
2637 struct ceph_inode_info *ci = ceph_inode(inode); 2639 struct ceph_inode_info *ci = ceph_inode(inode);
2638 int mds = session->s_mds; 2640 int mds = session->s_mds;
2639 unsigned mseq = le32_to_cpu(ex->migrate_seq); 2641 unsigned mseq = le32_to_cpu(ex->migrate_seq);
@@ -2670,6 +2672,19 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
2670 * export targets, so that we get the matching IMPORT 2672 * export targets, so that we get the matching IMPORT
2671 */ 2673 */
2672 *open_target_sessions = 1; 2674 *open_target_sessions = 1;
2675
2676 /*
2677 * we can't flush dirty caps that we've seen the
2678 * EXPORT but no IMPORT for
2679 */
2680 spin_lock(&mdsc->cap_dirty_lock);
2681 if (!list_empty(&ci->i_dirty_item)) {
2682 dout(" moving %p to cap_dirty_migrating\n",
2683 inode);
2684 list_move(&ci->i_dirty_item,
2685 &mdsc->cap_dirty_migrating);
2686 }
2687 spin_unlock(&mdsc->cap_dirty_lock);
2673 } 2688 }
2674 __ceph_remove_cap(cap); 2689 __ceph_remove_cap(cap);
2675 } 2690 }
@@ -2707,6 +2722,13 @@ static void handle_cap_import(struct ceph_mds_client *mdsc,
2707 ci->i_cap_exporting_issued = 0; 2722 ci->i_cap_exporting_issued = 0;
2708 ci->i_cap_exporting_mseq = 0; 2723 ci->i_cap_exporting_mseq = 0;
2709 ci->i_cap_exporting_mds = -1; 2724 ci->i_cap_exporting_mds = -1;
2725
2726 spin_lock(&mdsc->cap_dirty_lock);
2727 if (!list_empty(&ci->i_dirty_item)) {
2728 dout(" moving %p back to cap_dirty\n", inode);
2729 list_move(&ci->i_dirty_item, &mdsc->cap_dirty);
2730 }
2731 spin_unlock(&mdsc->cap_dirty_lock);
2710 } else { 2732 } else {
2711 dout("handle_cap_import inode %p ci %p mds%d mseq %d\n", 2733 dout("handle_cap_import inode %p ci %p mds%d mseq %d\n",
2712 inode, ci, mds, mseq); 2734 inode, ci, mds, mseq);
@@ -2910,38 +2932,16 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
2910 */ 2932 */
2911void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc) 2933void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
2912{ 2934{
2913 struct ceph_inode_info *ci, *nci = NULL; 2935 struct ceph_inode_info *ci;
2914 struct inode *inode, *ninode = NULL; 2936 struct inode *inode;
2915 struct list_head *p, *n;
2916 2937
2917 dout("flush_dirty_caps\n"); 2938 dout("flush_dirty_caps\n");
2918 spin_lock(&mdsc->cap_dirty_lock); 2939 spin_lock(&mdsc->cap_dirty_lock);
2919 list_for_each_safe(p, n, &mdsc->cap_dirty) { 2940 while (!list_empty(&mdsc->cap_dirty)) {
2920 if (nci) { 2941 ci = list_first_entry(&mdsc->cap_dirty, struct ceph_inode_info,
2921 ci = nci; 2942 i_dirty_item);
2922 inode = ninode; 2943 inode = igrab(&ci->vfs_inode);
2923 ci->i_ceph_flags &= ~CEPH_I_NOFLUSH; 2944 dout("flush_dirty_caps %p\n", inode);
2924 dout("flush_dirty_caps inode %p (was next inode)\n",
2925 inode);
2926 } else {
2927 ci = list_entry(p, struct ceph_inode_info,
2928 i_dirty_item);
2929 inode = igrab(&ci->vfs_inode);
2930 BUG_ON(!inode);
2931 dout("flush_dirty_caps inode %p\n", inode);
2932 }
2933 if (n != &mdsc->cap_dirty) {
2934 nci = list_entry(n, struct ceph_inode_info,
2935 i_dirty_item);
2936 ninode = igrab(&nci->vfs_inode);
2937 BUG_ON(!ninode);
2938 nci->i_ceph_flags |= CEPH_I_NOFLUSH;
2939 dout("flush_dirty_caps next inode %p, noflush\n",
2940 ninode);
2941 } else {
2942 nci = NULL;
2943 ninode = NULL;
2944 }
2945 spin_unlock(&mdsc->cap_dirty_lock); 2945 spin_unlock(&mdsc->cap_dirty_lock);
2946 if (inode) { 2946 if (inode) {
2947 ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH, 2947 ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH,
@@ -2951,6 +2951,7 @@ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
2951 spin_lock(&mdsc->cap_dirty_lock); 2951 spin_lock(&mdsc->cap_dirty_lock);
2952 } 2952 }
2953 spin_unlock(&mdsc->cap_dirty_lock); 2953 spin_unlock(&mdsc->cap_dirty_lock);
2954 dout("flush_dirty_caps done\n");
2954} 2955}
2955 2956
2956/* 2957/*
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 1a867a3601ae..33729e822bb9 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -360,7 +360,7 @@ more:
360 rinfo = &fi->last_readdir->r_reply_info; 360 rinfo = &fi->last_readdir->r_reply_info;
361 dout("readdir frag %x num %d off %d chunkoff %d\n", frag, 361 dout("readdir frag %x num %d off %d chunkoff %d\n", frag,
362 rinfo->dir_nr, off, fi->offset); 362 rinfo->dir_nr, off, fi->offset);
363 while (off - fi->offset >= 0 && off - fi->offset < rinfo->dir_nr) { 363 while (off >= fi->offset && off - fi->offset < rinfo->dir_nr) {
364 u64 pos = ceph_make_fpos(frag, off); 364 u64 pos = ceph_make_fpos(frag, off);
365 struct ceph_mds_reply_inode *in = 365 struct ceph_mds_reply_inode *in =
366 rinfo->dir_in[off - fi->offset].in; 366 rinfo->dir_in[off - fi->offset].in;
@@ -1066,16 +1066,17 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
1066 struct inode *inode = file->f_dentry->d_inode; 1066 struct inode *inode = file->f_dentry->d_inode;
1067 struct ceph_inode_info *ci = ceph_inode(inode); 1067 struct ceph_inode_info *ci = ceph_inode(inode);
1068 int left; 1068 int left;
1069 const int bufsize = 1024;
1069 1070
1070 if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT)) 1071 if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT))
1071 return -EISDIR; 1072 return -EISDIR;
1072 1073
1073 if (!cf->dir_info) { 1074 if (!cf->dir_info) {
1074 cf->dir_info = kmalloc(1024, GFP_NOFS); 1075 cf->dir_info = kmalloc(bufsize, GFP_NOFS);
1075 if (!cf->dir_info) 1076 if (!cf->dir_info)
1076 return -ENOMEM; 1077 return -ENOMEM;
1077 cf->dir_info_len = 1078 cf->dir_info_len =
1078 sprintf(cf->dir_info, 1079 snprintf(cf->dir_info, bufsize,
1079 "entries: %20lld\n" 1080 "entries: %20lld\n"
1080 " files: %20lld\n" 1081 " files: %20lld\n"
1081 " subdirs: %20lld\n" 1082 " subdirs: %20lld\n"
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index e41056174bf8..a610d3d67488 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -86,6 +86,7 @@ static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
86static struct dentry *__fh_to_dentry(struct super_block *sb, 86static struct dentry *__fh_to_dentry(struct super_block *sb,
87 struct ceph_nfs_fh *fh) 87 struct ceph_nfs_fh *fh)
88{ 88{
89 struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
89 struct inode *inode; 90 struct inode *inode;
90 struct dentry *dentry; 91 struct dentry *dentry;
91 struct ceph_vino vino; 92 struct ceph_vino vino;
@@ -95,8 +96,24 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
95 vino.ino = fh->ino; 96 vino.ino = fh->ino;
96 vino.snap = CEPH_NOSNAP; 97 vino.snap = CEPH_NOSNAP;
97 inode = ceph_find_inode(sb, vino); 98 inode = ceph_find_inode(sb, vino);
98 if (!inode) 99 if (!inode) {
99 return ERR_PTR(-ESTALE); 100 struct ceph_mds_request *req;
101
102 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPINO,
103 USE_ANY_MDS);
104 if (IS_ERR(req))
105 return ERR_CAST(req);
106
107 req->r_ino1 = vino;
108 req->r_num_caps = 1;
109 err = ceph_mdsc_do_request(mdsc, NULL, req);
110 inode = req->r_target_inode;
111 if (inode)
112 igrab(inode);
113 ceph_mdsc_put_request(req);
114 if (!inode)
115 return ERR_PTR(-ESTALE);
116 }
100 117
101 dentry = d_obtain_alias(inode); 118 dentry = d_obtain_alias(inode);
102 if (IS_ERR(dentry)) { 119 if (IS_ERR(dentry)) {
@@ -148,8 +165,10 @@ static struct dentry *__cfh_to_dentry(struct super_block *sb,
148 snprintf(req->r_path2, 16, "%d", cfh->parent_name_hash); 165 snprintf(req->r_path2, 16, "%d", cfh->parent_name_hash);
149 req->r_num_caps = 1; 166 req->r_num_caps = 1;
150 err = ceph_mdsc_do_request(mdsc, NULL, req); 167 err = ceph_mdsc_do_request(mdsc, NULL, req);
168 inode = req->r_target_inode;
169 if (inode)
170 igrab(inode);
151 ceph_mdsc_put_request(req); 171 ceph_mdsc_put_request(req);
152 inode = ceph_find_inode(sb, vino);
153 if (!inode) 172 if (!inode)
154 return ERR_PTR(err ? err : -ESTALE); 173 return ERR_PTR(err ? err : -ESTALE);
155 } 174 }
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index d0fae4ce9ba5..79743d146be6 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -578,6 +578,7 @@ static void __register_request(struct ceph_mds_client *mdsc,
578 if (dir) { 578 if (dir) {
579 struct ceph_inode_info *ci = ceph_inode(dir); 579 struct ceph_inode_info *ci = ceph_inode(dir);
580 580
581 ihold(dir);
581 spin_lock(&ci->i_unsafe_lock); 582 spin_lock(&ci->i_unsafe_lock);
582 req->r_unsafe_dir = dir; 583 req->r_unsafe_dir = dir;
583 list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops); 584 list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops);
@@ -598,6 +599,9 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
598 spin_lock(&ci->i_unsafe_lock); 599 spin_lock(&ci->i_unsafe_lock);
599 list_del_init(&req->r_unsafe_dir_item); 600 list_del_init(&req->r_unsafe_dir_item);
600 spin_unlock(&ci->i_unsafe_lock); 601 spin_unlock(&ci->i_unsafe_lock);
602
603 iput(req->r_unsafe_dir);
604 req->r_unsafe_dir = NULL;
601 } 605 }
602 606
603 ceph_mdsc_put_request(req); 607 ceph_mdsc_put_request(req);
@@ -2691,7 +2695,6 @@ static void handle_lease(struct ceph_mds_client *mdsc,
2691{ 2695{
2692 struct super_block *sb = mdsc->fsc->sb; 2696 struct super_block *sb = mdsc->fsc->sb;
2693 struct inode *inode; 2697 struct inode *inode;
2694 struct ceph_inode_info *ci;
2695 struct dentry *parent, *dentry; 2698 struct dentry *parent, *dentry;
2696 struct ceph_dentry_info *di; 2699 struct ceph_dentry_info *di;
2697 int mds = session->s_mds; 2700 int mds = session->s_mds;
@@ -2728,7 +2731,6 @@ static void handle_lease(struct ceph_mds_client *mdsc,
2728 dout("handle_lease no inode %llx\n", vino.ino); 2731 dout("handle_lease no inode %llx\n", vino.ino);
2729 goto release; 2732 goto release;
2730 } 2733 }
2731 ci = ceph_inode(inode);
2732 2734
2733 /* dentry */ 2735 /* dentry */
2734 parent = d_find_alias(inode); 2736 parent = d_find_alias(inode);
@@ -3002,6 +3004,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
3002 spin_lock_init(&mdsc->snap_flush_lock); 3004 spin_lock_init(&mdsc->snap_flush_lock);
3003 mdsc->cap_flush_seq = 0; 3005 mdsc->cap_flush_seq = 0;
3004 INIT_LIST_HEAD(&mdsc->cap_dirty); 3006 INIT_LIST_HEAD(&mdsc->cap_dirty);
3007 INIT_LIST_HEAD(&mdsc->cap_dirty_migrating);
3005 mdsc->num_cap_flushing = 0; 3008 mdsc->num_cap_flushing = 0;
3006 spin_lock_init(&mdsc->cap_dirty_lock); 3009 spin_lock_init(&mdsc->cap_dirty_lock);
3007 init_waitqueue_head(&mdsc->cap_flushing_wq); 3010 init_waitqueue_head(&mdsc->cap_flushing_wq);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 4e3a9cc0bba6..7d8a0d662d56 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -278,6 +278,7 @@ struct ceph_mds_client {
278 278
279 u64 cap_flush_seq; 279 u64 cap_flush_seq;
280 struct list_head cap_dirty; /* inodes with dirty caps */ 280 struct list_head cap_dirty; /* inodes with dirty caps */
281 struct list_head cap_dirty_migrating; /* ...that are migration... */
281 int num_cap_flushing; /* # caps we are flushing */ 282 int num_cap_flushing; /* # caps we are flushing */
282 spinlock_t cap_dirty_lock; /* protects above items */ 283 spinlock_t cap_dirty_lock; /* protects above items */
283 wait_queue_head_t cap_flushing_wq; 284 wait_queue_head_t cap_flushing_wq;
diff --git a/fs/dcache.c b/fs/dcache.c
index 18b2a1f10ed8..37f72ee5bf7c 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1220,7 +1220,7 @@ void shrink_dcache_parent(struct dentry * parent)
1220EXPORT_SYMBOL(shrink_dcache_parent); 1220EXPORT_SYMBOL(shrink_dcache_parent);
1221 1221
1222/* 1222/*
1223 * Scan `nr' dentries and return the number which remain. 1223 * Scan `sc->nr_slab_to_reclaim' dentries and return the number which remain.
1224 * 1224 *
1225 * We need to avoid reentering the filesystem if the caller is performing a 1225 * We need to avoid reentering the filesystem if the caller is performing a
1226 * GFP_NOFS allocation attempt. One example deadlock is: 1226 * GFP_NOFS allocation attempt. One example deadlock is:
@@ -1231,8 +1231,12 @@ EXPORT_SYMBOL(shrink_dcache_parent);
1231 * 1231 *
1232 * In this case we return -1 to tell the caller that we baled. 1232 * In this case we return -1 to tell the caller that we baled.
1233 */ 1233 */
1234static int shrink_dcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask) 1234static int shrink_dcache_memory(struct shrinker *shrink,
1235 struct shrink_control *sc)
1235{ 1236{
1237 int nr = sc->nr_to_scan;
1238 gfp_t gfp_mask = sc->gfp_mask;
1239
1236 if (nr) { 1240 if (nr) {
1237 if (!(gfp_mask & __GFP_FS)) 1241 if (!(gfp_mask & __GFP_FS))
1238 return -1; 1242 return -1;
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 0d329ff8ed4c..9b026ea8baa9 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -100,6 +100,7 @@ struct dlm_cluster {
100 unsigned int cl_log_debug; 100 unsigned int cl_log_debug;
101 unsigned int cl_protocol; 101 unsigned int cl_protocol;
102 unsigned int cl_timewarn_cs; 102 unsigned int cl_timewarn_cs;
103 unsigned int cl_waitwarn_us;
103}; 104};
104 105
105enum { 106enum {
@@ -114,6 +115,7 @@ enum {
114 CLUSTER_ATTR_LOG_DEBUG, 115 CLUSTER_ATTR_LOG_DEBUG,
115 CLUSTER_ATTR_PROTOCOL, 116 CLUSTER_ATTR_PROTOCOL,
116 CLUSTER_ATTR_TIMEWARN_CS, 117 CLUSTER_ATTR_TIMEWARN_CS,
118 CLUSTER_ATTR_WAITWARN_US,
117}; 119};
118 120
119struct cluster_attribute { 121struct cluster_attribute {
@@ -166,6 +168,7 @@ CLUSTER_ATTR(scan_secs, 1);
166CLUSTER_ATTR(log_debug, 0); 168CLUSTER_ATTR(log_debug, 0);
167CLUSTER_ATTR(protocol, 0); 169CLUSTER_ATTR(protocol, 0);
168CLUSTER_ATTR(timewarn_cs, 1); 170CLUSTER_ATTR(timewarn_cs, 1);
171CLUSTER_ATTR(waitwarn_us, 0);
169 172
170static struct configfs_attribute *cluster_attrs[] = { 173static struct configfs_attribute *cluster_attrs[] = {
171 [CLUSTER_ATTR_TCP_PORT] = &cluster_attr_tcp_port.attr, 174 [CLUSTER_ATTR_TCP_PORT] = &cluster_attr_tcp_port.attr,
@@ -179,6 +182,7 @@ static struct configfs_attribute *cluster_attrs[] = {
179 [CLUSTER_ATTR_LOG_DEBUG] = &cluster_attr_log_debug.attr, 182 [CLUSTER_ATTR_LOG_DEBUG] = &cluster_attr_log_debug.attr,
180 [CLUSTER_ATTR_PROTOCOL] = &cluster_attr_protocol.attr, 183 [CLUSTER_ATTR_PROTOCOL] = &cluster_attr_protocol.attr,
181 [CLUSTER_ATTR_TIMEWARN_CS] = &cluster_attr_timewarn_cs.attr, 184 [CLUSTER_ATTR_TIMEWARN_CS] = &cluster_attr_timewarn_cs.attr,
185 [CLUSTER_ATTR_WAITWARN_US] = &cluster_attr_waitwarn_us.attr,
182 NULL, 186 NULL,
183}; 187};
184 188
@@ -439,6 +443,7 @@ static struct config_group *make_cluster(struct config_group *g,
439 cl->cl_log_debug = dlm_config.ci_log_debug; 443 cl->cl_log_debug = dlm_config.ci_log_debug;
440 cl->cl_protocol = dlm_config.ci_protocol; 444 cl->cl_protocol = dlm_config.ci_protocol;
441 cl->cl_timewarn_cs = dlm_config.ci_timewarn_cs; 445 cl->cl_timewarn_cs = dlm_config.ci_timewarn_cs;
446 cl->cl_waitwarn_us = dlm_config.ci_waitwarn_us;
442 447
443 space_list = &sps->ss_group; 448 space_list = &sps->ss_group;
444 comm_list = &cms->cs_group; 449 comm_list = &cms->cs_group;
@@ -986,6 +991,7 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num)
986#define DEFAULT_LOG_DEBUG 0 991#define DEFAULT_LOG_DEBUG 0
987#define DEFAULT_PROTOCOL 0 992#define DEFAULT_PROTOCOL 0
988#define DEFAULT_TIMEWARN_CS 500 /* 5 sec = 500 centiseconds */ 993#define DEFAULT_TIMEWARN_CS 500 /* 5 sec = 500 centiseconds */
994#define DEFAULT_WAITWARN_US 0
989 995
990struct dlm_config_info dlm_config = { 996struct dlm_config_info dlm_config = {
991 .ci_tcp_port = DEFAULT_TCP_PORT, 997 .ci_tcp_port = DEFAULT_TCP_PORT,
@@ -998,6 +1004,7 @@ struct dlm_config_info dlm_config = {
998 .ci_scan_secs = DEFAULT_SCAN_SECS, 1004 .ci_scan_secs = DEFAULT_SCAN_SECS,
999 .ci_log_debug = DEFAULT_LOG_DEBUG, 1005 .ci_log_debug = DEFAULT_LOG_DEBUG,
1000 .ci_protocol = DEFAULT_PROTOCOL, 1006 .ci_protocol = DEFAULT_PROTOCOL,
1001 .ci_timewarn_cs = DEFAULT_TIMEWARN_CS 1007 .ci_timewarn_cs = DEFAULT_TIMEWARN_CS,
1008 .ci_waitwarn_us = DEFAULT_WAITWARN_US
1002}; 1009};
1003 1010
diff --git a/fs/dlm/config.h b/fs/dlm/config.h
index 4f1d6fce58c5..dd0ce24d5a80 100644
--- a/fs/dlm/config.h
+++ b/fs/dlm/config.h
@@ -28,6 +28,7 @@ struct dlm_config_info {
28 int ci_log_debug; 28 int ci_log_debug;
29 int ci_protocol; 29 int ci_protocol;
30 int ci_timewarn_cs; 30 int ci_timewarn_cs;
31 int ci_waitwarn_us;
31}; 32};
32 33
33extern struct dlm_config_info dlm_config; 34extern struct dlm_config_info dlm_config;
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index b94204913011..0262451eb9c6 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -209,6 +209,7 @@ struct dlm_args {
209#define DLM_IFL_WATCH_TIMEWARN 0x00400000 209#define DLM_IFL_WATCH_TIMEWARN 0x00400000
210#define DLM_IFL_TIMEOUT_CANCEL 0x00800000 210#define DLM_IFL_TIMEOUT_CANCEL 0x00800000
211#define DLM_IFL_DEADLOCK_CANCEL 0x01000000 211#define DLM_IFL_DEADLOCK_CANCEL 0x01000000
212#define DLM_IFL_STUB_MS 0x02000000 /* magic number for m_flags */
212#define DLM_IFL_USER 0x00000001 213#define DLM_IFL_USER 0x00000001
213#define DLM_IFL_ORPHAN 0x00000002 214#define DLM_IFL_ORPHAN 0x00000002
214 215
@@ -245,6 +246,7 @@ struct dlm_lkb {
245 246
246 int8_t lkb_wait_type; /* type of reply waiting for */ 247 int8_t lkb_wait_type; /* type of reply waiting for */
247 int8_t lkb_wait_count; 248 int8_t lkb_wait_count;
249 int lkb_wait_nodeid; /* for debugging */
248 250
249 struct list_head lkb_idtbl_list; /* lockspace lkbtbl */ 251 struct list_head lkb_idtbl_list; /* lockspace lkbtbl */
250 struct list_head lkb_statequeue; /* rsb g/c/w list */ 252 struct list_head lkb_statequeue; /* rsb g/c/w list */
@@ -254,6 +256,7 @@ struct dlm_lkb {
254 struct list_head lkb_ownqueue; /* list of locks for a process */ 256 struct list_head lkb_ownqueue; /* list of locks for a process */
255 struct list_head lkb_time_list; 257 struct list_head lkb_time_list;
256 ktime_t lkb_timestamp; 258 ktime_t lkb_timestamp;
259 ktime_t lkb_wait_time;
257 unsigned long lkb_timeout_cs; 260 unsigned long lkb_timeout_cs;
258 261
259 struct dlm_callback lkb_callbacks[DLM_CALLBACKS_SIZE]; 262 struct dlm_callback lkb_callbacks[DLM_CALLBACKS_SIZE];
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 56d6bfcc1e48..f71d0b5abd95 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -799,10 +799,84 @@ static int msg_reply_type(int mstype)
799 return -1; 799 return -1;
800} 800}
801 801
802static int nodeid_warned(int nodeid, int num_nodes, int *warned)
803{
804 int i;
805
806 for (i = 0; i < num_nodes; i++) {
807 if (!warned[i]) {
808 warned[i] = nodeid;
809 return 0;
810 }
811 if (warned[i] == nodeid)
812 return 1;
813 }
814 return 0;
815}
816
817void dlm_scan_waiters(struct dlm_ls *ls)
818{
819 struct dlm_lkb *lkb;
820 ktime_t zero = ktime_set(0, 0);
821 s64 us;
822 s64 debug_maxus = 0;
823 u32 debug_scanned = 0;
824 u32 debug_expired = 0;
825 int num_nodes = 0;
826 int *warned = NULL;
827
828 if (!dlm_config.ci_waitwarn_us)
829 return;
830
831 mutex_lock(&ls->ls_waiters_mutex);
832
833 list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
834 if (ktime_equal(lkb->lkb_wait_time, zero))
835 continue;
836
837 debug_scanned++;
838
839 us = ktime_to_us(ktime_sub(ktime_get(), lkb->lkb_wait_time));
840
841 if (us < dlm_config.ci_waitwarn_us)
842 continue;
843
844 lkb->lkb_wait_time = zero;
845
846 debug_expired++;
847 if (us > debug_maxus)
848 debug_maxus = us;
849
850 if (!num_nodes) {
851 num_nodes = ls->ls_num_nodes;
852 warned = kmalloc(GFP_KERNEL, num_nodes * sizeof(int));
853 if (warned)
854 memset(warned, 0, num_nodes * sizeof(int));
855 }
856 if (!warned)
857 continue;
858 if (nodeid_warned(lkb->lkb_wait_nodeid, num_nodes, warned))
859 continue;
860
861 log_error(ls, "waitwarn %x %lld %d us check connection to "
862 "node %d", lkb->lkb_id, (long long)us,
863 dlm_config.ci_waitwarn_us, lkb->lkb_wait_nodeid);
864 }
865 mutex_unlock(&ls->ls_waiters_mutex);
866
867 if (warned)
868 kfree(warned);
869
870 if (debug_expired)
871 log_debug(ls, "scan_waiters %u warn %u over %d us max %lld us",
872 debug_scanned, debug_expired,
873 dlm_config.ci_waitwarn_us, (long long)debug_maxus);
874}
875
802/* add/remove lkb from global waiters list of lkb's waiting for 876/* add/remove lkb from global waiters list of lkb's waiting for
803 a reply from a remote node */ 877 a reply from a remote node */
804 878
805static int add_to_waiters(struct dlm_lkb *lkb, int mstype) 879static int add_to_waiters(struct dlm_lkb *lkb, int mstype, int to_nodeid)
806{ 880{
807 struct dlm_ls *ls = lkb->lkb_resource->res_ls; 881 struct dlm_ls *ls = lkb->lkb_resource->res_ls;
808 int error = 0; 882 int error = 0;
@@ -842,6 +916,8 @@ static int add_to_waiters(struct dlm_lkb *lkb, int mstype)
842 916
843 lkb->lkb_wait_count++; 917 lkb->lkb_wait_count++;
844 lkb->lkb_wait_type = mstype; 918 lkb->lkb_wait_type = mstype;
919 lkb->lkb_wait_time = ktime_get();
920 lkb->lkb_wait_nodeid = to_nodeid; /* for debugging */
845 hold_lkb(lkb); 921 hold_lkb(lkb);
846 list_add(&lkb->lkb_wait_reply, &ls->ls_waiters); 922 list_add(&lkb->lkb_wait_reply, &ls->ls_waiters);
847 out: 923 out:
@@ -961,10 +1037,10 @@ static int remove_from_waiters_ms(struct dlm_lkb *lkb, struct dlm_message *ms)
961 struct dlm_ls *ls = lkb->lkb_resource->res_ls; 1037 struct dlm_ls *ls = lkb->lkb_resource->res_ls;
962 int error; 1038 int error;
963 1039
964 if (ms != &ls->ls_stub_ms) 1040 if (ms->m_flags != DLM_IFL_STUB_MS)
965 mutex_lock(&ls->ls_waiters_mutex); 1041 mutex_lock(&ls->ls_waiters_mutex);
966 error = _remove_from_waiters(lkb, ms->m_type, ms); 1042 error = _remove_from_waiters(lkb, ms->m_type, ms);
967 if (ms != &ls->ls_stub_ms) 1043 if (ms->m_flags != DLM_IFL_STUB_MS)
968 mutex_unlock(&ls->ls_waiters_mutex); 1044 mutex_unlock(&ls->ls_waiters_mutex);
969 return error; 1045 return error;
970} 1046}
@@ -1157,6 +1233,16 @@ void dlm_adjust_timeouts(struct dlm_ls *ls)
1157 list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list) 1233 list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list)
1158 lkb->lkb_timestamp = ktime_add_us(lkb->lkb_timestamp, adj_us); 1234 lkb->lkb_timestamp = ktime_add_us(lkb->lkb_timestamp, adj_us);
1159 mutex_unlock(&ls->ls_timeout_mutex); 1235 mutex_unlock(&ls->ls_timeout_mutex);
1236
1237 if (!dlm_config.ci_waitwarn_us)
1238 return;
1239
1240 mutex_lock(&ls->ls_waiters_mutex);
1241 list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
1242 if (ktime_to_us(lkb->lkb_wait_time))
1243 lkb->lkb_wait_time = ktime_get();
1244 }
1245 mutex_unlock(&ls->ls_waiters_mutex);
1160} 1246}
1161 1247
1162/* lkb is master or local copy */ 1248/* lkb is master or local copy */
@@ -1376,14 +1462,8 @@ static void grant_lock_pending(struct dlm_rsb *r, struct dlm_lkb *lkb)
1376 ALTPR/ALTCW: our rqmode may have been changed to PR or CW to become 1462 ALTPR/ALTCW: our rqmode may have been changed to PR or CW to become
1377 compatible with other granted locks */ 1463 compatible with other granted locks */
1378 1464
1379static void munge_demoted(struct dlm_lkb *lkb, struct dlm_message *ms) 1465static void munge_demoted(struct dlm_lkb *lkb)
1380{ 1466{
1381 if (ms->m_type != DLM_MSG_CONVERT_REPLY) {
1382 log_print("munge_demoted %x invalid reply type %d",
1383 lkb->lkb_id, ms->m_type);
1384 return;
1385 }
1386
1387 if (lkb->lkb_rqmode == DLM_LOCK_IV || lkb->lkb_grmode == DLM_LOCK_IV) { 1467 if (lkb->lkb_rqmode == DLM_LOCK_IV || lkb->lkb_grmode == DLM_LOCK_IV) {
1388 log_print("munge_demoted %x invalid modes gr %d rq %d", 1468 log_print("munge_demoted %x invalid modes gr %d rq %d",
1389 lkb->lkb_id, lkb->lkb_grmode, lkb->lkb_rqmode); 1469 lkb->lkb_id, lkb->lkb_grmode, lkb->lkb_rqmode);
@@ -2844,12 +2924,12 @@ static int send_common(struct dlm_rsb *r, struct dlm_lkb *lkb, int mstype)
2844 struct dlm_mhandle *mh; 2924 struct dlm_mhandle *mh;
2845 int to_nodeid, error; 2925 int to_nodeid, error;
2846 2926
2847 error = add_to_waiters(lkb, mstype); 2927 to_nodeid = r->res_nodeid;
2928
2929 error = add_to_waiters(lkb, mstype, to_nodeid);
2848 if (error) 2930 if (error)
2849 return error; 2931 return error;
2850 2932
2851 to_nodeid = r->res_nodeid;
2852
2853 error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh); 2933 error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh);
2854 if (error) 2934 if (error)
2855 goto fail; 2935 goto fail;
@@ -2880,9 +2960,9 @@ static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
2880 /* down conversions go without a reply from the master */ 2960 /* down conversions go without a reply from the master */
2881 if (!error && down_conversion(lkb)) { 2961 if (!error && down_conversion(lkb)) {
2882 remove_from_waiters(lkb, DLM_MSG_CONVERT_REPLY); 2962 remove_from_waiters(lkb, DLM_MSG_CONVERT_REPLY);
2963 r->res_ls->ls_stub_ms.m_flags = DLM_IFL_STUB_MS;
2883 r->res_ls->ls_stub_ms.m_type = DLM_MSG_CONVERT_REPLY; 2964 r->res_ls->ls_stub_ms.m_type = DLM_MSG_CONVERT_REPLY;
2884 r->res_ls->ls_stub_ms.m_result = 0; 2965 r->res_ls->ls_stub_ms.m_result = 0;
2885 r->res_ls->ls_stub_ms.m_flags = lkb->lkb_flags;
2886 __receive_convert_reply(r, lkb, &r->res_ls->ls_stub_ms); 2966 __receive_convert_reply(r, lkb, &r->res_ls->ls_stub_ms);
2887 } 2967 }
2888 2968
@@ -2951,12 +3031,12 @@ static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb)
2951 struct dlm_mhandle *mh; 3031 struct dlm_mhandle *mh;
2952 int to_nodeid, error; 3032 int to_nodeid, error;
2953 3033
2954 error = add_to_waiters(lkb, DLM_MSG_LOOKUP); 3034 to_nodeid = dlm_dir_nodeid(r);
3035
3036 error = add_to_waiters(lkb, DLM_MSG_LOOKUP, to_nodeid);
2955 if (error) 3037 if (error)
2956 return error; 3038 return error;
2957 3039
2958 to_nodeid = dlm_dir_nodeid(r);
2959
2960 error = create_message(r, NULL, to_nodeid, DLM_MSG_LOOKUP, &ms, &mh); 3040 error = create_message(r, NULL, to_nodeid, DLM_MSG_LOOKUP, &ms, &mh);
2961 if (error) 3041 if (error)
2962 goto fail; 3042 goto fail;
@@ -3070,6 +3150,9 @@ static void receive_flags(struct dlm_lkb *lkb, struct dlm_message *ms)
3070 3150
3071static void receive_flags_reply(struct dlm_lkb *lkb, struct dlm_message *ms) 3151static void receive_flags_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
3072{ 3152{
3153 if (ms->m_flags == DLM_IFL_STUB_MS)
3154 return;
3155
3073 lkb->lkb_sbflags = ms->m_sbflags; 3156 lkb->lkb_sbflags = ms->m_sbflags;
3074 lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) | 3157 lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) |
3075 (ms->m_flags & 0x0000FFFF); 3158 (ms->m_flags & 0x0000FFFF);
@@ -3612,7 +3695,7 @@ static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
3612 /* convert was queued on remote master */ 3695 /* convert was queued on remote master */
3613 receive_flags_reply(lkb, ms); 3696 receive_flags_reply(lkb, ms);
3614 if (is_demoted(lkb)) 3697 if (is_demoted(lkb))
3615 munge_demoted(lkb, ms); 3698 munge_demoted(lkb);
3616 del_lkb(r, lkb); 3699 del_lkb(r, lkb);
3617 add_lkb(r, lkb, DLM_LKSTS_CONVERT); 3700 add_lkb(r, lkb, DLM_LKSTS_CONVERT);
3618 add_timeout(lkb); 3701 add_timeout(lkb);
@@ -3622,7 +3705,7 @@ static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
3622 /* convert was granted on remote master */ 3705 /* convert was granted on remote master */
3623 receive_flags_reply(lkb, ms); 3706 receive_flags_reply(lkb, ms);
3624 if (is_demoted(lkb)) 3707 if (is_demoted(lkb))
3625 munge_demoted(lkb, ms); 3708 munge_demoted(lkb);
3626 grant_lock_pc(r, lkb, ms); 3709 grant_lock_pc(r, lkb, ms);
3627 queue_cast(r, lkb, 0); 3710 queue_cast(r, lkb, 0);
3628 break; 3711 break;
@@ -3996,15 +4079,17 @@ void dlm_receive_buffer(union dlm_packet *p, int nodeid)
3996 dlm_put_lockspace(ls); 4079 dlm_put_lockspace(ls);
3997} 4080}
3998 4081
3999static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb) 4082static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb,
4083 struct dlm_message *ms_stub)
4000{ 4084{
4001 if (middle_conversion(lkb)) { 4085 if (middle_conversion(lkb)) {
4002 hold_lkb(lkb); 4086 hold_lkb(lkb);
4003 ls->ls_stub_ms.m_type = DLM_MSG_CONVERT_REPLY; 4087 memset(ms_stub, 0, sizeof(struct dlm_message));
4004 ls->ls_stub_ms.m_result = -EINPROGRESS; 4088 ms_stub->m_flags = DLM_IFL_STUB_MS;
4005 ls->ls_stub_ms.m_flags = lkb->lkb_flags; 4089 ms_stub->m_type = DLM_MSG_CONVERT_REPLY;
4006 ls->ls_stub_ms.m_header.h_nodeid = lkb->lkb_nodeid; 4090 ms_stub->m_result = -EINPROGRESS;
4007 _receive_convert_reply(lkb, &ls->ls_stub_ms); 4091 ms_stub->m_header.h_nodeid = lkb->lkb_nodeid;
4092 _receive_convert_reply(lkb, ms_stub);
4008 4093
4009 /* Same special case as in receive_rcom_lock_args() */ 4094 /* Same special case as in receive_rcom_lock_args() */
4010 lkb->lkb_grmode = DLM_LOCK_IV; 4095 lkb->lkb_grmode = DLM_LOCK_IV;
@@ -4045,13 +4130,27 @@ static int waiter_needs_recovery(struct dlm_ls *ls, struct dlm_lkb *lkb)
4045void dlm_recover_waiters_pre(struct dlm_ls *ls) 4130void dlm_recover_waiters_pre(struct dlm_ls *ls)
4046{ 4131{
4047 struct dlm_lkb *lkb, *safe; 4132 struct dlm_lkb *lkb, *safe;
4133 struct dlm_message *ms_stub;
4048 int wait_type, stub_unlock_result, stub_cancel_result; 4134 int wait_type, stub_unlock_result, stub_cancel_result;
4049 4135
4136 ms_stub = kmalloc(GFP_KERNEL, sizeof(struct dlm_message));
4137 if (!ms_stub) {
4138 log_error(ls, "dlm_recover_waiters_pre no mem");
4139 return;
4140 }
4141
4050 mutex_lock(&ls->ls_waiters_mutex); 4142 mutex_lock(&ls->ls_waiters_mutex);
4051 4143
4052 list_for_each_entry_safe(lkb, safe, &ls->ls_waiters, lkb_wait_reply) { 4144 list_for_each_entry_safe(lkb, safe, &ls->ls_waiters, lkb_wait_reply) {
4053 log_debug(ls, "pre recover waiter lkid %x type %d flags %x", 4145
4054 lkb->lkb_id, lkb->lkb_wait_type, lkb->lkb_flags); 4146 /* exclude debug messages about unlocks because there can be so
4147 many and they aren't very interesting */
4148
4149 if (lkb->lkb_wait_type != DLM_MSG_UNLOCK) {
4150 log_debug(ls, "recover_waiter %x nodeid %d "
4151 "msg %d to %d", lkb->lkb_id, lkb->lkb_nodeid,
4152 lkb->lkb_wait_type, lkb->lkb_wait_nodeid);
4153 }
4055 4154
4056 /* all outstanding lookups, regardless of destination will be 4155 /* all outstanding lookups, regardless of destination will be
4057 resent after recovery is done */ 4156 resent after recovery is done */
@@ -4097,26 +4196,28 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
4097 break; 4196 break;
4098 4197
4099 case DLM_MSG_CONVERT: 4198 case DLM_MSG_CONVERT:
4100 recover_convert_waiter(ls, lkb); 4199 recover_convert_waiter(ls, lkb, ms_stub);
4101 break; 4200 break;
4102 4201
4103 case DLM_MSG_UNLOCK: 4202 case DLM_MSG_UNLOCK:
4104 hold_lkb(lkb); 4203 hold_lkb(lkb);
4105 ls->ls_stub_ms.m_type = DLM_MSG_UNLOCK_REPLY; 4204 memset(ms_stub, 0, sizeof(struct dlm_message));
4106 ls->ls_stub_ms.m_result = stub_unlock_result; 4205 ms_stub->m_flags = DLM_IFL_STUB_MS;
4107 ls->ls_stub_ms.m_flags = lkb->lkb_flags; 4206 ms_stub->m_type = DLM_MSG_UNLOCK_REPLY;
4108 ls->ls_stub_ms.m_header.h_nodeid = lkb->lkb_nodeid; 4207 ms_stub->m_result = stub_unlock_result;
4109 _receive_unlock_reply(lkb, &ls->ls_stub_ms); 4208 ms_stub->m_header.h_nodeid = lkb->lkb_nodeid;
4209 _receive_unlock_reply(lkb, ms_stub);
4110 dlm_put_lkb(lkb); 4210 dlm_put_lkb(lkb);
4111 break; 4211 break;
4112 4212
4113 case DLM_MSG_CANCEL: 4213 case DLM_MSG_CANCEL:
4114 hold_lkb(lkb); 4214 hold_lkb(lkb);
4115 ls->ls_stub_ms.m_type = DLM_MSG_CANCEL_REPLY; 4215 memset(ms_stub, 0, sizeof(struct dlm_message));
4116 ls->ls_stub_ms.m_result = stub_cancel_result; 4216 ms_stub->m_flags = DLM_IFL_STUB_MS;
4117 ls->ls_stub_ms.m_flags = lkb->lkb_flags; 4217 ms_stub->m_type = DLM_MSG_CANCEL_REPLY;
4118 ls->ls_stub_ms.m_header.h_nodeid = lkb->lkb_nodeid; 4218 ms_stub->m_result = stub_cancel_result;
4119 _receive_cancel_reply(lkb, &ls->ls_stub_ms); 4219 ms_stub->m_header.h_nodeid = lkb->lkb_nodeid;
4220 _receive_cancel_reply(lkb, ms_stub);
4120 dlm_put_lkb(lkb); 4221 dlm_put_lkb(lkb);
4121 break; 4222 break;
4122 4223
@@ -4127,6 +4228,7 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
4127 schedule(); 4228 schedule();
4128 } 4229 }
4129 mutex_unlock(&ls->ls_waiters_mutex); 4230 mutex_unlock(&ls->ls_waiters_mutex);
4231 kfree(ms_stub);
4130} 4232}
4131 4233
4132static struct dlm_lkb *find_resend_waiter(struct dlm_ls *ls) 4234static struct dlm_lkb *find_resend_waiter(struct dlm_ls *ls)
@@ -4191,8 +4293,8 @@ int dlm_recover_waiters_post(struct dlm_ls *ls)
4191 ou = is_overlap_unlock(lkb); 4293 ou = is_overlap_unlock(lkb);
4192 err = 0; 4294 err = 0;
4193 4295
4194 log_debug(ls, "recover_waiters_post %x type %d flags %x %s", 4296 log_debug(ls, "recover_waiter %x nodeid %d msg %d r_nodeid %d",
4195 lkb->lkb_id, mstype, lkb->lkb_flags, r->res_name); 4297 lkb->lkb_id, lkb->lkb_nodeid, mstype, r->res_nodeid);
4196 4298
4197 /* At this point we assume that we won't get a reply to any 4299 /* At this point we assume that we won't get a reply to any
4198 previous op or overlap op on this lock. First, do a big 4300 previous op or overlap op on this lock. First, do a big
diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h
index 88e93c80cc22..265017a7c3e7 100644
--- a/fs/dlm/lock.h
+++ b/fs/dlm/lock.h
@@ -24,6 +24,7 @@ int dlm_put_lkb(struct dlm_lkb *lkb);
24void dlm_scan_rsbs(struct dlm_ls *ls); 24void dlm_scan_rsbs(struct dlm_ls *ls);
25int dlm_lock_recovery_try(struct dlm_ls *ls); 25int dlm_lock_recovery_try(struct dlm_ls *ls);
26void dlm_unlock_recovery(struct dlm_ls *ls); 26void dlm_unlock_recovery(struct dlm_ls *ls);
27void dlm_scan_waiters(struct dlm_ls *ls);
27void dlm_scan_timeout(struct dlm_ls *ls); 28void dlm_scan_timeout(struct dlm_ls *ls);
28void dlm_adjust_timeouts(struct dlm_ls *ls); 29void dlm_adjust_timeouts(struct dlm_ls *ls);
29 30
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index f994a7dfda85..14cbf4099753 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -243,7 +243,6 @@ static struct dlm_ls *find_ls_to_scan(void)
243static int dlm_scand(void *data) 243static int dlm_scand(void *data)
244{ 244{
245 struct dlm_ls *ls; 245 struct dlm_ls *ls;
246 int timeout_jiffies = dlm_config.ci_scan_secs * HZ;
247 246
248 while (!kthread_should_stop()) { 247 while (!kthread_should_stop()) {
249 ls = find_ls_to_scan(); 248 ls = find_ls_to_scan();
@@ -252,13 +251,14 @@ static int dlm_scand(void *data)
252 ls->ls_scan_time = jiffies; 251 ls->ls_scan_time = jiffies;
253 dlm_scan_rsbs(ls); 252 dlm_scan_rsbs(ls);
254 dlm_scan_timeout(ls); 253 dlm_scan_timeout(ls);
254 dlm_scan_waiters(ls);
255 dlm_unlock_recovery(ls); 255 dlm_unlock_recovery(ls);
256 } else { 256 } else {
257 ls->ls_scan_time += HZ; 257 ls->ls_scan_time += HZ;
258 } 258 }
259 } else { 259 continue;
260 schedule_timeout_interruptible(timeout_jiffies);
261 } 260 }
261 schedule_timeout_interruptible(dlm_config.ci_scan_secs * HZ);
262 } 262 }
263 return 0; 263 return 0;
264} 264}
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index 30d8b85febbf..e2b878004364 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -71,6 +71,36 @@ static void send_op(struct plock_op *op)
71 wake_up(&send_wq); 71 wake_up(&send_wq);
72} 72}
73 73
74/* If a process was killed while waiting for the only plock on a file,
75 locks_remove_posix will not see any lock on the file so it won't
76 send an unlock-close to us to pass on to userspace to clean up the
77 abandoned waiter. So, we have to insert the unlock-close when the
78 lock call is interrupted. */
79
80static void do_unlock_close(struct dlm_ls *ls, u64 number,
81 struct file *file, struct file_lock *fl)
82{
83 struct plock_op *op;
84
85 op = kzalloc(sizeof(*op), GFP_NOFS);
86 if (!op)
87 return;
88
89 op->info.optype = DLM_PLOCK_OP_UNLOCK;
90 op->info.pid = fl->fl_pid;
91 op->info.fsid = ls->ls_global_id;
92 op->info.number = number;
93 op->info.start = 0;
94 op->info.end = OFFSET_MAX;
95 if (fl->fl_lmops && fl->fl_lmops->fl_grant)
96 op->info.owner = (__u64) fl->fl_pid;
97 else
98 op->info.owner = (__u64)(long) fl->fl_owner;
99
100 op->info.flags |= DLM_PLOCK_FL_CLOSE;
101 send_op(op);
102}
103
74int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file, 104int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
75 int cmd, struct file_lock *fl) 105 int cmd, struct file_lock *fl)
76{ 106{
@@ -114,9 +144,19 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
114 144
115 send_op(op); 145 send_op(op);
116 146
117 if (xop->callback == NULL) 147 if (xop->callback == NULL) {
118 wait_event(recv_wq, (op->done != 0)); 148 rv = wait_event_killable(recv_wq, (op->done != 0));
119 else { 149 if (rv == -ERESTARTSYS) {
150 log_debug(ls, "dlm_posix_lock: wait killed %llx",
151 (unsigned long long)number);
152 spin_lock(&ops_lock);
153 list_del(&op->list);
154 spin_unlock(&ops_lock);
155 kfree(xop);
156 do_unlock_close(ls, number, file, fl);
157 goto out;
158 }
159 } else {
120 rv = FILE_LOCK_DEFERRED; 160 rv = FILE_LOCK_DEFERRED;
121 goto out; 161 goto out;
122 } 162 }
@@ -233,6 +273,13 @@ int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
233 else 273 else
234 op->info.owner = (__u64)(long) fl->fl_owner; 274 op->info.owner = (__u64)(long) fl->fl_owner;
235 275
276 if (fl->fl_flags & FL_CLOSE) {
277 op->info.flags |= DLM_PLOCK_FL_CLOSE;
278 send_op(op);
279 rv = 0;
280 goto out;
281 }
282
236 send_op(op); 283 send_op(op);
237 wait_event(recv_wq, (op->done != 0)); 284 wait_event(recv_wq, (op->done != 0));
238 285
@@ -334,7 +381,10 @@ static ssize_t dev_read(struct file *file, char __user *u, size_t count,
334 spin_lock(&ops_lock); 381 spin_lock(&ops_lock);
335 if (!list_empty(&send_list)) { 382 if (!list_empty(&send_list)) {
336 op = list_entry(send_list.next, struct plock_op, list); 383 op = list_entry(send_list.next, struct plock_op, list);
337 list_move(&op->list, &recv_list); 384 if (op->info.flags & DLM_PLOCK_FL_CLOSE)
385 list_del(&op->list);
386 else
387 list_move(&op->list, &recv_list);
338 memcpy(&info, &op->info, sizeof(info)); 388 memcpy(&info, &op->info, sizeof(info));
339 } 389 }
340 spin_unlock(&ops_lock); 390 spin_unlock(&ops_lock);
@@ -342,6 +392,13 @@ static ssize_t dev_read(struct file *file, char __user *u, size_t count,
342 if (!op) 392 if (!op)
343 return -EAGAIN; 393 return -EAGAIN;
344 394
395 /* there is no need to get a reply from userspace for unlocks
396 that were generated by the vfs cleaning up for a close
397 (the process did not make an unlock call). */
398
399 if (op->info.flags & DLM_PLOCK_FL_CLOSE)
400 kfree(op);
401
345 if (copy_to_user(u, &info, sizeof(info))) 402 if (copy_to_user(u, &info, sizeof(info)))
346 return -EFAULT; 403 return -EFAULT;
347 return sizeof(info); 404 return sizeof(info);
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index d5ab3fe7c198..e96bf3e9be88 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -611,7 +611,6 @@ static ssize_t device_write(struct file *file, const char __user *buf,
611 611
612 out_sig: 612 out_sig:
613 sigprocmask(SIG_SETMASK, &tmpsig, NULL); 613 sigprocmask(SIG_SETMASK, &tmpsig, NULL);
614 recalc_sigpending();
615 out_free: 614 out_free:
616 kfree(kbuf); 615 kfree(kbuf);
617 return error; 616 return error;
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 98b77c89494c..c00e055b6282 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -40,9 +40,12 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
40static void drop_slab(void) 40static void drop_slab(void)
41{ 41{
42 int nr_objects; 42 int nr_objects;
43 struct shrink_control shrink = {
44 .gfp_mask = GFP_KERNEL,
45 };
43 46
44 do { 47 do {
45 nr_objects = shrink_slab(1000, GFP_KERNEL, 1000); 48 nr_objects = shrink_slab(&shrink, 1000, 1000);
46 } while (nr_objects > 10); 49 } while (nr_objects > 10);
47} 50}
48 51
diff --git a/fs/exec.c b/fs/exec.c
index c016896dcbb2..936f5776655c 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -200,7 +200,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
200 200
201#ifdef CONFIG_STACK_GROWSUP 201#ifdef CONFIG_STACK_GROWSUP
202 if (write) { 202 if (write) {
203 ret = expand_stack_downwards(bprm->vma, pos); 203 ret = expand_downwards(bprm->vma, pos);
204 if (ret < 0) 204 if (ret < 0)
205 return NULL; 205 return NULL;
206 } 206 }
@@ -600,7 +600,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
600 unsigned long length = old_end - old_start; 600 unsigned long length = old_end - old_start;
601 unsigned long new_start = old_start - shift; 601 unsigned long new_start = old_start - shift;
602 unsigned long new_end = old_end - shift; 602 unsigned long new_end = old_end - shift;
603 struct mmu_gather *tlb; 603 struct mmu_gather tlb;
604 604
605 BUG_ON(new_start > new_end); 605 BUG_ON(new_start > new_end);
606 606
@@ -626,12 +626,12 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
626 return -ENOMEM; 626 return -ENOMEM;
627 627
628 lru_add_drain(); 628 lru_add_drain();
629 tlb = tlb_gather_mmu(mm, 0); 629 tlb_gather_mmu(&tlb, mm, 0);
630 if (new_end > old_start) { 630 if (new_end > old_start) {
631 /* 631 /*
632 * when the old and new regions overlap clear from new_end. 632 * when the old and new regions overlap clear from new_end.
633 */ 633 */
634 free_pgd_range(tlb, new_end, old_end, new_end, 634 free_pgd_range(&tlb, new_end, old_end, new_end,
635 vma->vm_next ? vma->vm_next->vm_start : 0); 635 vma->vm_next ? vma->vm_next->vm_start : 0);
636 } else { 636 } else {
637 /* 637 /*
@@ -640,10 +640,10 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
640 * have constraints on va-space that make this illegal (IA64) - 640 * have constraints on va-space that make this illegal (IA64) -
641 * for the others its just a little faster. 641 * for the others its just a little faster.
642 */ 642 */
643 free_pgd_range(tlb, old_start, old_end, new_end, 643 free_pgd_range(&tlb, old_start, old_end, new_end,
644 vma->vm_next ? vma->vm_next->vm_start : 0); 644 vma->vm_next ? vma->vm_next->vm_start : 0);
645 } 645 }
646 tlb_finish_mmu(tlb, new_end, old_end); 646 tlb_finish_mmu(&tlb, new_end, old_end);
647 647
648 /* 648 /*
649 * Shrink the vma to just the new range. Always succeeds. 649 * Shrink the vma to just the new range. Always succeeds.
@@ -1051,6 +1051,7 @@ char *get_task_comm(char *buf, struct task_struct *tsk)
1051 task_unlock(tsk); 1051 task_unlock(tsk);
1052 return buf; 1052 return buf;
1053} 1053}
1054EXPORT_SYMBOL_GPL(get_task_comm);
1054 1055
1055void set_task_comm(struct task_struct *tsk, char *buf) 1056void set_task_comm(struct task_struct *tsk, char *buf)
1056{ 1057{
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 0a78dae7e2cb..1dd62ed35b85 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -898,7 +898,8 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
898 brelse(bh); 898 brelse(bh);
899 899
900 if (!sb_set_blocksize(sb, blocksize)) { 900 if (!sb_set_blocksize(sb, blocksize)) {
901 ext2_msg(sb, KERN_ERR, "error: blocksize is too small"); 901 ext2_msg(sb, KERN_ERR,
902 "error: bad blocksize %d", blocksize);
902 goto failed_sbi; 903 goto failed_sbi;
903 } 904 }
904 905
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 32f3b8695859..34b6d9bfc48a 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1416,10 +1416,19 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1416 frame->at = entries; 1416 frame->at = entries;
1417 frame->bh = bh; 1417 frame->bh = bh;
1418 bh = bh2; 1418 bh = bh2;
1419 /*
1420 * Mark buffers dirty here so that if do_split() fails we write a
1421 * consistent set of buffers to disk.
1422 */
1423 ext3_journal_dirty_metadata(handle, frame->bh);
1424 ext3_journal_dirty_metadata(handle, bh);
1419 de = do_split(handle,dir, &bh, frame, &hinfo, &retval); 1425 de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
1420 dx_release (frames); 1426 if (!de) {
1421 if (!(de)) 1427 ext3_mark_inode_dirty(handle, dir);
1428 dx_release(frames);
1422 return retval; 1429 return retval;
1430 }
1431 dx_release(frames);
1423 1432
1424 return add_dirent_to_buf(handle, dentry, inode, de, bh); 1433 return add_dirent_to_buf(handle, dentry, inode, de, bh);
1425} 1434}
@@ -2189,6 +2198,7 @@ static int ext3_symlink (struct inode * dir,
2189 handle_t *handle; 2198 handle_t *handle;
2190 struct inode * inode; 2199 struct inode * inode;
2191 int l, err, retries = 0; 2200 int l, err, retries = 0;
2201 int credits;
2192 2202
2193 l = strlen(symname)+1; 2203 l = strlen(symname)+1;
2194 if (l > dir->i_sb->s_blocksize) 2204 if (l > dir->i_sb->s_blocksize)
@@ -2196,10 +2206,26 @@ static int ext3_symlink (struct inode * dir,
2196 2206
2197 dquot_initialize(dir); 2207 dquot_initialize(dir);
2198 2208
2209 if (l > EXT3_N_BLOCKS * 4) {
2210 /*
2211 * For non-fast symlinks, we just allocate inode and put it on
2212 * orphan list in the first transaction => we need bitmap,
2213 * group descriptor, sb, inode block, quota blocks.
2214 */
2215 credits = 4 + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
2216 } else {
2217 /*
2218 * Fast symlink. We have to add entry to directory
2219 * (EXT3_DATA_TRANS_BLOCKS + EXT3_INDEX_EXTRA_TRANS_BLOCKS),
2220 * allocate new inode (bitmap, group descriptor, inode block,
2221 * quota blocks, sb is already counted in previous macros).
2222 */
2223 credits = EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
2224 EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
2225 EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
2226 }
2199retry: 2227retry:
2200 handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + 2228 handle = ext3_journal_start(dir, credits);
2201 EXT3_INDEX_EXTRA_TRANS_BLOCKS + 5 +
2202 EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
2203 if (IS_ERR(handle)) 2229 if (IS_ERR(handle))
2204 return PTR_ERR(handle); 2230 return PTR_ERR(handle);
2205 2231
@@ -2211,21 +2237,45 @@ retry:
2211 if (IS_ERR(inode)) 2237 if (IS_ERR(inode))
2212 goto out_stop; 2238 goto out_stop;
2213 2239
2214 if (l > sizeof (EXT3_I(inode)->i_data)) { 2240 if (l > EXT3_N_BLOCKS * 4) {
2215 inode->i_op = &ext3_symlink_inode_operations; 2241 inode->i_op = &ext3_symlink_inode_operations;
2216 ext3_set_aops(inode); 2242 ext3_set_aops(inode);
2217 /* 2243 /*
2218 * page_symlink() calls into ext3_prepare/commit_write. 2244 * We cannot call page_symlink() with transaction started
2219 * We have a transaction open. All is sweetness. It also sets 2245 * because it calls into ext3_write_begin() which acquires page
2220 * i_size in generic_commit_write(). 2246 * lock which ranks below transaction start (and it can also
2247 * wait for journal commit if we are running out of space). So
2248 * we have to stop transaction now and restart it when symlink
2249 * contents is written.
2250 *
2251 * To keep fs consistent in case of crash, we have to put inode
2252 * to orphan list in the mean time.
2221 */ 2253 */
2254 drop_nlink(inode);
2255 err = ext3_orphan_add(handle, inode);
2256 ext3_journal_stop(handle);
2257 if (err)
2258 goto err_drop_inode;
2222 err = __page_symlink(inode, symname, l, 1); 2259 err = __page_symlink(inode, symname, l, 1);
2260 if (err)
2261 goto err_drop_inode;
2262 /*
2263 * Now inode is being linked into dir (EXT3_DATA_TRANS_BLOCKS
2264 * + EXT3_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified
2265 */
2266 handle = ext3_journal_start(dir,
2267 EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
2268 EXT3_INDEX_EXTRA_TRANS_BLOCKS + 1);
2269 if (IS_ERR(handle)) {
2270 err = PTR_ERR(handle);
2271 goto err_drop_inode;
2272 }
2273 inc_nlink(inode);
2274 err = ext3_orphan_del(handle, inode);
2223 if (err) { 2275 if (err) {
2276 ext3_journal_stop(handle);
2224 drop_nlink(inode); 2277 drop_nlink(inode);
2225 unlock_new_inode(inode); 2278 goto err_drop_inode;
2226 ext3_mark_inode_dirty(handle, inode);
2227 iput (inode);
2228 goto out_stop;
2229 } 2279 }
2230 } else { 2280 } else {
2231 inode->i_op = &ext3_fast_symlink_inode_operations; 2281 inode->i_op = &ext3_fast_symlink_inode_operations;
@@ -2239,6 +2289,10 @@ out_stop:
2239 if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) 2289 if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
2240 goto retry; 2290 goto retry;
2241 return err; 2291 return err;
2292err_drop_inode:
2293 unlock_new_inode(inode);
2294 iput(inode);
2295 return err;
2242} 2296}
2243 2297
2244static int ext3_link (struct dentry * old_dentry, 2298static int ext3_link (struct dentry * old_dentry,
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index ae8200f84e39..1cc7038e273d 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -151,6 +151,13 @@ static void fat_cache_add(struct inode *inode, struct fat_cache_id *new)
151 spin_unlock(&MSDOS_I(inode)->cache_lru_lock); 151 spin_unlock(&MSDOS_I(inode)->cache_lru_lock);
152 152
153 tmp = fat_cache_alloc(inode); 153 tmp = fat_cache_alloc(inode);
154 if (!tmp) {
155 spin_lock(&MSDOS_I(inode)->cache_lru_lock);
156 MSDOS_I(inode)->nr_caches--;
157 spin_unlock(&MSDOS_I(inode)->cache_lru_lock);
158 return;
159 }
160
154 spin_lock(&MSDOS_I(inode)->cache_lru_lock); 161 spin_lock(&MSDOS_I(inode)->cache_lru_lock);
155 cache = fat_cache_merge(inode, new); 162 cache = fat_cache_merge(inode, new);
156 if (cache != NULL) { 163 if (cache != NULL) {
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index ee42b9e0b16a..4ad64732cbce 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -98,7 +98,7 @@ next:
98 98
99 *bh = sb_bread(sb, phys); 99 *bh = sb_bread(sb, phys);
100 if (*bh == NULL) { 100 if (*bh == NULL) {
101 printk(KERN_ERR "FAT: Directory bread(block %llu) failed\n", 101 fat_msg(sb, KERN_ERR, "Directory bread(block %llu) failed",
102 (llu)phys); 102 (llu)phys);
103 /* skip this block */ 103 /* skip this block */
104 *pos = (iblock + 1) << sb->s_blocksize_bits; 104 *pos = (iblock + 1) << sb->s_blocksize_bits;
@@ -136,9 +136,10 @@ static inline int fat_get_entry(struct inode *dir, loff_t *pos,
136 * but ignore that right now. 136 * but ignore that right now.
137 * Ahem... Stack smashing in ring 0 isn't fun. Fixed. 137 * Ahem... Stack smashing in ring 0 isn't fun. Fixed.
138 */ 138 */
139static int uni16_to_x8(unsigned char *ascii, const wchar_t *uni, int len, 139static int uni16_to_x8(struct super_block *sb, unsigned char *ascii,
140 int uni_xlate, struct nls_table *nls) 140 const wchar_t *uni, int len, struct nls_table *nls)
141{ 141{
142 int uni_xlate = MSDOS_SB(sb)->options.unicode_xlate;
142 const wchar_t *ip; 143 const wchar_t *ip;
143 wchar_t ec; 144 wchar_t ec;
144 unsigned char *op; 145 unsigned char *op;
@@ -166,23 +167,23 @@ static int uni16_to_x8(unsigned char *ascii, const wchar_t *uni, int len,
166 } 167 }
167 168
168 if (unlikely(*ip)) { 169 if (unlikely(*ip)) {
169 printk(KERN_WARNING "FAT: filename was truncated while " 170 fat_msg(sb, KERN_WARNING, "filename was truncated while "
170 "converting."); 171 "converting.");
171 } 172 }
172 173
173 *op = 0; 174 *op = 0;
174 return (op - ascii); 175 return (op - ascii);
175} 176}
176 177
177static inline int fat_uni_to_x8(struct msdos_sb_info *sbi, const wchar_t *uni, 178static inline int fat_uni_to_x8(struct super_block *sb, const wchar_t *uni,
178 unsigned char *buf, int size) 179 unsigned char *buf, int size)
179{ 180{
181 struct msdos_sb_info *sbi = MSDOS_SB(sb);
180 if (sbi->options.utf8) 182 if (sbi->options.utf8)
181 return utf16s_to_utf8s(uni, FAT_MAX_UNI_CHARS, 183 return utf16s_to_utf8s(uni, FAT_MAX_UNI_CHARS,
182 UTF16_HOST_ENDIAN, buf, size); 184 UTF16_HOST_ENDIAN, buf, size);
183 else 185 else
184 return uni16_to_x8(buf, uni, size, sbi->options.unicode_xlate, 186 return uni16_to_x8(sb, buf, uni, size, sbi->nls_io);
185 sbi->nls_io);
186} 187}
187 188
188static inline int 189static inline int
@@ -419,7 +420,7 @@ parse_record:
419 420
420 /* Compare shortname */ 421 /* Compare shortname */
421 bufuname[last_u] = 0x0000; 422 bufuname[last_u] = 0x0000;
422 len = fat_uni_to_x8(sbi, bufuname, bufname, sizeof(bufname)); 423 len = fat_uni_to_x8(sb, bufuname, bufname, sizeof(bufname));
423 if (fat_name_match(sbi, name, name_len, bufname, len)) 424 if (fat_name_match(sbi, name, name_len, bufname, len))
424 goto found; 425 goto found;
425 426
@@ -428,7 +429,7 @@ parse_record:
428 int size = PATH_MAX - FAT_MAX_UNI_SIZE; 429 int size = PATH_MAX - FAT_MAX_UNI_SIZE;
429 430
430 /* Compare longname */ 431 /* Compare longname */
431 len = fat_uni_to_x8(sbi, unicode, longname, size); 432 len = fat_uni_to_x8(sb, unicode, longname, size);
432 if (fat_name_match(sbi, name, name_len, longname, len)) 433 if (fat_name_match(sbi, name, name_len, longname, len))
433 goto found; 434 goto found;
434 } 435 }
@@ -545,7 +546,7 @@ parse_record:
545 if (nr_slots) { 546 if (nr_slots) {
546 void *longname = unicode + FAT_MAX_UNI_CHARS; 547 void *longname = unicode + FAT_MAX_UNI_CHARS;
547 int size = PATH_MAX - FAT_MAX_UNI_SIZE; 548 int size = PATH_MAX - FAT_MAX_UNI_SIZE;
548 int len = fat_uni_to_x8(sbi, unicode, longname, size); 549 int len = fat_uni_to_x8(sb, unicode, longname, size);
549 550
550 fill_name = longname; 551 fill_name = longname;
551 fill_len = len; 552 fill_len = len;
@@ -621,7 +622,7 @@ parse_record:
621 622
622 if (isvfat) { 623 if (isvfat) {
623 bufuname[j] = 0x0000; 624 bufuname[j] = 0x0000;
624 i = fat_uni_to_x8(sbi, bufuname, bufname, sizeof(bufname)); 625 i = fat_uni_to_x8(sb, bufuname, bufname, sizeof(bufname));
625 } 626 }
626 if (nr_slots) { 627 if (nr_slots) {
627 /* hack for fat_ioctl_filldir() */ 628 /* hack for fat_ioctl_filldir() */
@@ -979,6 +980,7 @@ static int __fat_remove_entries(struct inode *dir, loff_t pos, int nr_slots)
979 980
980int fat_remove_entries(struct inode *dir, struct fat_slot_info *sinfo) 981int fat_remove_entries(struct inode *dir, struct fat_slot_info *sinfo)
981{ 982{
983 struct super_block *sb = dir->i_sb;
982 struct msdos_dir_entry *de; 984 struct msdos_dir_entry *de;
983 struct buffer_head *bh; 985 struct buffer_head *bh;
984 int err = 0, nr_slots; 986 int err = 0, nr_slots;
@@ -1013,8 +1015,8 @@ int fat_remove_entries(struct inode *dir, struct fat_slot_info *sinfo)
1013 */ 1015 */
1014 err = __fat_remove_entries(dir, sinfo->slot_off, nr_slots); 1016 err = __fat_remove_entries(dir, sinfo->slot_off, nr_slots);
1015 if (err) { 1017 if (err) {
1016 printk(KERN_WARNING 1018 fat_msg(sb, KERN_WARNING,
1017 "FAT: Couldn't remove the long name slots\n"); 1019 "Couldn't remove the long name slots");
1018 } 1020 }
1019 } 1021 }
1020 1022
@@ -1265,7 +1267,7 @@ int fat_add_entries(struct inode *dir, void *slots, int nr_slots,
1265 if (sbi->fat_bits != 32) 1267 if (sbi->fat_bits != 32)
1266 goto error; 1268 goto error;
1267 } else if (MSDOS_I(dir)->i_start == 0) { 1269 } else if (MSDOS_I(dir)->i_start == 0) {
1268 printk(KERN_ERR "FAT: Corrupted directory (i_pos %lld)\n", 1270 fat_msg(sb, KERN_ERR, "Corrupted directory (i_pos %lld)",
1269 MSDOS_I(dir)->i_pos); 1271 MSDOS_I(dir)->i_pos);
1270 err = -EIO; 1272 err = -EIO;
1271 goto error; 1273 goto error;
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index f50408901f7e..8276cc282dec 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -319,19 +319,20 @@ extern struct inode *fat_build_inode(struct super_block *sb,
319 struct msdos_dir_entry *de, loff_t i_pos); 319 struct msdos_dir_entry *de, loff_t i_pos);
320extern int fat_sync_inode(struct inode *inode); 320extern int fat_sync_inode(struct inode *inode);
321extern int fat_fill_super(struct super_block *sb, void *data, int silent, 321extern int fat_fill_super(struct super_block *sb, void *data, int silent,
322 const struct inode_operations *fs_dir_inode_ops, 322 int isvfat, void (*setup)(struct super_block *));
323 int isvfat, void (*setup)(struct super_block *));
324 323
325extern int fat_flush_inodes(struct super_block *sb, struct inode *i1, 324extern int fat_flush_inodes(struct super_block *sb, struct inode *i1,
326 struct inode *i2); 325 struct inode *i2);
327/* fat/misc.c */ 326/* fat/misc.c */
328extern void 327extern void
329__fat_fs_error(struct super_block *s, int report, const char *fmt, ...) 328__fat_fs_error(struct super_block *sb, int report, const char *fmt, ...)
329 __attribute__ ((format (printf, 3, 4))) __cold;
330#define fat_fs_error(sb, fmt, args...) \
331 __fat_fs_error(sb, 1, fmt , ## args)
332#define fat_fs_error_ratelimit(sb, fmt, args...) \
333 __fat_fs_error(sb, __ratelimit(&MSDOS_SB(sb)->ratelimit), fmt , ## args)
334void fat_msg(struct super_block *sb, const char *level, const char *fmt, ...)
330 __attribute__ ((format (printf, 3, 4))) __cold; 335 __attribute__ ((format (printf, 3, 4))) __cold;
331#define fat_fs_error(s, fmt, args...) \
332 __fat_fs_error(s, 1, fmt , ## args)
333#define fat_fs_error_ratelimit(s, fmt, args...) \
334 __fat_fs_error(s, __ratelimit(&MSDOS_SB(s)->ratelimit), fmt , ## args)
335extern int fat_clusters_flush(struct super_block *sb); 336extern int fat_clusters_flush(struct super_block *sb);
336extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster); 337extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster);
337extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts, 338extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts,
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index b47d2c9f4fa1..2e81ac0df7e2 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -95,7 +95,7 @@ static int fat12_ent_bread(struct super_block *sb, struct fat_entry *fatent,
95err_brelse: 95err_brelse:
96 brelse(bhs[0]); 96 brelse(bhs[0]);
97err: 97err:
98 printk(KERN_ERR "FAT: FAT read failed (blocknr %llu)\n", (llu)blocknr); 98 fat_msg(sb, KERN_ERR, "FAT read failed (blocknr %llu)", (llu)blocknr);
99 return -EIO; 99 return -EIO;
100} 100}
101 101
@@ -108,7 +108,7 @@ static int fat_ent_bread(struct super_block *sb, struct fat_entry *fatent,
108 fatent->fat_inode = MSDOS_SB(sb)->fat_inode; 108 fatent->fat_inode = MSDOS_SB(sb)->fat_inode;
109 fatent->bhs[0] = sb_bread(sb, blocknr); 109 fatent->bhs[0] = sb_bread(sb, blocknr);
110 if (!fatent->bhs[0]) { 110 if (!fatent->bhs[0]) {
111 printk(KERN_ERR "FAT: FAT read failed (blocknr %llu)\n", 111 fat_msg(sb, KERN_ERR, "FAT read failed (blocknr %llu)",
112 (llu)blocknr); 112 (llu)blocknr);
113 return -EIO; 113 return -EIO;
114 } 114 }
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 8d68690bdcf1..cb8d8391ac0b 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -581,7 +581,8 @@ static int fat_statfs(struct dentry *dentry, struct kstatfs *buf)
581 buf->f_bavail = sbi->free_clusters; 581 buf->f_bavail = sbi->free_clusters;
582 buf->f_fsid.val[0] = (u32)id; 582 buf->f_fsid.val[0] = (u32)id;
583 buf->f_fsid.val[1] = (u32)(id >> 32); 583 buf->f_fsid.val[1] = (u32)(id >> 32);
584 buf->f_namelen = sbi->options.isvfat ? FAT_LFN_LEN : 12; 584 buf->f_namelen =
585 (sbi->options.isvfat ? FAT_LFN_LEN : 12) * NLS_MAX_CHARSET_SIZE;
585 586
586 return 0; 587 return 0;
587} 588}
@@ -619,8 +620,8 @@ retry:
619 620
620 bh = sb_bread(sb, i_pos >> sbi->dir_per_block_bits); 621 bh = sb_bread(sb, i_pos >> sbi->dir_per_block_bits);
621 if (!bh) { 622 if (!bh) {
622 printk(KERN_ERR "FAT: unable to read inode block " 623 fat_msg(sb, KERN_ERR, "unable to read inode block "
623 "for updating (i_pos %lld)\n", i_pos); 624 "for updating (i_pos %lld)", i_pos);
624 return -EIO; 625 return -EIO;
625 } 626 }
626 spin_lock(&sbi->inode_hash_lock); 627 spin_lock(&sbi->inode_hash_lock);
@@ -976,8 +977,8 @@ static const match_table_t vfat_tokens = {
976 {Opt_err, NULL} 977 {Opt_err, NULL}
977}; 978};
978 979
979static int parse_options(char *options, int is_vfat, int silent, int *debug, 980static int parse_options(struct super_block *sb, char *options, int is_vfat,
980 struct fat_mount_options *opts) 981 int silent, int *debug, struct fat_mount_options *opts)
981{ 982{
982 char *p; 983 char *p;
983 substring_t args[MAX_OPT_ARGS]; 984 substring_t args[MAX_OPT_ARGS];
@@ -1168,15 +1169,15 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
1168 1169
1169 /* obsolete mount options */ 1170 /* obsolete mount options */
1170 case Opt_obsolate: 1171 case Opt_obsolate:
1171 printk(KERN_INFO "FAT: \"%s\" option is obsolete, " 1172 fat_msg(sb, KERN_INFO, "\"%s\" option is obsolete, "
1172 "not supported now\n", p); 1173 "not supported now", p);
1173 break; 1174 break;
1174 /* unknown option */ 1175 /* unknown option */
1175 default: 1176 default:
1176 if (!silent) { 1177 if (!silent) {
1177 printk(KERN_ERR 1178 fat_msg(sb, KERN_ERR,
1178 "FAT: Unrecognized mount option \"%s\" " 1179 "Unrecognized mount option \"%s\" "
1179 "or missing value\n", p); 1180 "or missing value", p);
1180 } 1181 }
1181 return -EINVAL; 1182 return -EINVAL;
1182 } 1183 }
@@ -1185,7 +1186,7 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
1185out: 1186out:
1186 /* UTF-8 doesn't provide FAT semantics */ 1187 /* UTF-8 doesn't provide FAT semantics */
1187 if (!strcmp(opts->iocharset, "utf8")) { 1188 if (!strcmp(opts->iocharset, "utf8")) {
1188 printk(KERN_ERR "FAT: utf8 is not a recommended IO charset" 1189 fat_msg(sb, KERN_ERR, "utf8 is not a recommended IO charset"
1189 " for FAT filesystems, filesystem will be " 1190 " for FAT filesystems, filesystem will be "
1190 "case sensitive!\n"); 1191 "case sensitive!\n");
1191 } 1192 }
@@ -1238,8 +1239,7 @@ static int fat_read_root(struct inode *inode)
1238/* 1239/*
1239 * Read the super block of an MS-DOS FS. 1240 * Read the super block of an MS-DOS FS.
1240 */ 1241 */
1241int fat_fill_super(struct super_block *sb, void *data, int silent, 1242int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
1242 const struct inode_operations *fs_dir_inode_ops, int isvfat,
1243 void (*setup)(struct super_block *)) 1243 void (*setup)(struct super_block *))
1244{ 1244{
1245 struct inode *root_inode = NULL, *fat_inode = NULL; 1245 struct inode *root_inode = NULL, *fat_inode = NULL;
@@ -1268,11 +1268,10 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
1268 sb->s_magic = MSDOS_SUPER_MAGIC; 1268 sb->s_magic = MSDOS_SUPER_MAGIC;
1269 sb->s_op = &fat_sops; 1269 sb->s_op = &fat_sops;
1270 sb->s_export_op = &fat_export_ops; 1270 sb->s_export_op = &fat_export_ops;
1271 sbi->dir_ops = fs_dir_inode_ops;
1272 ratelimit_state_init(&sbi->ratelimit, DEFAULT_RATELIMIT_INTERVAL, 1271 ratelimit_state_init(&sbi->ratelimit, DEFAULT_RATELIMIT_INTERVAL,
1273 DEFAULT_RATELIMIT_BURST); 1272 DEFAULT_RATELIMIT_BURST);
1274 1273
1275 error = parse_options(data, isvfat, silent, &debug, &sbi->options); 1274 error = parse_options(sb, data, isvfat, silent, &debug, &sbi->options);
1276 if (error) 1275 if (error)
1277 goto out_fail; 1276 goto out_fail;
1278 1277
@@ -1282,20 +1281,20 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
1282 sb_min_blocksize(sb, 512); 1281 sb_min_blocksize(sb, 512);
1283 bh = sb_bread(sb, 0); 1282 bh = sb_bread(sb, 0);
1284 if (bh == NULL) { 1283 if (bh == NULL) {
1285 printk(KERN_ERR "FAT: unable to read boot sector\n"); 1284 fat_msg(sb, KERN_ERR, "unable to read boot sector");
1286 goto out_fail; 1285 goto out_fail;
1287 } 1286 }
1288 1287
1289 b = (struct fat_boot_sector *) bh->b_data; 1288 b = (struct fat_boot_sector *) bh->b_data;
1290 if (!b->reserved) { 1289 if (!b->reserved) {
1291 if (!silent) 1290 if (!silent)
1292 printk(KERN_ERR "FAT: bogus number of reserved sectors\n"); 1291 fat_msg(sb, KERN_ERR, "bogus number of reserved sectors");
1293 brelse(bh); 1292 brelse(bh);
1294 goto out_invalid; 1293 goto out_invalid;
1295 } 1294 }
1296 if (!b->fats) { 1295 if (!b->fats) {
1297 if (!silent) 1296 if (!silent)
1298 printk(KERN_ERR "FAT: bogus number of FAT structure\n"); 1297 fat_msg(sb, KERN_ERR, "bogus number of FAT structure");
1299 brelse(bh); 1298 brelse(bh);
1300 goto out_invalid; 1299 goto out_invalid;
1301 } 1300 }
@@ -1308,7 +1307,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
1308 media = b->media; 1307 media = b->media;
1309 if (!fat_valid_media(media)) { 1308 if (!fat_valid_media(media)) {
1310 if (!silent) 1309 if (!silent)
1311 printk(KERN_ERR "FAT: invalid media value (0x%02x)\n", 1310 fat_msg(sb, KERN_ERR, "invalid media value (0x%02x)",
1312 media); 1311 media);
1313 brelse(bh); 1312 brelse(bh);
1314 goto out_invalid; 1313 goto out_invalid;
@@ -1318,7 +1317,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
1318 || (logical_sector_size < 512) 1317 || (logical_sector_size < 512)
1319 || (logical_sector_size > 4096)) { 1318 || (logical_sector_size > 4096)) {
1320 if (!silent) 1319 if (!silent)
1321 printk(KERN_ERR "FAT: bogus logical sector size %u\n", 1320 fat_msg(sb, KERN_ERR, "bogus logical sector size %u",
1322 logical_sector_size); 1321 logical_sector_size);
1323 brelse(bh); 1322 brelse(bh);
1324 goto out_invalid; 1323 goto out_invalid;
@@ -1326,15 +1325,15 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
1326 sbi->sec_per_clus = b->sec_per_clus; 1325 sbi->sec_per_clus = b->sec_per_clus;
1327 if (!is_power_of_2(sbi->sec_per_clus)) { 1326 if (!is_power_of_2(sbi->sec_per_clus)) {
1328 if (!silent) 1327 if (!silent)
1329 printk(KERN_ERR "FAT: bogus sectors per cluster %u\n", 1328 fat_msg(sb, KERN_ERR, "bogus sectors per cluster %u",
1330 sbi->sec_per_clus); 1329 sbi->sec_per_clus);
1331 brelse(bh); 1330 brelse(bh);
1332 goto out_invalid; 1331 goto out_invalid;
1333 } 1332 }
1334 1333
1335 if (logical_sector_size < sb->s_blocksize) { 1334 if (logical_sector_size < sb->s_blocksize) {
1336 printk(KERN_ERR "FAT: logical sector size too small for device" 1335 fat_msg(sb, KERN_ERR, "logical sector size too small for device"
1337 " (logical sector size = %u)\n", logical_sector_size); 1336 " (logical sector size = %u)", logical_sector_size);
1338 brelse(bh); 1337 brelse(bh);
1339 goto out_fail; 1338 goto out_fail;
1340 } 1339 }
@@ -1342,14 +1341,14 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
1342 brelse(bh); 1341 brelse(bh);
1343 1342
1344 if (!sb_set_blocksize(sb, logical_sector_size)) { 1343 if (!sb_set_blocksize(sb, logical_sector_size)) {
1345 printk(KERN_ERR "FAT: unable to set blocksize %u\n", 1344 fat_msg(sb, KERN_ERR, "unable to set blocksize %u",
1346 logical_sector_size); 1345 logical_sector_size);
1347 goto out_fail; 1346 goto out_fail;
1348 } 1347 }
1349 bh = sb_bread(sb, 0); 1348 bh = sb_bread(sb, 0);
1350 if (bh == NULL) { 1349 if (bh == NULL) {
1351 printk(KERN_ERR "FAT: unable to read boot sector" 1350 fat_msg(sb, KERN_ERR, "unable to read boot sector"
1352 " (logical sector size = %lu)\n", 1351 " (logical sector size = %lu)",
1353 sb->s_blocksize); 1352 sb->s_blocksize);
1354 goto out_fail; 1353 goto out_fail;
1355 } 1354 }
@@ -1385,16 +1384,16 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
1385 1384
1386 fsinfo_bh = sb_bread(sb, sbi->fsinfo_sector); 1385 fsinfo_bh = sb_bread(sb, sbi->fsinfo_sector);
1387 if (fsinfo_bh == NULL) { 1386 if (fsinfo_bh == NULL) {
1388 printk(KERN_ERR "FAT: bread failed, FSINFO block" 1387 fat_msg(sb, KERN_ERR, "bread failed, FSINFO block"
1389 " (sector = %lu)\n", sbi->fsinfo_sector); 1388 " (sector = %lu)", sbi->fsinfo_sector);
1390 brelse(bh); 1389 brelse(bh);
1391 goto out_fail; 1390 goto out_fail;
1392 } 1391 }
1393 1392
1394 fsinfo = (struct fat_boot_fsinfo *)fsinfo_bh->b_data; 1393 fsinfo = (struct fat_boot_fsinfo *)fsinfo_bh->b_data;
1395 if (!IS_FSINFO(fsinfo)) { 1394 if (!IS_FSINFO(fsinfo)) {
1396 printk(KERN_WARNING "FAT: Invalid FSINFO signature: " 1395 fat_msg(sb, KERN_WARNING, "Invalid FSINFO signature: "
1397 "0x%08x, 0x%08x (sector = %lu)\n", 1396 "0x%08x, 0x%08x (sector = %lu)",
1398 le32_to_cpu(fsinfo->signature1), 1397 le32_to_cpu(fsinfo->signature1),
1399 le32_to_cpu(fsinfo->signature2), 1398 le32_to_cpu(fsinfo->signature2),
1400 sbi->fsinfo_sector); 1399 sbi->fsinfo_sector);
@@ -1415,8 +1414,8 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
1415 sbi->dir_entries = get_unaligned_le16(&b->dir_entries); 1414 sbi->dir_entries = get_unaligned_le16(&b->dir_entries);
1416 if (sbi->dir_entries & (sbi->dir_per_block - 1)) { 1415 if (sbi->dir_entries & (sbi->dir_per_block - 1)) {
1417 if (!silent) 1416 if (!silent)
1418 printk(KERN_ERR "FAT: bogus directroy-entries per block" 1417 fat_msg(sb, KERN_ERR, "bogus directroy-entries per block"
1419 " (%u)\n", sbi->dir_entries); 1418 " (%u)", sbi->dir_entries);
1420 brelse(bh); 1419 brelse(bh);
1421 goto out_invalid; 1420 goto out_invalid;
1422 } 1421 }
@@ -1438,7 +1437,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
1438 total_clusters = min(total_clusters, fat_clusters - FAT_START_ENT); 1437 total_clusters = min(total_clusters, fat_clusters - FAT_START_ENT);
1439 if (total_clusters > MAX_FAT(sb)) { 1438 if (total_clusters > MAX_FAT(sb)) {
1440 if (!silent) 1439 if (!silent)
1441 printk(KERN_ERR "FAT: count of clusters too big (%u)\n", 1440 fat_msg(sb, KERN_ERR, "count of clusters too big (%u)",
1442 total_clusters); 1441 total_clusters);
1443 brelse(bh); 1442 brelse(bh);
1444 goto out_invalid; 1443 goto out_invalid;
@@ -1471,7 +1470,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
1471 sprintf(buf, "cp%d", sbi->options.codepage); 1470 sprintf(buf, "cp%d", sbi->options.codepage);
1472 sbi->nls_disk = load_nls(buf); 1471 sbi->nls_disk = load_nls(buf);
1473 if (!sbi->nls_disk) { 1472 if (!sbi->nls_disk) {
1474 printk(KERN_ERR "FAT: codepage %s not found\n", buf); 1473 fat_msg(sb, KERN_ERR, "codepage %s not found", buf);
1475 goto out_fail; 1474 goto out_fail;
1476 } 1475 }
1477 1476
@@ -1479,7 +1478,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
1479 if (sbi->options.isvfat) { 1478 if (sbi->options.isvfat) {
1480 sbi->nls_io = load_nls(sbi->options.iocharset); 1479 sbi->nls_io = load_nls(sbi->options.iocharset);
1481 if (!sbi->nls_io) { 1480 if (!sbi->nls_io) {
1482 printk(KERN_ERR "FAT: IO charset %s not found\n", 1481 fat_msg(sb, KERN_ERR, "IO charset %s not found",
1483 sbi->options.iocharset); 1482 sbi->options.iocharset);
1484 goto out_fail; 1483 goto out_fail;
1485 } 1484 }
@@ -1503,7 +1502,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
1503 insert_inode_hash(root_inode); 1502 insert_inode_hash(root_inode);
1504 sb->s_root = d_alloc_root(root_inode); 1503 sb->s_root = d_alloc_root(root_inode);
1505 if (!sb->s_root) { 1504 if (!sb->s_root) {
1506 printk(KERN_ERR "FAT: get root inode failed\n"); 1505 fat_msg(sb, KERN_ERR, "get root inode failed");
1507 goto out_fail; 1506 goto out_fail;
1508 } 1507 }
1509 1508
@@ -1512,8 +1511,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
1512out_invalid: 1511out_invalid:
1513 error = -EINVAL; 1512 error = -EINVAL;
1514 if (!silent) 1513 if (!silent)
1515 printk(KERN_INFO "VFS: Can't find a valid FAT filesystem" 1514 fat_msg(sb, KERN_INFO, "Can't find a valid FAT filesystem");
1516 " on dev %s.\n", sb->s_id);
1517 1515
1518out_fail: 1516out_fail:
1519 if (fat_inode) 1517 if (fat_inode)
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 970e682ea754..6d93360ca0cc 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -20,30 +20,46 @@
20 * In case the file system is remounted read-only, it can be made writable 20 * In case the file system is remounted read-only, it can be made writable
21 * again by remounting it. 21 * again by remounting it.
22 */ 22 */
23void __fat_fs_error(struct super_block *s, int report, const char *fmt, ...) 23void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...)
24{ 24{
25 struct fat_mount_options *opts = &MSDOS_SB(s)->options; 25 struct fat_mount_options *opts = &MSDOS_SB(sb)->options;
26 va_list args; 26 va_list args;
27 struct va_format vaf;
27 28
28 if (report) { 29 if (report) {
29 printk(KERN_ERR "FAT: Filesystem error (dev %s)\n", s->s_id);
30
31 printk(KERN_ERR " ");
32 va_start(args, fmt); 30 va_start(args, fmt);
33 vprintk(fmt, args); 31 vaf.fmt = fmt;
32 vaf.va = &args;
33 printk(KERN_ERR "FAT-fs (%s): error, %pV\n", sb->s_id, &vaf);
34 va_end(args); 34 va_end(args);
35 printk("\n");
36 } 35 }
37 36
38 if (opts->errors == FAT_ERRORS_PANIC) 37 if (opts->errors == FAT_ERRORS_PANIC)
39 panic("FAT: fs panic from previous error\n"); 38 panic("FAT-fs (%s): fs panic from previous error\n", sb->s_id);
40 else if (opts->errors == FAT_ERRORS_RO && !(s->s_flags & MS_RDONLY)) { 39 else if (opts->errors == FAT_ERRORS_RO && !(sb->s_flags & MS_RDONLY)) {
41 s->s_flags |= MS_RDONLY; 40 sb->s_flags |= MS_RDONLY;
42 printk(KERN_ERR "FAT: Filesystem has been set read-only\n"); 41 printk(KERN_ERR "FAT-fs (%s): Filesystem has been "
42 "set read-only\n", sb->s_id);
43 } 43 }
44} 44}
45EXPORT_SYMBOL_GPL(__fat_fs_error); 45EXPORT_SYMBOL_GPL(__fat_fs_error);
46 46
47/**
48 * fat_msg() - print preformated FAT specific messages. Every thing what is
49 * not fat_fs_error() should be fat_msg().
50 */
51void fat_msg(struct super_block *sb, const char *level, const char *fmt, ...)
52{
53 struct va_format vaf;
54 va_list args;
55
56 va_start(args, fmt);
57 vaf.fmt = fmt;
58 vaf.va = &args;
59 printk("%sFAT-fs (%s): %pV\n", level, sb->s_id, &vaf);
60 va_end(args);
61}
62
47/* Flushes the number of free clusters on FAT32 */ 63/* Flushes the number of free clusters on FAT32 */
48/* XXX: Need to write one per FSINFO block. Currently only writes 1 */ 64/* XXX: Need to write one per FSINFO block. Currently only writes 1 */
49int fat_clusters_flush(struct super_block *sb) 65int fat_clusters_flush(struct super_block *sb)
@@ -57,15 +73,15 @@ int fat_clusters_flush(struct super_block *sb)
57 73
58 bh = sb_bread(sb, sbi->fsinfo_sector); 74 bh = sb_bread(sb, sbi->fsinfo_sector);
59 if (bh == NULL) { 75 if (bh == NULL) {
60 printk(KERN_ERR "FAT: bread failed in fat_clusters_flush\n"); 76 fat_msg(sb, KERN_ERR, "bread failed in fat_clusters_flush");
61 return -EIO; 77 return -EIO;
62 } 78 }
63 79
64 fsinfo = (struct fat_boot_fsinfo *)bh->b_data; 80 fsinfo = (struct fat_boot_fsinfo *)bh->b_data;
65 /* Sanity check */ 81 /* Sanity check */
66 if (!IS_FSINFO(fsinfo)) { 82 if (!IS_FSINFO(fsinfo)) {
67 printk(KERN_ERR "FAT: Invalid FSINFO signature: " 83 fat_msg(sb, KERN_ERR, "Invalid FSINFO signature: "
68 "0x%08x, 0x%08x (sector = %lu)\n", 84 "0x%08x, 0x%08x (sector = %lu)",
69 le32_to_cpu(fsinfo->signature1), 85 le32_to_cpu(fsinfo->signature1),
70 le32_to_cpu(fsinfo->signature2), 86 le32_to_cpu(fsinfo->signature2),
71 sbi->fsinfo_sector); 87 sbi->fsinfo_sector);
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index 711499040eb6..3b222dafd15b 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -659,14 +659,14 @@ static const struct inode_operations msdos_dir_inode_operations = {
659 659
660static void setup(struct super_block *sb) 660static void setup(struct super_block *sb)
661{ 661{
662 MSDOS_SB(sb)->dir_ops = &msdos_dir_inode_operations;
662 sb->s_d_op = &msdos_dentry_operations; 663 sb->s_d_op = &msdos_dentry_operations;
663 sb->s_flags |= MS_NOATIME; 664 sb->s_flags |= MS_NOATIME;
664} 665}
665 666
666static int msdos_fill_super(struct super_block *sb, void *data, int silent) 667static int msdos_fill_super(struct super_block *sb, void *data, int silent)
667{ 668{
668 return fat_fill_super(sb, data, silent, &msdos_dir_inode_operations, 669 return fat_fill_super(sb, data, silent, 0, setup);
669 0, setup);
670} 670}
671 671
672static struct dentry *msdos_mount(struct file_system_type *fs_type, 672static struct dentry *msdos_mount(struct file_system_type *fs_type,
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index adae3fb7451a..20b4ea53fdc4 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -1065,6 +1065,7 @@ static const struct inode_operations vfat_dir_inode_operations = {
1065 1065
1066static void setup(struct super_block *sb) 1066static void setup(struct super_block *sb)
1067{ 1067{
1068 MSDOS_SB(sb)->dir_ops = &vfat_dir_inode_operations;
1068 if (MSDOS_SB(sb)->options.name_check != 's') 1069 if (MSDOS_SB(sb)->options.name_check != 's')
1069 sb->s_d_op = &vfat_ci_dentry_ops; 1070 sb->s_d_op = &vfat_ci_dentry_ops;
1070 else 1071 else
@@ -1073,8 +1074,7 @@ static void setup(struct super_block *sb)
1073 1074
1074static int vfat_fill_super(struct super_block *sb, void *data, int silent) 1075static int vfat_fill_super(struct super_block *sb, void *data, int silent)
1075{ 1076{
1076 return fat_fill_super(sb, data, silent, &vfat_dir_inode_operations, 1077 return fat_fill_super(sb, data, silent, 1, setup);
1077 1, setup);
1078} 1078}
1079 1079
1080static struct dentry *vfat_mount(struct file_system_type *fs_type, 1080static struct dentry *vfat_mount(struct file_system_type *fs_type,
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index 48a18f184d50..30afdfa7aec7 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -33,8 +33,6 @@ void fscache_enqueue_operation(struct fscache_operation *op)
33 _enter("{OBJ%x OP%x,%u}", 33 _enter("{OBJ%x OP%x,%u}",
34 op->object->debug_id, op->debug_id, atomic_read(&op->usage)); 34 op->object->debug_id, op->debug_id, atomic_read(&op->usage));
35 35
36 fscache_set_op_state(op, "EnQ");
37
38 ASSERT(list_empty(&op->pend_link)); 36 ASSERT(list_empty(&op->pend_link));
39 ASSERT(op->processor != NULL); 37 ASSERT(op->processor != NULL);
40 ASSERTCMP(op->object->state, >=, FSCACHE_OBJECT_AVAILABLE); 38 ASSERTCMP(op->object->state, >=, FSCACHE_OBJECT_AVAILABLE);
@@ -66,8 +64,6 @@ EXPORT_SYMBOL(fscache_enqueue_operation);
66static void fscache_run_op(struct fscache_object *object, 64static void fscache_run_op(struct fscache_object *object,
67 struct fscache_operation *op) 65 struct fscache_operation *op)
68{ 66{
69 fscache_set_op_state(op, "Run");
70
71 object->n_in_progress++; 67 object->n_in_progress++;
72 if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags)) 68 if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags))
73 wake_up_bit(&op->flags, FSCACHE_OP_WAITING); 69 wake_up_bit(&op->flags, FSCACHE_OP_WAITING);
@@ -88,8 +84,6 @@ int fscache_submit_exclusive_op(struct fscache_object *object,
88 84
89 _enter("{OBJ%x OP%x},", object->debug_id, op->debug_id); 85 _enter("{OBJ%x OP%x},", object->debug_id, op->debug_id);
90 86
91 fscache_set_op_state(op, "SubmitX");
92
93 spin_lock(&object->lock); 87 spin_lock(&object->lock);
94 ASSERTCMP(object->n_ops, >=, object->n_in_progress); 88 ASSERTCMP(object->n_ops, >=, object->n_in_progress);
95 ASSERTCMP(object->n_ops, >=, object->n_exclusive); 89 ASSERTCMP(object->n_ops, >=, object->n_exclusive);
@@ -194,8 +188,6 @@ int fscache_submit_op(struct fscache_object *object,
194 188
195 ASSERTCMP(atomic_read(&op->usage), >, 0); 189 ASSERTCMP(atomic_read(&op->usage), >, 0);
196 190
197 fscache_set_op_state(op, "Submit");
198
199 spin_lock(&object->lock); 191 spin_lock(&object->lock);
200 ASSERTCMP(object->n_ops, >=, object->n_in_progress); 192 ASSERTCMP(object->n_ops, >=, object->n_in_progress);
201 ASSERTCMP(object->n_ops, >=, object->n_exclusive); 193 ASSERTCMP(object->n_ops, >=, object->n_exclusive);
@@ -335,8 +327,6 @@ void fscache_put_operation(struct fscache_operation *op)
335 if (!atomic_dec_and_test(&op->usage)) 327 if (!atomic_dec_and_test(&op->usage))
336 return; 328 return;
337 329
338 fscache_set_op_state(op, "Put");
339
340 _debug("PUT OP"); 330 _debug("PUT OP");
341 if (test_and_set_bit(FSCACHE_OP_DEAD, &op->flags)) 331 if (test_and_set_bit(FSCACHE_OP_DEAD, &op->flags))
342 BUG(); 332 BUG();
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 41c441c2058d..a2a5d19ece6a 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -155,11 +155,9 @@ static void fscache_attr_changed_op(struct fscache_operation *op)
155 fscache_stat(&fscache_n_attr_changed_calls); 155 fscache_stat(&fscache_n_attr_changed_calls);
156 156
157 if (fscache_object_is_active(object)) { 157 if (fscache_object_is_active(object)) {
158 fscache_set_op_state(op, "CallFS");
159 fscache_stat(&fscache_n_cop_attr_changed); 158 fscache_stat(&fscache_n_cop_attr_changed);
160 ret = object->cache->ops->attr_changed(object); 159 ret = object->cache->ops->attr_changed(object);
161 fscache_stat_d(&fscache_n_cop_attr_changed); 160 fscache_stat_d(&fscache_n_cop_attr_changed);
162 fscache_set_op_state(op, "Done");
163 if (ret < 0) 161 if (ret < 0)
164 fscache_abort_object(object); 162 fscache_abort_object(object);
165 } 163 }
@@ -190,7 +188,6 @@ int __fscache_attr_changed(struct fscache_cookie *cookie)
190 188
191 fscache_operation_init(op, fscache_attr_changed_op, NULL); 189 fscache_operation_init(op, fscache_attr_changed_op, NULL);
192 op->flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_EXCLUSIVE); 190 op->flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_EXCLUSIVE);
193 fscache_set_op_name(op, "Attr");
194 191
195 spin_lock(&cookie->lock); 192 spin_lock(&cookie->lock);
196 193
@@ -257,7 +254,6 @@ static struct fscache_retrieval *fscache_alloc_retrieval(
257 op->context = context; 254 op->context = context;
258 op->start_time = jiffies; 255 op->start_time = jiffies;
259 INIT_LIST_HEAD(&op->to_do); 256 INIT_LIST_HEAD(&op->to_do);
260 fscache_set_op_name(&op->op, "Retr");
261 return op; 257 return op;
262} 258}
263 259
@@ -368,7 +364,6 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
368 _leave(" = -ENOMEM"); 364 _leave(" = -ENOMEM");
369 return -ENOMEM; 365 return -ENOMEM;
370 } 366 }
371 fscache_set_op_name(&op->op, "RetrRA1");
372 367
373 spin_lock(&cookie->lock); 368 spin_lock(&cookie->lock);
374 369
@@ -487,7 +482,6 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
487 op = fscache_alloc_retrieval(mapping, end_io_func, context); 482 op = fscache_alloc_retrieval(mapping, end_io_func, context);
488 if (!op) 483 if (!op)
489 return -ENOMEM; 484 return -ENOMEM;
490 fscache_set_op_name(&op->op, "RetrRAN");
491 485
492 spin_lock(&cookie->lock); 486 spin_lock(&cookie->lock);
493 487
@@ -589,7 +583,6 @@ int __fscache_alloc_page(struct fscache_cookie *cookie,
589 op = fscache_alloc_retrieval(page->mapping, NULL, NULL); 583 op = fscache_alloc_retrieval(page->mapping, NULL, NULL);
590 if (!op) 584 if (!op)
591 return -ENOMEM; 585 return -ENOMEM;
592 fscache_set_op_name(&op->op, "RetrAL1");
593 586
594 spin_lock(&cookie->lock); 587 spin_lock(&cookie->lock);
595 588
@@ -662,8 +655,6 @@ static void fscache_write_op(struct fscache_operation *_op)
662 655
663 _enter("{OP%x,%d}", op->op.debug_id, atomic_read(&op->op.usage)); 656 _enter("{OP%x,%d}", op->op.debug_id, atomic_read(&op->op.usage));
664 657
665 fscache_set_op_state(&op->op, "GetPage");
666
667 spin_lock(&object->lock); 658 spin_lock(&object->lock);
668 cookie = object->cookie; 659 cookie = object->cookie;
669 660
@@ -698,15 +689,12 @@ static void fscache_write_op(struct fscache_operation *_op)
698 spin_unlock(&cookie->stores_lock); 689 spin_unlock(&cookie->stores_lock);
699 spin_unlock(&object->lock); 690 spin_unlock(&object->lock);
700 691
701 fscache_set_op_state(&op->op, "Store");
702 fscache_stat(&fscache_n_store_pages); 692 fscache_stat(&fscache_n_store_pages);
703 fscache_stat(&fscache_n_cop_write_page); 693 fscache_stat(&fscache_n_cop_write_page);
704 ret = object->cache->ops->write_page(op, page); 694 ret = object->cache->ops->write_page(op, page);
705 fscache_stat_d(&fscache_n_cop_write_page); 695 fscache_stat_d(&fscache_n_cop_write_page);
706 fscache_set_op_state(&op->op, "EndWrite");
707 fscache_end_page_write(object, page); 696 fscache_end_page_write(object, page);
708 if (ret < 0) { 697 if (ret < 0) {
709 fscache_set_op_state(&op->op, "Abort");
710 fscache_abort_object(object); 698 fscache_abort_object(object);
711 } else { 699 } else {
712 fscache_enqueue_operation(&op->op); 700 fscache_enqueue_operation(&op->op);
@@ -778,7 +766,6 @@ int __fscache_write_page(struct fscache_cookie *cookie,
778 fscache_operation_init(&op->op, fscache_write_op, 766 fscache_operation_init(&op->op, fscache_write_op,
779 fscache_release_write_op); 767 fscache_release_write_op);
780 op->op.flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_WAITING); 768 op->op.flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_WAITING);
781 fscache_set_op_name(&op->op, "Write1");
782 769
783 ret = radix_tree_preload(gfp & ~__GFP_HIGHMEM); 770 ret = radix_tree_preload(gfp & ~__GFP_HIGHMEM);
784 if (ret < 0) 771 if (ret < 0)
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index a2a6abbccc07..2792a790e50b 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1346,11 +1346,14 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
1346} 1346}
1347 1347
1348 1348
1349static int gfs2_shrink_glock_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask) 1349static int gfs2_shrink_glock_memory(struct shrinker *shrink,
1350 struct shrink_control *sc)
1350{ 1351{
1351 struct gfs2_glock *gl; 1352 struct gfs2_glock *gl;
1352 int may_demote; 1353 int may_demote;
1353 int nr_skipped = 0; 1354 int nr_skipped = 0;
1355 int nr = sc->nr_to_scan;
1356 gfp_t gfp_mask = sc->gfp_mask;
1354 LIST_HEAD(skipped); 1357 LIST_HEAD(skipped);
1355 1358
1356 if (nr == 0) 1359 if (nr == 0)
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index e23d9864c418..42e8d23bc047 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -38,6 +38,7 @@
38 38
39#include <linux/sched.h> 39#include <linux/sched.h>
40#include <linux/slab.h> 40#include <linux/slab.h>
41#include <linux/mm.h>
41#include <linux/spinlock.h> 42#include <linux/spinlock.h>
42#include <linux/completion.h> 43#include <linux/completion.h>
43#include <linux/buffer_head.h> 44#include <linux/buffer_head.h>
@@ -77,19 +78,20 @@ static LIST_HEAD(qd_lru_list);
77static atomic_t qd_lru_count = ATOMIC_INIT(0); 78static atomic_t qd_lru_count = ATOMIC_INIT(0);
78static DEFINE_SPINLOCK(qd_lru_lock); 79static DEFINE_SPINLOCK(qd_lru_lock);
79 80
80int gfs2_shrink_qd_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask) 81int gfs2_shrink_qd_memory(struct shrinker *shrink, struct shrink_control *sc)
81{ 82{
82 struct gfs2_quota_data *qd; 83 struct gfs2_quota_data *qd;
83 struct gfs2_sbd *sdp; 84 struct gfs2_sbd *sdp;
85 int nr_to_scan = sc->nr_to_scan;
84 86
85 if (nr == 0) 87 if (nr_to_scan == 0)
86 goto out; 88 goto out;
87 89
88 if (!(gfp_mask & __GFP_FS)) 90 if (!(sc->gfp_mask & __GFP_FS))
89 return -1; 91 return -1;
90 92
91 spin_lock(&qd_lru_lock); 93 spin_lock(&qd_lru_lock);
92 while (nr && !list_empty(&qd_lru_list)) { 94 while (nr_to_scan && !list_empty(&qd_lru_list)) {
93 qd = list_entry(qd_lru_list.next, 95 qd = list_entry(qd_lru_list.next,
94 struct gfs2_quota_data, qd_reclaim); 96 struct gfs2_quota_data, qd_reclaim);
95 sdp = qd->qd_gl->gl_sbd; 97 sdp = qd->qd_gl->gl_sbd;
@@ -110,7 +112,7 @@ int gfs2_shrink_qd_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
110 spin_unlock(&qd_lru_lock); 112 spin_unlock(&qd_lru_lock);
111 kmem_cache_free(gfs2_quotad_cachep, qd); 113 kmem_cache_free(gfs2_quotad_cachep, qd);
112 spin_lock(&qd_lru_lock); 114 spin_lock(&qd_lru_lock);
113 nr--; 115 nr_to_scan--;
114 } 116 }
115 spin_unlock(&qd_lru_lock); 117 spin_unlock(&qd_lru_lock);
116 118
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index e7d236ca48bd..90bf1c302a98 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -12,6 +12,7 @@
12 12
13struct gfs2_inode; 13struct gfs2_inode;
14struct gfs2_sbd; 14struct gfs2_sbd;
15struct shrink_control;
15 16
16#define NO_QUOTA_CHANGE ((u32)-1) 17#define NO_QUOTA_CHANGE ((u32)-1)
17 18
@@ -51,7 +52,8 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)
51 return ret; 52 return ret;
52} 53}
53 54
54extern int gfs2_shrink_qd_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask); 55extern int gfs2_shrink_qd_memory(struct shrinker *shrink,
56 struct shrink_control *sc);
55extern const struct quotactl_ops gfs2_quotactl_ops; 57extern const struct quotactl_ops gfs2_quotactl_ops;
56 58
57#endif /* __QUOTA_DOT_H__ */ 59#endif /* __QUOTA_DOT_H__ */
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index b9eeb1cd03ff..e7a035781b7d 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -412,10 +412,10 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
412 pgoff = offset >> PAGE_SHIFT; 412 pgoff = offset >> PAGE_SHIFT;
413 413
414 i_size_write(inode, offset); 414 i_size_write(inode, offset);
415 spin_lock(&mapping->i_mmap_lock); 415 mutex_lock(&mapping->i_mmap_mutex);
416 if (!prio_tree_empty(&mapping->i_mmap)) 416 if (!prio_tree_empty(&mapping->i_mmap))
417 hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff); 417 hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff);
418 spin_unlock(&mapping->i_mmap_lock); 418 mutex_unlock(&mapping->i_mmap_mutex);
419 truncate_hugepages(inode, offset); 419 truncate_hugepages(inode, offset);
420 return 0; 420 return 0;
421} 421}
diff --git a/fs/inode.c b/fs/inode.c
index 05f4fa521325..990d284877a1 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -326,12 +326,11 @@ void address_space_init_once(struct address_space *mapping)
326 memset(mapping, 0, sizeof(*mapping)); 326 memset(mapping, 0, sizeof(*mapping));
327 INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC); 327 INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
328 spin_lock_init(&mapping->tree_lock); 328 spin_lock_init(&mapping->tree_lock);
329 spin_lock_init(&mapping->i_mmap_lock); 329 mutex_init(&mapping->i_mmap_mutex);
330 INIT_LIST_HEAD(&mapping->private_list); 330 INIT_LIST_HEAD(&mapping->private_list);
331 spin_lock_init(&mapping->private_lock); 331 spin_lock_init(&mapping->private_lock);
332 INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap); 332 INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
333 INIT_LIST_HEAD(&mapping->i_mmap_nonlinear); 333 INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
334 mutex_init(&mapping->unmap_mutex);
335} 334}
336EXPORT_SYMBOL(address_space_init_once); 335EXPORT_SYMBOL(address_space_init_once);
337 336
@@ -752,8 +751,12 @@ static void prune_icache(int nr_to_scan)
752 * This function is passed the number of inodes to scan, and it returns the 751 * This function is passed the number of inodes to scan, and it returns the
753 * total number of remaining possibly-reclaimable inodes. 752 * total number of remaining possibly-reclaimable inodes.
754 */ 753 */
755static int shrink_icache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask) 754static int shrink_icache_memory(struct shrinker *shrink,
755 struct shrink_control *sc)
756{ 756{
757 int nr = sc->nr_to_scan;
758 gfp_t gfp_mask = sc->gfp_mask;
759
757 if (nr) { 760 if (nr) {
758 /* 761 /*
759 * Nasty deadlock avoidance. We may hold various FS locks, 762 * Nasty deadlock avoidance. We may hold various FS locks,
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 69b180459463..72ffa974b0b8 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -302,12 +302,6 @@ void journal_commit_transaction(journal_t *journal)
302 * all outstanding updates to complete. 302 * all outstanding updates to complete.
303 */ 303 */
304 304
305#ifdef COMMIT_STATS
306 spin_lock(&journal->j_list_lock);
307 summarise_journal_usage(journal);
308 spin_unlock(&journal->j_list_lock);
309#endif
310
311 /* Do we need to erase the effects of a prior journal_flush? */ 305 /* Do we need to erase the effects of a prior journal_flush? */
312 if (journal->j_flags & JFS_FLUSHED) { 306 if (journal->j_flags & JFS_FLUSHED) {
313 jbd_debug(3, "super block updated\n"); 307 jbd_debug(3, "super block updated\n");
@@ -722,8 +716,13 @@ wait_for_iobuf:
722 required. */ 716 required. */
723 JBUFFER_TRACE(jh, "file as BJ_Forget"); 717 JBUFFER_TRACE(jh, "file as BJ_Forget");
724 journal_file_buffer(jh, commit_transaction, BJ_Forget); 718 journal_file_buffer(jh, commit_transaction, BJ_Forget);
725 /* Wake up any transactions which were waiting for this 719 /*
726 IO to complete */ 720 * Wake up any transactions which were waiting for this
721 * IO to complete. The barrier must be here so that changes
722 * by journal_file_buffer() take effect before wake_up_bit()
723 * does the waitqueue check.
724 */
725 smp_mb();
727 wake_up_bit(&bh->b_state, BH_Unshadow); 726 wake_up_bit(&bh->b_state, BH_Unshadow);
728 JBUFFER_TRACE(jh, "brelse shadowed buffer"); 727 JBUFFER_TRACE(jh, "brelse shadowed buffer");
729 __brelse(bh); 728 __brelse(bh);
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index b3713afaaa9e..e2d4285fbe90 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -437,9 +437,12 @@ int __log_space_left(journal_t *journal)
437int __log_start_commit(journal_t *journal, tid_t target) 437int __log_start_commit(journal_t *journal, tid_t target)
438{ 438{
439 /* 439 /*
440 * Are we already doing a recent enough commit? 440 * The only transaction we can possibly wait upon is the
441 * currently running transaction (if it exists). Otherwise,
442 * the target tid must be an old one.
441 */ 443 */
442 if (!tid_geq(journal->j_commit_request, target)) { 444 if (journal->j_running_transaction &&
445 journal->j_running_transaction->t_tid == target) {
443 /* 446 /*
444 * We want a new commit: OK, mark the request and wakeup the 447 * We want a new commit: OK, mark the request and wakeup the
445 * commit thread. We do _not_ do the commit ourselves. 448 * commit thread. We do _not_ do the commit ourselves.
@@ -451,7 +454,14 @@ int __log_start_commit(journal_t *journal, tid_t target)
451 journal->j_commit_sequence); 454 journal->j_commit_sequence);
452 wake_up(&journal->j_wait_commit); 455 wake_up(&journal->j_wait_commit);
453 return 1; 456 return 1;
454 } 457 } else if (!tid_geq(journal->j_commit_request, target))
458 /* This should never happen, but if it does, preserve
459 the evidence before kjournald goes into a loop and
460 increments j_commit_sequence beyond all recognition. */
461 WARN_ONCE(1, "jbd: bad log_start_commit: %u %u %u %u\n",
462 journal->j_commit_request, journal->j_commit_sequence,
463 target, journal->j_running_transaction ?
464 journal->j_running_transaction->t_tid : 0);
455 return 0; 465 return 0;
456} 466}
457 467
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 60d2319651b2..f7ee81a065da 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -266,7 +266,8 @@ static handle_t *new_handle(int nblocks)
266 * This function is visible to journal users (like ext3fs), so is not 266 * This function is visible to journal users (like ext3fs), so is not
267 * called with the journal already locked. 267 * called with the journal already locked.
268 * 268 *
269 * Return a pointer to a newly allocated handle, or NULL on failure 269 * Return a pointer to a newly allocated handle, or an ERR_PTR() value
270 * on failure.
270 */ 271 */
271handle_t *journal_start(journal_t *journal, int nblocks) 272handle_t *journal_start(journal_t *journal, int nblocks)
272{ 273{
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 6e28000a4b21..29148a81c783 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -338,12 +338,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
338 * all outstanding updates to complete. 338 * all outstanding updates to complete.
339 */ 339 */
340 340
341#ifdef COMMIT_STATS
342 spin_lock(&journal->j_list_lock);
343 summarise_journal_usage(journal);
344 spin_unlock(&journal->j_list_lock);
345#endif
346
347 /* Do we need to erase the effects of a prior jbd2_journal_flush? */ 341 /* Do we need to erase the effects of a prior jbd2_journal_flush? */
348 if (journal->j_flags & JBD2_FLUSHED) { 342 if (journal->j_flags & JBD2_FLUSHED) {
349 jbd_debug(3, "super block updated\n"); 343 jbd_debug(3, "super block updated\n");
diff --git a/fs/mbcache.c b/fs/mbcache.c
index 2f174be06555..8c32ef3ba88e 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -90,7 +90,8 @@ static DEFINE_SPINLOCK(mb_cache_spinlock);
90 * What the mbcache registers as to get shrunk dynamically. 90 * What the mbcache registers as to get shrunk dynamically.
91 */ 91 */
92 92
93static int mb_cache_shrink_fn(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask); 93static int mb_cache_shrink_fn(struct shrinker *shrink,
94 struct shrink_control *sc);
94 95
95static struct shrinker mb_cache_shrinker = { 96static struct shrinker mb_cache_shrinker = {
96 .shrink = mb_cache_shrink_fn, 97 .shrink = mb_cache_shrink_fn,
@@ -156,18 +157,19 @@ forget:
156 * gets low. 157 * gets low.
157 * 158 *
158 * @shrink: (ignored) 159 * @shrink: (ignored)
159 * @nr_to_scan: Number of objects to scan 160 * @sc: shrink_control passed from reclaim
160 * @gfp_mask: (ignored)
161 * 161 *
162 * Returns the number of objects which are present in the cache. 162 * Returns the number of objects which are present in the cache.
163 */ 163 */
164static int 164static int
165mb_cache_shrink_fn(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) 165mb_cache_shrink_fn(struct shrinker *shrink, struct shrink_control *sc)
166{ 166{
167 LIST_HEAD(free_list); 167 LIST_HEAD(free_list);
168 struct mb_cache *cache; 168 struct mb_cache *cache;
169 struct mb_cache_entry *entry, *tmp; 169 struct mb_cache_entry *entry, *tmp;
170 int count = 0; 170 int count = 0;
171 int nr_to_scan = sc->nr_to_scan;
172 gfp_t gfp_mask = sc->gfp_mask;
171 173
172 mb_debug("trying to free %d entries", nr_to_scan); 174 mb_debug("trying to free %d entries", nr_to_scan);
173 spin_lock(&mb_cache_spinlock); 175 spin_lock(&mb_cache_spinlock);
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 0250e4ce4893..202f370526a7 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -461,7 +461,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
461#endif 461#endif
462 struct ncp_entry_info finfo; 462 struct ncp_entry_info finfo;
463 463
464 data.wdog_pid = NULL; 464 memset(&data, 0, sizeof(data));
465 server = kzalloc(sizeof(struct ncp_server), GFP_KERNEL); 465 server = kzalloc(sizeof(struct ncp_server), GFP_KERNEL);
466 if (!server) 466 if (!server)
467 return -ENOMEM; 467 return -ENOMEM;
@@ -496,7 +496,6 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
496 struct ncp_mount_data_v4* md = (struct ncp_mount_data_v4*)raw_data; 496 struct ncp_mount_data_v4* md = (struct ncp_mount_data_v4*)raw_data;
497 497
498 data.flags = md->flags; 498 data.flags = md->flags;
499 data.int_flags = 0;
500 data.mounted_uid = md->mounted_uid; 499 data.mounted_uid = md->mounted_uid;
501 data.wdog_pid = find_get_pid(md->wdog_pid); 500 data.wdog_pid = find_get_pid(md->wdog_pid);
502 data.ncp_fd = md->ncp_fd; 501 data.ncp_fd = md->ncp_fd;
@@ -507,7 +506,6 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
507 data.file_mode = md->file_mode; 506 data.file_mode = md->file_mode;
508 data.dir_mode = md->dir_mode; 507 data.dir_mode = md->dir_mode;
509 data.info_fd = -1; 508 data.info_fd = -1;
510 data.mounted_vol[0] = 0;
511 } 509 }
512 break; 510 break;
513 default: 511 default:
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 7237672216c8..424e47773a84 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -2042,11 +2042,14 @@ static void nfs_access_free_list(struct list_head *head)
2042 } 2042 }
2043} 2043}
2044 2044
2045int nfs_access_cache_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) 2045int nfs_access_cache_shrinker(struct shrinker *shrink,
2046 struct shrink_control *sc)
2046{ 2047{
2047 LIST_HEAD(head); 2048 LIST_HEAD(head);
2048 struct nfs_inode *nfsi, *next; 2049 struct nfs_inode *nfsi, *next;
2049 struct nfs_access_entry *cache; 2050 struct nfs_access_entry *cache;
2051 int nr_to_scan = sc->nr_to_scan;
2052 gfp_t gfp_mask = sc->gfp_mask;
2050 2053
2051 if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL) 2054 if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL)
2052 return (nr_to_scan == 0) ? 0 : -1; 2055 return (nr_to_scan == 0) ? 0 : -1;
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index ce118ce885dd..2df6ca7b5898 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -234,7 +234,7 @@ extern int nfs_init_client(struct nfs_client *clp,
234 234
235/* dir.c */ 235/* dir.c */
236extern int nfs_access_cache_shrinker(struct shrinker *shrink, 236extern int nfs_access_cache_shrinker(struct shrinker *shrink,
237 int nr_to_scan, gfp_t gfp_mask); 237 struct shrink_control *sc);
238 238
239/* inode.c */ 239/* inode.c */
240extern struct workqueue_struct *nfsiod_workqueue; 240extern struct workqueue_struct *nfsiod_workqueue;
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index d545e97d99c3..8ed4d3433199 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -255,7 +255,11 @@ ssize_t part_discard_alignment_show(struct device *dev,
255 struct device_attribute *attr, char *buf) 255 struct device_attribute *attr, char *buf)
256{ 256{
257 struct hd_struct *p = dev_to_part(dev); 257 struct hd_struct *p = dev_to_part(dev);
258 return sprintf(buf, "%u\n", p->discard_alignment); 258 struct gendisk *disk = dev_to_disk(dev);
259
260 return sprintf(buf, "%u\n",
261 queue_limit_discard_alignment(&disk->queue->limits,
262 p->start_sect));
259} 263}
260 264
261ssize_t part_stat_show(struct device *dev, 265ssize_t part_stat_show(struct device *dev,
@@ -449,8 +453,6 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
449 p->start_sect = start; 453 p->start_sect = start;
450 p->alignment_offset = 454 p->alignment_offset =
451 queue_limit_alignment_offset(&disk->queue->limits, start); 455 queue_limit_alignment_offset(&disk->queue->limits, start);
452 p->discard_alignment =
453 queue_limit_discard_alignment(&disk->queue->limits, start);
454 p->nr_sects = len; 456 p->nr_sects = len;
455 p->partno = partno; 457 p->partno = partno;
456 p->policy = get_disk_ro(disk); 458 p->policy = get_disk_ro(disk);
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index c03e8d3a3a5b..3763b436e69d 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -61,6 +61,14 @@ extern const struct file_operations proc_pagemap_operations;
61extern const struct file_operations proc_net_operations; 61extern const struct file_operations proc_net_operations;
62extern const struct inode_operations proc_net_inode_operations; 62extern const struct inode_operations proc_net_inode_operations;
63 63
64struct proc_maps_private {
65 struct pid *pid;
66 struct task_struct *task;
67#ifdef CONFIG_MMU
68 struct vm_area_struct *tail_vma;
69#endif
70};
71
64void proc_init_inodecache(void); 72void proc_init_inodecache(void);
65 73
66static inline struct pid *proc_pid(struct inode *inode) 74static inline struct pid *proc_pid(struct inode *inode)
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 318d8654989b..2c9db29ea358 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -858,7 +858,192 @@ const struct file_operations proc_pagemap_operations = {
858#endif /* CONFIG_PROC_PAGE_MONITOR */ 858#endif /* CONFIG_PROC_PAGE_MONITOR */
859 859
860#ifdef CONFIG_NUMA 860#ifdef CONFIG_NUMA
861extern int show_numa_map(struct seq_file *m, void *v); 861
862struct numa_maps {
863 struct vm_area_struct *vma;
864 unsigned long pages;
865 unsigned long anon;
866 unsigned long active;
867 unsigned long writeback;
868 unsigned long mapcount_max;
869 unsigned long dirty;
870 unsigned long swapcache;
871 unsigned long node[MAX_NUMNODES];
872};
873
874struct numa_maps_private {
875 struct proc_maps_private proc_maps;
876 struct numa_maps md;
877};
878
879static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty)
880{
881 int count = page_mapcount(page);
882
883 md->pages++;
884 if (pte_dirty || PageDirty(page))
885 md->dirty++;
886
887 if (PageSwapCache(page))
888 md->swapcache++;
889
890 if (PageActive(page) || PageUnevictable(page))
891 md->active++;
892
893 if (PageWriteback(page))
894 md->writeback++;
895
896 if (PageAnon(page))
897 md->anon++;
898
899 if (count > md->mapcount_max)
900 md->mapcount_max = count;
901
902 md->node[page_to_nid(page)]++;
903}
904
905static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
906 unsigned long end, struct mm_walk *walk)
907{
908 struct numa_maps *md;
909 spinlock_t *ptl;
910 pte_t *orig_pte;
911 pte_t *pte;
912
913 md = walk->private;
914 orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
915 do {
916 struct page *page;
917 int nid;
918
919 if (!pte_present(*pte))
920 continue;
921
922 page = vm_normal_page(md->vma, addr, *pte);
923 if (!page)
924 continue;
925
926 if (PageReserved(page))
927 continue;
928
929 nid = page_to_nid(page);
930 if (!node_isset(nid, node_states[N_HIGH_MEMORY]))
931 continue;
932
933 gather_stats(page, md, pte_dirty(*pte));
934
935 } while (pte++, addr += PAGE_SIZE, addr != end);
936 pte_unmap_unlock(orig_pte, ptl);
937 return 0;
938}
939#ifdef CONFIG_HUGETLB_PAGE
940static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask,
941 unsigned long addr, unsigned long end, struct mm_walk *walk)
942{
943 struct numa_maps *md;
944 struct page *page;
945
946 if (pte_none(*pte))
947 return 0;
948
949 page = pte_page(*pte);
950 if (!page)
951 return 0;
952
953 md = walk->private;
954 gather_stats(page, md, pte_dirty(*pte));
955 return 0;
956}
957
958#else
959static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask,
960 unsigned long addr, unsigned long end, struct mm_walk *walk)
961{
962 return 0;
963}
964#endif
965
966/*
967 * Display pages allocated per node and memory policy via /proc.
968 */
969static int show_numa_map(struct seq_file *m, void *v)
970{
971 struct numa_maps_private *numa_priv = m->private;
972 struct proc_maps_private *proc_priv = &numa_priv->proc_maps;
973 struct vm_area_struct *vma = v;
974 struct numa_maps *md = &numa_priv->md;
975 struct file *file = vma->vm_file;
976 struct mm_struct *mm = vma->vm_mm;
977 struct mm_walk walk = {};
978 struct mempolicy *pol;
979 int n;
980 char buffer[50];
981
982 if (!mm)
983 return 0;
984
985 /* Ensure we start with an empty set of numa_maps statistics. */
986 memset(md, 0, sizeof(*md));
987
988 md->vma = vma;
989
990 walk.hugetlb_entry = gather_hugetbl_stats;
991 walk.pmd_entry = gather_pte_stats;
992 walk.private = md;
993 walk.mm = mm;
994
995 pol = get_vma_policy(proc_priv->task, vma, vma->vm_start);
996 mpol_to_str(buffer, sizeof(buffer), pol, 0);
997 mpol_cond_put(pol);
998
999 seq_printf(m, "%08lx %s", vma->vm_start, buffer);
1000
1001 if (file) {
1002 seq_printf(m, " file=");
1003 seq_path(m, &file->f_path, "\n\t= ");
1004 } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
1005 seq_printf(m, " heap");
1006 } else if (vma->vm_start <= mm->start_stack &&
1007 vma->vm_end >= mm->start_stack) {
1008 seq_printf(m, " stack");
1009 }
1010
1011 walk_page_range(vma->vm_start, vma->vm_end, &walk);
1012
1013 if (!md->pages)
1014 goto out;
1015
1016 if (md->anon)
1017 seq_printf(m, " anon=%lu", md->anon);
1018
1019 if (md->dirty)
1020 seq_printf(m, " dirty=%lu", md->dirty);
1021
1022 if (md->pages != md->anon && md->pages != md->dirty)
1023 seq_printf(m, " mapped=%lu", md->pages);
1024
1025 if (md->mapcount_max > 1)
1026 seq_printf(m, " mapmax=%lu", md->mapcount_max);
1027
1028 if (md->swapcache)
1029 seq_printf(m, " swapcache=%lu", md->swapcache);
1030
1031 if (md->active < md->pages && !is_vm_hugetlb_page(vma))
1032 seq_printf(m, " active=%lu", md->active);
1033
1034 if (md->writeback)
1035 seq_printf(m, " writeback=%lu", md->writeback);
1036
1037 for_each_node_state(n, N_HIGH_MEMORY)
1038 if (md->node[n])
1039 seq_printf(m, " N%d=%lu", n, md->node[n]);
1040out:
1041 seq_putc(m, '\n');
1042
1043 if (m->count < m->size)
1044 m->version = (vma != proc_priv->tail_vma) ? vma->vm_start : 0;
1045 return 0;
1046}
862 1047
863static const struct seq_operations proc_pid_numa_maps_op = { 1048static const struct seq_operations proc_pid_numa_maps_op = {
864 .start = m_start, 1049 .start = m_start,
@@ -869,7 +1054,20 @@ static const struct seq_operations proc_pid_numa_maps_op = {
869 1054
870static int numa_maps_open(struct inode *inode, struct file *file) 1055static int numa_maps_open(struct inode *inode, struct file *file)
871{ 1056{
872 return do_maps_open(inode, file, &proc_pid_numa_maps_op); 1057 struct numa_maps_private *priv;
1058 int ret = -ENOMEM;
1059 priv = kzalloc(sizeof(*priv), GFP_KERNEL);
1060 if (priv) {
1061 priv->proc_maps.pid = proc_pid(inode);
1062 ret = seq_open(file, &proc_pid_numa_maps_op);
1063 if (!ret) {
1064 struct seq_file *m = file->private_data;
1065 m->private = priv;
1066 } else {
1067 kfree(priv);
1068 }
1069 }
1070 return ret;
873} 1071}
874 1072
875const struct file_operations proc_numa_maps_operations = { 1073const struct file_operations proc_numa_maps_operations = {
@@ -878,4 +1076,4 @@ const struct file_operations proc_numa_maps_operations = {
878 .llseek = seq_lseek, 1076 .llseek = seq_lseek,
879 .release = seq_release_private, 1077 .release = seq_release_private,
880}; 1078};
881#endif 1079#endif /* CONFIG_NUMA */
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index d3c032f5fa0a..5b572c89e6c4 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -691,8 +691,11 @@ static void prune_dqcache(int count)
691 * This is called from kswapd when we think we need some 691 * This is called from kswapd when we think we need some
692 * more memory 692 * more memory
693 */ 693 */
694static int shrink_dqcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask) 694static int shrink_dqcache_memory(struct shrinker *shrink,
695 struct shrink_control *sc)
695{ 696{
697 int nr = sc->nr_to_scan;
698
696 if (nr) { 699 if (nr) {
697 spin_lock(&dq_list_lock); 700 spin_lock(&dq_list_lock);
698 prune_dqcache(nr); 701 prune_dqcache(nr);
diff --git a/fs/splice.c b/fs/splice.c
index 50a5d978da16..aa866d309695 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -162,6 +162,14 @@ static const struct pipe_buf_operations user_page_pipe_buf_ops = {
162 .get = generic_pipe_buf_get, 162 .get = generic_pipe_buf_get,
163}; 163};
164 164
165static void wakeup_pipe_readers(struct pipe_inode_info *pipe)
166{
167 smp_mb();
168 if (waitqueue_active(&pipe->wait))
169 wake_up_interruptible(&pipe->wait);
170 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
171}
172
165/** 173/**
166 * splice_to_pipe - fill passed data into a pipe 174 * splice_to_pipe - fill passed data into a pipe
167 * @pipe: pipe to fill 175 * @pipe: pipe to fill
@@ -247,12 +255,8 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
247 255
248 pipe_unlock(pipe); 256 pipe_unlock(pipe);
249 257
250 if (do_wakeup) { 258 if (do_wakeup)
251 smp_mb(); 259 wakeup_pipe_readers(pipe);
252 if (waitqueue_active(&pipe->wait))
253 wake_up_interruptible(&pipe->wait);
254 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
255 }
256 260
257 while (page_nr < spd_pages) 261 while (page_nr < spd_pages)
258 spd->spd_release(spd, page_nr++); 262 spd->spd_release(spd, page_nr++);
@@ -1892,12 +1896,9 @@ retry:
1892 /* 1896 /*
1893 * If we put data in the output pipe, wakeup any potential readers. 1897 * If we put data in the output pipe, wakeup any potential readers.
1894 */ 1898 */
1895 if (ret > 0) { 1899 if (ret > 0)
1896 smp_mb(); 1900 wakeup_pipe_readers(opipe);
1897 if (waitqueue_active(&opipe->wait)) 1901
1898 wake_up_interruptible(&opipe->wait);
1899 kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN);
1900 }
1901 if (input_wakeup) 1902 if (input_wakeup)
1902 wakeup_pipe_writers(ipipe); 1903 wakeup_pipe_writers(ipipe);
1903 1904
@@ -1976,12 +1977,8 @@ static int link_pipe(struct pipe_inode_info *ipipe,
1976 /* 1977 /*
1977 * If we put data in the output pipe, wakeup any potential readers. 1978 * If we put data in the output pipe, wakeup any potential readers.
1978 */ 1979 */
1979 if (ret > 0) { 1980 if (ret > 0)
1980 smp_mb(); 1981 wakeup_pipe_readers(opipe);
1981 if (waitqueue_active(&opipe->wait))
1982 wake_up_interruptible(&opipe->wait);
1983 kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN);
1984 }
1985 1982
1986 return ret; 1983 return ret;
1987} 1984}
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 8c4fc1425b3e..f67acbdda5e8 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -22,16 +22,24 @@
22#include <linux/anon_inodes.h> 22#include <linux/anon_inodes.h>
23#include <linux/timerfd.h> 23#include <linux/timerfd.h>
24#include <linux/syscalls.h> 24#include <linux/syscalls.h>
25#include <linux/rcupdate.h>
25 26
26struct timerfd_ctx { 27struct timerfd_ctx {
27 struct hrtimer tmr; 28 struct hrtimer tmr;
28 ktime_t tintv; 29 ktime_t tintv;
30 ktime_t moffs;
29 wait_queue_head_t wqh; 31 wait_queue_head_t wqh;
30 u64 ticks; 32 u64 ticks;
31 int expired; 33 int expired;
32 int clockid; 34 int clockid;
35 struct rcu_head rcu;
36 struct list_head clist;
37 bool might_cancel;
33}; 38};
34 39
40static LIST_HEAD(cancel_list);
41static DEFINE_SPINLOCK(cancel_lock);
42
35/* 43/*
36 * This gets called when the timer event triggers. We set the "expired" 44 * This gets called when the timer event triggers. We set the "expired"
37 * flag, but we do not re-arm the timer (in case it's necessary, 45 * flag, but we do not re-arm the timer (in case it's necessary,
@@ -51,6 +59,63 @@ static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr)
51 return HRTIMER_NORESTART; 59 return HRTIMER_NORESTART;
52} 60}
53 61
62/*
63 * Called when the clock was set to cancel the timers in the cancel
64 * list.
65 */
66void timerfd_clock_was_set(void)
67{
68 ktime_t moffs = ktime_get_monotonic_offset();
69 struct timerfd_ctx *ctx;
70 unsigned long flags;
71
72 rcu_read_lock();
73 list_for_each_entry_rcu(ctx, &cancel_list, clist) {
74 if (!ctx->might_cancel)
75 continue;
76 spin_lock_irqsave(&ctx->wqh.lock, flags);
77 if (ctx->moffs.tv64 != moffs.tv64) {
78 ctx->moffs.tv64 = KTIME_MAX;
79 wake_up_locked(&ctx->wqh);
80 }
81 spin_unlock_irqrestore(&ctx->wqh.lock, flags);
82 }
83 rcu_read_unlock();
84}
85
86static void timerfd_remove_cancel(struct timerfd_ctx *ctx)
87{
88 if (ctx->might_cancel) {
89 ctx->might_cancel = false;
90 spin_lock(&cancel_lock);
91 list_del_rcu(&ctx->clist);
92 spin_unlock(&cancel_lock);
93 }
94}
95
96static bool timerfd_canceled(struct timerfd_ctx *ctx)
97{
98 if (!ctx->might_cancel || ctx->moffs.tv64 != KTIME_MAX)
99 return false;
100 ctx->moffs = ktime_get_monotonic_offset();
101 return true;
102}
103
104static void timerfd_setup_cancel(struct timerfd_ctx *ctx, int flags)
105{
106 if (ctx->clockid == CLOCK_REALTIME && (flags & TFD_TIMER_ABSTIME) &&
107 (flags & TFD_TIMER_CANCEL_ON_SET)) {
108 if (!ctx->might_cancel) {
109 ctx->might_cancel = true;
110 spin_lock(&cancel_lock);
111 list_add_rcu(&ctx->clist, &cancel_list);
112 spin_unlock(&cancel_lock);
113 }
114 } else if (ctx->might_cancel) {
115 timerfd_remove_cancel(ctx);
116 }
117}
118
54static ktime_t timerfd_get_remaining(struct timerfd_ctx *ctx) 119static ktime_t timerfd_get_remaining(struct timerfd_ctx *ctx)
55{ 120{
56 ktime_t remaining; 121 ktime_t remaining;
@@ -59,11 +124,12 @@ static ktime_t timerfd_get_remaining(struct timerfd_ctx *ctx)
59 return remaining.tv64 < 0 ? ktime_set(0, 0): remaining; 124 return remaining.tv64 < 0 ? ktime_set(0, 0): remaining;
60} 125}
61 126
62static void timerfd_setup(struct timerfd_ctx *ctx, int flags, 127static int timerfd_setup(struct timerfd_ctx *ctx, int flags,
63 const struct itimerspec *ktmr) 128 const struct itimerspec *ktmr)
64{ 129{
65 enum hrtimer_mode htmode; 130 enum hrtimer_mode htmode;
66 ktime_t texp; 131 ktime_t texp;
132 int clockid = ctx->clockid;
67 133
68 htmode = (flags & TFD_TIMER_ABSTIME) ? 134 htmode = (flags & TFD_TIMER_ABSTIME) ?
69 HRTIMER_MODE_ABS: HRTIMER_MODE_REL; 135 HRTIMER_MODE_ABS: HRTIMER_MODE_REL;
@@ -72,19 +138,24 @@ static void timerfd_setup(struct timerfd_ctx *ctx, int flags,
72 ctx->expired = 0; 138 ctx->expired = 0;
73 ctx->ticks = 0; 139 ctx->ticks = 0;
74 ctx->tintv = timespec_to_ktime(ktmr->it_interval); 140 ctx->tintv = timespec_to_ktime(ktmr->it_interval);
75 hrtimer_init(&ctx->tmr, ctx->clockid, htmode); 141 hrtimer_init(&ctx->tmr, clockid, htmode);
76 hrtimer_set_expires(&ctx->tmr, texp); 142 hrtimer_set_expires(&ctx->tmr, texp);
77 ctx->tmr.function = timerfd_tmrproc; 143 ctx->tmr.function = timerfd_tmrproc;
78 if (texp.tv64 != 0) 144 if (texp.tv64 != 0) {
79 hrtimer_start(&ctx->tmr, texp, htmode); 145 hrtimer_start(&ctx->tmr, texp, htmode);
146 if (timerfd_canceled(ctx))
147 return -ECANCELED;
148 }
149 return 0;
80} 150}
81 151
82static int timerfd_release(struct inode *inode, struct file *file) 152static int timerfd_release(struct inode *inode, struct file *file)
83{ 153{
84 struct timerfd_ctx *ctx = file->private_data; 154 struct timerfd_ctx *ctx = file->private_data;
85 155
156 timerfd_remove_cancel(ctx);
86 hrtimer_cancel(&ctx->tmr); 157 hrtimer_cancel(&ctx->tmr);
87 kfree(ctx); 158 kfree_rcu(ctx, rcu);
88 return 0; 159 return 0;
89} 160}
90 161
@@ -118,8 +189,21 @@ static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count,
118 res = -EAGAIN; 189 res = -EAGAIN;
119 else 190 else
120 res = wait_event_interruptible_locked_irq(ctx->wqh, ctx->ticks); 191 res = wait_event_interruptible_locked_irq(ctx->wqh, ctx->ticks);
192
193 /*
194 * If clock has changed, we do not care about the
195 * ticks and we do not rearm the timer. Userspace must
196 * reevaluate anyway.
197 */
198 if (timerfd_canceled(ctx)) {
199 ctx->ticks = 0;
200 ctx->expired = 0;
201 res = -ECANCELED;
202 }
203
121 if (ctx->ticks) { 204 if (ctx->ticks) {
122 ticks = ctx->ticks; 205 ticks = ctx->ticks;
206
123 if (ctx->expired && ctx->tintv.tv64) { 207 if (ctx->expired && ctx->tintv.tv64) {
124 /* 208 /*
125 * If tintv.tv64 != 0, this is a periodic timer that 209 * If tintv.tv64 != 0, this is a periodic timer that
@@ -183,6 +267,7 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
183 init_waitqueue_head(&ctx->wqh); 267 init_waitqueue_head(&ctx->wqh);
184 ctx->clockid = clockid; 268 ctx->clockid = clockid;
185 hrtimer_init(&ctx->tmr, clockid, HRTIMER_MODE_ABS); 269 hrtimer_init(&ctx->tmr, clockid, HRTIMER_MODE_ABS);
270 ctx->moffs = ktime_get_monotonic_offset();
186 271
187 ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx, 272 ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx,
188 O_RDWR | (flags & TFD_SHARED_FCNTL_FLAGS)); 273 O_RDWR | (flags & TFD_SHARED_FCNTL_FLAGS));
@@ -199,6 +284,7 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
199 struct file *file; 284 struct file *file;
200 struct timerfd_ctx *ctx; 285 struct timerfd_ctx *ctx;
201 struct itimerspec ktmr, kotmr; 286 struct itimerspec ktmr, kotmr;
287 int ret;
202 288
203 if (copy_from_user(&ktmr, utmr, sizeof(ktmr))) 289 if (copy_from_user(&ktmr, utmr, sizeof(ktmr)))
204 return -EFAULT; 290 return -EFAULT;
@@ -213,6 +299,8 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
213 return PTR_ERR(file); 299 return PTR_ERR(file);
214 ctx = file->private_data; 300 ctx = file->private_data;
215 301
302 timerfd_setup_cancel(ctx, flags);
303
216 /* 304 /*
217 * We need to stop the existing timer before reprogramming 305 * We need to stop the existing timer before reprogramming
218 * it to the new values. 306 * it to the new values.
@@ -240,14 +328,14 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
240 /* 328 /*
241 * Re-program the timer to the new value ... 329 * Re-program the timer to the new value ...
242 */ 330 */
243 timerfd_setup(ctx, flags, &ktmr); 331 ret = timerfd_setup(ctx, flags, &ktmr);
244 332
245 spin_unlock_irq(&ctx->wqh.lock); 333 spin_unlock_irq(&ctx->wqh.lock);
246 fput(file); 334 fput(file);
247 if (otmr && copy_to_user(otmr, &kotmr, sizeof(kotmr))) 335 if (otmr && copy_to_user(otmr, &kotmr, sizeof(kotmr)))
248 return -EFAULT; 336 return -EFAULT;
249 337
250 return 0; 338 return ret;
251} 339}
252 340
253SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct itimerspec __user *, otmr) 341SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct itimerspec __user *, otmr)
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index 8b3a7da531eb..315de66e52b2 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -106,7 +106,7 @@ static long long get_liability(struct ubifs_info *c)
106 long long liab; 106 long long liab;
107 107
108 spin_lock(&c->space_lock); 108 spin_lock(&c->space_lock);
109 liab = c->budg_idx_growth + c->budg_data_growth + c->budg_dd_growth; 109 liab = c->bi.idx_growth + c->bi.data_growth + c->bi.dd_growth;
110 spin_unlock(&c->space_lock); 110 spin_unlock(&c->space_lock);
111 return liab; 111 return liab;
112} 112}
@@ -180,7 +180,7 @@ int ubifs_calc_min_idx_lebs(struct ubifs_info *c)
180 int idx_lebs; 180 int idx_lebs;
181 long long idx_size; 181 long long idx_size;
182 182
183 idx_size = c->old_idx_sz + c->budg_idx_growth + c->budg_uncommitted_idx; 183 idx_size = c->bi.old_idx_sz + c->bi.idx_growth + c->bi.uncommitted_idx;
184 /* And make sure we have thrice the index size of space reserved */ 184 /* And make sure we have thrice the index size of space reserved */
185 idx_size += idx_size << 1; 185 idx_size += idx_size << 1;
186 /* 186 /*
@@ -292,13 +292,13 @@ static int can_use_rp(struct ubifs_info *c)
292 * budgeted index space to the size of the current index, multiplies this by 3, 292 * budgeted index space to the size of the current index, multiplies this by 3,
293 * and makes sure this does not exceed the amount of free LEBs. 293 * and makes sure this does not exceed the amount of free LEBs.
294 * 294 *
295 * Notes about @c->min_idx_lebs and @c->lst.idx_lebs variables: 295 * Notes about @c->bi.min_idx_lebs and @c->lst.idx_lebs variables:
296 * o @c->lst.idx_lebs is the number of LEBs the index currently uses. It might 296 * o @c->lst.idx_lebs is the number of LEBs the index currently uses. It might
297 * be large, because UBIFS does not do any index consolidation as long as 297 * be large, because UBIFS does not do any index consolidation as long as
298 * there is free space. IOW, the index may take a lot of LEBs, but the LEBs 298 * there is free space. IOW, the index may take a lot of LEBs, but the LEBs
299 * will contain a lot of dirt. 299 * will contain a lot of dirt.
300 * o @c->min_idx_lebs is the number of LEBS the index presumably takes. IOW, 300 * o @c->bi.min_idx_lebs is the number of LEBS the index presumably takes. IOW,
301 * the index may be consolidated to take up to @c->min_idx_lebs LEBs. 301 * the index may be consolidated to take up to @c->bi.min_idx_lebs LEBs.
302 * 302 *
303 * This function returns zero in case of success, and %-ENOSPC in case of 303 * This function returns zero in case of success, and %-ENOSPC in case of
304 * failure. 304 * failure.
@@ -343,13 +343,13 @@ static int do_budget_space(struct ubifs_info *c)
343 c->lst.taken_empty_lebs; 343 c->lst.taken_empty_lebs;
344 if (unlikely(rsvd_idx_lebs > lebs)) { 344 if (unlikely(rsvd_idx_lebs > lebs)) {
345 dbg_budg("out of indexing space: min_idx_lebs %d (old %d), " 345 dbg_budg("out of indexing space: min_idx_lebs %d (old %d), "
346 "rsvd_idx_lebs %d", min_idx_lebs, c->min_idx_lebs, 346 "rsvd_idx_lebs %d", min_idx_lebs, c->bi.min_idx_lebs,
347 rsvd_idx_lebs); 347 rsvd_idx_lebs);
348 return -ENOSPC; 348 return -ENOSPC;
349 } 349 }
350 350
351 available = ubifs_calc_available(c, min_idx_lebs); 351 available = ubifs_calc_available(c, min_idx_lebs);
352 outstanding = c->budg_data_growth + c->budg_dd_growth; 352 outstanding = c->bi.data_growth + c->bi.dd_growth;
353 353
354 if (unlikely(available < outstanding)) { 354 if (unlikely(available < outstanding)) {
355 dbg_budg("out of data space: available %lld, outstanding %lld", 355 dbg_budg("out of data space: available %lld, outstanding %lld",
@@ -360,7 +360,7 @@ static int do_budget_space(struct ubifs_info *c)
360 if (available - outstanding <= c->rp_size && !can_use_rp(c)) 360 if (available - outstanding <= c->rp_size && !can_use_rp(c))
361 return -ENOSPC; 361 return -ENOSPC;
362 362
363 c->min_idx_lebs = min_idx_lebs; 363 c->bi.min_idx_lebs = min_idx_lebs;
364 return 0; 364 return 0;
365} 365}
366 366
@@ -393,11 +393,11 @@ static int calc_data_growth(const struct ubifs_info *c,
393{ 393{
394 int data_growth; 394 int data_growth;
395 395
396 data_growth = req->new_ino ? c->inode_budget : 0; 396 data_growth = req->new_ino ? c->bi.inode_budget : 0;
397 if (req->new_page) 397 if (req->new_page)
398 data_growth += c->page_budget; 398 data_growth += c->bi.page_budget;
399 if (req->new_dent) 399 if (req->new_dent)
400 data_growth += c->dent_budget; 400 data_growth += c->bi.dent_budget;
401 data_growth += req->new_ino_d; 401 data_growth += req->new_ino_d;
402 return data_growth; 402 return data_growth;
403} 403}
@@ -413,12 +413,12 @@ static int calc_dd_growth(const struct ubifs_info *c,
413{ 413{
414 int dd_growth; 414 int dd_growth;
415 415
416 dd_growth = req->dirtied_page ? c->page_budget : 0; 416 dd_growth = req->dirtied_page ? c->bi.page_budget : 0;
417 417
418 if (req->dirtied_ino) 418 if (req->dirtied_ino)
419 dd_growth += c->inode_budget << (req->dirtied_ino - 1); 419 dd_growth += c->bi.inode_budget << (req->dirtied_ino - 1);
420 if (req->mod_dent) 420 if (req->mod_dent)
421 dd_growth += c->dent_budget; 421 dd_growth += c->bi.dent_budget;
422 dd_growth += req->dirtied_ino_d; 422 dd_growth += req->dirtied_ino_d;
423 return dd_growth; 423 return dd_growth;
424} 424}
@@ -460,19 +460,19 @@ int ubifs_budget_space(struct ubifs_info *c, struct ubifs_budget_req *req)
460 460
461again: 461again:
462 spin_lock(&c->space_lock); 462 spin_lock(&c->space_lock);
463 ubifs_assert(c->budg_idx_growth >= 0); 463 ubifs_assert(c->bi.idx_growth >= 0);
464 ubifs_assert(c->budg_data_growth >= 0); 464 ubifs_assert(c->bi.data_growth >= 0);
465 ubifs_assert(c->budg_dd_growth >= 0); 465 ubifs_assert(c->bi.dd_growth >= 0);
466 466
467 if (unlikely(c->nospace) && (c->nospace_rp || !can_use_rp(c))) { 467 if (unlikely(c->bi.nospace) && (c->bi.nospace_rp || !can_use_rp(c))) {
468 dbg_budg("no space"); 468 dbg_budg("no space");
469 spin_unlock(&c->space_lock); 469 spin_unlock(&c->space_lock);
470 return -ENOSPC; 470 return -ENOSPC;
471 } 471 }
472 472
473 c->budg_idx_growth += idx_growth; 473 c->bi.idx_growth += idx_growth;
474 c->budg_data_growth += data_growth; 474 c->bi.data_growth += data_growth;
475 c->budg_dd_growth += dd_growth; 475 c->bi.dd_growth += dd_growth;
476 476
477 err = do_budget_space(c); 477 err = do_budget_space(c);
478 if (likely(!err)) { 478 if (likely(!err)) {
@@ -484,9 +484,9 @@ again:
484 } 484 }
485 485
486 /* Restore the old values */ 486 /* Restore the old values */
487 c->budg_idx_growth -= idx_growth; 487 c->bi.idx_growth -= idx_growth;
488 c->budg_data_growth -= data_growth; 488 c->bi.data_growth -= data_growth;
489 c->budg_dd_growth -= dd_growth; 489 c->bi.dd_growth -= dd_growth;
490 spin_unlock(&c->space_lock); 490 spin_unlock(&c->space_lock);
491 491
492 if (req->fast) { 492 if (req->fast) {
@@ -506,9 +506,9 @@ again:
506 goto again; 506 goto again;
507 } 507 }
508 dbg_budg("FS is full, -ENOSPC"); 508 dbg_budg("FS is full, -ENOSPC");
509 c->nospace = 1; 509 c->bi.nospace = 1;
510 if (can_use_rp(c) || c->rp_size == 0) 510 if (can_use_rp(c) || c->rp_size == 0)
511 c->nospace_rp = 1; 511 c->bi.nospace_rp = 1;
512 smp_wmb(); 512 smp_wmb();
513 } else 513 } else
514 ubifs_err("cannot budget space, error %d", err); 514 ubifs_err("cannot budget space, error %d", err);
@@ -523,8 +523,8 @@ again:
523 * This function releases the space budgeted by 'ubifs_budget_space()'. Note, 523 * This function releases the space budgeted by 'ubifs_budget_space()'. Note,
524 * since the index changes (which were budgeted for in @req->idx_growth) will 524 * since the index changes (which were budgeted for in @req->idx_growth) will
525 * only be written to the media on commit, this function moves the index budget 525 * only be written to the media on commit, this function moves the index budget
526 * from @c->budg_idx_growth to @c->budg_uncommitted_idx. The latter will be 526 * from @c->bi.idx_growth to @c->bi.uncommitted_idx. The latter will be zeroed
527 * zeroed by the commit operation. 527 * by the commit operation.
528 */ 528 */
529void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req) 529void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req)
530{ 530{
@@ -553,23 +553,23 @@ void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req)
553 if (!req->data_growth && !req->dd_growth) 553 if (!req->data_growth && !req->dd_growth)
554 return; 554 return;
555 555
556 c->nospace = c->nospace_rp = 0; 556 c->bi.nospace = c->bi.nospace_rp = 0;
557 smp_wmb(); 557 smp_wmb();
558 558
559 spin_lock(&c->space_lock); 559 spin_lock(&c->space_lock);
560 c->budg_idx_growth -= req->idx_growth; 560 c->bi.idx_growth -= req->idx_growth;
561 c->budg_uncommitted_idx += req->idx_growth; 561 c->bi.uncommitted_idx += req->idx_growth;
562 c->budg_data_growth -= req->data_growth; 562 c->bi.data_growth -= req->data_growth;
563 c->budg_dd_growth -= req->dd_growth; 563 c->bi.dd_growth -= req->dd_growth;
564 c->min_idx_lebs = ubifs_calc_min_idx_lebs(c); 564 c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c);
565 565
566 ubifs_assert(c->budg_idx_growth >= 0); 566 ubifs_assert(c->bi.idx_growth >= 0);
567 ubifs_assert(c->budg_data_growth >= 0); 567 ubifs_assert(c->bi.data_growth >= 0);
568 ubifs_assert(c->budg_dd_growth >= 0); 568 ubifs_assert(c->bi.dd_growth >= 0);
569 ubifs_assert(c->min_idx_lebs < c->main_lebs); 569 ubifs_assert(c->bi.min_idx_lebs < c->main_lebs);
570 ubifs_assert(!(c->budg_idx_growth & 7)); 570 ubifs_assert(!(c->bi.idx_growth & 7));
571 ubifs_assert(!(c->budg_data_growth & 7)); 571 ubifs_assert(!(c->bi.data_growth & 7));
572 ubifs_assert(!(c->budg_dd_growth & 7)); 572 ubifs_assert(!(c->bi.dd_growth & 7));
573 spin_unlock(&c->space_lock); 573 spin_unlock(&c->space_lock);
574} 574}
575 575
@@ -586,13 +586,13 @@ void ubifs_convert_page_budget(struct ubifs_info *c)
586{ 586{
587 spin_lock(&c->space_lock); 587 spin_lock(&c->space_lock);
588 /* Release the index growth reservation */ 588 /* Release the index growth reservation */
589 c->budg_idx_growth -= c->max_idx_node_sz << UBIFS_BLOCKS_PER_PAGE_SHIFT; 589 c->bi.idx_growth -= c->max_idx_node_sz << UBIFS_BLOCKS_PER_PAGE_SHIFT;
590 /* Release the data growth reservation */ 590 /* Release the data growth reservation */
591 c->budg_data_growth -= c->page_budget; 591 c->bi.data_growth -= c->bi.page_budget;
592 /* Increase the dirty data growth reservation instead */ 592 /* Increase the dirty data growth reservation instead */
593 c->budg_dd_growth += c->page_budget; 593 c->bi.dd_growth += c->bi.page_budget;
594 /* And re-calculate the indexing space reservation */ 594 /* And re-calculate the indexing space reservation */
595 c->min_idx_lebs = ubifs_calc_min_idx_lebs(c); 595 c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c);
596 spin_unlock(&c->space_lock); 596 spin_unlock(&c->space_lock);
597} 597}
598 598
@@ -612,7 +612,7 @@ void ubifs_release_dirty_inode_budget(struct ubifs_info *c,
612 612
613 memset(&req, 0, sizeof(struct ubifs_budget_req)); 613 memset(&req, 0, sizeof(struct ubifs_budget_req));
614 /* The "no space" flags will be cleared because dd_growth is > 0 */ 614 /* The "no space" flags will be cleared because dd_growth is > 0 */
615 req.dd_growth = c->inode_budget + ALIGN(ui->data_len, 8); 615 req.dd_growth = c->bi.inode_budget + ALIGN(ui->data_len, 8);
616 ubifs_release_budget(c, &req); 616 ubifs_release_budget(c, &req);
617} 617}
618 618
@@ -682,9 +682,9 @@ long long ubifs_get_free_space_nolock(struct ubifs_info *c)
682 int rsvd_idx_lebs, lebs; 682 int rsvd_idx_lebs, lebs;
683 long long available, outstanding, free; 683 long long available, outstanding, free;
684 684
685 ubifs_assert(c->min_idx_lebs == ubifs_calc_min_idx_lebs(c)); 685 ubifs_assert(c->bi.min_idx_lebs == ubifs_calc_min_idx_lebs(c));
686 outstanding = c->budg_data_growth + c->budg_dd_growth; 686 outstanding = c->bi.data_growth + c->bi.dd_growth;
687 available = ubifs_calc_available(c, c->min_idx_lebs); 687 available = ubifs_calc_available(c, c->bi.min_idx_lebs);
688 688
689 /* 689 /*
690 * When reporting free space to user-space, UBIFS guarantees that it is 690 * When reporting free space to user-space, UBIFS guarantees that it is
@@ -697,8 +697,8 @@ long long ubifs_get_free_space_nolock(struct ubifs_info *c)
697 * Note, the calculations below are similar to what we have in 697 * Note, the calculations below are similar to what we have in
698 * 'do_budget_space()', so refer there for comments. 698 * 'do_budget_space()', so refer there for comments.
699 */ 699 */
700 if (c->min_idx_lebs > c->lst.idx_lebs) 700 if (c->bi.min_idx_lebs > c->lst.idx_lebs)
701 rsvd_idx_lebs = c->min_idx_lebs - c->lst.idx_lebs; 701 rsvd_idx_lebs = c->bi.min_idx_lebs - c->lst.idx_lebs;
702 else 702 else
703 rsvd_idx_lebs = 0; 703 rsvd_idx_lebs = 0;
704 lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt - 704 lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt -
diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
index 1bd01ded7123..87cd0ead8633 100644
--- a/fs/ubifs/commit.c
+++ b/fs/ubifs/commit.c
@@ -182,7 +182,7 @@ static int do_commit(struct ubifs_info *c)
182 c->mst_node->root_len = cpu_to_le32(zroot.len); 182 c->mst_node->root_len = cpu_to_le32(zroot.len);
183 c->mst_node->ihead_lnum = cpu_to_le32(c->ihead_lnum); 183 c->mst_node->ihead_lnum = cpu_to_le32(c->ihead_lnum);
184 c->mst_node->ihead_offs = cpu_to_le32(c->ihead_offs); 184 c->mst_node->ihead_offs = cpu_to_le32(c->ihead_offs);
185 c->mst_node->index_size = cpu_to_le64(c->old_idx_sz); 185 c->mst_node->index_size = cpu_to_le64(c->bi.old_idx_sz);
186 c->mst_node->lpt_lnum = cpu_to_le32(c->lpt_lnum); 186 c->mst_node->lpt_lnum = cpu_to_le32(c->lpt_lnum);
187 c->mst_node->lpt_offs = cpu_to_le32(c->lpt_offs); 187 c->mst_node->lpt_offs = cpu_to_le32(c->lpt_offs);
188 c->mst_node->nhead_lnum = cpu_to_le32(c->nhead_lnum); 188 c->mst_node->nhead_lnum = cpu_to_le32(c->nhead_lnum);
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 004d3745dc45..0bb2bcef0de9 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -34,7 +34,6 @@
34#include <linux/moduleparam.h> 34#include <linux/moduleparam.h>
35#include <linux/debugfs.h> 35#include <linux/debugfs.h>
36#include <linux/math64.h> 36#include <linux/math64.h>
37#include <linux/slab.h>
38 37
39#ifdef CONFIG_UBIFS_FS_DEBUG 38#ifdef CONFIG_UBIFS_FS_DEBUG
40 39
@@ -43,15 +42,12 @@ DEFINE_SPINLOCK(dbg_lock);
43static char dbg_key_buf0[128]; 42static char dbg_key_buf0[128];
44static char dbg_key_buf1[128]; 43static char dbg_key_buf1[128];
45 44
46unsigned int ubifs_msg_flags;
47unsigned int ubifs_chk_flags; 45unsigned int ubifs_chk_flags;
48unsigned int ubifs_tst_flags; 46unsigned int ubifs_tst_flags;
49 47
50module_param_named(debug_msgs, ubifs_msg_flags, uint, S_IRUGO | S_IWUSR);
51module_param_named(debug_chks, ubifs_chk_flags, uint, S_IRUGO | S_IWUSR); 48module_param_named(debug_chks, ubifs_chk_flags, uint, S_IRUGO | S_IWUSR);
52module_param_named(debug_tsts, ubifs_tst_flags, uint, S_IRUGO | S_IWUSR); 49module_param_named(debug_tsts, ubifs_tst_flags, uint, S_IRUGO | S_IWUSR);
53 50
54MODULE_PARM_DESC(debug_msgs, "Debug message type flags");
55MODULE_PARM_DESC(debug_chks, "Debug check flags"); 51MODULE_PARM_DESC(debug_chks, "Debug check flags");
56MODULE_PARM_DESC(debug_tsts, "Debug special test flags"); 52MODULE_PARM_DESC(debug_tsts, "Debug special test flags");
57 53
@@ -317,6 +313,8 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
317 printk(KERN_DEBUG "\tflags %#x\n", sup_flags); 313 printk(KERN_DEBUG "\tflags %#x\n", sup_flags);
318 printk(KERN_DEBUG "\t big_lpt %u\n", 314 printk(KERN_DEBUG "\t big_lpt %u\n",
319 !!(sup_flags & UBIFS_FLG_BIGLPT)); 315 !!(sup_flags & UBIFS_FLG_BIGLPT));
316 printk(KERN_DEBUG "\t space_fixup %u\n",
317 !!(sup_flags & UBIFS_FLG_SPACE_FIXUP));
320 printk(KERN_DEBUG "\tmin_io_size %u\n", 318 printk(KERN_DEBUG "\tmin_io_size %u\n",
321 le32_to_cpu(sup->min_io_size)); 319 le32_to_cpu(sup->min_io_size));
322 printk(KERN_DEBUG "\tleb_size %u\n", 320 printk(KERN_DEBUG "\tleb_size %u\n",
@@ -602,7 +600,7 @@ void dbg_dump_lstats(const struct ubifs_lp_stats *lst)
602 spin_unlock(&dbg_lock); 600 spin_unlock(&dbg_lock);
603} 601}
604 602
605void dbg_dump_budg(struct ubifs_info *c) 603void dbg_dump_budg(struct ubifs_info *c, const struct ubifs_budg_info *bi)
606{ 604{
607 int i; 605 int i;
608 struct rb_node *rb; 606 struct rb_node *rb;
@@ -610,26 +608,42 @@ void dbg_dump_budg(struct ubifs_info *c)
610 struct ubifs_gced_idx_leb *idx_gc; 608 struct ubifs_gced_idx_leb *idx_gc;
611 long long available, outstanding, free; 609 long long available, outstanding, free;
612 610
613 ubifs_assert(spin_is_locked(&c->space_lock)); 611 spin_lock(&c->space_lock);
614 spin_lock(&dbg_lock); 612 spin_lock(&dbg_lock);
615 printk(KERN_DEBUG "(pid %d) Budgeting info: budg_data_growth %lld, " 613 printk(KERN_DEBUG "(pid %d) Budgeting info: data budget sum %lld, "
616 "budg_dd_growth %lld, budg_idx_growth %lld\n", current->pid, 614 "total budget sum %lld\n", current->pid,
617 c->budg_data_growth, c->budg_dd_growth, c->budg_idx_growth); 615 bi->data_growth + bi->dd_growth,
618 printk(KERN_DEBUG "\tdata budget sum %lld, total budget sum %lld, " 616 bi->data_growth + bi->dd_growth + bi->idx_growth);
619 "freeable_cnt %d\n", c->budg_data_growth + c->budg_dd_growth, 617 printk(KERN_DEBUG "\tbudg_data_growth %lld, budg_dd_growth %lld, "
620 c->budg_data_growth + c->budg_dd_growth + c->budg_idx_growth, 618 "budg_idx_growth %lld\n", bi->data_growth, bi->dd_growth,
621 c->freeable_cnt); 619 bi->idx_growth);
622 printk(KERN_DEBUG "\tmin_idx_lebs %d, old_idx_sz %lld, " 620 printk(KERN_DEBUG "\tmin_idx_lebs %d, old_idx_sz %llu, "
623 "calc_idx_sz %lld, idx_gc_cnt %d\n", c->min_idx_lebs, 621 "uncommitted_idx %lld\n", bi->min_idx_lebs, bi->old_idx_sz,
624 c->old_idx_sz, c->calc_idx_sz, c->idx_gc_cnt); 622 bi->uncommitted_idx);
623 printk(KERN_DEBUG "\tpage_budget %d, inode_budget %d, dent_budget %d\n",
624 bi->page_budget, bi->inode_budget, bi->dent_budget);
625 printk(KERN_DEBUG "\tnospace %u, nospace_rp %u\n",
626 bi->nospace, bi->nospace_rp);
627 printk(KERN_DEBUG "\tdark_wm %d, dead_wm %d, max_idx_node_sz %d\n",
628 c->dark_wm, c->dead_wm, c->max_idx_node_sz);
629
630 if (bi != &c->bi)
631 /*
632 * If we are dumping saved budgeting data, do not print
633 * additional information which is about the current state, not
634 * the old one which corresponded to the saved budgeting data.
635 */
636 goto out_unlock;
637
638 printk(KERN_DEBUG "\tfreeable_cnt %d, calc_idx_sz %lld, idx_gc_cnt %d\n",
639 c->freeable_cnt, c->calc_idx_sz, c->idx_gc_cnt);
625 printk(KERN_DEBUG "\tdirty_pg_cnt %ld, dirty_zn_cnt %ld, " 640 printk(KERN_DEBUG "\tdirty_pg_cnt %ld, dirty_zn_cnt %ld, "
626 "clean_zn_cnt %ld\n", atomic_long_read(&c->dirty_pg_cnt), 641 "clean_zn_cnt %ld\n", atomic_long_read(&c->dirty_pg_cnt),
627 atomic_long_read(&c->dirty_zn_cnt), 642 atomic_long_read(&c->dirty_zn_cnt),
628 atomic_long_read(&c->clean_zn_cnt)); 643 atomic_long_read(&c->clean_zn_cnt));
629 printk(KERN_DEBUG "\tdark_wm %d, dead_wm %d, max_idx_node_sz %d\n",
630 c->dark_wm, c->dead_wm, c->max_idx_node_sz);
631 printk(KERN_DEBUG "\tgc_lnum %d, ihead_lnum %d\n", 644 printk(KERN_DEBUG "\tgc_lnum %d, ihead_lnum %d\n",
632 c->gc_lnum, c->ihead_lnum); 645 c->gc_lnum, c->ihead_lnum);
646
633 /* If we are in R/O mode, journal heads do not exist */ 647 /* If we are in R/O mode, journal heads do not exist */
634 if (c->jheads) 648 if (c->jheads)
635 for (i = 0; i < c->jhead_cnt; i++) 649 for (i = 0; i < c->jhead_cnt; i++)
@@ -648,13 +662,15 @@ void dbg_dump_budg(struct ubifs_info *c)
648 printk(KERN_DEBUG "\tcommit state %d\n", c->cmt_state); 662 printk(KERN_DEBUG "\tcommit state %d\n", c->cmt_state);
649 663
650 /* Print budgeting predictions */ 664 /* Print budgeting predictions */
651 available = ubifs_calc_available(c, c->min_idx_lebs); 665 available = ubifs_calc_available(c, c->bi.min_idx_lebs);
652 outstanding = c->budg_data_growth + c->budg_dd_growth; 666 outstanding = c->bi.data_growth + c->bi.dd_growth;
653 free = ubifs_get_free_space_nolock(c); 667 free = ubifs_get_free_space_nolock(c);
654 printk(KERN_DEBUG "Budgeting predictions:\n"); 668 printk(KERN_DEBUG "Budgeting predictions:\n");
655 printk(KERN_DEBUG "\tavailable: %lld, outstanding %lld, free %lld\n", 669 printk(KERN_DEBUG "\tavailable: %lld, outstanding %lld, free %lld\n",
656 available, outstanding, free); 670 available, outstanding, free);
671out_unlock:
657 spin_unlock(&dbg_lock); 672 spin_unlock(&dbg_lock);
673 spin_unlock(&c->space_lock);
658} 674}
659 675
660void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp) 676void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp)
@@ -729,7 +745,13 @@ void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp)
729 if (bud->lnum == lp->lnum) { 745 if (bud->lnum == lp->lnum) {
730 int head = 0; 746 int head = 0;
731 for (i = 0; i < c->jhead_cnt; i++) { 747 for (i = 0; i < c->jhead_cnt; i++) {
732 if (lp->lnum == c->jheads[i].wbuf.lnum) { 748 /*
749 * Note, if we are in R/O mode or in the middle
750 * of mounting/re-mounting, the write-buffers do
751 * not exist.
752 */
753 if (c->jheads &&
754 lp->lnum == c->jheads[i].wbuf.lnum) {
733 printk(KERN_CONT ", jhead %s", 755 printk(KERN_CONT ", jhead %s",
734 dbg_jhead(i)); 756 dbg_jhead(i));
735 head = 1; 757 head = 1;
@@ -976,6 +998,8 @@ void dbg_save_space_info(struct ubifs_info *c)
976 998
977 spin_lock(&c->space_lock); 999 spin_lock(&c->space_lock);
978 memcpy(&d->saved_lst, &c->lst, sizeof(struct ubifs_lp_stats)); 1000 memcpy(&d->saved_lst, &c->lst, sizeof(struct ubifs_lp_stats));
1001 memcpy(&d->saved_bi, &c->bi, sizeof(struct ubifs_budg_info));
1002 d->saved_idx_gc_cnt = c->idx_gc_cnt;
979 1003
980 /* 1004 /*
981 * We use a dirty hack here and zero out @c->freeable_cnt, because it 1005 * We use a dirty hack here and zero out @c->freeable_cnt, because it
@@ -1042,14 +1066,14 @@ int dbg_check_space_info(struct ubifs_info *c)
1042out: 1066out:
1043 ubifs_msg("saved lprops statistics dump"); 1067 ubifs_msg("saved lprops statistics dump");
1044 dbg_dump_lstats(&d->saved_lst); 1068 dbg_dump_lstats(&d->saved_lst);
1045 ubifs_get_lp_stats(c, &lst); 1069 ubifs_msg("saved budgeting info dump");
1046 1070 dbg_dump_budg(c, &d->saved_bi);
1071 ubifs_msg("saved idx_gc_cnt %d", d->saved_idx_gc_cnt);
1047 ubifs_msg("current lprops statistics dump"); 1072 ubifs_msg("current lprops statistics dump");
1073 ubifs_get_lp_stats(c, &lst);
1048 dbg_dump_lstats(&lst); 1074 dbg_dump_lstats(&lst);
1049 1075 ubifs_msg("current budgeting info dump");
1050 spin_lock(&c->space_lock); 1076 dbg_dump_budg(c, &c->bi);
1051 dbg_dump_budg(c);
1052 spin_unlock(&c->space_lock);
1053 dump_stack(); 1077 dump_stack();
1054 return -EINVAL; 1078 return -EINVAL;
1055} 1079}
@@ -1793,6 +1817,8 @@ static struct fsck_inode *add_inode(struct ubifs_info *c,
1793 struct rb_node **p, *parent = NULL; 1817 struct rb_node **p, *parent = NULL;
1794 struct fsck_inode *fscki; 1818 struct fsck_inode *fscki;
1795 ino_t inum = key_inum_flash(c, &ino->key); 1819 ino_t inum = key_inum_flash(c, &ino->key);
1820 struct inode *inode;
1821 struct ubifs_inode *ui;
1796 1822
1797 p = &fsckd->inodes.rb_node; 1823 p = &fsckd->inodes.rb_node;
1798 while (*p) { 1824 while (*p) {
@@ -1816,19 +1842,46 @@ static struct fsck_inode *add_inode(struct ubifs_info *c,
1816 if (!fscki) 1842 if (!fscki)
1817 return ERR_PTR(-ENOMEM); 1843 return ERR_PTR(-ENOMEM);
1818 1844
1845 inode = ilookup(c->vfs_sb, inum);
1846
1819 fscki->inum = inum; 1847 fscki->inum = inum;
1820 fscki->nlink = le32_to_cpu(ino->nlink); 1848 /*
1821 fscki->size = le64_to_cpu(ino->size); 1849 * If the inode is present in the VFS inode cache, use it instead of
1822 fscki->xattr_cnt = le32_to_cpu(ino->xattr_cnt); 1850 * the on-flash inode which might be out-of-date. E.g., the size might
1823 fscki->xattr_sz = le32_to_cpu(ino->xattr_size); 1851 * be out-of-date. If we do not do this, the following may happen, for
1824 fscki->xattr_nms = le32_to_cpu(ino->xattr_names); 1852 * example:
1825 fscki->mode = le32_to_cpu(ino->mode); 1853 * 1. A power cut happens
1854 * 2. We mount the file-system R/O, the replay process fixes up the
1855 * inode size in the VFS cache, but on on-flash.
1856 * 3. 'check_leaf()' fails because it hits a data node beyond inode
1857 * size.
1858 */
1859 if (!inode) {
1860 fscki->nlink = le32_to_cpu(ino->nlink);
1861 fscki->size = le64_to_cpu(ino->size);
1862 fscki->xattr_cnt = le32_to_cpu(ino->xattr_cnt);
1863 fscki->xattr_sz = le32_to_cpu(ino->xattr_size);
1864 fscki->xattr_nms = le32_to_cpu(ino->xattr_names);
1865 fscki->mode = le32_to_cpu(ino->mode);
1866 } else {
1867 ui = ubifs_inode(inode);
1868 fscki->nlink = inode->i_nlink;
1869 fscki->size = inode->i_size;
1870 fscki->xattr_cnt = ui->xattr_cnt;
1871 fscki->xattr_sz = ui->xattr_size;
1872 fscki->xattr_nms = ui->xattr_names;
1873 fscki->mode = inode->i_mode;
1874 iput(inode);
1875 }
1876
1826 if (S_ISDIR(fscki->mode)) { 1877 if (S_ISDIR(fscki->mode)) {
1827 fscki->calc_sz = UBIFS_INO_NODE_SZ; 1878 fscki->calc_sz = UBIFS_INO_NODE_SZ;
1828 fscki->calc_cnt = 2; 1879 fscki->calc_cnt = 2;
1829 } 1880 }
1881
1830 rb_link_node(&fscki->rb, parent, p); 1882 rb_link_node(&fscki->rb, parent, p);
1831 rb_insert_color(&fscki->rb, &fsckd->inodes); 1883 rb_insert_color(&fscki->rb, &fsckd->inodes);
1884
1832 return fscki; 1885 return fscki;
1833} 1886}
1834 1887
@@ -2421,7 +2474,8 @@ int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head)
2421 hashb = key_block(c, &sb->key); 2474 hashb = key_block(c, &sb->key);
2422 2475
2423 if (hasha > hashb) { 2476 if (hasha > hashb) {
2424 ubifs_err("larger hash %u goes before %u", hasha, hashb); 2477 ubifs_err("larger hash %u goes before %u",
2478 hasha, hashb);
2425 goto error_dump; 2479 goto error_dump;
2426 } 2480 }
2427 } 2481 }
@@ -2437,14 +2491,12 @@ error_dump:
2437 return 0; 2491 return 0;
2438} 2492}
2439 2493
2440static int invocation_cnt;
2441
2442int dbg_force_in_the_gaps(void) 2494int dbg_force_in_the_gaps(void)
2443{ 2495{
2444 if (!dbg_force_in_the_gaps_enabled) 2496 if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
2445 return 0; 2497 return 0;
2446 /* Force in-the-gaps every 8th commit */ 2498
2447 return !((invocation_cnt++) & 0x7); 2499 return !(random32() & 7);
2448} 2500}
2449 2501
2450/* Failure mode for recovery testing */ 2502/* Failure mode for recovery testing */
@@ -2632,7 +2684,7 @@ int dbg_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset,
2632 int len, int check) 2684 int len, int check)
2633{ 2685{
2634 if (in_failure_mode(desc)) 2686 if (in_failure_mode(desc))
2635 return -EIO; 2687 return -EROFS;
2636 return ubi_leb_read(desc, lnum, buf, offset, len, check); 2688 return ubi_leb_read(desc, lnum, buf, offset, len, check);
2637} 2689}
2638 2690
@@ -2642,7 +2694,7 @@ int dbg_leb_write(struct ubi_volume_desc *desc, int lnum, const void *buf,
2642 int err, failing; 2694 int err, failing;
2643 2695
2644 if (in_failure_mode(desc)) 2696 if (in_failure_mode(desc))
2645 return -EIO; 2697 return -EROFS;
2646 failing = do_fail(desc, lnum, 1); 2698 failing = do_fail(desc, lnum, 1);
2647 if (failing) 2699 if (failing)
2648 cut_data(buf, len); 2700 cut_data(buf, len);
@@ -2650,7 +2702,7 @@ int dbg_leb_write(struct ubi_volume_desc *desc, int lnum, const void *buf,
2650 if (err) 2702 if (err)
2651 return err; 2703 return err;
2652 if (failing) 2704 if (failing)
2653 return -EIO; 2705 return -EROFS;
2654 return 0; 2706 return 0;
2655} 2707}
2656 2708
@@ -2660,12 +2712,12 @@ int dbg_leb_change(struct ubi_volume_desc *desc, int lnum, const void *buf,
2660 int err; 2712 int err;
2661 2713
2662 if (do_fail(desc, lnum, 1)) 2714 if (do_fail(desc, lnum, 1))
2663 return -EIO; 2715 return -EROFS;
2664 err = ubi_leb_change(desc, lnum, buf, len, dtype); 2716 err = ubi_leb_change(desc, lnum, buf, len, dtype);
2665 if (err) 2717 if (err)
2666 return err; 2718 return err;
2667 if (do_fail(desc, lnum, 1)) 2719 if (do_fail(desc, lnum, 1))
2668 return -EIO; 2720 return -EROFS;
2669 return 0; 2721 return 0;
2670} 2722}
2671 2723
@@ -2674,12 +2726,12 @@ int dbg_leb_erase(struct ubi_volume_desc *desc, int lnum)
2674 int err; 2726 int err;
2675 2727
2676 if (do_fail(desc, lnum, 0)) 2728 if (do_fail(desc, lnum, 0))
2677 return -EIO; 2729 return -EROFS;
2678 err = ubi_leb_erase(desc, lnum); 2730 err = ubi_leb_erase(desc, lnum);
2679 if (err) 2731 if (err)
2680 return err; 2732 return err;
2681 if (do_fail(desc, lnum, 0)) 2733 if (do_fail(desc, lnum, 0))
2682 return -EIO; 2734 return -EROFS;
2683 return 0; 2735 return 0;
2684} 2736}
2685 2737
@@ -2688,19 +2740,19 @@ int dbg_leb_unmap(struct ubi_volume_desc *desc, int lnum)
2688 int err; 2740 int err;
2689 2741
2690 if (do_fail(desc, lnum, 0)) 2742 if (do_fail(desc, lnum, 0))
2691 return -EIO; 2743 return -EROFS;
2692 err = ubi_leb_unmap(desc, lnum); 2744 err = ubi_leb_unmap(desc, lnum);
2693 if (err) 2745 if (err)
2694 return err; 2746 return err;
2695 if (do_fail(desc, lnum, 0)) 2747 if (do_fail(desc, lnum, 0))
2696 return -EIO; 2748 return -EROFS;
2697 return 0; 2749 return 0;
2698} 2750}
2699 2751
2700int dbg_is_mapped(struct ubi_volume_desc *desc, int lnum) 2752int dbg_is_mapped(struct ubi_volume_desc *desc, int lnum)
2701{ 2753{
2702 if (in_failure_mode(desc)) 2754 if (in_failure_mode(desc))
2703 return -EIO; 2755 return -EROFS;
2704 return ubi_is_mapped(desc, lnum); 2756 return ubi_is_mapped(desc, lnum);
2705} 2757}
2706 2758
@@ -2709,12 +2761,12 @@ int dbg_leb_map(struct ubi_volume_desc *desc, int lnum, int dtype)
2709 int err; 2761 int err;
2710 2762
2711 if (do_fail(desc, lnum, 0)) 2763 if (do_fail(desc, lnum, 0))
2712 return -EIO; 2764 return -EROFS;
2713 err = ubi_leb_map(desc, lnum, dtype); 2765 err = ubi_leb_map(desc, lnum, dtype);
2714 if (err) 2766 if (err)
2715 return err; 2767 return err;
2716 if (do_fail(desc, lnum, 0)) 2768 if (do_fail(desc, lnum, 0))
2717 return -EIO; 2769 return -EROFS;
2718 return 0; 2770 return 0;
2719} 2771}
2720 2772
@@ -2784,7 +2836,7 @@ void dbg_debugfs_exit(void)
2784static int open_debugfs_file(struct inode *inode, struct file *file) 2836static int open_debugfs_file(struct inode *inode, struct file *file)
2785{ 2837{
2786 file->private_data = inode->i_private; 2838 file->private_data = inode->i_private;
2787 return 0; 2839 return nonseekable_open(inode, file);
2788} 2840}
2789 2841
2790static ssize_t write_debugfs_file(struct file *file, const char __user *buf, 2842static ssize_t write_debugfs_file(struct file *file, const char __user *buf,
@@ -2795,18 +2847,15 @@ static ssize_t write_debugfs_file(struct file *file, const char __user *buf,
2795 2847
2796 if (file->f_path.dentry == d->dfs_dump_lprops) 2848 if (file->f_path.dentry == d->dfs_dump_lprops)
2797 dbg_dump_lprops(c); 2849 dbg_dump_lprops(c);
2798 else if (file->f_path.dentry == d->dfs_dump_budg) { 2850 else if (file->f_path.dentry == d->dfs_dump_budg)
2799 spin_lock(&c->space_lock); 2851 dbg_dump_budg(c, &c->bi);
2800 dbg_dump_budg(c); 2852 else if (file->f_path.dentry == d->dfs_dump_tnc) {
2801 spin_unlock(&c->space_lock);
2802 } else if (file->f_path.dentry == d->dfs_dump_tnc) {
2803 mutex_lock(&c->tnc_mutex); 2853 mutex_lock(&c->tnc_mutex);
2804 dbg_dump_tnc(c); 2854 dbg_dump_tnc(c);
2805 mutex_unlock(&c->tnc_mutex); 2855 mutex_unlock(&c->tnc_mutex);
2806 } else 2856 } else
2807 return -EINVAL; 2857 return -EINVAL;
2808 2858
2809 *ppos += count;
2810 return count; 2859 return count;
2811} 2860}
2812 2861
@@ -2814,7 +2863,7 @@ static const struct file_operations dfs_fops = {
2814 .open = open_debugfs_file, 2863 .open = open_debugfs_file,
2815 .write = write_debugfs_file, 2864 .write = write_debugfs_file,
2816 .owner = THIS_MODULE, 2865 .owner = THIS_MODULE,
2817 .llseek = default_llseek, 2866 .llseek = no_llseek,
2818}; 2867};
2819 2868
2820/** 2869/**
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index e6493cac193d..a811ac4a26bb 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -31,6 +31,8 @@ typedef int (*dbg_znode_callback)(struct ubifs_info *c,
31 31
32#ifdef CONFIG_UBIFS_FS_DEBUG 32#ifdef CONFIG_UBIFS_FS_DEBUG
33 33
34#include <linux/random.h>
35
34/** 36/**
35 * ubifs_debug_info - per-FS debugging information. 37 * ubifs_debug_info - per-FS debugging information.
36 * @old_zroot: old index root - used by 'dbg_check_old_index()' 38 * @old_zroot: old index root - used by 'dbg_check_old_index()'
@@ -50,13 +52,15 @@ typedef int (*dbg_znode_callback)(struct ubifs_info *c,
50 * @new_ihead_offs: used by debugging to check @c->ihead_offs 52 * @new_ihead_offs: used by debugging to check @c->ihead_offs
51 * 53 *
52 * @saved_lst: saved lprops statistics (used by 'dbg_save_space_info()') 54 * @saved_lst: saved lprops statistics (used by 'dbg_save_space_info()')
53 * @saved_free: saved free space (used by 'dbg_save_space_info()') 55 * @saved_bi: saved budgeting information
56 * @saved_free: saved amount of free space
57 * @saved_idx_gc_cnt: saved value of @c->idx_gc_cnt
54 * 58 *
55 * dfs_dir_name: name of debugfs directory containing this file-system's files 59 * @dfs_dir_name: name of debugfs directory containing this file-system's files
56 * dfs_dir: direntry object of the file-system debugfs directory 60 * @dfs_dir: direntry object of the file-system debugfs directory
57 * dfs_dump_lprops: "dump lprops" debugfs knob 61 * @dfs_dump_lprops: "dump lprops" debugfs knob
58 * dfs_dump_budg: "dump budgeting information" debugfs knob 62 * @dfs_dump_budg: "dump budgeting information" debugfs knob
59 * dfs_dump_tnc: "dump TNC" debugfs knob 63 * @dfs_dump_tnc: "dump TNC" debugfs knob
60 */ 64 */
61struct ubifs_debug_info { 65struct ubifs_debug_info {
62 struct ubifs_zbranch old_zroot; 66 struct ubifs_zbranch old_zroot;
@@ -76,7 +80,9 @@ struct ubifs_debug_info {
76 int new_ihead_offs; 80 int new_ihead_offs;
77 81
78 struct ubifs_lp_stats saved_lst; 82 struct ubifs_lp_stats saved_lst;
83 struct ubifs_budg_info saved_bi;
79 long long saved_free; 84 long long saved_free;
85 int saved_idx_gc_cnt;
80 86
81 char dfs_dir_name[100]; 87 char dfs_dir_name[100];
82 struct dentry *dfs_dir; 88 struct dentry *dfs_dir;
@@ -101,23 +107,7 @@ struct ubifs_debug_info {
101 } \ 107 } \
102} while (0) 108} while (0)
103 109
104#define dbg_dump_stack() do { \ 110#define dbg_dump_stack() dump_stack()
105 if (!dbg_failure_mode) \
106 dump_stack(); \
107} while (0)
108
109/* Generic debugging messages */
110#define dbg_msg(fmt, ...) do { \
111 spin_lock(&dbg_lock); \
112 printk(KERN_DEBUG "UBIFS DBG (pid %d): %s: " fmt "\n", current->pid, \
113 __func__, ##__VA_ARGS__); \
114 spin_unlock(&dbg_lock); \
115} while (0)
116
117#define dbg_do_msg(typ, fmt, ...) do { \
118 if (ubifs_msg_flags & typ) \
119 dbg_msg(fmt, ##__VA_ARGS__); \
120} while (0)
121 111
122#define dbg_err(fmt, ...) do { \ 112#define dbg_err(fmt, ...) do { \
123 spin_lock(&dbg_lock); \ 113 spin_lock(&dbg_lock); \
@@ -137,77 +127,40 @@ const char *dbg_key_str1(const struct ubifs_info *c,
137#define DBGKEY(key) dbg_key_str0(c, (key)) 127#define DBGKEY(key) dbg_key_str0(c, (key))
138#define DBGKEY1(key) dbg_key_str1(c, (key)) 128#define DBGKEY1(key) dbg_key_str1(c, (key))
139 129
140/* General messages */ 130#define ubifs_dbg_msg(type, fmt, ...) do { \
141#define dbg_gen(fmt, ...) dbg_do_msg(UBIFS_MSG_GEN, fmt, ##__VA_ARGS__) 131 spin_lock(&dbg_lock); \
132 pr_debug("UBIFS DBG " type ": " fmt "\n", ##__VA_ARGS__); \
133 spin_unlock(&dbg_lock); \
134} while (0)
142 135
136/* Just a debugging messages not related to any specific UBIFS subsystem */
137#define dbg_msg(fmt, ...) ubifs_dbg_msg("msg", fmt, ##__VA_ARGS__)
138/* General messages */
139#define dbg_gen(fmt, ...) ubifs_dbg_msg("gen", fmt, ##__VA_ARGS__)
143/* Additional journal messages */ 140/* Additional journal messages */
144#define dbg_jnl(fmt, ...) dbg_do_msg(UBIFS_MSG_JNL, fmt, ##__VA_ARGS__) 141#define dbg_jnl(fmt, ...) ubifs_dbg_msg("jnl", fmt, ##__VA_ARGS__)
145
146/* Additional TNC messages */ 142/* Additional TNC messages */
147#define dbg_tnc(fmt, ...) dbg_do_msg(UBIFS_MSG_TNC, fmt, ##__VA_ARGS__) 143#define dbg_tnc(fmt, ...) ubifs_dbg_msg("tnc", fmt, ##__VA_ARGS__)
148
149/* Additional lprops messages */ 144/* Additional lprops messages */
150#define dbg_lp(fmt, ...) dbg_do_msg(UBIFS_MSG_LP, fmt, ##__VA_ARGS__) 145#define dbg_lp(fmt, ...) ubifs_dbg_msg("lp", fmt, ##__VA_ARGS__)
151
152/* Additional LEB find messages */ 146/* Additional LEB find messages */
153#define dbg_find(fmt, ...) dbg_do_msg(UBIFS_MSG_FIND, fmt, ##__VA_ARGS__) 147#define dbg_find(fmt, ...) ubifs_dbg_msg("find", fmt, ##__VA_ARGS__)
154
155/* Additional mount messages */ 148/* Additional mount messages */
156#define dbg_mnt(fmt, ...) dbg_do_msg(UBIFS_MSG_MNT, fmt, ##__VA_ARGS__) 149#define dbg_mnt(fmt, ...) ubifs_dbg_msg("mnt", fmt, ##__VA_ARGS__)
157
158/* Additional I/O messages */ 150/* Additional I/O messages */
159#define dbg_io(fmt, ...) dbg_do_msg(UBIFS_MSG_IO, fmt, ##__VA_ARGS__) 151#define dbg_io(fmt, ...) ubifs_dbg_msg("io", fmt, ##__VA_ARGS__)
160
161/* Additional commit messages */ 152/* Additional commit messages */
162#define dbg_cmt(fmt, ...) dbg_do_msg(UBIFS_MSG_CMT, fmt, ##__VA_ARGS__) 153#define dbg_cmt(fmt, ...) ubifs_dbg_msg("cmt", fmt, ##__VA_ARGS__)
163
164/* Additional budgeting messages */ 154/* Additional budgeting messages */
165#define dbg_budg(fmt, ...) dbg_do_msg(UBIFS_MSG_BUDG, fmt, ##__VA_ARGS__) 155#define dbg_budg(fmt, ...) ubifs_dbg_msg("budg", fmt, ##__VA_ARGS__)
166
167/* Additional log messages */ 156/* Additional log messages */
168#define dbg_log(fmt, ...) dbg_do_msg(UBIFS_MSG_LOG, fmt, ##__VA_ARGS__) 157#define dbg_log(fmt, ...) ubifs_dbg_msg("log", fmt, ##__VA_ARGS__)
169
170/* Additional gc messages */ 158/* Additional gc messages */
171#define dbg_gc(fmt, ...) dbg_do_msg(UBIFS_MSG_GC, fmt, ##__VA_ARGS__) 159#define dbg_gc(fmt, ...) ubifs_dbg_msg("gc", fmt, ##__VA_ARGS__)
172
173/* Additional scan messages */ 160/* Additional scan messages */
174#define dbg_scan(fmt, ...) dbg_do_msg(UBIFS_MSG_SCAN, fmt, ##__VA_ARGS__) 161#define dbg_scan(fmt, ...) ubifs_dbg_msg("scan", fmt, ##__VA_ARGS__)
175
176/* Additional recovery messages */ 162/* Additional recovery messages */
177#define dbg_rcvry(fmt, ...) dbg_do_msg(UBIFS_MSG_RCVRY, fmt, ##__VA_ARGS__) 163#define dbg_rcvry(fmt, ...) ubifs_dbg_msg("rcvry", fmt, ##__VA_ARGS__)
178
179/*
180 * Debugging message type flags.
181 *
182 * UBIFS_MSG_GEN: general messages
183 * UBIFS_MSG_JNL: journal messages
184 * UBIFS_MSG_MNT: mount messages
185 * UBIFS_MSG_CMT: commit messages
186 * UBIFS_MSG_FIND: LEB find messages
187 * UBIFS_MSG_BUDG: budgeting messages
188 * UBIFS_MSG_GC: garbage collection messages
189 * UBIFS_MSG_TNC: TNC messages
190 * UBIFS_MSG_LP: lprops messages
191 * UBIFS_MSG_IO: I/O messages
192 * UBIFS_MSG_LOG: log messages
193 * UBIFS_MSG_SCAN: scan messages
194 * UBIFS_MSG_RCVRY: recovery messages
195 */
196enum {
197 UBIFS_MSG_GEN = 0x1,
198 UBIFS_MSG_JNL = 0x2,
199 UBIFS_MSG_MNT = 0x4,
200 UBIFS_MSG_CMT = 0x8,
201 UBIFS_MSG_FIND = 0x10,
202 UBIFS_MSG_BUDG = 0x20,
203 UBIFS_MSG_GC = 0x40,
204 UBIFS_MSG_TNC = 0x80,
205 UBIFS_MSG_LP = 0x100,
206 UBIFS_MSG_IO = 0x200,
207 UBIFS_MSG_LOG = 0x400,
208 UBIFS_MSG_SCAN = 0x800,
209 UBIFS_MSG_RCVRY = 0x1000,
210};
211 164
212/* 165/*
213 * Debugging check flags. 166 * Debugging check flags.
@@ -233,11 +186,9 @@ enum {
233/* 186/*
234 * Special testing flags. 187 * Special testing flags.
235 * 188 *
236 * UBIFS_TST_FORCE_IN_THE_GAPS: force the use of in-the-gaps method
237 * UBIFS_TST_RCVRY: failure mode for recovery testing 189 * UBIFS_TST_RCVRY: failure mode for recovery testing
238 */ 190 */
239enum { 191enum {
240 UBIFS_TST_FORCE_IN_THE_GAPS = 0x2,
241 UBIFS_TST_RCVRY = 0x4, 192 UBIFS_TST_RCVRY = 0x4,
242}; 193};
243 194
@@ -262,7 +213,7 @@ void dbg_dump_lpt_node(const struct ubifs_info *c, void *node, int lnum,
262 int offs); 213 int offs);
263void dbg_dump_budget_req(const struct ubifs_budget_req *req); 214void dbg_dump_budget_req(const struct ubifs_budget_req *req);
264void dbg_dump_lstats(const struct ubifs_lp_stats *lst); 215void dbg_dump_lstats(const struct ubifs_lp_stats *lst);
265void dbg_dump_budg(struct ubifs_info *c); 216void dbg_dump_budg(struct ubifs_info *c, const struct ubifs_budg_info *bi);
266void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp); 217void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp);
267void dbg_dump_lprops(struct ubifs_info *c); 218void dbg_dump_lprops(struct ubifs_info *c);
268void dbg_dump_lpt_info(struct ubifs_info *c); 219void dbg_dump_lpt_info(struct ubifs_info *c);
@@ -304,18 +255,16 @@ int dbg_check_data_nodes_order(struct ubifs_info *c, struct list_head *head);
304int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head); 255int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head);
305 256
306/* Force the use of in-the-gaps method for testing */ 257/* Force the use of in-the-gaps method for testing */
307 258static inline int dbg_force_in_the_gaps_enabled(void)
308#define dbg_force_in_the_gaps_enabled \ 259{
309 (ubifs_tst_flags & UBIFS_TST_FORCE_IN_THE_GAPS) 260 return ubifs_chk_flags & UBIFS_CHK_GEN;
310 261}
311int dbg_force_in_the_gaps(void); 262int dbg_force_in_the_gaps(void);
312 263
313/* Failure mode for recovery testing */ 264/* Failure mode for recovery testing */
314
315#define dbg_failure_mode (ubifs_tst_flags & UBIFS_TST_RCVRY) 265#define dbg_failure_mode (ubifs_tst_flags & UBIFS_TST_RCVRY)
316 266
317#ifndef UBIFS_DBG_PRESERVE_UBI 267#ifndef UBIFS_DBG_PRESERVE_UBI
318
319#define ubi_leb_read dbg_leb_read 268#define ubi_leb_read dbg_leb_read
320#define ubi_leb_write dbg_leb_write 269#define ubi_leb_write dbg_leb_write
321#define ubi_leb_change dbg_leb_change 270#define ubi_leb_change dbg_leb_change
@@ -323,7 +272,6 @@ int dbg_force_in_the_gaps(void);
323#define ubi_leb_unmap dbg_leb_unmap 272#define ubi_leb_unmap dbg_leb_unmap
324#define ubi_is_mapped dbg_is_mapped 273#define ubi_is_mapped dbg_is_mapped
325#define ubi_leb_map dbg_leb_map 274#define ubi_leb_map dbg_leb_map
326
327#endif 275#endif
328 276
329int dbg_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset, 277int dbg_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset,
@@ -370,33 +318,33 @@ void dbg_debugfs_exit_fs(struct ubifs_info *c);
370 __func__, __LINE__, current->pid); \ 318 __func__, __LINE__, current->pid); \
371} while (0) 319} while (0)
372 320
373#define dbg_err(fmt, ...) do { \ 321#define dbg_err(fmt, ...) do { \
374 if (0) \ 322 if (0) \
375 ubifs_err(fmt, ##__VA_ARGS__); \ 323 ubifs_err(fmt, ##__VA_ARGS__); \
376} while (0) 324} while (0)
377 325
378#define dbg_msg(fmt, ...) do { \ 326#define ubifs_dbg_msg(fmt, ...) do { \
379 if (0) \ 327 if (0) \
380 printk(KERN_DEBUG "UBIFS DBG (pid %d): %s: " fmt "\n", \ 328 pr_debug(fmt "\n", ##__VA_ARGS__); \
381 current->pid, __func__, ##__VA_ARGS__); \
382} while (0) 329} while (0)
383 330
384#define dbg_dump_stack() 331#define dbg_dump_stack()
385#define ubifs_assert_cmt_locked(c) 332#define ubifs_assert_cmt_locked(c)
386 333
387#define dbg_gen(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) 334#define dbg_msg(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
388#define dbg_jnl(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) 335#define dbg_gen(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
389#define dbg_tnc(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) 336#define dbg_jnl(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
390#define dbg_lp(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) 337#define dbg_tnc(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
391#define dbg_find(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) 338#define dbg_lp(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
392#define dbg_mnt(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) 339#define dbg_find(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
393#define dbg_io(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) 340#define dbg_mnt(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
394#define dbg_cmt(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) 341#define dbg_io(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
395#define dbg_budg(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) 342#define dbg_cmt(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
396#define dbg_log(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) 343#define dbg_budg(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
397#define dbg_gc(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) 344#define dbg_log(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
398#define dbg_scan(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) 345#define dbg_gc(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
399#define dbg_rcvry(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) 346#define dbg_scan(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
347#define dbg_rcvry(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
400 348
401#define DBGKEY(key) ((char *)(key)) 349#define DBGKEY(key) ((char *)(key))
402#define DBGKEY1(key) ((char *)(key)) 350#define DBGKEY1(key) ((char *)(key))
@@ -420,7 +368,9 @@ static inline void
420dbg_dump_budget_req(const struct ubifs_budget_req *req) { return; } 368dbg_dump_budget_req(const struct ubifs_budget_req *req) { return; }
421static inline void 369static inline void
422dbg_dump_lstats(const struct ubifs_lp_stats *lst) { return; } 370dbg_dump_lstats(const struct ubifs_lp_stats *lst) { return; }
423static inline void dbg_dump_budg(struct ubifs_info *c) { return; } 371static inline void
372dbg_dump_budg(struct ubifs_info *c,
373 const struct ubifs_budg_info *bi) { return; }
424static inline void dbg_dump_lprop(const struct ubifs_info *c, 374static inline void dbg_dump_lprop(const struct ubifs_info *c,
425 const struct ubifs_lprops *lp) { return; } 375 const struct ubifs_lprops *lp) { return; }
426static inline void dbg_dump_lprops(struct ubifs_info *c) { return; } 376static inline void dbg_dump_lprops(struct ubifs_info *c) { return; }
@@ -482,8 +432,8 @@ dbg_check_nondata_nodes_order(struct ubifs_info *c,
482 struct list_head *head) { return 0; } 432 struct list_head *head) { return 0; }
483 433
484static inline int dbg_force_in_the_gaps(void) { return 0; } 434static inline int dbg_force_in_the_gaps(void) { return 0; }
485#define dbg_force_in_the_gaps_enabled 0 435#define dbg_force_in_the_gaps_enabled() 0
486#define dbg_failure_mode 0 436#define dbg_failure_mode 0
487 437
488static inline int dbg_debugfs_init(void) { return 0; } 438static inline int dbg_debugfs_init(void) { return 0; }
489static inline void dbg_debugfs_exit(void) { return; } 439static inline void dbg_debugfs_exit(void) { return; }
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 7217d67a80a6..ef5abd38f0bf 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -603,7 +603,7 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry)
603 ubifs_release_budget(c, &req); 603 ubifs_release_budget(c, &req);
604 else { 604 else {
605 /* We've deleted something - clean the "no space" flags */ 605 /* We've deleted something - clean the "no space" flags */
606 c->nospace = c->nospace_rp = 0; 606 c->bi.nospace = c->bi.nospace_rp = 0;
607 smp_wmb(); 607 smp_wmb();
608 } 608 }
609 return 0; 609 return 0;
@@ -693,7 +693,7 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry)
693 ubifs_release_budget(c, &req); 693 ubifs_release_budget(c, &req);
694 else { 694 else {
695 /* We've deleted something - clean the "no space" flags */ 695 /* We've deleted something - clean the "no space" flags */
696 c->nospace = c->nospace_rp = 0; 696 c->bi.nospace = c->bi.nospace_rp = 0;
697 smp_wmb(); 697 smp_wmb();
698 } 698 }
699 return 0; 699 return 0;
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index b286db79c686..5e7fccfc4b29 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -212,7 +212,7 @@ static void release_new_page_budget(struct ubifs_info *c)
212 */ 212 */
213static void release_existing_page_budget(struct ubifs_info *c) 213static void release_existing_page_budget(struct ubifs_info *c)
214{ 214{
215 struct ubifs_budget_req req = { .dd_growth = c->page_budget}; 215 struct ubifs_budget_req req = { .dd_growth = c->bi.page_budget};
216 216
217 ubifs_release_budget(c, &req); 217 ubifs_release_budget(c, &req);
218} 218}
@@ -971,11 +971,11 @@ static int do_writepage(struct page *page, int len)
971 * the page locked, and it locks @ui_mutex. However, write-back does take inode 971 * the page locked, and it locks @ui_mutex. However, write-back does take inode
972 * @i_mutex, which means other VFS operations may be run on this inode at the 972 * @i_mutex, which means other VFS operations may be run on this inode at the
973 * same time. And the problematic one is truncation to smaller size, from where 973 * same time. And the problematic one is truncation to smaller size, from where
974 * we have to call 'truncate_setsize()', which first changes @inode->i_size, then 974 * we have to call 'truncate_setsize()', which first changes @inode->i_size,
975 * drops the truncated pages. And while dropping the pages, it takes the page 975 * then drops the truncated pages. And while dropping the pages, it takes the
976 * lock. This means that 'do_truncation()' cannot call 'truncate_setsize()' with 976 * page lock. This means that 'do_truncation()' cannot call 'truncate_setsize()'
977 * @ui_mutex locked, because it would deadlock with 'ubifs_writepage()'. This 977 * with @ui_mutex locked, because it would deadlock with 'ubifs_writepage()'.
978 * means that @inode->i_size is changed while @ui_mutex is unlocked. 978 * This means that @inode->i_size is changed while @ui_mutex is unlocked.
979 * 979 *
980 * XXX(truncate): with the new truncate sequence this is not true anymore, 980 * XXX(truncate): with the new truncate sequence this is not true anymore,
981 * and the calls to truncate_setsize can be move around freely. They should 981 * and the calls to truncate_setsize can be move around freely. They should
@@ -1189,7 +1189,7 @@ out_budg:
1189 if (budgeted) 1189 if (budgeted)
1190 ubifs_release_budget(c, &req); 1190 ubifs_release_budget(c, &req);
1191 else { 1191 else {
1192 c->nospace = c->nospace_rp = 0; 1192 c->bi.nospace = c->bi.nospace_rp = 0;
1193 smp_wmb(); 1193 smp_wmb();
1194 } 1194 }
1195 return err; 1195 return err;
@@ -1312,7 +1312,11 @@ int ubifs_fsync(struct file *file, int datasync)
1312 1312
1313 dbg_gen("syncing inode %lu", inode->i_ino); 1313 dbg_gen("syncing inode %lu", inode->i_ino);
1314 1314
1315 if (inode->i_sb->s_flags & MS_RDONLY) 1315 if (c->ro_mount)
1316 /*
1317 * For some really strange reasons VFS does not filter out
1318 * 'fsync()' for R/O mounted file-systems as per 2.6.39.
1319 */
1316 return 0; 1320 return 0;
1317 1321
1318 /* 1322 /*
@@ -1432,10 +1436,11 @@ static int ubifs_releasepage(struct page *page, gfp_t unused_gfp_flags)
1432} 1436}
1433 1437
1434/* 1438/*
1435 * mmap()d file has taken write protection fault and is being made 1439 * mmap()d file has taken write protection fault and is being made writable.
1436 * writable. UBIFS must ensure page is budgeted for. 1440 * UBIFS must ensure page is budgeted for.
1437 */ 1441 */
1438static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) 1442static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma,
1443 struct vm_fault *vmf)
1439{ 1444{
1440 struct page *page = vmf->page; 1445 struct page *page = vmf->page;
1441 struct inode *inode = vma->vm_file->f_path.dentry->d_inode; 1446 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
@@ -1536,7 +1541,6 @@ static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma)
1536{ 1541{
1537 int err; 1542 int err;
1538 1543
1539 /* 'generic_file_mmap()' takes care of NOMMU case */
1540 err = generic_file_mmap(file, vma); 1544 err = generic_file_mmap(file, vma);
1541 if (err) 1545 if (err)
1542 return err; 1546 return err;
diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c
index 1d54383d1269..2559d174e004 100644
--- a/fs/ubifs/find.c
+++ b/fs/ubifs/find.c
@@ -252,8 +252,8 @@ int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
252 * But if the index takes fewer LEBs than it is reserved for it, 252 * But if the index takes fewer LEBs than it is reserved for it,
253 * this function must avoid picking those reserved LEBs. 253 * this function must avoid picking those reserved LEBs.
254 */ 254 */
255 if (c->min_idx_lebs >= c->lst.idx_lebs) { 255 if (c->bi.min_idx_lebs >= c->lst.idx_lebs) {
256 rsvd_idx_lebs = c->min_idx_lebs - c->lst.idx_lebs; 256 rsvd_idx_lebs = c->bi.min_idx_lebs - c->lst.idx_lebs;
257 exclude_index = 1; 257 exclude_index = 1;
258 } 258 }
259 spin_unlock(&c->space_lock); 259 spin_unlock(&c->space_lock);
@@ -276,7 +276,7 @@ int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
276 pick_free = 0; 276 pick_free = 0;
277 } else { 277 } else {
278 spin_lock(&c->space_lock); 278 spin_lock(&c->space_lock);
279 exclude_index = (c->min_idx_lebs >= c->lst.idx_lebs); 279 exclude_index = (c->bi.min_idx_lebs >= c->lst.idx_lebs);
280 spin_unlock(&c->space_lock); 280 spin_unlock(&c->space_lock);
281 } 281 }
282 282
@@ -501,8 +501,8 @@ int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *offs,
501 501
502 /* Check if there are enough empty LEBs for commit */ 502 /* Check if there are enough empty LEBs for commit */
503 spin_lock(&c->space_lock); 503 spin_lock(&c->space_lock);
504 if (c->min_idx_lebs > c->lst.idx_lebs) 504 if (c->bi.min_idx_lebs > c->lst.idx_lebs)
505 rsvd_idx_lebs = c->min_idx_lebs - c->lst.idx_lebs; 505 rsvd_idx_lebs = c->bi.min_idx_lebs - c->lst.idx_lebs;
506 else 506 else
507 rsvd_idx_lebs = 0; 507 rsvd_idx_lebs = 0;
508 lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt - 508 lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt -
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index 151f10882820..ded29f6224c2 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -100,6 +100,10 @@ static int switch_gc_head(struct ubifs_info *c)
100 if (err) 100 if (err)
101 return err; 101 return err;
102 102
103 err = ubifs_wbuf_sync_nolock(wbuf);
104 if (err)
105 return err;
106
103 err = ubifs_add_bud_to_log(c, GCHD, gc_lnum, 0); 107 err = ubifs_add_bud_to_log(c, GCHD, gc_lnum, 0);
104 if (err) 108 if (err)
105 return err; 109 return err;
@@ -118,7 +122,7 @@ static int switch_gc_head(struct ubifs_info *c)
118 * This function compares data nodes @a and @b. Returns %1 if @a has greater 122 * This function compares data nodes @a and @b. Returns %1 if @a has greater
119 * inode or block number, and %-1 otherwise. 123 * inode or block number, and %-1 otherwise.
120 */ 124 */
121int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b) 125static int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
122{ 126{
123 ino_t inuma, inumb; 127 ino_t inuma, inumb;
124 struct ubifs_info *c = priv; 128 struct ubifs_info *c = priv;
@@ -161,7 +165,8 @@ int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
161 * first and sorted by length in descending order. Directory entry nodes go 165 * first and sorted by length in descending order. Directory entry nodes go
162 * after inode nodes and are sorted in ascending hash valuer order. 166 * after inode nodes and are sorted in ascending hash valuer order.
163 */ 167 */
164int nondata_nodes_cmp(void *priv, struct list_head *a, struct list_head *b) 168static int nondata_nodes_cmp(void *priv, struct list_head *a,
169 struct list_head *b)
165{ 170{
166 ino_t inuma, inumb; 171 ino_t inuma, inumb;
167 struct ubifs_info *c = priv; 172 struct ubifs_info *c = priv;
@@ -473,6 +478,37 @@ int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp)
473 ubifs_assert(c->gc_lnum != lnum); 478 ubifs_assert(c->gc_lnum != lnum);
474 ubifs_assert(wbuf->lnum != lnum); 479 ubifs_assert(wbuf->lnum != lnum);
475 480
481 if (lp->free + lp->dirty == c->leb_size) {
482 /* Special case - a free LEB */
483 dbg_gc("LEB %d is free, return it", lp->lnum);
484 ubifs_assert(!(lp->flags & LPROPS_INDEX));
485
486 if (lp->free != c->leb_size) {
487 /*
488 * Write buffers must be sync'd before unmapping
489 * freeable LEBs, because one of them may contain data
490 * which obsoletes something in 'lp->pnum'.
491 */
492 err = gc_sync_wbufs(c);
493 if (err)
494 return err;
495 err = ubifs_change_one_lp(c, lp->lnum, c->leb_size,
496 0, 0, 0, 0);
497 if (err)
498 return err;
499 }
500 err = ubifs_leb_unmap(c, lp->lnum);
501 if (err)
502 return err;
503
504 if (c->gc_lnum == -1) {
505 c->gc_lnum = lnum;
506 return LEB_RETAINED;
507 }
508
509 return LEB_FREED;
510 }
511
476 /* 512 /*
477 * We scan the entire LEB even though we only really need to scan up to 513 * We scan the entire LEB even though we only really need to scan up to
478 * (c->leb_size - lp->free). 514 * (c->leb_size - lp->free).
@@ -682,37 +718,6 @@ int ubifs_garbage_collect(struct ubifs_info *c, int anyway)
682 "(min. space %d)", lp.lnum, lp.free, lp.dirty, 718 "(min. space %d)", lp.lnum, lp.free, lp.dirty,
683 lp.free + lp.dirty, min_space); 719 lp.free + lp.dirty, min_space);
684 720
685 if (lp.free + lp.dirty == c->leb_size) {
686 /* An empty LEB was returned */
687 dbg_gc("LEB %d is free, return it", lp.lnum);
688 /*
689 * ubifs_find_dirty_leb() doesn't return freeable index
690 * LEBs.
691 */
692 ubifs_assert(!(lp.flags & LPROPS_INDEX));
693 if (lp.free != c->leb_size) {
694 /*
695 * Write buffers must be sync'd before
696 * unmapping freeable LEBs, because one of them
697 * may contain data which obsoletes something
698 * in 'lp.pnum'.
699 */
700 ret = gc_sync_wbufs(c);
701 if (ret)
702 goto out;
703 ret = ubifs_change_one_lp(c, lp.lnum,
704 c->leb_size, 0, 0, 0,
705 0);
706 if (ret)
707 goto out;
708 }
709 ret = ubifs_leb_unmap(c, lp.lnum);
710 if (ret)
711 goto out;
712 ret = lp.lnum;
713 break;
714 }
715
716 space_before = c->leb_size - wbuf->offs - wbuf->used; 721 space_before = c->leb_size - wbuf->offs - wbuf->used;
717 if (wbuf->lnum == -1) 722 if (wbuf->lnum == -1)
718 space_before = 0; 723 space_before = 0;
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index dfd168b7807e..166951e0dcd3 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -393,7 +393,7 @@ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf)
393 ubifs_assert(wbuf->size % c->min_io_size == 0); 393 ubifs_assert(wbuf->size % c->min_io_size == 0);
394 ubifs_assert(!c->ro_media && !c->ro_mount); 394 ubifs_assert(!c->ro_media && !c->ro_mount);
395 if (c->leb_size - wbuf->offs >= c->max_write_size) 395 if (c->leb_size - wbuf->offs >= c->max_write_size)
396 ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size )); 396 ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size));
397 397
398 if (c->ro_error) 398 if (c->ro_error)
399 return -EROFS; 399 return -EROFS;
@@ -452,8 +452,8 @@ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf)
452 * @dtype: data type 452 * @dtype: data type
453 * 453 *
454 * This function targets the write-buffer to logical eraseblock @lnum:@offs. 454 * This function targets the write-buffer to logical eraseblock @lnum:@offs.
455 * The write-buffer is synchronized if it is not empty. Returns zero in case of 455 * The write-buffer has to be empty. Returns zero in case of success and a
456 * success and a negative error code in case of failure. 456 * negative error code in case of failure.
457 */ 457 */
458int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs, 458int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs,
459 int dtype) 459 int dtype)
@@ -465,13 +465,7 @@ int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs,
465 ubifs_assert(offs >= 0 && offs <= c->leb_size); 465 ubifs_assert(offs >= 0 && offs <= c->leb_size);
466 ubifs_assert(offs % c->min_io_size == 0 && !(offs & 7)); 466 ubifs_assert(offs % c->min_io_size == 0 && !(offs & 7));
467 ubifs_assert(lnum != wbuf->lnum); 467 ubifs_assert(lnum != wbuf->lnum);
468 468 ubifs_assert(wbuf->used == 0);
469 if (wbuf->used > 0) {
470 int err = ubifs_wbuf_sync_nolock(wbuf);
471
472 if (err)
473 return err;
474 }
475 469
476 spin_lock(&wbuf->lock); 470 spin_lock(&wbuf->lock);
477 wbuf->lnum = lnum; 471 wbuf->lnum = lnum;
@@ -573,7 +567,7 @@ out_timers:
573int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) 567int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
574{ 568{
575 struct ubifs_info *c = wbuf->c; 569 struct ubifs_info *c = wbuf->c;
576 int err, written, n, aligned_len = ALIGN(len, 8), offs; 570 int err, written, n, aligned_len = ALIGN(len, 8);
577 571
578 dbg_io("%d bytes (%s) to jhead %s wbuf at LEB %d:%d", len, 572 dbg_io("%d bytes (%s) to jhead %s wbuf at LEB %d:%d", len,
579 dbg_ntype(((struct ubifs_ch *)buf)->node_type), 573 dbg_ntype(((struct ubifs_ch *)buf)->node_type),
@@ -588,7 +582,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
588 ubifs_assert(mutex_is_locked(&wbuf->io_mutex)); 582 ubifs_assert(mutex_is_locked(&wbuf->io_mutex));
589 ubifs_assert(!c->ro_media && !c->ro_mount); 583 ubifs_assert(!c->ro_media && !c->ro_mount);
590 if (c->leb_size - wbuf->offs >= c->max_write_size) 584 if (c->leb_size - wbuf->offs >= c->max_write_size)
591 ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size )); 585 ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size));
592 586
593 if (c->leb_size - wbuf->offs - wbuf->used < aligned_len) { 587 if (c->leb_size - wbuf->offs - wbuf->used < aligned_len) {
594 err = -ENOSPC; 588 err = -ENOSPC;
@@ -636,7 +630,6 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
636 goto exit; 630 goto exit;
637 } 631 }
638 632
639 offs = wbuf->offs;
640 written = 0; 633 written = 0;
641 634
642 if (wbuf->used) { 635 if (wbuf->used) {
@@ -653,7 +646,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
653 if (err) 646 if (err)
654 goto out; 647 goto out;
655 648
656 offs += wbuf->size; 649 wbuf->offs += wbuf->size;
657 len -= wbuf->avail; 650 len -= wbuf->avail;
658 aligned_len -= wbuf->avail; 651 aligned_len -= wbuf->avail;
659 written += wbuf->avail; 652 written += wbuf->avail;
@@ -672,7 +665,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
672 if (err) 665 if (err)
673 goto out; 666 goto out;
674 667
675 offs += wbuf->size; 668 wbuf->offs += wbuf->size;
676 len -= wbuf->size; 669 len -= wbuf->size;
677 aligned_len -= wbuf->size; 670 aligned_len -= wbuf->size;
678 written += wbuf->size; 671 written += wbuf->size;
@@ -687,12 +680,13 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
687 n = aligned_len >> c->max_write_shift; 680 n = aligned_len >> c->max_write_shift;
688 if (n) { 681 if (n) {
689 n <<= c->max_write_shift; 682 n <<= c->max_write_shift;
690 dbg_io("write %d bytes to LEB %d:%d", n, wbuf->lnum, offs); 683 dbg_io("write %d bytes to LEB %d:%d", n, wbuf->lnum,
691 err = ubi_leb_write(c->ubi, wbuf->lnum, buf + written, offs, n, 684 wbuf->offs);
692 wbuf->dtype); 685 err = ubi_leb_write(c->ubi, wbuf->lnum, buf + written,
686 wbuf->offs, n, wbuf->dtype);
693 if (err) 687 if (err)
694 goto out; 688 goto out;
695 offs += n; 689 wbuf->offs += n;
696 aligned_len -= n; 690 aligned_len -= n;
697 len -= n; 691 len -= n;
698 written += n; 692 written += n;
@@ -707,7 +701,6 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
707 */ 701 */
708 memcpy(wbuf->buf, buf + written, len); 702 memcpy(wbuf->buf, buf + written, len);
709 703
710 wbuf->offs = offs;
711 if (c->leb_size - wbuf->offs >= c->max_write_size) 704 if (c->leb_size - wbuf->offs >= c->max_write_size)
712 wbuf->size = c->max_write_size; 705 wbuf->size = c->max_write_size;
713 else 706 else
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index aed25e864227..34b1679e6e3a 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -141,14 +141,8 @@ again:
141 * LEB with some empty space. 141 * LEB with some empty space.
142 */ 142 */
143 lnum = ubifs_find_free_space(c, len, &offs, squeeze); 143 lnum = ubifs_find_free_space(c, len, &offs, squeeze);
144 if (lnum >= 0) { 144 if (lnum >= 0)
145 /* Found an LEB, add it to the journal head */
146 err = ubifs_add_bud_to_log(c, jhead, lnum, offs);
147 if (err)
148 goto out_return;
149 /* A new bud was successfully allocated and added to the log */
150 goto out; 145 goto out;
151 }
152 146
153 err = lnum; 147 err = lnum;
154 if (err != -ENOSPC) 148 if (err != -ENOSPC)
@@ -203,12 +197,23 @@ again:
203 return 0; 197 return 0;
204 } 198 }
205 199
206 err = ubifs_add_bud_to_log(c, jhead, lnum, 0);
207 if (err)
208 goto out_return;
209 offs = 0; 200 offs = 0;
210 201
211out: 202out:
203 /*
204 * Make sure we synchronize the write-buffer before we add the new bud
205 * to the log. Otherwise we may have a power cut after the log
206 * reference node for the last bud (@lnum) is written but before the
207 * write-buffer data are written to the next-to-last bud
208 * (@wbuf->lnum). And the effect would be that the recovery would see
209 * that there is corruption in the next-to-last bud.
210 */
211 err = ubifs_wbuf_sync_nolock(wbuf);
212 if (err)
213 goto out_return;
214 err = ubifs_add_bud_to_log(c, jhead, lnum, offs);
215 if (err)
216 goto out_return;
212 err = ubifs_wbuf_seek_nolock(wbuf, lnum, offs, wbuf->dtype); 217 err = ubifs_wbuf_seek_nolock(wbuf, lnum, offs, wbuf->dtype);
213 if (err) 218 if (err)
214 goto out_unlock; 219 goto out_unlock;
@@ -380,10 +385,8 @@ out:
380 if (err == -ENOSPC) { 385 if (err == -ENOSPC) {
381 /* This are some budgeting problems, print useful information */ 386 /* This are some budgeting problems, print useful information */
382 down_write(&c->commit_sem); 387 down_write(&c->commit_sem);
383 spin_lock(&c->space_lock);
384 dbg_dump_stack(); 388 dbg_dump_stack();
385 dbg_dump_budg(c); 389 dbg_dump_budg(c, &c->bi);
386 spin_unlock(&c->space_lock);
387 dbg_dump_lprops(c); 390 dbg_dump_lprops(c);
388 cmt_retries = dbg_check_lprops(c); 391 cmt_retries = dbg_check_lprops(c);
389 up_write(&c->commit_sem); 392 up_write(&c->commit_sem);
diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c
index 40fa780ebea7..affea9494ae2 100644
--- a/fs/ubifs/log.c
+++ b/fs/ubifs/log.c
@@ -100,20 +100,6 @@ struct ubifs_wbuf *ubifs_get_wbuf(struct ubifs_info *c, int lnum)
100} 100}
101 101
102/** 102/**
103 * next_log_lnum - switch to the next log LEB.
104 * @c: UBIFS file-system description object
105 * @lnum: current log LEB
106 */
107static inline int next_log_lnum(const struct ubifs_info *c, int lnum)
108{
109 lnum += 1;
110 if (lnum > c->log_last)
111 lnum = UBIFS_LOG_LNUM;
112
113 return lnum;
114}
115
116/**
117 * empty_log_bytes - calculate amount of empty space in the log. 103 * empty_log_bytes - calculate amount of empty space in the log.
118 * @c: UBIFS file-system description object 104 * @c: UBIFS file-system description object
119 */ 105 */
@@ -257,7 +243,7 @@ int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs)
257 ref->jhead = cpu_to_le32(jhead); 243 ref->jhead = cpu_to_le32(jhead);
258 244
259 if (c->lhead_offs > c->leb_size - c->ref_node_alsz) { 245 if (c->lhead_offs > c->leb_size - c->ref_node_alsz) {
260 c->lhead_lnum = next_log_lnum(c, c->lhead_lnum); 246 c->lhead_lnum = ubifs_next_log_lnum(c, c->lhead_lnum);
261 c->lhead_offs = 0; 247 c->lhead_offs = 0;
262 } 248 }
263 249
@@ -425,7 +411,7 @@ int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum)
425 411
426 /* Switch to the next log LEB */ 412 /* Switch to the next log LEB */
427 if (c->lhead_offs) { 413 if (c->lhead_offs) {
428 c->lhead_lnum = next_log_lnum(c, c->lhead_lnum); 414 c->lhead_lnum = ubifs_next_log_lnum(c, c->lhead_lnum);
429 c->lhead_offs = 0; 415 c->lhead_offs = 0;
430 } 416 }
431 417
@@ -446,7 +432,7 @@ int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum)
446 432
447 c->lhead_offs += len; 433 c->lhead_offs += len;
448 if (c->lhead_offs == c->leb_size) { 434 if (c->lhead_offs == c->leb_size) {
449 c->lhead_lnum = next_log_lnum(c, c->lhead_lnum); 435 c->lhead_lnum = ubifs_next_log_lnum(c, c->lhead_lnum);
450 c->lhead_offs = 0; 436 c->lhead_offs = 0;
451 } 437 }
452 438
@@ -533,7 +519,7 @@ int ubifs_log_post_commit(struct ubifs_info *c, int old_ltail_lnum)
533 } 519 }
534 mutex_lock(&c->log_mutex); 520 mutex_lock(&c->log_mutex);
535 for (lnum = old_ltail_lnum; lnum != c->ltail_lnum; 521 for (lnum = old_ltail_lnum; lnum != c->ltail_lnum;
536 lnum = next_log_lnum(c, lnum)) { 522 lnum = ubifs_next_log_lnum(c, lnum)) {
537 dbg_log("unmap log LEB %d", lnum); 523 dbg_log("unmap log LEB %d", lnum);
538 err = ubifs_leb_unmap(c, lnum); 524 err = ubifs_leb_unmap(c, lnum);
539 if (err) 525 if (err)
@@ -642,7 +628,7 @@ static int add_node(struct ubifs_info *c, void *buf, int *lnum, int *offs,
642 err = ubifs_leb_change(c, *lnum, buf, sz, UBI_SHORTTERM); 628 err = ubifs_leb_change(c, *lnum, buf, sz, UBI_SHORTTERM);
643 if (err) 629 if (err)
644 return err; 630 return err;
645 *lnum = next_log_lnum(c, *lnum); 631 *lnum = ubifs_next_log_lnum(c, *lnum);
646 *offs = 0; 632 *offs = 0;
647 } 633 }
648 memcpy(buf + *offs, node, len); 634 memcpy(buf + *offs, node, len);
@@ -712,7 +698,7 @@ int ubifs_consolidate_log(struct ubifs_info *c)
712 ubifs_scan_destroy(sleb); 698 ubifs_scan_destroy(sleb);
713 if (lnum == c->lhead_lnum) 699 if (lnum == c->lhead_lnum)
714 break; 700 break;
715 lnum = next_log_lnum(c, lnum); 701 lnum = ubifs_next_log_lnum(c, lnum);
716 } 702 }
717 if (offs) { 703 if (offs) {
718 int sz = ALIGN(offs, c->min_io_size); 704 int sz = ALIGN(offs, c->min_io_size);
@@ -732,7 +718,7 @@ int ubifs_consolidate_log(struct ubifs_info *c)
732 /* Unmap remaining LEBs */ 718 /* Unmap remaining LEBs */
733 lnum = write_lnum; 719 lnum = write_lnum;
734 do { 720 do {
735 lnum = next_log_lnum(c, lnum); 721 lnum = ubifs_next_log_lnum(c, lnum);
736 err = ubifs_leb_unmap(c, lnum); 722 err = ubifs_leb_unmap(c, lnum);
737 if (err) 723 if (err)
738 return err; 724 return err;
diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c
index 0ee0847f2421..667884f4a615 100644
--- a/fs/ubifs/lprops.c
+++ b/fs/ubifs/lprops.c
@@ -1007,21 +1007,11 @@ out:
1007} 1007}
1008 1008
1009/** 1009/**
1010 * struct scan_check_data - data provided to scan callback function.
1011 * @lst: LEB properties statistics
1012 * @err: error code
1013 */
1014struct scan_check_data {
1015 struct ubifs_lp_stats lst;
1016 int err;
1017};
1018
1019/**
1020 * scan_check_cb - scan callback. 1010 * scan_check_cb - scan callback.
1021 * @c: the UBIFS file-system description object 1011 * @c: the UBIFS file-system description object
1022 * @lp: LEB properties to scan 1012 * @lp: LEB properties to scan
1023 * @in_tree: whether the LEB properties are in main memory 1013 * @in_tree: whether the LEB properties are in main memory
1024 * @data: information passed to and from the caller of the scan 1014 * @lst: lprops statistics to update
1025 * 1015 *
1026 * This function returns a code that indicates whether the scan should continue 1016 * This function returns a code that indicates whether the scan should continue
1027 * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree 1017 * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree
@@ -1030,11 +1020,10 @@ struct scan_check_data {
1030 */ 1020 */
1031static int scan_check_cb(struct ubifs_info *c, 1021static int scan_check_cb(struct ubifs_info *c,
1032 const struct ubifs_lprops *lp, int in_tree, 1022 const struct ubifs_lprops *lp, int in_tree,
1033 struct scan_check_data *data) 1023 struct ubifs_lp_stats *lst)
1034{ 1024{
1035 struct ubifs_scan_leb *sleb; 1025 struct ubifs_scan_leb *sleb;
1036 struct ubifs_scan_node *snod; 1026 struct ubifs_scan_node *snod;
1037 struct ubifs_lp_stats *lst = &data->lst;
1038 int cat, lnum = lp->lnum, is_idx = 0, used = 0, free, dirty, ret; 1027 int cat, lnum = lp->lnum, is_idx = 0, used = 0, free, dirty, ret;
1039 void *buf = NULL; 1028 void *buf = NULL;
1040 1029
@@ -1044,7 +1033,7 @@ static int scan_check_cb(struct ubifs_info *c,
1044 if (cat != (lp->flags & LPROPS_CAT_MASK)) { 1033 if (cat != (lp->flags & LPROPS_CAT_MASK)) {
1045 ubifs_err("bad LEB category %d expected %d", 1034 ubifs_err("bad LEB category %d expected %d",
1046 (lp->flags & LPROPS_CAT_MASK), cat); 1035 (lp->flags & LPROPS_CAT_MASK), cat);
1047 goto out; 1036 return -EINVAL;
1048 } 1037 }
1049 } 1038 }
1050 1039
@@ -1078,7 +1067,7 @@ static int scan_check_cb(struct ubifs_info *c,
1078 } 1067 }
1079 if (!found) { 1068 if (!found) {
1080 ubifs_err("bad LPT list (category %d)", cat); 1069 ubifs_err("bad LPT list (category %d)", cat);
1081 goto out; 1070 return -EINVAL;
1082 } 1071 }
1083 } 1072 }
1084 } 1073 }
@@ -1090,45 +1079,40 @@ static int scan_check_cb(struct ubifs_info *c,
1090 if ((lp->hpos != -1 && heap->arr[lp->hpos]->lnum != lnum) || 1079 if ((lp->hpos != -1 && heap->arr[lp->hpos]->lnum != lnum) ||
1091 lp != heap->arr[lp->hpos]) { 1080 lp != heap->arr[lp->hpos]) {
1092 ubifs_err("bad LPT heap (category %d)", cat); 1081 ubifs_err("bad LPT heap (category %d)", cat);
1093 goto out; 1082 return -EINVAL;
1094 } 1083 }
1095 } 1084 }
1096 1085
1097 buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL); 1086 buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL);
1098 if (!buf) { 1087 if (!buf)
1099 ubifs_err("cannot allocate memory to scan LEB %d", lnum); 1088 return -ENOMEM;
1100 goto out; 1089
1090 /*
1091 * After an unclean unmount, empty and freeable LEBs
1092 * may contain garbage - do not scan them.
1093 */
1094 if (lp->free == c->leb_size) {
1095 lst->empty_lebs += 1;
1096 lst->total_free += c->leb_size;
1097 lst->total_dark += ubifs_calc_dark(c, c->leb_size);
1098 return LPT_SCAN_CONTINUE;
1099 }
1100 if (lp->free + lp->dirty == c->leb_size &&
1101 !(lp->flags & LPROPS_INDEX)) {
1102 lst->total_free += lp->free;
1103 lst->total_dirty += lp->dirty;
1104 lst->total_dark += ubifs_calc_dark(c, c->leb_size);
1105 return LPT_SCAN_CONTINUE;
1101 } 1106 }
1102 1107
1103 sleb = ubifs_scan(c, lnum, 0, buf, 0); 1108 sleb = ubifs_scan(c, lnum, 0, buf, 0);
1104 if (IS_ERR(sleb)) { 1109 if (IS_ERR(sleb)) {
1105 /* 1110 ret = PTR_ERR(sleb);
1106 * After an unclean unmount, empty and freeable LEBs 1111 if (ret == -EUCLEAN) {
1107 * may contain garbage. 1112 dbg_dump_lprops(c);
1108 */ 1113 dbg_dump_budg(c, &c->bi);
1109 if (lp->free == c->leb_size) {
1110 ubifs_err("scan errors were in empty LEB "
1111 "- continuing checking");
1112 lst->empty_lebs += 1;
1113 lst->total_free += c->leb_size;
1114 lst->total_dark += ubifs_calc_dark(c, c->leb_size);
1115 ret = LPT_SCAN_CONTINUE;
1116 goto exit;
1117 }
1118
1119 if (lp->free + lp->dirty == c->leb_size &&
1120 !(lp->flags & LPROPS_INDEX)) {
1121 ubifs_err("scan errors were in freeable LEB "
1122 "- continuing checking");
1123 lst->total_free += lp->free;
1124 lst->total_dirty += lp->dirty;
1125 lst->total_dark += ubifs_calc_dark(c, c->leb_size);
1126 ret = LPT_SCAN_CONTINUE;
1127 goto exit;
1128 } 1114 }
1129 data->err = PTR_ERR(sleb); 1115 goto out;
1130 ret = LPT_SCAN_STOP;
1131 goto exit;
1132 } 1116 }
1133 1117
1134 is_idx = -1; 1118 is_idx = -1;
@@ -1246,10 +1230,8 @@ static int scan_check_cb(struct ubifs_info *c,
1246 } 1230 }
1247 1231
1248 ubifs_scan_destroy(sleb); 1232 ubifs_scan_destroy(sleb);
1249 ret = LPT_SCAN_CONTINUE;
1250exit:
1251 vfree(buf); 1233 vfree(buf);
1252 return ret; 1234 return LPT_SCAN_CONTINUE;
1253 1235
1254out_print: 1236out_print:
1255 ubifs_err("bad accounting of LEB %d: free %d, dirty %d flags %#x, " 1237 ubifs_err("bad accounting of LEB %d: free %d, dirty %d flags %#x, "
@@ -1258,10 +1240,10 @@ out_print:
1258 dbg_dump_leb(c, lnum); 1240 dbg_dump_leb(c, lnum);
1259out_destroy: 1241out_destroy:
1260 ubifs_scan_destroy(sleb); 1242 ubifs_scan_destroy(sleb);
1243 ret = -EINVAL;
1261out: 1244out:
1262 vfree(buf); 1245 vfree(buf);
1263 data->err = -EINVAL; 1246 return ret;
1264 return LPT_SCAN_STOP;
1265} 1247}
1266 1248
1267/** 1249/**
@@ -1278,8 +1260,7 @@ out:
1278int dbg_check_lprops(struct ubifs_info *c) 1260int dbg_check_lprops(struct ubifs_info *c)
1279{ 1261{
1280 int i, err; 1262 int i, err;
1281 struct scan_check_data data; 1263 struct ubifs_lp_stats lst;
1282 struct ubifs_lp_stats *lst = &data.lst;
1283 1264
1284 if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS)) 1265 if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS))
1285 return 0; 1266 return 0;
@@ -1294,29 +1275,23 @@ int dbg_check_lprops(struct ubifs_info *c)
1294 return err; 1275 return err;
1295 } 1276 }
1296 1277
1297 memset(lst, 0, sizeof(struct ubifs_lp_stats)); 1278 memset(&lst, 0, sizeof(struct ubifs_lp_stats));
1298
1299 data.err = 0;
1300 err = ubifs_lpt_scan_nolock(c, c->main_first, c->leb_cnt - 1, 1279 err = ubifs_lpt_scan_nolock(c, c->main_first, c->leb_cnt - 1,
1301 (ubifs_lpt_scan_callback)scan_check_cb, 1280 (ubifs_lpt_scan_callback)scan_check_cb,
1302 &data); 1281 &lst);
1303 if (err && err != -ENOSPC) 1282 if (err && err != -ENOSPC)
1304 goto out; 1283 goto out;
1305 if (data.err) {
1306 err = data.err;
1307 goto out;
1308 }
1309 1284
1310 if (lst->empty_lebs != c->lst.empty_lebs || 1285 if (lst.empty_lebs != c->lst.empty_lebs ||
1311 lst->idx_lebs != c->lst.idx_lebs || 1286 lst.idx_lebs != c->lst.idx_lebs ||
1312 lst->total_free != c->lst.total_free || 1287 lst.total_free != c->lst.total_free ||
1313 lst->total_dirty != c->lst.total_dirty || 1288 lst.total_dirty != c->lst.total_dirty ||
1314 lst->total_used != c->lst.total_used) { 1289 lst.total_used != c->lst.total_used) {
1315 ubifs_err("bad overall accounting"); 1290 ubifs_err("bad overall accounting");
1316 ubifs_err("calculated: empty_lebs %d, idx_lebs %d, " 1291 ubifs_err("calculated: empty_lebs %d, idx_lebs %d, "
1317 "total_free %lld, total_dirty %lld, total_used %lld", 1292 "total_free %lld, total_dirty %lld, total_used %lld",
1318 lst->empty_lebs, lst->idx_lebs, lst->total_free, 1293 lst.empty_lebs, lst.idx_lebs, lst.total_free,
1319 lst->total_dirty, lst->total_used); 1294 lst.total_dirty, lst.total_used);
1320 ubifs_err("read from lprops: empty_lebs %d, idx_lebs %d, " 1295 ubifs_err("read from lprops: empty_lebs %d, idx_lebs %d, "
1321 "total_free %lld, total_dirty %lld, total_used %lld", 1296 "total_free %lld, total_dirty %lld, total_used %lld",
1322 c->lst.empty_lebs, c->lst.idx_lebs, c->lst.total_free, 1297 c->lst.empty_lebs, c->lst.idx_lebs, c->lst.total_free,
@@ -1325,11 +1300,11 @@ int dbg_check_lprops(struct ubifs_info *c)
1325 goto out; 1300 goto out;
1326 } 1301 }
1327 1302
1328 if (lst->total_dead != c->lst.total_dead || 1303 if (lst.total_dead != c->lst.total_dead ||
1329 lst->total_dark != c->lst.total_dark) { 1304 lst.total_dark != c->lst.total_dark) {
1330 ubifs_err("bad dead/dark space accounting"); 1305 ubifs_err("bad dead/dark space accounting");
1331 ubifs_err("calculated: total_dead %lld, total_dark %lld", 1306 ubifs_err("calculated: total_dead %lld, total_dark %lld",
1332 lst->total_dead, lst->total_dark); 1307 lst.total_dead, lst.total_dark);
1333 ubifs_err("read from lprops: total_dead %lld, total_dark %lld", 1308 ubifs_err("read from lprops: total_dead %lld, total_dark %lld",
1334 c->lst.total_dead, c->lst.total_dark); 1309 c->lst.total_dead, c->lst.total_dark);
1335 err = -EINVAL; 1310 err = -EINVAL;
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index 0c9c69bd983a..dfcb5748a7dc 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -29,6 +29,12 @@
29#include <linux/slab.h> 29#include <linux/slab.h>
30#include "ubifs.h" 30#include "ubifs.h"
31 31
32#ifdef CONFIG_UBIFS_FS_DEBUG
33static int dbg_populate_lsave(struct ubifs_info *c);
34#else
35#define dbg_populate_lsave(c) 0
36#endif
37
32/** 38/**
33 * first_dirty_cnode - find first dirty cnode. 39 * first_dirty_cnode - find first dirty cnode.
34 * @c: UBIFS file-system description object 40 * @c: UBIFS file-system description object
@@ -586,7 +592,7 @@ static struct ubifs_pnode *next_pnode_to_dirty(struct ubifs_info *c,
586 if (nnode->nbranch[iip].lnum) 592 if (nnode->nbranch[iip].lnum)
587 break; 593 break;
588 } 594 }
589 } while (iip >= UBIFS_LPT_FANOUT); 595 } while (iip >= UBIFS_LPT_FANOUT);
590 596
591 /* Go right */ 597 /* Go right */
592 nnode = ubifs_get_nnode(c, nnode, iip); 598 nnode = ubifs_get_nnode(c, nnode, iip);
@@ -815,6 +821,10 @@ static void populate_lsave(struct ubifs_info *c)
815 c->lpt_drty_flgs |= LSAVE_DIRTY; 821 c->lpt_drty_flgs |= LSAVE_DIRTY;
816 ubifs_add_lpt_dirt(c, c->lsave_lnum, c->lsave_sz); 822 ubifs_add_lpt_dirt(c, c->lsave_lnum, c->lsave_sz);
817 } 823 }
824
825 if (dbg_populate_lsave(c))
826 return;
827
818 list_for_each_entry(lprops, &c->empty_list, list) { 828 list_for_each_entry(lprops, &c->empty_list, list) {
819 c->lsave[cnt++] = lprops->lnum; 829 c->lsave[cnt++] = lprops->lnum;
820 if (cnt >= c->lsave_cnt) 830 if (cnt >= c->lsave_cnt)
@@ -1994,4 +2004,47 @@ void dbg_dump_lpt_lebs(const struct ubifs_info *c)
1994 current->pid); 2004 current->pid);
1995} 2005}
1996 2006
2007/**
2008 * dbg_populate_lsave - debugging version of 'populate_lsave()'
2009 * @c: UBIFS file-system description object
2010 *
2011 * This is a debugging version for 'populate_lsave()' which populates lsave
2012 * with random LEBs instead of useful LEBs, which is good for test coverage.
2013 * Returns zero if lsave has not been populated (this debugging feature is
2014 * disabled) an non-zero if lsave has been populated.
2015 */
2016static int dbg_populate_lsave(struct ubifs_info *c)
2017{
2018 struct ubifs_lprops *lprops;
2019 struct ubifs_lpt_heap *heap;
2020 int i;
2021
2022 if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
2023 return 0;
2024 if (random32() & 3)
2025 return 0;
2026
2027 for (i = 0; i < c->lsave_cnt; i++)
2028 c->lsave[i] = c->main_first;
2029
2030 list_for_each_entry(lprops, &c->empty_list, list)
2031 c->lsave[random32() % c->lsave_cnt] = lprops->lnum;
2032 list_for_each_entry(lprops, &c->freeable_list, list)
2033 c->lsave[random32() % c->lsave_cnt] = lprops->lnum;
2034 list_for_each_entry(lprops, &c->frdi_idx_list, list)
2035 c->lsave[random32() % c->lsave_cnt] = lprops->lnum;
2036
2037 heap = &c->lpt_heap[LPROPS_DIRTY_IDX - 1];
2038 for (i = 0; i < heap->cnt; i++)
2039 c->lsave[random32() % c->lsave_cnt] = heap->arr[i]->lnum;
2040 heap = &c->lpt_heap[LPROPS_DIRTY - 1];
2041 for (i = 0; i < heap->cnt; i++)
2042 c->lsave[random32() % c->lsave_cnt] = heap->arr[i]->lnum;
2043 heap = &c->lpt_heap[LPROPS_FREE - 1];
2044 for (i = 0; i < heap->cnt; i++)
2045 c->lsave[random32() % c->lsave_cnt] = heap->arr[i]->lnum;
2046
2047 return 1;
2048}
2049
1997#endif /* CONFIG_UBIFS_FS_DEBUG */ 2050#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c
index 21f47afdacff..278c2382e8c2 100644
--- a/fs/ubifs/master.c
+++ b/fs/ubifs/master.c
@@ -148,7 +148,7 @@ static int validate_master(const struct ubifs_info *c)
148 } 148 }
149 149
150 main_sz = (long long)c->main_lebs * c->leb_size; 150 main_sz = (long long)c->main_lebs * c->leb_size;
151 if (c->old_idx_sz & 7 || c->old_idx_sz >= main_sz) { 151 if (c->bi.old_idx_sz & 7 || c->bi.old_idx_sz >= main_sz) {
152 err = 9; 152 err = 9;
153 goto out; 153 goto out;
154 } 154 }
@@ -218,7 +218,7 @@ static int validate_master(const struct ubifs_info *c)
218 } 218 }
219 219
220 if (c->lst.total_dead + c->lst.total_dark + 220 if (c->lst.total_dead + c->lst.total_dark +
221 c->lst.total_used + c->old_idx_sz > main_sz) { 221 c->lst.total_used + c->bi.old_idx_sz > main_sz) {
222 err = 21; 222 err = 21;
223 goto out; 223 goto out;
224 } 224 }
@@ -286,7 +286,7 @@ int ubifs_read_master(struct ubifs_info *c)
286 c->gc_lnum = le32_to_cpu(c->mst_node->gc_lnum); 286 c->gc_lnum = le32_to_cpu(c->mst_node->gc_lnum);
287 c->ihead_lnum = le32_to_cpu(c->mst_node->ihead_lnum); 287 c->ihead_lnum = le32_to_cpu(c->mst_node->ihead_lnum);
288 c->ihead_offs = le32_to_cpu(c->mst_node->ihead_offs); 288 c->ihead_offs = le32_to_cpu(c->mst_node->ihead_offs);
289 c->old_idx_sz = le64_to_cpu(c->mst_node->index_size); 289 c->bi.old_idx_sz = le64_to_cpu(c->mst_node->index_size);
290 c->lpt_lnum = le32_to_cpu(c->mst_node->lpt_lnum); 290 c->lpt_lnum = le32_to_cpu(c->mst_node->lpt_lnum);
291 c->lpt_offs = le32_to_cpu(c->mst_node->lpt_offs); 291 c->lpt_offs = le32_to_cpu(c->mst_node->lpt_offs);
292 c->nhead_lnum = le32_to_cpu(c->mst_node->nhead_lnum); 292 c->nhead_lnum = le32_to_cpu(c->mst_node->nhead_lnum);
@@ -305,7 +305,7 @@ int ubifs_read_master(struct ubifs_info *c)
305 c->lst.total_dead = le64_to_cpu(c->mst_node->total_dead); 305 c->lst.total_dead = le64_to_cpu(c->mst_node->total_dead);
306 c->lst.total_dark = le64_to_cpu(c->mst_node->total_dark); 306 c->lst.total_dark = le64_to_cpu(c->mst_node->total_dark);
307 307
308 c->calc_idx_sz = c->old_idx_sz; 308 c->calc_idx_sz = c->bi.old_idx_sz;
309 309
310 if (c->mst_node->flags & cpu_to_le32(UBIFS_MST_NO_ORPHS)) 310 if (c->mst_node->flags & cpu_to_le32(UBIFS_MST_NO_ORPHS))
311 c->no_orphs = 1; 311 c->no_orphs = 1;
diff --git a/fs/ubifs/misc.h b/fs/ubifs/misc.h
index c3de04dc952a..0b5296a9a4c5 100644
--- a/fs/ubifs/misc.h
+++ b/fs/ubifs/misc.h
@@ -340,4 +340,21 @@ static inline void ubifs_release_lprops(struct ubifs_info *c)
340 mutex_unlock(&c->lp_mutex); 340 mutex_unlock(&c->lp_mutex);
341} 341}
342 342
343/**
344 * ubifs_next_log_lnum - switch to the next log LEB.
345 * @c: UBIFS file-system description object
346 * @lnum: current log LEB
347 *
348 * This helper function returns the log LEB number which goes next after LEB
349 * 'lnum'.
350 */
351static inline int ubifs_next_log_lnum(const struct ubifs_info *c, int lnum)
352{
353 lnum += 1;
354 if (lnum > c->log_last)
355 lnum = UBIFS_LOG_LNUM;
356
357 return lnum;
358}
359
343#endif /* __UBIFS_MISC_H__ */ 360#endif /* __UBIFS_MISC_H__ */
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index 09df318e368f..bd644bf587a8 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -673,7 +673,8 @@ static int kill_orphans(struct ubifs_info *c)
673 sleb = ubifs_scan(c, lnum, 0, c->sbuf, 1); 673 sleb = ubifs_scan(c, lnum, 0, c->sbuf, 1);
674 if (IS_ERR(sleb)) { 674 if (IS_ERR(sleb)) {
675 if (PTR_ERR(sleb) == -EUCLEAN) 675 if (PTR_ERR(sleb) == -EUCLEAN)
676 sleb = ubifs_recover_leb(c, lnum, 0, c->sbuf, 0); 676 sleb = ubifs_recover_leb(c, lnum, 0,
677 c->sbuf, 0);
677 if (IS_ERR(sleb)) { 678 if (IS_ERR(sleb)) {
678 err = PTR_ERR(sleb); 679 err = PTR_ERR(sleb);
679 break; 680 break;
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 3dbad6fbd1eb..731d9e2e7b50 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -564,13 +564,16 @@ static int fix_unclean_leb(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
564} 564}
565 565
566/** 566/**
567 * drop_incomplete_group - drop nodes from an incomplete group. 567 * drop_last_node - drop the last node or group of nodes.
568 * @sleb: scanned LEB information 568 * @sleb: scanned LEB information
569 * @offs: offset of dropped nodes is returned here 569 * @offs: offset of dropped nodes is returned here
570 * @grouped: non-zero if whole group of nodes have to be dropped
570 * 571 *
571 * This function returns %1 if nodes are dropped and %0 otherwise. 572 * This is a helper function for 'ubifs_recover_leb()' which drops the last
573 * node of the scanned LEB or the last group of nodes if @grouped is not zero.
574 * This function returns %1 if a node was dropped and %0 otherwise.
572 */ 575 */
573static int drop_incomplete_group(struct ubifs_scan_leb *sleb, int *offs) 576static int drop_last_node(struct ubifs_scan_leb *sleb, int *offs, int grouped)
574{ 577{
575 int dropped = 0; 578 int dropped = 0;
576 579
@@ -589,6 +592,8 @@ static int drop_incomplete_group(struct ubifs_scan_leb *sleb, int *offs)
589 kfree(snod); 592 kfree(snod);
590 sleb->nodes_cnt -= 1; 593 sleb->nodes_cnt -= 1;
591 dropped = 1; 594 dropped = 1;
595 if (!grouped)
596 break;
592 } 597 }
593 return dropped; 598 return dropped;
594} 599}
@@ -609,8 +614,7 @@ static int drop_incomplete_group(struct ubifs_scan_leb *sleb, int *offs)
609struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum, 614struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
610 int offs, void *sbuf, int grouped) 615 int offs, void *sbuf, int grouped)
611{ 616{
612 int err, len = c->leb_size - offs, need_clean = 0, quiet = 1; 617 int ret = 0, err, len = c->leb_size - offs, start = offs, min_io_unit;
613 int empty_chkd = 0, start = offs;
614 struct ubifs_scan_leb *sleb; 618 struct ubifs_scan_leb *sleb;
615 void *buf = sbuf + offs; 619 void *buf = sbuf + offs;
616 620
@@ -620,12 +624,8 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
620 if (IS_ERR(sleb)) 624 if (IS_ERR(sleb))
621 return sleb; 625 return sleb;
622 626
623 if (sleb->ecc) 627 ubifs_assert(len >= 8);
624 need_clean = 1;
625
626 while (len >= 8) { 628 while (len >= 8) {
627 int ret;
628
629 dbg_scan("look at LEB %d:%d (%d bytes left)", 629 dbg_scan("look at LEB %d:%d (%d bytes left)",
630 lnum, offs, len); 630 lnum, offs, len);
631 631
@@ -635,8 +635,7 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
635 * Scan quietly until there is an error from which we cannot 635 * Scan quietly until there is an error from which we cannot
636 * recover 636 * recover
637 */ 637 */
638 ret = ubifs_scan_a_node(c, buf, len, lnum, offs, quiet); 638 ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 0);
639
640 if (ret == SCANNED_A_NODE) { 639 if (ret == SCANNED_A_NODE) {
641 /* A valid node, and not a padding node */ 640 /* A valid node, and not a padding node */
642 struct ubifs_ch *ch = buf; 641 struct ubifs_ch *ch = buf;
@@ -649,70 +648,32 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
649 offs += node_len; 648 offs += node_len;
650 buf += node_len; 649 buf += node_len;
651 len -= node_len; 650 len -= node_len;
652 continue; 651 } else if (ret > 0) {
653 }
654
655 if (ret > 0) {
656 /* Padding bytes or a valid padding node */ 652 /* Padding bytes or a valid padding node */
657 offs += ret; 653 offs += ret;
658 buf += ret; 654 buf += ret;
659 len -= ret; 655 len -= ret;
660 continue; 656 } else if (ret == SCANNED_EMPTY_SPACE ||
661 } 657 ret == SCANNED_GARBAGE ||
662 658 ret == SCANNED_A_BAD_PAD_NODE ||
663 if (ret == SCANNED_EMPTY_SPACE) { 659 ret == SCANNED_A_CORRUPT_NODE) {
664 if (!is_empty(buf, len)) { 660 dbg_rcvry("found corruption - %d", ret);
665 if (!is_last_write(c, buf, offs))
666 break;
667 clean_buf(c, &buf, lnum, &offs, &len);
668 need_clean = 1;
669 }
670 empty_chkd = 1;
671 break; 661 break;
672 } 662 } else {
673 663 dbg_err("unexpected return value %d", ret);
674 if (ret == SCANNED_GARBAGE || ret == SCANNED_A_BAD_PAD_NODE)
675 if (is_last_write(c, buf, offs)) {
676 clean_buf(c, &buf, lnum, &offs, &len);
677 need_clean = 1;
678 empty_chkd = 1;
679 break;
680 }
681
682 if (ret == SCANNED_A_CORRUPT_NODE)
683 if (no_more_nodes(c, buf, len, lnum, offs)) {
684 clean_buf(c, &buf, lnum, &offs, &len);
685 need_clean = 1;
686 empty_chkd = 1;
687 break;
688 }
689
690 if (quiet) {
691 /* Redo the last scan but noisily */
692 quiet = 0;
693 continue;
694 }
695
696 switch (ret) {
697 case SCANNED_GARBAGE:
698 dbg_err("garbage");
699 goto corrupted;
700 case SCANNED_A_CORRUPT_NODE:
701 case SCANNED_A_BAD_PAD_NODE:
702 dbg_err("bad node");
703 goto corrupted;
704 default:
705 dbg_err("unknown");
706 err = -EINVAL; 664 err = -EINVAL;
707 goto error; 665 goto error;
708 } 666 }
709 } 667 }
710 668
711 if (!empty_chkd && !is_empty(buf, len)) { 669 if (ret == SCANNED_GARBAGE || ret == SCANNED_A_BAD_PAD_NODE) {
712 if (is_last_write(c, buf, offs)) { 670 if (!is_last_write(c, buf, offs))
713 clean_buf(c, &buf, lnum, &offs, &len); 671 goto corrupted_rescan;
714 need_clean = 1; 672 } else if (ret == SCANNED_A_CORRUPT_NODE) {
715 } else { 673 if (!no_more_nodes(c, buf, len, lnum, offs))
674 goto corrupted_rescan;
675 } else if (!is_empty(buf, len)) {
676 if (!is_last_write(c, buf, offs)) {
716 int corruption = first_non_ff(buf, len); 677 int corruption = first_non_ff(buf, len);
717 678
718 /* 679 /*
@@ -728,29 +689,82 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
728 } 689 }
729 } 690 }
730 691
731 /* Drop nodes from incomplete group */ 692 min_io_unit = round_down(offs, c->min_io_size);
732 if (grouped && drop_incomplete_group(sleb, &offs)) { 693 if (grouped)
733 buf = sbuf + offs; 694 /*
734 len = c->leb_size - offs; 695 * If nodes are grouped, always drop the incomplete group at
735 clean_buf(c, &buf, lnum, &offs, &len); 696 * the end.
736 need_clean = 1; 697 */
737 } 698 drop_last_node(sleb, &offs, 1);
738 699
739 if (offs % c->min_io_size) { 700 /*
740 clean_buf(c, &buf, lnum, &offs, &len); 701 * While we are in the middle of the same min. I/O unit keep dropping
741 need_clean = 1; 702 * nodes. So basically, what we want is to make sure that the last min.
742 } 703 * I/O unit where we saw the corruption is dropped completely with all
704 * the uncorrupted node which may possibly sit there.
705 *
706 * In other words, let's name the min. I/O unit where the corruption
707 * starts B, and the previous min. I/O unit A. The below code tries to
708 * deal with a situation when half of B contains valid nodes or the end
709 * of a valid node, and the second half of B contains corrupted data or
710 * garbage. This means that UBIFS had been writing to B just before the
711 * power cut happened. I do not know how realistic is this scenario
712 * that half of the min. I/O unit had been written successfully and the
713 * other half not, but this is possible in our 'failure mode emulation'
714 * infrastructure at least.
715 *
716 * So what is the problem, why we need to drop those nodes? Whey can't
717 * we just clean-up the second half of B by putting a padding node
718 * there? We can, and this works fine with one exception which was
719 * reproduced with power cut emulation testing and happens extremely
720 * rarely. The description follows, but it is worth noting that that is
721 * only about the GC head, so we could do this trick only if the bud
722 * belongs to the GC head, but it does not seem to be worth an
723 * additional "if" statement.
724 *
725 * So, imagine the file-system is full, we run GC which is moving valid
726 * nodes from LEB X to LEB Y (obviously, LEB Y is the current GC head
727 * LEB). The @c->gc_lnum is -1, which means that GC will retain LEB X
728 * and will try to continue. Imagine that LEB X is currently the
729 * dirtiest LEB, and the amount of used space in LEB Y is exactly the
730 * same as amount of free space in LEB X.
731 *
732 * And a power cut happens when nodes are moved from LEB X to LEB Y. We
733 * are here trying to recover LEB Y which is the GC head LEB. We find
734 * the min. I/O unit B as described above. Then we clean-up LEB Y by
735 * padding min. I/O unit. And later 'ubifs_rcvry_gc_commit()' function
736 * fails, because it cannot find a dirty LEB which could be GC'd into
737 * LEB Y! Even LEB X does not match because the amount of valid nodes
738 * there does not fit the free space in LEB Y any more! And this is
739 * because of the padding node which we added to LEB Y. The
740 * user-visible effect of this which I once observed and analysed is
741 * that we cannot mount the file-system with -ENOSPC error.
742 *
743 * So obviously, to make sure that situation does not happen we should
744 * free min. I/O unit B in LEB Y completely and the last used min. I/O
745 * unit in LEB Y should be A. This is basically what the below code
746 * tries to do.
747 */
748 while (min_io_unit == round_down(offs, c->min_io_size) &&
749 min_io_unit != offs &&
750 drop_last_node(sleb, &offs, grouped));
751
752 buf = sbuf + offs;
753 len = c->leb_size - offs;
743 754
755 clean_buf(c, &buf, lnum, &offs, &len);
744 ubifs_end_scan(c, sleb, lnum, offs); 756 ubifs_end_scan(c, sleb, lnum, offs);
745 757
746 if (need_clean) { 758 err = fix_unclean_leb(c, sleb, start);
747 err = fix_unclean_leb(c, sleb, start); 759 if (err)
748 if (err) 760 goto error;
749 goto error;
750 }
751 761
752 return sleb; 762 return sleb;
753 763
764corrupted_rescan:
765 /* Re-scan the corrupted data with verbose messages */
766 dbg_err("corruptio %d", ret);
767 ubifs_scan_a_node(c, buf, len, lnum, offs, 1);
754corrupted: 768corrupted:
755 ubifs_scanned_corruption(c, lnum, offs, buf); 769 ubifs_scanned_corruption(c, lnum, offs, buf);
756 err = -EUCLEAN; 770 err = -EUCLEAN;
@@ -1070,6 +1084,53 @@ int ubifs_clean_lebs(const struct ubifs_info *c, void *sbuf)
1070} 1084}
1071 1085
1072/** 1086/**
1087 * grab_empty_leb - grab an empty LEB to use as GC LEB and run commit.
1088 * @c: UBIFS file-system description object
1089 *
1090 * This is a helper function for 'ubifs_rcvry_gc_commit()' which grabs an empty
1091 * LEB to be used as GC LEB (@c->gc_lnum), and then runs the commit. Returns
1092 * zero in case of success and a negative error code in case of failure.
1093 */
1094static int grab_empty_leb(struct ubifs_info *c)
1095{
1096 int lnum, err;
1097
1098 /*
1099 * Note, it is very important to first search for an empty LEB and then
1100 * run the commit, not vice-versa. The reason is that there might be
1101 * only one empty LEB at the moment, the one which has been the
1102 * @c->gc_lnum just before the power cut happened. During the regular
1103 * UBIFS operation (not now) @c->gc_lnum is marked as "taken", so no
1104 * one but GC can grab it. But at this moment this single empty LEB is
1105 * not marked as taken, so if we run commit - what happens? Right, the
1106 * commit will grab it and write the index there. Remember that the
1107 * index always expands as long as there is free space, and it only
1108 * starts consolidating when we run out of space.
1109 *
1110 * IOW, if we run commit now, we might not be able to find a free LEB
1111 * after this.
1112 */
1113 lnum = ubifs_find_free_leb_for_idx(c);
1114 if (lnum < 0) {
1115 dbg_err("could not find an empty LEB");
1116 dbg_dump_lprops(c);
1117 dbg_dump_budg(c, &c->bi);
1118 return lnum;
1119 }
1120
1121 /* Reset the index flag */
1122 err = ubifs_change_one_lp(c, lnum, LPROPS_NC, LPROPS_NC, 0,
1123 LPROPS_INDEX, 0);
1124 if (err)
1125 return err;
1126
1127 c->gc_lnum = lnum;
1128 dbg_rcvry("found empty LEB %d, run commit", lnum);
1129
1130 return ubifs_run_commit(c);
1131}
1132
1133/**
1073 * ubifs_rcvry_gc_commit - recover the GC LEB number and run the commit. 1134 * ubifs_rcvry_gc_commit - recover the GC LEB number and run the commit.
1074 * @c: UBIFS file-system description object 1135 * @c: UBIFS file-system description object
1075 * 1136 *
@@ -1091,71 +1152,26 @@ int ubifs_rcvry_gc_commit(struct ubifs_info *c)
1091{ 1152{
1092 struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf; 1153 struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
1093 struct ubifs_lprops lp; 1154 struct ubifs_lprops lp;
1094 int lnum, err; 1155 int err;
1156
1157 dbg_rcvry("GC head LEB %d, offs %d", wbuf->lnum, wbuf->offs);
1095 1158
1096 c->gc_lnum = -1; 1159 c->gc_lnum = -1;
1097 if (wbuf->lnum == -1) { 1160 if (wbuf->lnum == -1 || wbuf->offs == c->leb_size)
1098 dbg_rcvry("no GC head LEB"); 1161 return grab_empty_leb(c);
1099 goto find_free; 1162
1100 }
1101 /*
1102 * See whether the used space in the dirtiest LEB fits in the GC head
1103 * LEB.
1104 */
1105 if (wbuf->offs == c->leb_size) {
1106 dbg_rcvry("no room in GC head LEB");
1107 goto find_free;
1108 }
1109 err = ubifs_find_dirty_leb(c, &lp, wbuf->offs, 2); 1163 err = ubifs_find_dirty_leb(c, &lp, wbuf->offs, 2);
1110 if (err) { 1164 if (err) {
1111 /* 1165 if (err != -ENOSPC)
1112 * There are no dirty or empty LEBs subject to here being
1113 * enough for the index. Try to use
1114 * 'ubifs_find_free_leb_for_idx()', which will return any empty
1115 * LEBs (ignoring index requirements). If the index then
1116 * doesn't have enough LEBs the recovery commit will fail -
1117 * which is the same result anyway i.e. recovery fails. So
1118 * there is no problem ignoring index requirements and just
1119 * grabbing a free LEB since we have already established there
1120 * is not a dirty LEB we could have used instead.
1121 */
1122 if (err == -ENOSPC) {
1123 dbg_rcvry("could not find a dirty LEB");
1124 goto find_free;
1125 }
1126 return err;
1127 }
1128 ubifs_assert(!(lp.flags & LPROPS_INDEX));
1129 lnum = lp.lnum;
1130 if (lp.free + lp.dirty == c->leb_size) {
1131 /* An empty LEB was returned */
1132 if (lp.free != c->leb_size) {
1133 err = ubifs_change_one_lp(c, lnum, c->leb_size,
1134 0, 0, 0, 0);
1135 if (err)
1136 return err;
1137 }
1138 err = ubifs_leb_unmap(c, lnum);
1139 if (err)
1140 return err; 1166 return err;
1141 c->gc_lnum = lnum; 1167
1142 dbg_rcvry("allocated LEB %d for GC", lnum); 1168 dbg_rcvry("could not find a dirty LEB");
1143 /* Run the commit */ 1169 return grab_empty_leb(c);
1144 dbg_rcvry("committing");
1145 return ubifs_run_commit(c);
1146 }
1147 /*
1148 * There was no empty LEB so the used space in the dirtiest LEB must fit
1149 * in the GC head LEB.
1150 */
1151 if (lp.free + lp.dirty < wbuf->offs) {
1152 dbg_rcvry("LEB %d doesn't fit in GC head LEB %d:%d",
1153 lnum, wbuf->lnum, wbuf->offs);
1154 err = ubifs_return_leb(c, lnum);
1155 if (err)
1156 return err;
1157 goto find_free;
1158 } 1170 }
1171
1172 ubifs_assert(!(lp.flags & LPROPS_INDEX));
1173 ubifs_assert(lp.free + lp.dirty >= wbuf->offs);
1174
1159 /* 1175 /*
1160 * We run the commit before garbage collection otherwise subsequent 1176 * We run the commit before garbage collection otherwise subsequent
1161 * mounts will see the GC and orphan deletion in a different order. 1177 * mounts will see the GC and orphan deletion in a different order.
@@ -1164,11 +1180,8 @@ int ubifs_rcvry_gc_commit(struct ubifs_info *c)
1164 err = ubifs_run_commit(c); 1180 err = ubifs_run_commit(c);
1165 if (err) 1181 if (err)
1166 return err; 1182 return err;
1167 /* 1183
1168 * The data in the dirtiest LEB fits in the GC head LEB, so do the GC 1184 dbg_rcvry("GC'ing LEB %d", lp.lnum);
1169 * - use locking to keep 'ubifs_assert()' happy.
1170 */
1171 dbg_rcvry("GC'ing LEB %d", lnum);
1172 mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead); 1185 mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
1173 err = ubifs_garbage_collect_leb(c, &lp); 1186 err = ubifs_garbage_collect_leb(c, &lp);
1174 if (err >= 0) { 1187 if (err >= 0) {
@@ -1184,37 +1197,17 @@ int ubifs_rcvry_gc_commit(struct ubifs_info *c)
1184 err = -EINVAL; 1197 err = -EINVAL;
1185 return err; 1198 return err;
1186 } 1199 }
1187 if (err != LEB_RETAINED) { 1200
1188 dbg_err("GC returned %d", err); 1201 ubifs_assert(err == LEB_RETAINED);
1202 if (err != LEB_RETAINED)
1189 return -EINVAL; 1203 return -EINVAL;
1190 } 1204
1191 err = ubifs_leb_unmap(c, c->gc_lnum); 1205 err = ubifs_leb_unmap(c, c->gc_lnum);
1192 if (err) 1206 if (err)
1193 return err; 1207 return err;
1194 dbg_rcvry("allocated LEB %d for GC", lnum);
1195 return 0;
1196 1208
1197find_free: 1209 dbg_rcvry("allocated LEB %d for GC", lp.lnum);
1198 /* 1210 return 0;
1199 * There is no GC head LEB or the free space in the GC head LEB is too
1200 * small, or there are not dirty LEBs. Allocate gc_lnum by calling
1201 * 'ubifs_find_free_leb_for_idx()' so GC is not run.
1202 */
1203 lnum = ubifs_find_free_leb_for_idx(c);
1204 if (lnum < 0) {
1205 dbg_err("could not find an empty LEB");
1206 return lnum;
1207 }
1208 /* And reset the index flag */
1209 err = ubifs_change_one_lp(c, lnum, LPROPS_NC, LPROPS_NC, 0,
1210 LPROPS_INDEX, 0);
1211 if (err)
1212 return err;
1213 c->gc_lnum = lnum;
1214 dbg_rcvry("allocated LEB %d for GC", lnum);
1215 /* Run the commit */
1216 dbg_rcvry("committing");
1217 return ubifs_run_commit(c);
1218} 1211}
1219 1212
1220/** 1213/**
@@ -1456,7 +1449,7 @@ static int fix_size_in_place(struct ubifs_info *c, struct size_entry *e)
1456 err = ubi_leb_change(c->ubi, lnum, c->sbuf, len, UBI_UNKNOWN); 1449 err = ubi_leb_change(c->ubi, lnum, c->sbuf, len, UBI_UNKNOWN);
1457 if (err) 1450 if (err)
1458 goto out; 1451 goto out;
1459 dbg_rcvry("inode %lu at %d:%d size %lld -> %lld ", 1452 dbg_rcvry("inode %lu at %d:%d size %lld -> %lld",
1460 (unsigned long)e->inum, lnum, offs, i_size, e->d_size); 1453 (unsigned long)e->inum, lnum, offs, i_size, e->d_size);
1461 return 0; 1454 return 0;
1462 1455
@@ -1505,20 +1498,27 @@ int ubifs_recover_size(struct ubifs_info *c)
1505 e->i_size = le64_to_cpu(ino->size); 1498 e->i_size = le64_to_cpu(ino->size);
1506 } 1499 }
1507 } 1500 }
1501
1508 if (e->exists && e->i_size < e->d_size) { 1502 if (e->exists && e->i_size < e->d_size) {
1509 if (!e->inode && c->ro_mount) { 1503 if (c->ro_mount) {
1510 /* Fix the inode size and pin it in memory */ 1504 /* Fix the inode size and pin it in memory */
1511 struct inode *inode; 1505 struct inode *inode;
1506 struct ubifs_inode *ui;
1507
1508 ubifs_assert(!e->inode);
1512 1509
1513 inode = ubifs_iget(c->vfs_sb, e->inum); 1510 inode = ubifs_iget(c->vfs_sb, e->inum);
1514 if (IS_ERR(inode)) 1511 if (IS_ERR(inode))
1515 return PTR_ERR(inode); 1512 return PTR_ERR(inode);
1513
1514 ui = ubifs_inode(inode);
1516 if (inode->i_size < e->d_size) { 1515 if (inode->i_size < e->d_size) {
1517 dbg_rcvry("ino %lu size %lld -> %lld", 1516 dbg_rcvry("ino %lu size %lld -> %lld",
1518 (unsigned long)e->inum, 1517 (unsigned long)e->inum,
1519 e->d_size, inode->i_size); 1518 inode->i_size, e->d_size);
1520 inode->i_size = e->d_size; 1519 inode->i_size = e->d_size;
1521 ubifs_inode(inode)->ui_size = e->d_size; 1520 ui->ui_size = e->d_size;
1521 ui->synced_i_size = e->d_size;
1522 e->inode = inode; 1522 e->inode = inode;
1523 this = rb_next(this); 1523 this = rb_next(this);
1524 continue; 1524 continue;
@@ -1533,9 +1533,11 @@ int ubifs_recover_size(struct ubifs_info *c)
1533 iput(e->inode); 1533 iput(e->inode);
1534 } 1534 }
1535 } 1535 }
1536
1536 this = rb_next(this); 1537 this = rb_next(this);
1537 rb_erase(&e->rb, &c->size_tree); 1538 rb_erase(&e->rb, &c->size_tree);
1538 kfree(e); 1539 kfree(e);
1539 } 1540 }
1541
1540 return 0; 1542 return 0;
1541} 1543}
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index d3d6d365bfc1..6617280d1679 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -33,44 +33,32 @@
33 */ 33 */
34 34
35#include "ubifs.h" 35#include "ubifs.h"
36 36#include <linux/list_sort.h>
37/*
38 * Replay flags.
39 *
40 * REPLAY_DELETION: node was deleted
41 * REPLAY_REF: node is a reference node
42 */
43enum {
44 REPLAY_DELETION = 1,
45 REPLAY_REF = 2,
46};
47 37
48/** 38/**
49 * struct replay_entry - replay tree entry. 39 * struct replay_entry - replay list entry.
50 * @lnum: logical eraseblock number of the node 40 * @lnum: logical eraseblock number of the node
51 * @offs: node offset 41 * @offs: node offset
52 * @len: node length 42 * @len: node length
43 * @deletion: non-zero if this entry corresponds to a node deletion
53 * @sqnum: node sequence number 44 * @sqnum: node sequence number
54 * @flags: replay flags 45 * @list: links the replay list
55 * @rb: links the replay tree
56 * @key: node key 46 * @key: node key
57 * @nm: directory entry name 47 * @nm: directory entry name
58 * @old_size: truncation old size 48 * @old_size: truncation old size
59 * @new_size: truncation new size 49 * @new_size: truncation new size
60 * @free: amount of free space in a bud
61 * @dirty: amount of dirty space in a bud from padding and deletion nodes
62 * @jhead: journal head number of the bud
63 * 50 *
64 * UBIFS journal replay must compare node sequence numbers, which means it must 51 * The replay process first scans all buds and builds the replay list, then
65 * build a tree of node information to insert into the TNC. 52 * sorts the replay list in nodes sequence number order, and then inserts all
53 * the replay entries to the TNC.
66 */ 54 */
67struct replay_entry { 55struct replay_entry {
68 int lnum; 56 int lnum;
69 int offs; 57 int offs;
70 int len; 58 int len;
59 unsigned int deletion:1;
71 unsigned long long sqnum; 60 unsigned long long sqnum;
72 int flags; 61 struct list_head list;
73 struct rb_node rb;
74 union ubifs_key key; 62 union ubifs_key key;
75 union { 63 union {
76 struct qstr nm; 64 struct qstr nm;
@@ -78,11 +66,6 @@ struct replay_entry {
78 loff_t old_size; 66 loff_t old_size;
79 loff_t new_size; 67 loff_t new_size;
80 }; 68 };
81 struct {
82 int free;
83 int dirty;
84 int jhead;
85 };
86 }; 69 };
87}; 70};
88 71
@@ -90,57 +73,64 @@ struct replay_entry {
90 * struct bud_entry - entry in the list of buds to replay. 73 * struct bud_entry - entry in the list of buds to replay.
91 * @list: next bud in the list 74 * @list: next bud in the list
92 * @bud: bud description object 75 * @bud: bud description object
93 * @free: free bytes in the bud
94 * @sqnum: reference node sequence number 76 * @sqnum: reference node sequence number
77 * @free: free bytes in the bud
78 * @dirty: dirty bytes in the bud
95 */ 79 */
96struct bud_entry { 80struct bud_entry {
97 struct list_head list; 81 struct list_head list;
98 struct ubifs_bud *bud; 82 struct ubifs_bud *bud;
99 int free;
100 unsigned long long sqnum; 83 unsigned long long sqnum;
84 int free;
85 int dirty;
101}; 86};
102 87
103/** 88/**
104 * set_bud_lprops - set free and dirty space used by a bud. 89 * set_bud_lprops - set free and dirty space used by a bud.
105 * @c: UBIFS file-system description object 90 * @c: UBIFS file-system description object
106 * @r: replay entry of bud 91 * @b: bud entry which describes the bud
92 *
93 * This function makes sure the LEB properties of bud @b are set correctly
94 * after the replay. Returns zero in case of success and a negative error code
95 * in case of failure.
107 */ 96 */
108static int set_bud_lprops(struct ubifs_info *c, struct replay_entry *r) 97static int set_bud_lprops(struct ubifs_info *c, struct bud_entry *b)
109{ 98{
110 const struct ubifs_lprops *lp; 99 const struct ubifs_lprops *lp;
111 int err = 0, dirty; 100 int err = 0, dirty;
112 101
113 ubifs_get_lprops(c); 102 ubifs_get_lprops(c);
114 103
115 lp = ubifs_lpt_lookup_dirty(c, r->lnum); 104 lp = ubifs_lpt_lookup_dirty(c, b->bud->lnum);
116 if (IS_ERR(lp)) { 105 if (IS_ERR(lp)) {
117 err = PTR_ERR(lp); 106 err = PTR_ERR(lp);
118 goto out; 107 goto out;
119 } 108 }
120 109
121 dirty = lp->dirty; 110 dirty = lp->dirty;
122 if (r->offs == 0 && (lp->free != c->leb_size || lp->dirty != 0)) { 111 if (b->bud->start == 0 && (lp->free != c->leb_size || lp->dirty != 0)) {
123 /* 112 /*
124 * The LEB was added to the journal with a starting offset of 113 * The LEB was added to the journal with a starting offset of
125 * zero which means the LEB must have been empty. The LEB 114 * zero which means the LEB must have been empty. The LEB
126 * property values should be lp->free == c->leb_size and 115 * property values should be @lp->free == @c->leb_size and
127 * lp->dirty == 0, but that is not the case. The reason is that 116 * @lp->dirty == 0, but that is not the case. The reason is that
128 * the LEB was garbage collected. The garbage collector resets 117 * the LEB had been garbage collected before it became the bud,
129 * the free and dirty space without recording it anywhere except 118 * and there was not commit inbetween. The garbage collector
130 * lprops, so if there is not a commit then lprops does not have 119 * resets the free and dirty space without recording it
131 * that information next time the file system is mounted. 120 * anywhere except lprops, so if there was no commit then
121 * lprops does not have that information.
132 * 122 *
133 * We do not need to adjust free space because the scan has told 123 * We do not need to adjust free space because the scan has told
134 * us the exact value which is recorded in the replay entry as 124 * us the exact value which is recorded in the replay entry as
135 * r->free. 125 * @b->free.
136 * 126 *
137 * However we do need to subtract from the dirty space the 127 * However we do need to subtract from the dirty space the
138 * amount of space that the garbage collector reclaimed, which 128 * amount of space that the garbage collector reclaimed, which
139 * is the whole LEB minus the amount of space that was free. 129 * is the whole LEB minus the amount of space that was free.
140 */ 130 */
141 dbg_mnt("bud LEB %d was GC'd (%d free, %d dirty)", r->lnum, 131 dbg_mnt("bud LEB %d was GC'd (%d free, %d dirty)", b->bud->lnum,
142 lp->free, lp->dirty); 132 lp->free, lp->dirty);
143 dbg_gc("bud LEB %d was GC'd (%d free, %d dirty)", r->lnum, 133 dbg_gc("bud LEB %d was GC'd (%d free, %d dirty)", b->bud->lnum,
144 lp->free, lp->dirty); 134 lp->free, lp->dirty);
145 dirty -= c->leb_size - lp->free; 135 dirty -= c->leb_size - lp->free;
146 /* 136 /*
@@ -152,10 +142,10 @@ static int set_bud_lprops(struct ubifs_info *c, struct replay_entry *r)
152 */ 142 */
153 if (dirty != 0) 143 if (dirty != 0)
154 dbg_msg("LEB %d lp: %d free %d dirty " 144 dbg_msg("LEB %d lp: %d free %d dirty "
155 "replay: %d free %d dirty", r->lnum, lp->free, 145 "replay: %d free %d dirty", b->bud->lnum,
156 lp->dirty, r->free, r->dirty); 146 lp->free, lp->dirty, b->free, b->dirty);
157 } 147 }
158 lp = ubifs_change_lp(c, lp, r->free, dirty + r->dirty, 148 lp = ubifs_change_lp(c, lp, b->free, dirty + b->dirty,
159 lp->flags | LPROPS_TAKEN, 0); 149 lp->flags | LPROPS_TAKEN, 0);
160 if (IS_ERR(lp)) { 150 if (IS_ERR(lp)) {
161 err = PTR_ERR(lp); 151 err = PTR_ERR(lp);
@@ -163,8 +153,9 @@ static int set_bud_lprops(struct ubifs_info *c, struct replay_entry *r)
163 } 153 }
164 154
165 /* Make sure the journal head points to the latest bud */ 155 /* Make sure the journal head points to the latest bud */
166 err = ubifs_wbuf_seek_nolock(&c->jheads[r->jhead].wbuf, r->lnum, 156 err = ubifs_wbuf_seek_nolock(&c->jheads[b->bud->jhead].wbuf,
167 c->leb_size - r->free, UBI_SHORTTERM); 157 b->bud->lnum, c->leb_size - b->free,
158 UBI_SHORTTERM);
168 159
169out: 160out:
170 ubifs_release_lprops(c); 161 ubifs_release_lprops(c);
@@ -172,6 +163,27 @@ out:
172} 163}
173 164
174/** 165/**
166 * set_buds_lprops - set free and dirty space for all replayed buds.
167 * @c: UBIFS file-system description object
168 *
169 * This function sets LEB properties for all replayed buds. Returns zero in
170 * case of success and a negative error code in case of failure.
171 */
172static int set_buds_lprops(struct ubifs_info *c)
173{
174 struct bud_entry *b;
175 int err;
176
177 list_for_each_entry(b, &c->replay_buds, list) {
178 err = set_bud_lprops(c, b);
179 if (err)
180 return err;
181 }
182
183 return 0;
184}
185
186/**
175 * trun_remove_range - apply a replay entry for a truncation to the TNC. 187 * trun_remove_range - apply a replay entry for a truncation to the TNC.
176 * @c: UBIFS file-system description object 188 * @c: UBIFS file-system description object
177 * @r: replay entry of truncation 189 * @r: replay entry of truncation
@@ -207,24 +219,22 @@ static int trun_remove_range(struct ubifs_info *c, struct replay_entry *r)
207 */ 219 */
208static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r) 220static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r)
209{ 221{
210 int err, deletion = ((r->flags & REPLAY_DELETION) != 0); 222 int err;
211 223
212 dbg_mnt("LEB %d:%d len %d flgs %d sqnum %llu %s", r->lnum, 224 dbg_mnt("LEB %d:%d len %d deletion %d sqnum %llu %s", r->lnum,
213 r->offs, r->len, r->flags, r->sqnum, DBGKEY(&r->key)); 225 r->offs, r->len, r->deletion, r->sqnum, DBGKEY(&r->key));
214 226
215 /* Set c->replay_sqnum to help deal with dangling branches. */ 227 /* Set c->replay_sqnum to help deal with dangling branches. */
216 c->replay_sqnum = r->sqnum; 228 c->replay_sqnum = r->sqnum;
217 229
218 if (r->flags & REPLAY_REF) 230 if (is_hash_key(c, &r->key)) {
219 err = set_bud_lprops(c, r); 231 if (r->deletion)
220 else if (is_hash_key(c, &r->key)) {
221 if (deletion)
222 err = ubifs_tnc_remove_nm(c, &r->key, &r->nm); 232 err = ubifs_tnc_remove_nm(c, &r->key, &r->nm);
223 else 233 else
224 err = ubifs_tnc_add_nm(c, &r->key, r->lnum, r->offs, 234 err = ubifs_tnc_add_nm(c, &r->key, r->lnum, r->offs,
225 r->len, &r->nm); 235 r->len, &r->nm);
226 } else { 236 } else {
227 if (deletion) 237 if (r->deletion)
228 switch (key_type(c, &r->key)) { 238 switch (key_type(c, &r->key)) {
229 case UBIFS_INO_KEY: 239 case UBIFS_INO_KEY:
230 { 240 {
@@ -247,7 +257,7 @@ static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r)
247 return err; 257 return err;
248 258
249 if (c->need_recovery) 259 if (c->need_recovery)
250 err = ubifs_recover_size_accum(c, &r->key, deletion, 260 err = ubifs_recover_size_accum(c, &r->key, r->deletion,
251 r->new_size); 261 r->new_size);
252 } 262 }
253 263
@@ -255,68 +265,77 @@ static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r)
255} 265}
256 266
257/** 267/**
258 * destroy_replay_tree - destroy the replay. 268 * replay_entries_cmp - compare 2 replay entries.
259 * @c: UBIFS file-system description object 269 * @priv: UBIFS file-system description object
270 * @a: first replay entry
271 * @a: second replay entry
260 * 272 *
261 * Destroy the replay tree. 273 * This is a comparios function for 'list_sort()' which compares 2 replay
274 * entries @a and @b by comparing their sequence numer. Returns %1 if @a has
275 * greater sequence number and %-1 otherwise.
262 */ 276 */
263static void destroy_replay_tree(struct ubifs_info *c) 277static int replay_entries_cmp(void *priv, struct list_head *a,
278 struct list_head *b)
264{ 279{
265 struct rb_node *this = c->replay_tree.rb_node; 280 struct replay_entry *ra, *rb;
266 struct replay_entry *r; 281
267 282 cond_resched();
268 while (this) { 283 if (a == b)
269 if (this->rb_left) { 284 return 0;
270 this = this->rb_left; 285
271 continue; 286 ra = list_entry(a, struct replay_entry, list);
272 } else if (this->rb_right) { 287 rb = list_entry(b, struct replay_entry, list);
273 this = this->rb_right; 288 ubifs_assert(ra->sqnum != rb->sqnum);
274 continue; 289 if (ra->sqnum > rb->sqnum)
275 } 290 return 1;
276 r = rb_entry(this, struct replay_entry, rb); 291 return -1;
277 this = rb_parent(this);
278 if (this) {
279 if (this->rb_left == &r->rb)
280 this->rb_left = NULL;
281 else
282 this->rb_right = NULL;
283 }
284 if (is_hash_key(c, &r->key))
285 kfree(r->nm.name);
286 kfree(r);
287 }
288 c->replay_tree = RB_ROOT;
289} 292}
290 293
291/** 294/**
292 * apply_replay_tree - apply the replay tree to the TNC. 295 * apply_replay_list - apply the replay list to the TNC.
293 * @c: UBIFS file-system description object 296 * @c: UBIFS file-system description object
294 * 297 *
295 * Apply the replay tree. 298 * Apply all entries in the replay list to the TNC. Returns zero in case of
296 * Returns zero in case of success and a negative error code in case of 299 * success and a negative error code in case of failure.
297 * failure.
298 */ 300 */
299static int apply_replay_tree(struct ubifs_info *c) 301static int apply_replay_list(struct ubifs_info *c)
300{ 302{
301 struct rb_node *this = rb_first(&c->replay_tree); 303 struct replay_entry *r;
304 int err;
302 305
303 while (this) { 306 list_sort(c, &c->replay_list, &replay_entries_cmp);
304 struct replay_entry *r;
305 int err;
306 307
308 list_for_each_entry(r, &c->replay_list, list) {
307 cond_resched(); 309 cond_resched();
308 310
309 r = rb_entry(this, struct replay_entry, rb);
310 err = apply_replay_entry(c, r); 311 err = apply_replay_entry(c, r);
311 if (err) 312 if (err)
312 return err; 313 return err;
313 this = rb_next(this);
314 } 314 }
315
315 return 0; 316 return 0;
316} 317}
317 318
318/** 319/**
319 * insert_node - insert a node to the replay tree. 320 * destroy_replay_list - destroy the replay.
321 * @c: UBIFS file-system description object
322 *
323 * Destroy the replay list.
324 */
325static void destroy_replay_list(struct ubifs_info *c)
326{
327 struct replay_entry *r, *tmp;
328
329 list_for_each_entry_safe(r, tmp, &c->replay_list, list) {
330 if (is_hash_key(c, &r->key))
331 kfree(r->nm.name);
332 list_del(&r->list);
333 kfree(r);
334 }
335}
336
337/**
338 * insert_node - insert a node to the replay list
320 * @c: UBIFS file-system description object 339 * @c: UBIFS file-system description object
321 * @lnum: node logical eraseblock number 340 * @lnum: node logical eraseblock number
322 * @offs: node offset 341 * @offs: node offset
@@ -328,39 +347,25 @@ static int apply_replay_tree(struct ubifs_info *c)
328 * @old_size: truncation old size 347 * @old_size: truncation old size
329 * @new_size: truncation new size 348 * @new_size: truncation new size
330 * 349 *
331 * This function inserts a scanned non-direntry node to the replay tree. The 350 * This function inserts a scanned non-direntry node to the replay list. The
332 * replay tree is an RB-tree containing @struct replay_entry elements which are 351 * replay list contains @struct replay_entry elements, and we sort this list in
333 * indexed by the sequence number. The replay tree is applied at the very end 352 * sequence number order before applying it. The replay list is applied at the
334 * of the replay process. Since the tree is sorted in sequence number order, 353 * very end of the replay process. Since the list is sorted in sequence number
335 * the older modifications are applied first. This function returns zero in 354 * order, the older modifications are applied first. This function returns zero
336 * case of success and a negative error code in case of failure. 355 * in case of success and a negative error code in case of failure.
337 */ 356 */
338static int insert_node(struct ubifs_info *c, int lnum, int offs, int len, 357static int insert_node(struct ubifs_info *c, int lnum, int offs, int len,
339 union ubifs_key *key, unsigned long long sqnum, 358 union ubifs_key *key, unsigned long long sqnum,
340 int deletion, int *used, loff_t old_size, 359 int deletion, int *used, loff_t old_size,
341 loff_t new_size) 360 loff_t new_size)
342{ 361{
343 struct rb_node **p = &c->replay_tree.rb_node, *parent = NULL;
344 struct replay_entry *r; 362 struct replay_entry *r;
345 363
364 dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key));
365
346 if (key_inum(c, key) >= c->highest_inum) 366 if (key_inum(c, key) >= c->highest_inum)
347 c->highest_inum = key_inum(c, key); 367 c->highest_inum = key_inum(c, key);
348 368
349 dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key));
350 while (*p) {
351 parent = *p;
352 r = rb_entry(parent, struct replay_entry, rb);
353 if (sqnum < r->sqnum) {
354 p = &(*p)->rb_left;
355 continue;
356 } else if (sqnum > r->sqnum) {
357 p = &(*p)->rb_right;
358 continue;
359 }
360 ubifs_err("duplicate sqnum in replay");
361 return -EINVAL;
362 }
363
364 r = kzalloc(sizeof(struct replay_entry), GFP_KERNEL); 369 r = kzalloc(sizeof(struct replay_entry), GFP_KERNEL);
365 if (!r) 370 if (!r)
366 return -ENOMEM; 371 return -ENOMEM;
@@ -370,19 +375,18 @@ static int insert_node(struct ubifs_info *c, int lnum, int offs, int len,
370 r->lnum = lnum; 375 r->lnum = lnum;
371 r->offs = offs; 376 r->offs = offs;
372 r->len = len; 377 r->len = len;
378 r->deletion = !!deletion;
373 r->sqnum = sqnum; 379 r->sqnum = sqnum;
374 r->flags = (deletion ? REPLAY_DELETION : 0); 380 key_copy(c, key, &r->key);
375 r->old_size = old_size; 381 r->old_size = old_size;
376 r->new_size = new_size; 382 r->new_size = new_size;
377 key_copy(c, key, &r->key);
378 383
379 rb_link_node(&r->rb, parent, p); 384 list_add_tail(&r->list, &c->replay_list);
380 rb_insert_color(&r->rb, &c->replay_tree);
381 return 0; 385 return 0;
382} 386}
383 387
384/** 388/**
385 * insert_dent - insert a directory entry node into the replay tree. 389 * insert_dent - insert a directory entry node into the replay list.
386 * @c: UBIFS file-system description object 390 * @c: UBIFS file-system description object
387 * @lnum: node logical eraseblock number 391 * @lnum: node logical eraseblock number
388 * @offs: node offset 392 * @offs: node offset
@@ -394,43 +398,25 @@ static int insert_node(struct ubifs_info *c, int lnum, int offs, int len,
394 * @deletion: non-zero if this is a deletion 398 * @deletion: non-zero if this is a deletion
395 * @used: number of bytes in use in a LEB 399 * @used: number of bytes in use in a LEB
396 * 400 *
397 * This function inserts a scanned directory entry node to the replay tree. 401 * This function inserts a scanned directory entry node or an extended
398 * Returns zero in case of success and a negative error code in case of 402 * attribute entry to the replay list. Returns zero in case of success and a
399 * failure. 403 * negative error code in case of failure.
400 *
401 * This function is also used for extended attribute entries because they are
402 * implemented as directory entry nodes.
403 */ 404 */
404static int insert_dent(struct ubifs_info *c, int lnum, int offs, int len, 405static int insert_dent(struct ubifs_info *c, int lnum, int offs, int len,
405 union ubifs_key *key, const char *name, int nlen, 406 union ubifs_key *key, const char *name, int nlen,
406 unsigned long long sqnum, int deletion, int *used) 407 unsigned long long sqnum, int deletion, int *used)
407{ 408{
408 struct rb_node **p = &c->replay_tree.rb_node, *parent = NULL;
409 struct replay_entry *r; 409 struct replay_entry *r;
410 char *nbuf; 410 char *nbuf;
411 411
412 dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key));
412 if (key_inum(c, key) >= c->highest_inum) 413 if (key_inum(c, key) >= c->highest_inum)
413 c->highest_inum = key_inum(c, key); 414 c->highest_inum = key_inum(c, key);
414 415
415 dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key));
416 while (*p) {
417 parent = *p;
418 r = rb_entry(parent, struct replay_entry, rb);
419 if (sqnum < r->sqnum) {
420 p = &(*p)->rb_left;
421 continue;
422 }
423 if (sqnum > r->sqnum) {
424 p = &(*p)->rb_right;
425 continue;
426 }
427 ubifs_err("duplicate sqnum in replay");
428 return -EINVAL;
429 }
430
431 r = kzalloc(sizeof(struct replay_entry), GFP_KERNEL); 416 r = kzalloc(sizeof(struct replay_entry), GFP_KERNEL);
432 if (!r) 417 if (!r)
433 return -ENOMEM; 418 return -ENOMEM;
419
434 nbuf = kmalloc(nlen + 1, GFP_KERNEL); 420 nbuf = kmalloc(nlen + 1, GFP_KERNEL);
435 if (!nbuf) { 421 if (!nbuf) {
436 kfree(r); 422 kfree(r);
@@ -442,17 +428,15 @@ static int insert_dent(struct ubifs_info *c, int lnum, int offs, int len,
442 r->lnum = lnum; 428 r->lnum = lnum;
443 r->offs = offs; 429 r->offs = offs;
444 r->len = len; 430 r->len = len;
431 r->deletion = !!deletion;
445 r->sqnum = sqnum; 432 r->sqnum = sqnum;
433 key_copy(c, key, &r->key);
446 r->nm.len = nlen; 434 r->nm.len = nlen;
447 memcpy(nbuf, name, nlen); 435 memcpy(nbuf, name, nlen);
448 nbuf[nlen] = '\0'; 436 nbuf[nlen] = '\0';
449 r->nm.name = nbuf; 437 r->nm.name = nbuf;
450 r->flags = (deletion ? REPLAY_DELETION : 0);
451 key_copy(c, key, &r->key);
452 438
453 ubifs_assert(!*p); 439 list_add_tail(&r->list, &c->replay_list);
454 rb_link_node(&r->rb, parent, p);
455 rb_insert_color(&r->rb, &c->replay_tree);
456 return 0; 440 return 0;
457} 441}
458 442
@@ -489,29 +473,92 @@ int ubifs_validate_entry(struct ubifs_info *c,
489} 473}
490 474
491/** 475/**
476 * is_last_bud - check if the bud is the last in the journal head.
477 * @c: UBIFS file-system description object
478 * @bud: bud description object
479 *
480 * This function checks if bud @bud is the last bud in its journal head. This
481 * information is then used by 'replay_bud()' to decide whether the bud can
482 * have corruptions or not. Indeed, only last buds can be corrupted by power
483 * cuts. Returns %1 if this is the last bud, and %0 if not.
484 */
485static int is_last_bud(struct ubifs_info *c, struct ubifs_bud *bud)
486{
487 struct ubifs_jhead *jh = &c->jheads[bud->jhead];
488 struct ubifs_bud *next;
489 uint32_t data;
490 int err;
491
492 if (list_is_last(&bud->list, &jh->buds_list))
493 return 1;
494
495 /*
496 * The following is a quirk to make sure we work correctly with UBIFS
497 * images used with older UBIFS.
498 *
499 * Normally, the last bud will be the last in the journal head's list
500 * of bud. However, there is one exception if the UBIFS image belongs
501 * to older UBIFS. This is fairly unlikely: one would need to use old
502 * UBIFS, then have a power cut exactly at the right point, and then
503 * try to mount this image with new UBIFS.
504 *
505 * The exception is: it is possible to have 2 buds A and B, A goes
506 * before B, and B is the last, bud B is contains no data, and bud A is
507 * corrupted at the end. The reason is that in older versions when the
508 * journal code switched the next bud (from A to B), it first added a
509 * log reference node for the new bud (B), and only after this it
510 * synchronized the write-buffer of current bud (A). But later this was
511 * changed and UBIFS started to always synchronize the write-buffer of
512 * the bud (A) before writing the log reference for the new bud (B).
513 *
514 * But because older UBIFS always synchronized A's write-buffer before
515 * writing to B, we can recognize this exceptional situation but
516 * checking the contents of bud B - if it is empty, then A can be
517 * treated as the last and we can recover it.
518 *
519 * TODO: remove this piece of code in a couple of years (today it is
520 * 16.05.2011).
521 */
522 next = list_entry(bud->list.next, struct ubifs_bud, list);
523 if (!list_is_last(&next->list, &jh->buds_list))
524 return 0;
525
526 err = ubi_read(c->ubi, next->lnum, (char *)&data,
527 next->start, 4);
528 if (err)
529 return 0;
530
531 return data == 0xFFFFFFFF;
532}
533
534/**
492 * replay_bud - replay a bud logical eraseblock. 535 * replay_bud - replay a bud logical eraseblock.
493 * @c: UBIFS file-system description object 536 * @c: UBIFS file-system description object
494 * @lnum: bud logical eraseblock number to replay 537 * @b: bud entry which describes the bud
495 * @offs: bud start offset
496 * @jhead: journal head to which this bud belongs
497 * @free: amount of free space in the bud is returned here
498 * @dirty: amount of dirty space from padding and deletion nodes is returned
499 * here
500 * 538 *
501 * This function returns zero in case of success and a negative error code in 539 * This function replays bud @bud, recovers it if needed, and adds all nodes
502 * case of failure. 540 * from this bud to the replay list. Returns zero in case of success and a
541 * negative error code in case of failure.
503 */ 542 */
504static int replay_bud(struct ubifs_info *c, int lnum, int offs, int jhead, 543static int replay_bud(struct ubifs_info *c, struct bud_entry *b)
505 int *free, int *dirty)
506{ 544{
507 int err = 0, used = 0; 545 int is_last = is_last_bud(c, b->bud);
546 int err = 0, used = 0, lnum = b->bud->lnum, offs = b->bud->start;
508 struct ubifs_scan_leb *sleb; 547 struct ubifs_scan_leb *sleb;
509 struct ubifs_scan_node *snod; 548 struct ubifs_scan_node *snod;
510 struct ubifs_bud *bud;
511 549
512 dbg_mnt("replay bud LEB %d, head %d", lnum, jhead); 550 dbg_mnt("replay bud LEB %d, head %d, offs %d, is_last %d",
513 if (c->need_recovery) 551 lnum, b->bud->jhead, offs, is_last);
514 sleb = ubifs_recover_leb(c, lnum, offs, c->sbuf, jhead != GCHD); 552
553 if (c->need_recovery && is_last)
554 /*
555 * Recover only last LEBs in the journal heads, because power
556 * cuts may cause corruptions only in these LEBs, because only
557 * these LEBs could possibly be written to at the power cut
558 * time.
559 */
560 sleb = ubifs_recover_leb(c, lnum, offs, c->sbuf,
561 b->bud->jhead != GCHD);
515 else 562 else
516 sleb = ubifs_scan(c, lnum, offs, c->sbuf, 0); 563 sleb = ubifs_scan(c, lnum, offs, c->sbuf, 0);
517 if (IS_ERR(sleb)) 564 if (IS_ERR(sleb))
@@ -627,15 +674,13 @@ static int replay_bud(struct ubifs_info *c, int lnum, int offs, int jhead,
627 goto out; 674 goto out;
628 } 675 }
629 676
630 bud = ubifs_search_bud(c, lnum); 677 ubifs_assert(ubifs_search_bud(c, lnum));
631 if (!bud)
632 BUG();
633
634 ubifs_assert(sleb->endpt - offs >= used); 678 ubifs_assert(sleb->endpt - offs >= used);
635 ubifs_assert(sleb->endpt % c->min_io_size == 0); 679 ubifs_assert(sleb->endpt % c->min_io_size == 0);
636 680
637 *dirty = sleb->endpt - offs - used; 681 b->dirty = sleb->endpt - offs - used;
638 *free = c->leb_size - sleb->endpt; 682 b->free = c->leb_size - sleb->endpt;
683 dbg_mnt("bud LEB %d replied: dirty %d, free %d", lnum, b->dirty, b->free);
639 684
640out: 685out:
641 ubifs_scan_destroy(sleb); 686 ubifs_scan_destroy(sleb);
@@ -649,58 +694,6 @@ out_dump:
649} 694}
650 695
651/** 696/**
652 * insert_ref_node - insert a reference node to the replay tree.
653 * @c: UBIFS file-system description object
654 * @lnum: node logical eraseblock number
655 * @offs: node offset
656 * @sqnum: sequence number
657 * @free: amount of free space in bud
658 * @dirty: amount of dirty space from padding and deletion nodes
659 * @jhead: journal head number for the bud
660 *
661 * This function inserts a reference node to the replay tree and returns zero
662 * in case of success or a negative error code in case of failure.
663 */
664static int insert_ref_node(struct ubifs_info *c, int lnum, int offs,
665 unsigned long long sqnum, int free, int dirty,
666 int jhead)
667{
668 struct rb_node **p = &c->replay_tree.rb_node, *parent = NULL;
669 struct replay_entry *r;
670
671 dbg_mnt("add ref LEB %d:%d", lnum, offs);
672 while (*p) {
673 parent = *p;
674 r = rb_entry(parent, struct replay_entry, rb);
675 if (sqnum < r->sqnum) {
676 p = &(*p)->rb_left;
677 continue;
678 } else if (sqnum > r->sqnum) {
679 p = &(*p)->rb_right;
680 continue;
681 }
682 ubifs_err("duplicate sqnum in replay tree");
683 return -EINVAL;
684 }
685
686 r = kzalloc(sizeof(struct replay_entry), GFP_KERNEL);
687 if (!r)
688 return -ENOMEM;
689
690 r->lnum = lnum;
691 r->offs = offs;
692 r->sqnum = sqnum;
693 r->flags = REPLAY_REF;
694 r->free = free;
695 r->dirty = dirty;
696 r->jhead = jhead;
697
698 rb_link_node(&r->rb, parent, p);
699 rb_insert_color(&r->rb, &c->replay_tree);
700 return 0;
701}
702
703/**
704 * replay_buds - replay all buds. 697 * replay_buds - replay all buds.
705 * @c: UBIFS file-system description object 698 * @c: UBIFS file-system description object
706 * 699 *
@@ -710,17 +703,16 @@ static int insert_ref_node(struct ubifs_info *c, int lnum, int offs,
710static int replay_buds(struct ubifs_info *c) 703static int replay_buds(struct ubifs_info *c)
711{ 704{
712 struct bud_entry *b; 705 struct bud_entry *b;
713 int err, uninitialized_var(free), uninitialized_var(dirty); 706 int err;
707 unsigned long long prev_sqnum = 0;
714 708
715 list_for_each_entry(b, &c->replay_buds, list) { 709 list_for_each_entry(b, &c->replay_buds, list) {
716 err = replay_bud(c, b->bud->lnum, b->bud->start, b->bud->jhead, 710 err = replay_bud(c, b);
717 &free, &dirty);
718 if (err)
719 return err;
720 err = insert_ref_node(c, b->bud->lnum, b->bud->start, b->sqnum,
721 free, dirty, b->bud->jhead);
722 if (err) 711 if (err)
723 return err; 712 return err;
713
714 ubifs_assert(b->sqnum > prev_sqnum);
715 prev_sqnum = b->sqnum;
724 } 716 }
725 717
726 return 0; 718 return 0;
@@ -1060,25 +1052,29 @@ int ubifs_replay_journal(struct ubifs_info *c)
1060 if (err) 1052 if (err)
1061 goto out; 1053 goto out;
1062 1054
1063 err = apply_replay_tree(c); 1055 err = apply_replay_list(c);
1056 if (err)
1057 goto out;
1058
1059 err = set_buds_lprops(c);
1064 if (err) 1060 if (err)
1065 goto out; 1061 goto out;
1066 1062
1067 /* 1063 /*
1068 * UBIFS budgeting calculations use @c->budg_uncommitted_idx variable 1064 * UBIFS budgeting calculations use @c->bi.uncommitted_idx variable
1069 * to roughly estimate index growth. Things like @c->min_idx_lebs 1065 * to roughly estimate index growth. Things like @c->bi.min_idx_lebs
1070 * depend on it. This means we have to initialize it to make sure 1066 * depend on it. This means we have to initialize it to make sure
1071 * budgeting works properly. 1067 * budgeting works properly.
1072 */ 1068 */
1073 c->budg_uncommitted_idx = atomic_long_read(&c->dirty_zn_cnt); 1069 c->bi.uncommitted_idx = atomic_long_read(&c->dirty_zn_cnt);
1074 c->budg_uncommitted_idx *= c->max_idx_node_sz; 1070 c->bi.uncommitted_idx *= c->max_idx_node_sz;
1075 1071
1076 ubifs_assert(c->bud_bytes <= c->max_bud_bytes || c->need_recovery); 1072 ubifs_assert(c->bud_bytes <= c->max_bud_bytes || c->need_recovery);
1077 dbg_mnt("finished, log head LEB %d:%d, max_sqnum %llu, " 1073 dbg_mnt("finished, log head LEB %d:%d, max_sqnum %llu, "
1078 "highest_inum %lu", c->lhead_lnum, c->lhead_offs, c->max_sqnum, 1074 "highest_inum %lu", c->lhead_lnum, c->lhead_offs, c->max_sqnum,
1079 (unsigned long)c->highest_inum); 1075 (unsigned long)c->highest_inum);
1080out: 1076out:
1081 destroy_replay_tree(c); 1077 destroy_replay_list(c);
1082 destroy_bud_list(c); 1078 destroy_bud_list(c);
1083 c->replaying = 0; 1079 c->replaying = 0;
1084 return err; 1080 return err;
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index bf31b4729e51..c606f010e8df 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -475,7 +475,8 @@ failed:
475 * @c: UBIFS file-system description object 475 * @c: UBIFS file-system description object
476 * 476 *
477 * This function returns a pointer to the superblock node or a negative error 477 * This function returns a pointer to the superblock node or a negative error
478 * code. 478 * code. Note, the user of this function is responsible of kfree()'ing the
479 * returned superblock buffer.
479 */ 480 */
480struct ubifs_sb_node *ubifs_read_sb_node(struct ubifs_info *c) 481struct ubifs_sb_node *ubifs_read_sb_node(struct ubifs_info *c)
481{ 482{
@@ -616,6 +617,7 @@ int ubifs_read_superblock(struct ubifs_info *c)
616 c->vfs_sb->s_time_gran = le32_to_cpu(sup->time_gran); 617 c->vfs_sb->s_time_gran = le32_to_cpu(sup->time_gran);
617 memcpy(&c->uuid, &sup->uuid, 16); 618 memcpy(&c->uuid, &sup->uuid, 16);
618 c->big_lpt = !!(sup_flags & UBIFS_FLG_BIGLPT); 619 c->big_lpt = !!(sup_flags & UBIFS_FLG_BIGLPT);
620 c->space_fixup = !!(sup_flags & UBIFS_FLG_SPACE_FIXUP);
619 621
620 /* Automatically increase file system size to the maximum size */ 622 /* Automatically increase file system size to the maximum size */
621 c->old_leb_cnt = c->leb_cnt; 623 c->old_leb_cnt = c->leb_cnt;
@@ -650,3 +652,152 @@ out:
650 kfree(sup); 652 kfree(sup);
651 return err; 653 return err;
652} 654}
655
656/**
657 * fixup_leb - fixup/unmap an LEB containing free space.
658 * @c: UBIFS file-system description object
659 * @lnum: the LEB number to fix up
660 * @len: number of used bytes in LEB (starting at offset 0)
661 *
662 * This function reads the contents of the given LEB number @lnum, then fixes
663 * it up, so that empty min. I/O units in the end of LEB are actually erased on
664 * flash (rather than being just all-0xff real data). If the LEB is completely
665 * empty, it is simply unmapped.
666 */
667static int fixup_leb(struct ubifs_info *c, int lnum, int len)
668{
669 int err;
670
671 ubifs_assert(len >= 0);
672 ubifs_assert(len % c->min_io_size == 0);
673 ubifs_assert(len < c->leb_size);
674
675 if (len == 0) {
676 dbg_mnt("unmap empty LEB %d", lnum);
677 return ubi_leb_unmap(c->ubi, lnum);
678 }
679
680 dbg_mnt("fixup LEB %d, data len %d", lnum, len);
681 err = ubi_read(c->ubi, lnum, c->sbuf, 0, len);
682 if (err)
683 return err;
684
685 return ubi_leb_change(c->ubi, lnum, c->sbuf, len, UBI_UNKNOWN);
686}
687
688/**
689 * fixup_free_space - find & remap all LEBs containing free space.
690 * @c: UBIFS file-system description object
691 *
692 * This function walks through all LEBs in the filesystem and fiexes up those
693 * containing free/empty space.
694 */
695static int fixup_free_space(struct ubifs_info *c)
696{
697 int lnum, err = 0;
698 struct ubifs_lprops *lprops;
699
700 ubifs_get_lprops(c);
701
702 /* Fixup LEBs in the master area */
703 for (lnum = UBIFS_MST_LNUM; lnum < UBIFS_LOG_LNUM; lnum++) {
704 err = fixup_leb(c, lnum, c->mst_offs + c->mst_node_alsz);
705 if (err)
706 goto out;
707 }
708
709 /* Unmap unused log LEBs */
710 lnum = ubifs_next_log_lnum(c, c->lhead_lnum);
711 while (lnum != c->ltail_lnum) {
712 err = fixup_leb(c, lnum, 0);
713 if (err)
714 goto out;
715 lnum = ubifs_next_log_lnum(c, lnum);
716 }
717
718 /* Fixup the current log head */
719 err = fixup_leb(c, c->lhead_lnum, c->lhead_offs);
720 if (err)
721 goto out;
722
723 /* Fixup LEBs in the LPT area */
724 for (lnum = c->lpt_first; lnum <= c->lpt_last; lnum++) {
725 int free = c->ltab[lnum - c->lpt_first].free;
726
727 if (free > 0) {
728 err = fixup_leb(c, lnum, c->leb_size - free);
729 if (err)
730 goto out;
731 }
732 }
733
734 /* Unmap LEBs in the orphans area */
735 for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) {
736 err = fixup_leb(c, lnum, 0);
737 if (err)
738 goto out;
739 }
740
741 /* Fixup LEBs in the main area */
742 for (lnum = c->main_first; lnum < c->leb_cnt; lnum++) {
743 lprops = ubifs_lpt_lookup(c, lnum);
744 if (IS_ERR(lprops)) {
745 err = PTR_ERR(lprops);
746 goto out;
747 }
748
749 if (lprops->free > 0) {
750 err = fixup_leb(c, lnum, c->leb_size - lprops->free);
751 if (err)
752 goto out;
753 }
754 }
755
756out:
757 ubifs_release_lprops(c);
758 return err;
759}
760
761/**
762 * ubifs_fixup_free_space - find & fix all LEBs with free space.
763 * @c: UBIFS file-system description object
764 *
765 * This function fixes up LEBs containing free space on first mount, if the
766 * appropriate flag was set when the FS was created. Each LEB with one or more
767 * empty min. I/O unit (i.e. free-space-count > 0) is re-written, to make sure
768 * the free space is actually erased. E.g., this is necessary for some NAND
769 * chips, since the free space may have been programmed like real "0xff" data
770 * (generating a non-0xff ECC), causing future writes to the not-really-erased
771 * NAND pages to behave badly. After the space is fixed up, the superblock flag
772 * is cleared, so that this is skipped for all future mounts.
773 */
774int ubifs_fixup_free_space(struct ubifs_info *c)
775{
776 int err;
777 struct ubifs_sb_node *sup;
778
779 ubifs_assert(c->space_fixup);
780 ubifs_assert(!c->ro_mount);
781
782 ubifs_msg("start fixing up free space");
783
784 err = fixup_free_space(c);
785 if (err)
786 return err;
787
788 sup = ubifs_read_sb_node(c);
789 if (IS_ERR(sup))
790 return PTR_ERR(sup);
791
792 /* Free-space fixup is no longer required */
793 c->space_fixup = 0;
794 sup->flags &= cpu_to_le32(~UBIFS_FLG_SPACE_FIXUP);
795
796 err = ubifs_write_sb_node(c, sup);
797 kfree(sup);
798 if (err)
799 return err;
800
801 ubifs_msg("free space fixup complete");
802 return err;
803}
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 04ad07f4fcc3..6db0bdaa9f74 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -375,7 +375,7 @@ out:
375 ubifs_release_dirty_inode_budget(c, ui); 375 ubifs_release_dirty_inode_budget(c, ui);
376 else { 376 else {
377 /* We've deleted something - clean the "no space" flags */ 377 /* We've deleted something - clean the "no space" flags */
378 c->nospace = c->nospace_rp = 0; 378 c->bi.nospace = c->bi.nospace_rp = 0;
379 smp_wmb(); 379 smp_wmb();
380 } 380 }
381done: 381done:
@@ -694,11 +694,11 @@ static int init_constants_sb(struct ubifs_info *c)
694 * be compressed and direntries are of the maximum size. 694 * be compressed and direntries are of the maximum size.
695 * 695 *
696 * Note, data, which may be stored in inodes is budgeted separately, so 696 * Note, data, which may be stored in inodes is budgeted separately, so
697 * it is not included into 'c->inode_budget'. 697 * it is not included into 'c->bi.inode_budget'.
698 */ 698 */
699 c->page_budget = UBIFS_MAX_DATA_NODE_SZ * UBIFS_BLOCKS_PER_PAGE; 699 c->bi.page_budget = UBIFS_MAX_DATA_NODE_SZ * UBIFS_BLOCKS_PER_PAGE;
700 c->inode_budget = UBIFS_INO_NODE_SZ; 700 c->bi.inode_budget = UBIFS_INO_NODE_SZ;
701 c->dent_budget = UBIFS_MAX_DENT_NODE_SZ; 701 c->bi.dent_budget = UBIFS_MAX_DENT_NODE_SZ;
702 702
703 /* 703 /*
704 * When the amount of flash space used by buds becomes 704 * When the amount of flash space used by buds becomes
@@ -742,7 +742,7 @@ static void init_constants_master(struct ubifs_info *c)
742{ 742{
743 long long tmp64; 743 long long tmp64;
744 744
745 c->min_idx_lebs = ubifs_calc_min_idx_lebs(c); 745 c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c);
746 c->report_rp_size = ubifs_reported_space(c, c->rp_size); 746 c->report_rp_size = ubifs_reported_space(c, c->rp_size);
747 747
748 /* 748 /*
@@ -1144,8 +1144,8 @@ static int check_free_space(struct ubifs_info *c)
1144{ 1144{
1145 ubifs_assert(c->dark_wm > 0); 1145 ubifs_assert(c->dark_wm > 0);
1146 if (c->lst.total_free + c->lst.total_dirty < c->dark_wm) { 1146 if (c->lst.total_free + c->lst.total_dirty < c->dark_wm) {
1147 ubifs_err("insufficient free space to mount in read/write mode"); 1147 ubifs_err("insufficient free space to mount in R/W mode");
1148 dbg_dump_budg(c); 1148 dbg_dump_budg(c, &c->bi);
1149 dbg_dump_lprops(c); 1149 dbg_dump_lprops(c);
1150 return -ENOSPC; 1150 return -ENOSPC;
1151 } 1151 }
@@ -1304,7 +1304,7 @@ static int mount_ubifs(struct ubifs_info *c)
1304 if (err) 1304 if (err)
1305 goto out_lpt; 1305 goto out_lpt;
1306 1306
1307 err = dbg_check_idx_size(c, c->old_idx_sz); 1307 err = dbg_check_idx_size(c, c->bi.old_idx_sz);
1308 if (err) 1308 if (err)
1309 goto out_lpt; 1309 goto out_lpt;
1310 1310
@@ -1313,7 +1313,7 @@ static int mount_ubifs(struct ubifs_info *c)
1313 goto out_journal; 1313 goto out_journal;
1314 1314
1315 /* Calculate 'min_idx_lebs' after journal replay */ 1315 /* Calculate 'min_idx_lebs' after journal replay */
1316 c->min_idx_lebs = ubifs_calc_min_idx_lebs(c); 1316 c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c);
1317 1317
1318 err = ubifs_mount_orphans(c, c->need_recovery, c->ro_mount); 1318 err = ubifs_mount_orphans(c, c->need_recovery, c->ro_mount);
1319 if (err) 1319 if (err)
@@ -1396,6 +1396,12 @@ static int mount_ubifs(struct ubifs_info *c)
1396 } else 1396 } else
1397 ubifs_assert(c->lst.taken_empty_lebs > 0); 1397 ubifs_assert(c->lst.taken_empty_lebs > 0);
1398 1398
1399 if (!c->ro_mount && c->space_fixup) {
1400 err = ubifs_fixup_free_space(c);
1401 if (err)
1402 goto out_infos;
1403 }
1404
1399 err = dbg_check_filesystem(c); 1405 err = dbg_check_filesystem(c);
1400 if (err) 1406 if (err)
1401 goto out_infos; 1407 goto out_infos;
@@ -1442,7 +1448,8 @@ static int mount_ubifs(struct ubifs_info *c)
1442 c->main_lebs, c->main_first, c->leb_cnt - 1); 1448 c->main_lebs, c->main_first, c->leb_cnt - 1);
1443 dbg_msg("index LEBs: %d", c->lst.idx_lebs); 1449 dbg_msg("index LEBs: %d", c->lst.idx_lebs);
1444 dbg_msg("total index bytes: %lld (%lld KiB, %lld MiB)", 1450 dbg_msg("total index bytes: %lld (%lld KiB, %lld MiB)",
1445 c->old_idx_sz, c->old_idx_sz >> 10, c->old_idx_sz >> 20); 1451 c->bi.old_idx_sz, c->bi.old_idx_sz >> 10,
1452 c->bi.old_idx_sz >> 20);
1446 dbg_msg("key hash type: %d", c->key_hash_type); 1453 dbg_msg("key hash type: %d", c->key_hash_type);
1447 dbg_msg("tree fanout: %d", c->fanout); 1454 dbg_msg("tree fanout: %d", c->fanout);
1448 dbg_msg("reserved GC LEB: %d", c->gc_lnum); 1455 dbg_msg("reserved GC LEB: %d", c->gc_lnum);
@@ -1456,7 +1463,7 @@ static int mount_ubifs(struct ubifs_info *c)
1456 dbg_msg("node sizes: ref %zu, cmt. start %zu, orph %zu", 1463 dbg_msg("node sizes: ref %zu, cmt. start %zu, orph %zu",
1457 UBIFS_REF_NODE_SZ, UBIFS_CS_NODE_SZ, UBIFS_ORPH_NODE_SZ); 1464 UBIFS_REF_NODE_SZ, UBIFS_CS_NODE_SZ, UBIFS_ORPH_NODE_SZ);
1458 dbg_msg("max. node sizes: data %zu, inode %zu dentry %zu, idx %d", 1465 dbg_msg("max. node sizes: data %zu, inode %zu dentry %zu, idx %d",
1459 UBIFS_MAX_DATA_NODE_SZ, UBIFS_MAX_INO_NODE_SZ, 1466 UBIFS_MAX_DATA_NODE_SZ, UBIFS_MAX_INO_NODE_SZ,
1460 UBIFS_MAX_DENT_NODE_SZ, ubifs_idx_node_sz(c, c->fanout)); 1467 UBIFS_MAX_DENT_NODE_SZ, ubifs_idx_node_sz(c, c->fanout));
1461 dbg_msg("dead watermark: %d", c->dead_wm); 1468 dbg_msg("dead watermark: %d", c->dead_wm);
1462 dbg_msg("dark watermark: %d", c->dark_wm); 1469 dbg_msg("dark watermark: %d", c->dark_wm);
@@ -1584,6 +1591,7 @@ static int ubifs_remount_rw(struct ubifs_info *c)
1584 } 1591 }
1585 sup->leb_cnt = cpu_to_le32(c->leb_cnt); 1592 sup->leb_cnt = cpu_to_le32(c->leb_cnt);
1586 err = ubifs_write_sb_node(c, sup); 1593 err = ubifs_write_sb_node(c, sup);
1594 kfree(sup);
1587 if (err) 1595 if (err)
1588 goto out; 1596 goto out;
1589 } 1597 }
@@ -1684,6 +1692,13 @@ static int ubifs_remount_rw(struct ubifs_info *c)
1684 */ 1692 */
1685 err = dbg_check_space_info(c); 1693 err = dbg_check_space_info(c);
1686 } 1694 }
1695
1696 if (c->space_fixup) {
1697 err = ubifs_fixup_free_space(c);
1698 if (err)
1699 goto out;
1700 }
1701
1687 mutex_unlock(&c->umount_mutex); 1702 mutex_unlock(&c->umount_mutex);
1688 return err; 1703 return err;
1689 1704
@@ -1766,10 +1781,9 @@ static void ubifs_put_super(struct super_block *sb)
1766 * to write them back because of I/O errors. 1781 * to write them back because of I/O errors.
1767 */ 1782 */
1768 if (!c->ro_error) { 1783 if (!c->ro_error) {
1769 ubifs_assert(atomic_long_read(&c->dirty_pg_cnt) == 0); 1784 ubifs_assert(c->bi.idx_growth == 0);
1770 ubifs_assert(c->budg_idx_growth == 0); 1785 ubifs_assert(c->bi.dd_growth == 0);
1771 ubifs_assert(c->budg_dd_growth == 0); 1786 ubifs_assert(c->bi.data_growth == 0);
1772 ubifs_assert(c->budg_data_growth == 0);
1773 } 1787 }
1774 1788
1775 /* 1789 /*
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index de485979ca39..8119b1fd8d94 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -2557,11 +2557,11 @@ int ubifs_tnc_remove_nm(struct ubifs_info *c, const union ubifs_key *key,
2557 if (err) { 2557 if (err) {
2558 /* Ensure the znode is dirtied */ 2558 /* Ensure the znode is dirtied */
2559 if (znode->cnext || !ubifs_zn_dirty(znode)) { 2559 if (znode->cnext || !ubifs_zn_dirty(znode)) {
2560 znode = dirty_cow_bottom_up(c, znode); 2560 znode = dirty_cow_bottom_up(c, znode);
2561 if (IS_ERR(znode)) { 2561 if (IS_ERR(znode)) {
2562 err = PTR_ERR(znode); 2562 err = PTR_ERR(znode);
2563 goto out_unlock; 2563 goto out_unlock;
2564 } 2564 }
2565 } 2565 }
2566 err = tnc_delete(c, znode, n); 2566 err = tnc_delete(c, znode, n);
2567 } 2567 }
diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c
index 53288e5d604e..41920f357bbf 100644
--- a/fs/ubifs/tnc_commit.c
+++ b/fs/ubifs/tnc_commit.c
@@ -377,15 +377,13 @@ static int layout_in_gaps(struct ubifs_info *c, int cnt)
377 c->gap_lebs = NULL; 377 c->gap_lebs = NULL;
378 return err; 378 return err;
379 } 379 }
380 if (!dbg_force_in_the_gaps_enabled) { 380 if (dbg_force_in_the_gaps_enabled()) {
381 /* 381 /*
382 * Do not print scary warnings if the debugging 382 * Do not print scary warnings if the debugging
383 * option which forces in-the-gaps is enabled. 383 * option which forces in-the-gaps is enabled.
384 */ 384 */
385 ubifs_err("out of space"); 385 ubifs_warn("out of space");
386 spin_lock(&c->space_lock); 386 dbg_dump_budg(c, &c->bi);
387 dbg_dump_budg(c);
388 spin_unlock(&c->space_lock);
389 dbg_dump_lprops(c); 387 dbg_dump_lprops(c);
390 } 388 }
391 /* Try to commit anyway */ 389 /* Try to commit anyway */
@@ -796,16 +794,16 @@ int ubifs_tnc_start_commit(struct ubifs_info *c, struct ubifs_zbranch *zroot)
796 spin_lock(&c->space_lock); 794 spin_lock(&c->space_lock);
797 /* 795 /*
798 * Although we have not finished committing yet, update size of the 796 * Although we have not finished committing yet, update size of the
799 * committed index ('c->old_idx_sz') and zero out the index growth 797 * committed index ('c->bi.old_idx_sz') and zero out the index growth
800 * budget. It is OK to do this now, because we've reserved all the 798 * budget. It is OK to do this now, because we've reserved all the
801 * space which is needed to commit the index, and it is save for the 799 * space which is needed to commit the index, and it is save for the
802 * budgeting subsystem to assume the index is already committed, 800 * budgeting subsystem to assume the index is already committed,
803 * even though it is not. 801 * even though it is not.
804 */ 802 */
805 ubifs_assert(c->min_idx_lebs == ubifs_calc_min_idx_lebs(c)); 803 ubifs_assert(c->bi.min_idx_lebs == ubifs_calc_min_idx_lebs(c));
806 c->old_idx_sz = c->calc_idx_sz; 804 c->bi.old_idx_sz = c->calc_idx_sz;
807 c->budg_uncommitted_idx = 0; 805 c->bi.uncommitted_idx = 0;
808 c->min_idx_lebs = ubifs_calc_min_idx_lebs(c); 806 c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c);
809 spin_unlock(&c->space_lock); 807 spin_unlock(&c->space_lock);
810 mutex_unlock(&c->tnc_mutex); 808 mutex_unlock(&c->tnc_mutex);
811 809
diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h
index 191ca7863fe7..e24380cf46ed 100644
--- a/fs/ubifs/ubifs-media.h
+++ b/fs/ubifs/ubifs-media.h
@@ -408,9 +408,11 @@ enum {
408 * Superblock flags. 408 * Superblock flags.
409 * 409 *
410 * UBIFS_FLG_BIGLPT: if "big" LPT model is used if set 410 * UBIFS_FLG_BIGLPT: if "big" LPT model is used if set
411 * UBIFS_FLG_SPACE_FIXUP: first-mount "fixup" of free space within LEBs needed
411 */ 412 */
412enum { 413enum {
413 UBIFS_FLG_BIGLPT = 0x02, 414 UBIFS_FLG_BIGLPT = 0x02,
415 UBIFS_FLG_SPACE_FIXUP = 0x04,
414}; 416};
415 417
416/** 418/**
@@ -434,7 +436,7 @@ struct ubifs_ch {
434 __u8 node_type; 436 __u8 node_type;
435 __u8 group_type; 437 __u8 group_type;
436 __u8 padding[2]; 438 __u8 padding[2];
437} __attribute__ ((packed)); 439} __packed;
438 440
439/** 441/**
440 * union ubifs_dev_desc - device node descriptor. 442 * union ubifs_dev_desc - device node descriptor.
@@ -448,7 +450,7 @@ struct ubifs_ch {
448union ubifs_dev_desc { 450union ubifs_dev_desc {
449 __le32 new; 451 __le32 new;
450 __le64 huge; 452 __le64 huge;
451} __attribute__ ((packed)); 453} __packed;
452 454
453/** 455/**
454 * struct ubifs_ino_node - inode node. 456 * struct ubifs_ino_node - inode node.
@@ -509,7 +511,7 @@ struct ubifs_ino_node {
509 __le16 compr_type; 511 __le16 compr_type;
510 __u8 padding2[26]; /* Watch 'zero_ino_node_unused()' if changing! */ 512 __u8 padding2[26]; /* Watch 'zero_ino_node_unused()' if changing! */
511 __u8 data[]; 513 __u8 data[];
512} __attribute__ ((packed)); 514} __packed;
513 515
514/** 516/**
515 * struct ubifs_dent_node - directory entry node. 517 * struct ubifs_dent_node - directory entry node.
@@ -534,7 +536,7 @@ struct ubifs_dent_node {
534 __le16 nlen; 536 __le16 nlen;
535 __u8 padding2[4]; /* Watch 'zero_dent_node_unused()' if changing! */ 537 __u8 padding2[4]; /* Watch 'zero_dent_node_unused()' if changing! */
536 __u8 name[]; 538 __u8 name[];
537} __attribute__ ((packed)); 539} __packed;
538 540
539/** 541/**
540 * struct ubifs_data_node - data node. 542 * struct ubifs_data_node - data node.
@@ -555,7 +557,7 @@ struct ubifs_data_node {
555 __le16 compr_type; 557 __le16 compr_type;
556 __u8 padding[2]; /* Watch 'zero_data_node_unused()' if changing! */ 558 __u8 padding[2]; /* Watch 'zero_data_node_unused()' if changing! */
557 __u8 data[]; 559 __u8 data[];
558} __attribute__ ((packed)); 560} __packed;
559 561
560/** 562/**
561 * struct ubifs_trun_node - truncation node. 563 * struct ubifs_trun_node - truncation node.
@@ -575,7 +577,7 @@ struct ubifs_trun_node {
575 __u8 padding[12]; /* Watch 'zero_trun_node_unused()' if changing! */ 577 __u8 padding[12]; /* Watch 'zero_trun_node_unused()' if changing! */
576 __le64 old_size; 578 __le64 old_size;
577 __le64 new_size; 579 __le64 new_size;
578} __attribute__ ((packed)); 580} __packed;
579 581
580/** 582/**
581 * struct ubifs_pad_node - padding node. 583 * struct ubifs_pad_node - padding node.
@@ -586,7 +588,7 @@ struct ubifs_trun_node {
586struct ubifs_pad_node { 588struct ubifs_pad_node {
587 struct ubifs_ch ch; 589 struct ubifs_ch ch;
588 __le32 pad_len; 590 __le32 pad_len;
589} __attribute__ ((packed)); 591} __packed;
590 592
591/** 593/**
592 * struct ubifs_sb_node - superblock node. 594 * struct ubifs_sb_node - superblock node.
@@ -644,7 +646,7 @@ struct ubifs_sb_node {
644 __u8 uuid[16]; 646 __u8 uuid[16];
645 __le32 ro_compat_version; 647 __le32 ro_compat_version;
646 __u8 padding2[3968]; 648 __u8 padding2[3968];
647} __attribute__ ((packed)); 649} __packed;
648 650
649/** 651/**
650 * struct ubifs_mst_node - master node. 652 * struct ubifs_mst_node - master node.
@@ -711,7 +713,7 @@ struct ubifs_mst_node {
711 __le32 idx_lebs; 713 __le32 idx_lebs;
712 __le32 leb_cnt; 714 __le32 leb_cnt;
713 __u8 padding[344]; 715 __u8 padding[344];
714} __attribute__ ((packed)); 716} __packed;
715 717
716/** 718/**
717 * struct ubifs_ref_node - logical eraseblock reference node. 719 * struct ubifs_ref_node - logical eraseblock reference node.
@@ -727,7 +729,7 @@ struct ubifs_ref_node {
727 __le32 offs; 729 __le32 offs;
728 __le32 jhead; 730 __le32 jhead;
729 __u8 padding[28]; 731 __u8 padding[28];
730} __attribute__ ((packed)); 732} __packed;
731 733
732/** 734/**
733 * struct ubifs_branch - key/reference/length branch 735 * struct ubifs_branch - key/reference/length branch
@@ -741,7 +743,7 @@ struct ubifs_branch {
741 __le32 offs; 743 __le32 offs;
742 __le32 len; 744 __le32 len;
743 __u8 key[]; 745 __u8 key[];
744} __attribute__ ((packed)); 746} __packed;
745 747
746/** 748/**
747 * struct ubifs_idx_node - indexing node. 749 * struct ubifs_idx_node - indexing node.
@@ -755,7 +757,7 @@ struct ubifs_idx_node {
755 __le16 child_cnt; 757 __le16 child_cnt;
756 __le16 level; 758 __le16 level;
757 __u8 branches[]; 759 __u8 branches[];
758} __attribute__ ((packed)); 760} __packed;
759 761
760/** 762/**
761 * struct ubifs_cs_node - commit start node. 763 * struct ubifs_cs_node - commit start node.
@@ -765,7 +767,7 @@ struct ubifs_idx_node {
765struct ubifs_cs_node { 767struct ubifs_cs_node {
766 struct ubifs_ch ch; 768 struct ubifs_ch ch;
767 __le64 cmt_no; 769 __le64 cmt_no;
768} __attribute__ ((packed)); 770} __packed;
769 771
770/** 772/**
771 * struct ubifs_orph_node - orphan node. 773 * struct ubifs_orph_node - orphan node.
@@ -777,6 +779,6 @@ struct ubifs_orph_node {
777 struct ubifs_ch ch; 779 struct ubifs_ch ch;
778 __le64 cmt_no; 780 __le64 cmt_no;
779 __le64 inos[]; 781 __le64 inos[];
780} __attribute__ ((packed)); 782} __packed;
781 783
782#endif /* __UBIFS_MEDIA_H__ */ 784#endif /* __UBIFS_MEDIA_H__ */
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 8c40ad3c6721..93d1412a06f0 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -389,9 +389,9 @@ struct ubifs_gced_idx_leb {
389 * The @ui_size is a "shadow" variable for @inode->i_size and UBIFS uses 389 * The @ui_size is a "shadow" variable for @inode->i_size and UBIFS uses
390 * @ui_size instead of @inode->i_size. The reason for this is that UBIFS cannot 390 * @ui_size instead of @inode->i_size. The reason for this is that UBIFS cannot
391 * make sure @inode->i_size is always changed under @ui_mutex, because it 391 * make sure @inode->i_size is always changed under @ui_mutex, because it
392 * cannot call 'truncate_setsize()' with @ui_mutex locked, because it would deadlock 392 * cannot call 'truncate_setsize()' with @ui_mutex locked, because it would
393 * with 'ubifs_writepage()' (see file.c). All the other inode fields are 393 * deadlock with 'ubifs_writepage()' (see file.c). All the other inode fields
394 * changed under @ui_mutex, so they do not need "shadow" fields. Note, one 394 * are changed under @ui_mutex, so they do not need "shadow" fields. Note, one
395 * could consider to rework locking and base it on "shadow" fields. 395 * could consider to rework locking and base it on "shadow" fields.
396 */ 396 */
397struct ubifs_inode { 397struct ubifs_inode {
@@ -937,6 +937,40 @@ struct ubifs_mount_opts {
937 unsigned int compr_type:2; 937 unsigned int compr_type:2;
938}; 938};
939 939
940/**
941 * struct ubifs_budg_info - UBIFS budgeting information.
942 * @idx_growth: amount of bytes budgeted for index growth
943 * @data_growth: amount of bytes budgeted for cached data
944 * @dd_growth: amount of bytes budgeted for cached data that will make
945 * other data dirty
946 * @uncommitted_idx: amount of bytes were budgeted for growth of the index, but
947 * which still have to be taken into account because the index
948 * has not been committed so far
949 * @old_idx_sz: size of index on flash
950 * @min_idx_lebs: minimum number of LEBs required for the index
951 * @nospace: non-zero if the file-system does not have flash space (used as
952 * optimization)
953 * @nospace_rp: the same as @nospace, but additionally means that even reserved
954 * pool is full
955 * @page_budget: budget for a page (constant, nenver changed after mount)
956 * @inode_budget: budget for an inode (constant, nenver changed after mount)
957 * @dent_budget: budget for a directory entry (constant, nenver changed after
958 * mount)
959 */
960struct ubifs_budg_info {
961 long long idx_growth;
962 long long data_growth;
963 long long dd_growth;
964 long long uncommitted_idx;
965 unsigned long long old_idx_sz;
966 int min_idx_lebs;
967 unsigned int nospace:1;
968 unsigned int nospace_rp:1;
969 int page_budget;
970 int inode_budget;
971 int dent_budget;
972};
973
940struct ubifs_debug_info; 974struct ubifs_debug_info;
941 975
942/** 976/**
@@ -980,6 +1014,7 @@ struct ubifs_debug_info;
980 * @cmt_wq: wait queue to sleep on if the log is full and a commit is running 1014 * @cmt_wq: wait queue to sleep on if the log is full and a commit is running
981 * 1015 *
982 * @big_lpt: flag that LPT is too big to write whole during commit 1016 * @big_lpt: flag that LPT is too big to write whole during commit
1017 * @space_fixup: flag indicating that free space in LEBs needs to be cleaned up
983 * @no_chk_data_crc: do not check CRCs when reading data nodes (except during 1018 * @no_chk_data_crc: do not check CRCs when reading data nodes (except during
984 * recovery) 1019 * recovery)
985 * @bulk_read: enable bulk-reads 1020 * @bulk_read: enable bulk-reads
@@ -1057,32 +1092,14 @@ struct ubifs_debug_info;
1057 * @dirty_zn_cnt: number of dirty znodes 1092 * @dirty_zn_cnt: number of dirty znodes
1058 * @clean_zn_cnt: number of clean znodes 1093 * @clean_zn_cnt: number of clean znodes
1059 * 1094 *
1060 * @budg_idx_growth: amount of bytes budgeted for index growth 1095 * @space_lock: protects @bi and @lst
1061 * @budg_data_growth: amount of bytes budgeted for cached data 1096 * @lst: lprops statistics
1062 * @budg_dd_growth: amount of bytes budgeted for cached data that will make 1097 * @bi: budgeting information
1063 * other data dirty
1064 * @budg_uncommitted_idx: amount of bytes were budgeted for growth of the index,
1065 * but which still have to be taken into account because
1066 * the index has not been committed so far
1067 * @space_lock: protects @budg_idx_growth, @budg_data_growth, @budg_dd_growth,
1068 * @budg_uncommited_idx, @min_idx_lebs, @old_idx_sz, @lst,
1069 * @nospace, and @nospace_rp;
1070 * @min_idx_lebs: minimum number of LEBs required for the index
1071 * @old_idx_sz: size of index on flash
1072 * @calc_idx_sz: temporary variable which is used to calculate new index size 1098 * @calc_idx_sz: temporary variable which is used to calculate new index size
1073 * (contains accurate new index size at end of TNC commit start) 1099 * (contains accurate new index size at end of TNC commit start)
1074 * @lst: lprops statistics
1075 * @nospace: non-zero if the file-system does not have flash space (used as
1076 * optimization)
1077 * @nospace_rp: the same as @nospace, but additionally means that even reserved
1078 * pool is full
1079 *
1080 * @page_budget: budget for a page
1081 * @inode_budget: budget for an inode
1082 * @dent_budget: budget for a directory entry
1083 * 1100 *
1084 * @ref_node_alsz: size of the LEB reference node aligned to the min. flash 1101 * @ref_node_alsz: size of the LEB reference node aligned to the min. flash
1085 * I/O unit 1102 * I/O unit
1086 * @mst_node_alsz: master node aligned size 1103 * @mst_node_alsz: master node aligned size
1087 * @min_idx_node_sz: minimum indexing node aligned on 8-bytes boundary 1104 * @min_idx_node_sz: minimum indexing node aligned on 8-bytes boundary
1088 * @max_idx_node_sz: maximum indexing node aligned on 8-bytes boundary 1105 * @max_idx_node_sz: maximum indexing node aligned on 8-bytes boundary
@@ -1189,7 +1206,6 @@ struct ubifs_debug_info;
1189 * @replaying: %1 during journal replay 1206 * @replaying: %1 during journal replay
1190 * @mounting: %1 while mounting 1207 * @mounting: %1 while mounting
1191 * @remounting_rw: %1 while re-mounting from R/O mode to R/W mode 1208 * @remounting_rw: %1 while re-mounting from R/O mode to R/W mode
1192 * @replay_tree: temporary tree used during journal replay
1193 * @replay_list: temporary list used during journal replay 1209 * @replay_list: temporary list used during journal replay
1194 * @replay_buds: list of buds to replay 1210 * @replay_buds: list of buds to replay
1195 * @cs_sqnum: sequence number of first node in the log (commit start node) 1211 * @cs_sqnum: sequence number of first node in the log (commit start node)
@@ -1238,6 +1254,7 @@ struct ubifs_info {
1238 wait_queue_head_t cmt_wq; 1254 wait_queue_head_t cmt_wq;
1239 1255
1240 unsigned int big_lpt:1; 1256 unsigned int big_lpt:1;
1257 unsigned int space_fixup:1;
1241 unsigned int no_chk_data_crc:1; 1258 unsigned int no_chk_data_crc:1;
1242 unsigned int bulk_read:1; 1259 unsigned int bulk_read:1;
1243 unsigned int default_compr:2; 1260 unsigned int default_compr:2;
@@ -1308,21 +1325,10 @@ struct ubifs_info {
1308 atomic_long_t dirty_zn_cnt; 1325 atomic_long_t dirty_zn_cnt;
1309 atomic_long_t clean_zn_cnt; 1326 atomic_long_t clean_zn_cnt;
1310 1327
1311 long long budg_idx_growth;
1312 long long budg_data_growth;
1313 long long budg_dd_growth;
1314 long long budg_uncommitted_idx;
1315 spinlock_t space_lock; 1328 spinlock_t space_lock;
1316 int min_idx_lebs;
1317 unsigned long long old_idx_sz;
1318 unsigned long long calc_idx_sz;
1319 struct ubifs_lp_stats lst; 1329 struct ubifs_lp_stats lst;
1320 unsigned int nospace:1; 1330 struct ubifs_budg_info bi;
1321 unsigned int nospace_rp:1; 1331 unsigned long long calc_idx_sz;
1322
1323 int page_budget;
1324 int inode_budget;
1325 int dent_budget;
1326 1332
1327 int ref_node_alsz; 1333 int ref_node_alsz;
1328 int mst_node_alsz; 1334 int mst_node_alsz;
@@ -1430,7 +1436,6 @@ struct ubifs_info {
1430 unsigned int replaying:1; 1436 unsigned int replaying:1;
1431 unsigned int mounting:1; 1437 unsigned int mounting:1;
1432 unsigned int remounting_rw:1; 1438 unsigned int remounting_rw:1;
1433 struct rb_root replay_tree;
1434 struct list_head replay_list; 1439 struct list_head replay_list;
1435 struct list_head replay_buds; 1440 struct list_head replay_buds;
1436 unsigned long long cs_sqnum; 1441 unsigned long long cs_sqnum;
@@ -1628,6 +1633,7 @@ int ubifs_write_master(struct ubifs_info *c);
1628int ubifs_read_superblock(struct ubifs_info *c); 1633int ubifs_read_superblock(struct ubifs_info *c);
1629struct ubifs_sb_node *ubifs_read_sb_node(struct ubifs_info *c); 1634struct ubifs_sb_node *ubifs_read_sb_node(struct ubifs_info *c);
1630int ubifs_write_sb_node(struct ubifs_info *c, struct ubifs_sb_node *sup); 1635int ubifs_write_sb_node(struct ubifs_info *c, struct ubifs_sb_node *sup);
1636int ubifs_fixup_free_space(struct ubifs_info *c);
1631 1637
1632/* replay.c */ 1638/* replay.c */
1633int ubifs_validate_entry(struct ubifs_info *c, 1639int ubifs_validate_entry(struct ubifs_info *c,
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index 3299f469e712..16f19f55e63f 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -80,8 +80,8 @@ enum {
80 SECURITY_XATTR, 80 SECURITY_XATTR,
81}; 81};
82 82
83static const struct inode_operations none_inode_operations; 83static const struct inode_operations empty_iops;
84static const struct file_operations none_file_operations; 84static const struct file_operations empty_fops;
85 85
86/** 86/**
87 * create_xattr - create an extended attribute. 87 * create_xattr - create an extended attribute.
@@ -131,8 +131,8 @@ static int create_xattr(struct ubifs_info *c, struct inode *host,
131 131
132 /* Re-define all operations to be "nothing" */ 132 /* Re-define all operations to be "nothing" */
133 inode->i_mapping->a_ops = &empty_aops; 133 inode->i_mapping->a_ops = &empty_aops;
134 inode->i_op = &none_inode_operations; 134 inode->i_op = &empty_iops;
135 inode->i_fop = &none_file_operations; 135 inode->i_fop = &empty_fops;
136 136
137 inode->i_flags |= S_SYNC | S_NOATIME | S_NOCMTIME | S_NOQUOTA; 137 inode->i_flags |= S_SYNC | S_NOATIME | S_NOCMTIME | S_NOQUOTA;
138 ui = ubifs_inode(inode); 138 ui = ubifs_inode(inode);
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 9ef9ed2cfe2e..5e68099db2a5 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -33,7 +33,6 @@
33#include <linux/migrate.h> 33#include <linux/migrate.h>
34#include <linux/backing-dev.h> 34#include <linux/backing-dev.h>
35#include <linux/freezer.h> 35#include <linux/freezer.h>
36#include <linux/list_sort.h>
37 36
38#include "xfs_sb.h" 37#include "xfs_sb.h"
39#include "xfs_inum.h" 38#include "xfs_inum.h"
@@ -709,6 +708,27 @@ xfs_buf_get_empty(
709 return bp; 708 return bp;
710} 709}
711 710
711/*
712 * Return a buffer allocated as an empty buffer and associated to external
713 * memory via xfs_buf_associate_memory() back to it's empty state.
714 */
715void
716xfs_buf_set_empty(
717 struct xfs_buf *bp,
718 size_t len)
719{
720 if (bp->b_pages)
721 _xfs_buf_free_pages(bp);
722
723 bp->b_pages = NULL;
724 bp->b_page_count = 0;
725 bp->b_addr = NULL;
726 bp->b_file_offset = 0;
727 bp->b_buffer_length = bp->b_count_desired = len;
728 bp->b_bn = XFS_BUF_DADDR_NULL;
729 bp->b_flags &= ~XBF_MAPPED;
730}
731
712static inline struct page * 732static inline struct page *
713mem_to_page( 733mem_to_page(
714 void *addr) 734 void *addr)
@@ -1402,12 +1422,12 @@ restart:
1402int 1422int
1403xfs_buftarg_shrink( 1423xfs_buftarg_shrink(
1404 struct shrinker *shrink, 1424 struct shrinker *shrink,
1405 int nr_to_scan, 1425 struct shrink_control *sc)
1406 gfp_t mask)
1407{ 1426{
1408 struct xfs_buftarg *btp = container_of(shrink, 1427 struct xfs_buftarg *btp = container_of(shrink,
1409 struct xfs_buftarg, bt_shrinker); 1428 struct xfs_buftarg, bt_shrinker);
1410 struct xfs_buf *bp; 1429 struct xfs_buf *bp;
1430 int nr_to_scan = sc->nr_to_scan;
1411 LIST_HEAD(dispose); 1431 LIST_HEAD(dispose);
1412 1432
1413 if (!nr_to_scan) 1433 if (!nr_to_scan)
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index a9a1c4512645..50a7d5fb3b73 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -178,6 +178,7 @@ extern xfs_buf_t *xfs_buf_read(xfs_buftarg_t *, xfs_off_t, size_t,
178 xfs_buf_flags_t); 178 xfs_buf_flags_t);
179 179
180extern xfs_buf_t *xfs_buf_get_empty(size_t, xfs_buftarg_t *); 180extern xfs_buf_t *xfs_buf_get_empty(size_t, xfs_buftarg_t *);
181extern void xfs_buf_set_empty(struct xfs_buf *bp, size_t len);
181extern xfs_buf_t *xfs_buf_get_uncached(struct xfs_buftarg *, size_t, int); 182extern xfs_buf_t *xfs_buf_get_uncached(struct xfs_buftarg *, size_t, int);
182extern int xfs_buf_associate_memory(xfs_buf_t *, void *, size_t); 183extern int xfs_buf_associate_memory(xfs_buf_t *, void *, size_t);
183extern void xfs_buf_hold(xfs_buf_t *); 184extern void xfs_buf_hold(xfs_buf_t *);
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index b3486dfa5520..54e623bfbb85 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -586,7 +586,8 @@ xfs_file_compat_ioctl(
586 case XFS_IOC_RESVSP_32: 586 case XFS_IOC_RESVSP_32:
587 case XFS_IOC_UNRESVSP_32: 587 case XFS_IOC_UNRESVSP_32:
588 case XFS_IOC_RESVSP64_32: 588 case XFS_IOC_RESVSP64_32:
589 case XFS_IOC_UNRESVSP64_32: { 589 case XFS_IOC_UNRESVSP64_32:
590 case XFS_IOC_ZERO_RANGE_32: {
590 struct xfs_flock64 bf; 591 struct xfs_flock64 bf;
591 592
592 if (xfs_compat_flock64_copyin(&bf, arg)) 593 if (xfs_compat_flock64_copyin(&bf, arg))
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.h b/fs/xfs/linux-2.6/xfs_ioctl32.h
index 08b605792a99..80f4060e8970 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.h
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.h
@@ -184,6 +184,7 @@ typedef struct compat_xfs_flock64 {
184#define XFS_IOC_UNRESVSP_32 _IOW('X', 41, struct compat_xfs_flock64) 184#define XFS_IOC_UNRESVSP_32 _IOW('X', 41, struct compat_xfs_flock64)
185#define XFS_IOC_RESVSP64_32 _IOW('X', 42, struct compat_xfs_flock64) 185#define XFS_IOC_RESVSP64_32 _IOW('X', 42, struct compat_xfs_flock64)
186#define XFS_IOC_UNRESVSP64_32 _IOW('X', 43, struct compat_xfs_flock64) 186#define XFS_IOC_UNRESVSP64_32 _IOW('X', 43, struct compat_xfs_flock64)
187#define XFS_IOC_ZERO_RANGE_32 _IOW('X', 57, struct compat_xfs_flock64)
187 188
188typedef struct compat_xfs_fsop_geom_v1 { 189typedef struct compat_xfs_fsop_geom_v1 {
189 __u32 blocksize; /* filesystem (data) block size */ 190 __u32 blocksize; /* filesystem (data) block size */
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 244be9cbfe78..8633521b3b2e 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -70,6 +70,7 @@
70#include <linux/ctype.h> 70#include <linux/ctype.h>
71#include <linux/writeback.h> 71#include <linux/writeback.h>
72#include <linux/capability.h> 72#include <linux/capability.h>
73#include <linux/list_sort.h>
73 74
74#include <asm/page.h> 75#include <asm/page.h>
75#include <asm/div64.h> 76#include <asm/div64.h>
diff --git a/fs/xfs/linux-2.6/xfs_message.c b/fs/xfs/linux-2.6/xfs_message.c
index 9f76cceb678d..bd672def95ac 100644
--- a/fs/xfs/linux-2.6/xfs_message.c
+++ b/fs/xfs/linux-2.6/xfs_message.c
@@ -41,23 +41,6 @@ __xfs_printk(
41 printk("%sXFS: %pV\n", level, vaf); 41 printk("%sXFS: %pV\n", level, vaf);
42} 42}
43 43
44void xfs_printk(
45 const char *level,
46 const struct xfs_mount *mp,
47 const char *fmt, ...)
48{
49 struct va_format vaf;
50 va_list args;
51
52 va_start(args, fmt);
53
54 vaf.fmt = fmt;
55 vaf.va = &args;
56
57 __xfs_printk(level, mp, &vaf);
58 va_end(args);
59}
60
61#define define_xfs_printk_level(func, kern_level) \ 44#define define_xfs_printk_level(func, kern_level) \
62void func(const struct xfs_mount *mp, const char *fmt, ...) \ 45void func(const struct xfs_mount *mp, const char *fmt, ...) \
63{ \ 46{ \
@@ -95,8 +78,7 @@ xfs_alert_tag(
95 int do_panic = 0; 78 int do_panic = 0;
96 79
97 if (xfs_panic_mask && (xfs_panic_mask & panic_tag)) { 80 if (xfs_panic_mask && (xfs_panic_mask & panic_tag)) {
98 xfs_printk(KERN_ALERT, mp, 81 xfs_alert(mp, "Transforming an alert into a BUG.");
99 "XFS: Transforming an alert into a BUG.");
100 do_panic = 1; 82 do_panic = 1;
101 } 83 }
102 84
diff --git a/fs/xfs/linux-2.6/xfs_message.h b/fs/xfs/linux-2.6/xfs_message.h
index f1b3fc1b6c4e..7fb7ea007672 100644
--- a/fs/xfs/linux-2.6/xfs_message.h
+++ b/fs/xfs/linux-2.6/xfs_message.h
@@ -3,9 +3,6 @@
3 3
4struct xfs_mount; 4struct xfs_mount;
5 5
6extern void xfs_printk(const char *level, const struct xfs_mount *mp,
7 const char *fmt, ...)
8 __attribute__ ((format (printf, 3, 4)));
9extern void xfs_emerg(const struct xfs_mount *mp, const char *fmt, ...) 6extern void xfs_emerg(const struct xfs_mount *mp, const char *fmt, ...)
10 __attribute__ ((format (printf, 2, 3))); 7 __attribute__ ((format (printf, 2, 3)));
11extern void xfs_alert(const struct xfs_mount *mp, const char *fmt, ...) 8extern void xfs_alert(const struct xfs_mount *mp, const char *fmt, ...)
@@ -28,7 +25,9 @@ extern void xfs_info(const struct xfs_mount *mp, const char *fmt, ...)
28extern void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...) 25extern void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...)
29 __attribute__ ((format (printf, 2, 3))); 26 __attribute__ ((format (printf, 2, 3)));
30#else 27#else
31static inline void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...) 28static inline void
29__attribute__ ((format (printf, 2, 3)))
30xfs_debug(const struct xfs_mount *mp, const char *fmt, ...)
32{ 31{
33} 32}
34#endif 33#endif
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index b38e58d02299..b0aa59e51fd0 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1787,10 +1787,6 @@ init_xfs_fs(void)
1787 if (error) 1787 if (error)
1788 goto out_cleanup_procfs; 1788 goto out_cleanup_procfs;
1789 1789
1790 error = xfs_init_workqueues();
1791 if (error)
1792 goto out_sysctl_unregister;
1793
1794 vfs_initquota(); 1790 vfs_initquota();
1795 1791
1796 error = register_filesystem(&xfs_fs_type); 1792 error = register_filesystem(&xfs_fs_type);
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 3e898a48122d..8ecad5ff9f9b 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -267,6 +267,16 @@ xfs_sync_inode_attr(
267 267
268 error = xfs_iflush(ip, flags); 268 error = xfs_iflush(ip, flags);
269 269
270 /*
271 * We don't want to try again on non-blocking flushes that can't run
272 * again immediately. If an inode really must be written, then that's
273 * what the SYNC_WAIT flag is for.
274 */
275 if (error == EAGAIN) {
276 ASSERT(!(flags & SYNC_WAIT));
277 error = 0;
278 }
279
270 out_unlock: 280 out_unlock:
271 xfs_iunlock(ip, XFS_ILOCK_SHARED); 281 xfs_iunlock(ip, XFS_ILOCK_SHARED);
272 return error; 282 return error;
@@ -1022,13 +1032,14 @@ xfs_reclaim_inodes(
1022static int 1032static int
1023xfs_reclaim_inode_shrink( 1033xfs_reclaim_inode_shrink(
1024 struct shrinker *shrink, 1034 struct shrinker *shrink,
1025 int nr_to_scan, 1035 struct shrink_control *sc)
1026 gfp_t gfp_mask)
1027{ 1036{
1028 struct xfs_mount *mp; 1037 struct xfs_mount *mp;
1029 struct xfs_perag *pag; 1038 struct xfs_perag *pag;
1030 xfs_agnumber_t ag; 1039 xfs_agnumber_t ag;
1031 int reclaimable; 1040 int reclaimable;
1041 int nr_to_scan = sc->nr_to_scan;
1042 gfp_t gfp_mask = sc->gfp_mask;
1032 1043
1033 mp = container_of(shrink, struct xfs_mount, m_inode_shrink); 1044 mp = container_of(shrink, struct xfs_mount, m_inode_shrink);
1034 if (nr_to_scan) { 1045 if (nr_to_scan) {
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index 2d0bcb479075..d48b7a579ae1 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -1151,44 +1151,7 @@ TRACE_EVENT(xfs_bunmap,
1151 1151
1152); 1152);
1153 1153
1154#define XFS_BUSY_SYNC \ 1154DECLARE_EVENT_CLASS(xfs_busy_class,
1155 { 0, "async" }, \
1156 { 1, "sync" }
1157
1158TRACE_EVENT(xfs_alloc_busy,
1159 TP_PROTO(struct xfs_trans *trans, xfs_agnumber_t agno,
1160 xfs_agblock_t agbno, xfs_extlen_t len, int sync),
1161 TP_ARGS(trans, agno, agbno, len, sync),
1162 TP_STRUCT__entry(
1163 __field(dev_t, dev)
1164 __field(struct xfs_trans *, tp)
1165 __field(int, tid)
1166 __field(xfs_agnumber_t, agno)
1167 __field(xfs_agblock_t, agbno)
1168 __field(xfs_extlen_t, len)
1169 __field(int, sync)
1170 ),
1171 TP_fast_assign(
1172 __entry->dev = trans->t_mountp->m_super->s_dev;
1173 __entry->tp = trans;
1174 __entry->tid = trans->t_ticket->t_tid;
1175 __entry->agno = agno;
1176 __entry->agbno = agbno;
1177 __entry->len = len;
1178 __entry->sync = sync;
1179 ),
1180 TP_printk("dev %d:%d trans 0x%p tid 0x%x agno %u agbno %u len %u %s",
1181 MAJOR(__entry->dev), MINOR(__entry->dev),
1182 __entry->tp,
1183 __entry->tid,
1184 __entry->agno,
1185 __entry->agbno,
1186 __entry->len,
1187 __print_symbolic(__entry->sync, XFS_BUSY_SYNC))
1188
1189);
1190
1191TRACE_EVENT(xfs_alloc_unbusy,
1192 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, 1155 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
1193 xfs_agblock_t agbno, xfs_extlen_t len), 1156 xfs_agblock_t agbno, xfs_extlen_t len),
1194 TP_ARGS(mp, agno, agbno, len), 1157 TP_ARGS(mp, agno, agbno, len),
@@ -1210,35 +1173,45 @@ TRACE_EVENT(xfs_alloc_unbusy,
1210 __entry->agbno, 1173 __entry->agbno,
1211 __entry->len) 1174 __entry->len)
1212); 1175);
1176#define DEFINE_BUSY_EVENT(name) \
1177DEFINE_EVENT(xfs_busy_class, name, \
1178 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
1179 xfs_agblock_t agbno, xfs_extlen_t len), \
1180 TP_ARGS(mp, agno, agbno, len))
1181DEFINE_BUSY_EVENT(xfs_alloc_busy);
1182DEFINE_BUSY_EVENT(xfs_alloc_busy_enomem);
1183DEFINE_BUSY_EVENT(xfs_alloc_busy_force);
1184DEFINE_BUSY_EVENT(xfs_alloc_busy_reuse);
1185DEFINE_BUSY_EVENT(xfs_alloc_busy_clear);
1213 1186
1214#define XFS_BUSY_STATES \ 1187TRACE_EVENT(xfs_alloc_busy_trim,
1215 { 0, "missing" }, \
1216 { 1, "found" }
1217
1218TRACE_EVENT(xfs_alloc_busysearch,
1219 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, 1188 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
1220 xfs_agblock_t agbno, xfs_extlen_t len, int found), 1189 xfs_agblock_t agbno, xfs_extlen_t len,
1221 TP_ARGS(mp, agno, agbno, len, found), 1190 xfs_agblock_t tbno, xfs_extlen_t tlen),
1191 TP_ARGS(mp, agno, agbno, len, tbno, tlen),
1222 TP_STRUCT__entry( 1192 TP_STRUCT__entry(
1223 __field(dev_t, dev) 1193 __field(dev_t, dev)
1224 __field(xfs_agnumber_t, agno) 1194 __field(xfs_agnumber_t, agno)
1225 __field(xfs_agblock_t, agbno) 1195 __field(xfs_agblock_t, agbno)
1226 __field(xfs_extlen_t, len) 1196 __field(xfs_extlen_t, len)
1227 __field(int, found) 1197 __field(xfs_agblock_t, tbno)
1198 __field(xfs_extlen_t, tlen)
1228 ), 1199 ),
1229 TP_fast_assign( 1200 TP_fast_assign(
1230 __entry->dev = mp->m_super->s_dev; 1201 __entry->dev = mp->m_super->s_dev;
1231 __entry->agno = agno; 1202 __entry->agno = agno;
1232 __entry->agbno = agbno; 1203 __entry->agbno = agbno;
1233 __entry->len = len; 1204 __entry->len = len;
1234 __entry->found = found; 1205 __entry->tbno = tbno;
1206 __entry->tlen = tlen;
1235 ), 1207 ),
1236 TP_printk("dev %d:%d agno %u agbno %u len %u %s", 1208 TP_printk("dev %d:%d agno %u agbno %u len %u tbno %u tlen %u",
1237 MAJOR(__entry->dev), MINOR(__entry->dev), 1209 MAJOR(__entry->dev), MINOR(__entry->dev),
1238 __entry->agno, 1210 __entry->agno,
1239 __entry->agbno, 1211 __entry->agbno,
1240 __entry->len, 1212 __entry->len,
1241 __print_symbolic(__entry->found, XFS_BUSY_STATES)) 1213 __entry->tbno,
1214 __entry->tlen)
1242); 1215);
1243 1216
1244TRACE_EVENT(xfs_trans_commit_lsn, 1217TRACE_EVENT(xfs_trans_commit_lsn,
@@ -1418,7 +1391,7 @@ DECLARE_EVENT_CLASS(xfs_alloc_class,
1418 __entry->wasfromfl, 1391 __entry->wasfromfl,
1419 __entry->isfl, 1392 __entry->isfl,
1420 __entry->userdata, 1393 __entry->userdata,
1421 __entry->firstblock) 1394 (unsigned long long)__entry->firstblock)
1422) 1395)
1423 1396
1424#define DEFINE_ALLOC_EVENT(name) \ 1397#define DEFINE_ALLOC_EVENT(name) \
@@ -1433,11 +1406,14 @@ DEFINE_ALLOC_EVENT(xfs_alloc_near_first);
1433DEFINE_ALLOC_EVENT(xfs_alloc_near_greater); 1406DEFINE_ALLOC_EVENT(xfs_alloc_near_greater);
1434DEFINE_ALLOC_EVENT(xfs_alloc_near_lesser); 1407DEFINE_ALLOC_EVENT(xfs_alloc_near_lesser);
1435DEFINE_ALLOC_EVENT(xfs_alloc_near_error); 1408DEFINE_ALLOC_EVENT(xfs_alloc_near_error);
1409DEFINE_ALLOC_EVENT(xfs_alloc_near_noentry);
1410DEFINE_ALLOC_EVENT(xfs_alloc_near_busy);
1436DEFINE_ALLOC_EVENT(xfs_alloc_size_neither); 1411DEFINE_ALLOC_EVENT(xfs_alloc_size_neither);
1437DEFINE_ALLOC_EVENT(xfs_alloc_size_noentry); 1412DEFINE_ALLOC_EVENT(xfs_alloc_size_noentry);
1438DEFINE_ALLOC_EVENT(xfs_alloc_size_nominleft); 1413DEFINE_ALLOC_EVENT(xfs_alloc_size_nominleft);
1439DEFINE_ALLOC_EVENT(xfs_alloc_size_done); 1414DEFINE_ALLOC_EVENT(xfs_alloc_size_done);
1440DEFINE_ALLOC_EVENT(xfs_alloc_size_error); 1415DEFINE_ALLOC_EVENT(xfs_alloc_size_error);
1416DEFINE_ALLOC_EVENT(xfs_alloc_size_busy);
1441DEFINE_ALLOC_EVENT(xfs_alloc_small_freelist); 1417DEFINE_ALLOC_EVENT(xfs_alloc_small_freelist);
1442DEFINE_ALLOC_EVENT(xfs_alloc_small_notenough); 1418DEFINE_ALLOC_EVENT(xfs_alloc_small_notenough);
1443DEFINE_ALLOC_EVENT(xfs_alloc_small_done); 1419DEFINE_ALLOC_EVENT(xfs_alloc_small_done);
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 69228aa8605a..b94dace4e785 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -60,7 +60,7 @@ STATIC void xfs_qm_list_destroy(xfs_dqlist_t *);
60 60
61STATIC int xfs_qm_init_quotainos(xfs_mount_t *); 61STATIC int xfs_qm_init_quotainos(xfs_mount_t *);
62STATIC int xfs_qm_init_quotainfo(xfs_mount_t *); 62STATIC int xfs_qm_init_quotainfo(xfs_mount_t *);
63STATIC int xfs_qm_shake(struct shrinker *, int, gfp_t); 63STATIC int xfs_qm_shake(struct shrinker *, struct shrink_control *);
64 64
65static struct shrinker xfs_qm_shaker = { 65static struct shrinker xfs_qm_shaker = {
66 .shrink = xfs_qm_shake, 66 .shrink = xfs_qm_shake,
@@ -2009,10 +2009,10 @@ xfs_qm_shake_freelist(
2009STATIC int 2009STATIC int
2010xfs_qm_shake( 2010xfs_qm_shake(
2011 struct shrinker *shrink, 2011 struct shrinker *shrink,
2012 int nr_to_scan, 2012 struct shrink_control *sc)
2013 gfp_t gfp_mask)
2014{ 2013{
2015 int ndqused, nfree, n; 2014 int ndqused, nfree, n;
2015 gfp_t gfp_mask = sc->gfp_mask;
2016 2016
2017 if (!kmem_shake_allow(gfp_mask)) 2017 if (!kmem_shake_allow(gfp_mask))
2018 return 0; 2018 return 0;
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 58632cc17f2d..da0a561ffba2 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -187,7 +187,6 @@ struct xfs_busy_extent {
187 xfs_agnumber_t agno; 187 xfs_agnumber_t agno;
188 xfs_agblock_t bno; 188 xfs_agblock_t bno;
189 xfs_extlen_t length; 189 xfs_extlen_t length;
190 xlog_tid_t tid; /* transaction that created this */
191}; 190};
192 191
193/* 192/*
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 27d64d752eab..acdced86413c 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -41,19 +41,13 @@
41#define XFSA_FIXUP_BNO_OK 1 41#define XFSA_FIXUP_BNO_OK 1
42#define XFSA_FIXUP_CNT_OK 2 42#define XFSA_FIXUP_CNT_OK 2
43 43
44/*
45 * Prototypes for per-ag allocation routines
46 */
47
48STATIC int xfs_alloc_ag_vextent_exact(xfs_alloc_arg_t *); 44STATIC int xfs_alloc_ag_vextent_exact(xfs_alloc_arg_t *);
49STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *); 45STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *);
50STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *); 46STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *);
51STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *, 47STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *,
52 xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *); 48 xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *);
53 49STATIC void xfs_alloc_busy_trim(struct xfs_alloc_arg *,
54/* 50 xfs_agblock_t, xfs_extlen_t, xfs_agblock_t *, xfs_extlen_t *);
55 * Internal functions.
56 */
57 51
58/* 52/*
59 * Lookup the record equal to [bno, len] in the btree given by cur. 53 * Lookup the record equal to [bno, len] in the btree given by cur.
@@ -154,19 +148,21 @@ xfs_alloc_compute_aligned(
154 xfs_extlen_t *reslen) /* result length */ 148 xfs_extlen_t *reslen) /* result length */
155{ 149{
156 xfs_agblock_t bno; 150 xfs_agblock_t bno;
157 xfs_extlen_t diff;
158 xfs_extlen_t len; 151 xfs_extlen_t len;
159 152
160 if (args->alignment > 1 && foundlen >= args->minlen) { 153 /* Trim busy sections out of found extent */
161 bno = roundup(foundbno, args->alignment); 154 xfs_alloc_busy_trim(args, foundbno, foundlen, &bno, &len);
162 diff = bno - foundbno; 155
163 len = diff >= foundlen ? 0 : foundlen - diff; 156 if (args->alignment > 1 && len >= args->minlen) {
157 xfs_agblock_t aligned_bno = roundup(bno, args->alignment);
158 xfs_extlen_t diff = aligned_bno - bno;
159
160 *resbno = aligned_bno;
161 *reslen = diff >= len ? 0 : len - diff;
164 } else { 162 } else {
165 bno = foundbno; 163 *resbno = bno;
166 len = foundlen; 164 *reslen = len;
167 } 165 }
168 *resbno = bno;
169 *reslen = len;
170} 166}
171 167
172/* 168/*
@@ -280,7 +276,6 @@ xfs_alloc_fix_minleft(
280 return 1; 276 return 1;
281 agf = XFS_BUF_TO_AGF(args->agbp); 277 agf = XFS_BUF_TO_AGF(args->agbp);
282 diff = be32_to_cpu(agf->agf_freeblks) 278 diff = be32_to_cpu(agf->agf_freeblks)
283 + be32_to_cpu(agf->agf_flcount)
284 - args->len - args->minleft; 279 - args->len - args->minleft;
285 if (diff >= 0) 280 if (diff >= 0)
286 return 1; 281 return 1;
@@ -541,16 +536,8 @@ xfs_alloc_ag_vextent(
541 if (error) 536 if (error)
542 return error; 537 return error;
543 538
544 /* 539 ASSERT(!xfs_alloc_busy_search(args->mp, args->agno,
545 * Search the busylist for these blocks and mark the 540 args->agbno, args->len));
546 * transaction as synchronous if blocks are found. This
547 * avoids the need to block due to a synchronous log
548 * force to ensure correct ordering as the synchronous
549 * transaction will guarantee that for us.
550 */
551 if (xfs_alloc_busy_search(args->mp, args->agno,
552 args->agbno, args->len))
553 xfs_trans_set_sync(args->tp);
554 } 541 }
555 542
556 if (!args->isfl) { 543 if (!args->isfl) {
@@ -577,14 +564,14 @@ xfs_alloc_ag_vextent_exact(
577{ 564{
578 xfs_btree_cur_t *bno_cur;/* by block-number btree cursor */ 565 xfs_btree_cur_t *bno_cur;/* by block-number btree cursor */
579 xfs_btree_cur_t *cnt_cur;/* by count btree cursor */ 566 xfs_btree_cur_t *cnt_cur;/* by count btree cursor */
580 xfs_agblock_t end; /* end of allocated extent */
581 int error; 567 int error;
582 xfs_agblock_t fbno; /* start block of found extent */ 568 xfs_agblock_t fbno; /* start block of found extent */
583 xfs_agblock_t fend; /* end block of found extent */
584 xfs_extlen_t flen; /* length of found extent */ 569 xfs_extlen_t flen; /* length of found extent */
570 xfs_agblock_t tbno; /* start block of trimmed extent */
571 xfs_extlen_t tlen; /* length of trimmed extent */
572 xfs_agblock_t tend; /* end block of trimmed extent */
573 xfs_agblock_t end; /* end of allocated extent */
585 int i; /* success/failure of operation */ 574 int i; /* success/failure of operation */
586 xfs_agblock_t maxend; /* end of maximal extent */
587 xfs_agblock_t minend; /* end of minimal extent */
588 xfs_extlen_t rlen; /* length of returned extent */ 575 xfs_extlen_t rlen; /* length of returned extent */
589 576
590 ASSERT(args->alignment == 1); 577 ASSERT(args->alignment == 1);
@@ -614,14 +601,22 @@ xfs_alloc_ag_vextent_exact(
614 goto error0; 601 goto error0;
615 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 602 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
616 ASSERT(fbno <= args->agbno); 603 ASSERT(fbno <= args->agbno);
617 minend = args->agbno + args->minlen;
618 maxend = args->agbno + args->maxlen;
619 fend = fbno + flen;
620 604
621 /* 605 /*
622 * Give up if the freespace isn't long enough for the minimum request. 606 * Check for overlapping busy extents.
607 */
608 xfs_alloc_busy_trim(args, fbno, flen, &tbno, &tlen);
609
610 /*
611 * Give up if the start of the extent is busy, or the freespace isn't
612 * long enough for the minimum request.
623 */ 613 */
624 if (fend < minend) 614 if (tbno > args->agbno)
615 goto not_found;
616 if (tlen < args->minlen)
617 goto not_found;
618 tend = tbno + tlen;
619 if (tend < args->agbno + args->minlen)
625 goto not_found; 620 goto not_found;
626 621
627 /* 622 /*
@@ -630,14 +625,14 @@ xfs_alloc_ag_vextent_exact(
630 * 625 *
631 * Fix the length according to mod and prod if given. 626 * Fix the length according to mod and prod if given.
632 */ 627 */
633 end = XFS_AGBLOCK_MIN(fend, maxend); 628 end = XFS_AGBLOCK_MIN(tend, args->agbno + args->maxlen);
634 args->len = end - args->agbno; 629 args->len = end - args->agbno;
635 xfs_alloc_fix_len(args); 630 xfs_alloc_fix_len(args);
636 if (!xfs_alloc_fix_minleft(args)) 631 if (!xfs_alloc_fix_minleft(args))
637 goto not_found; 632 goto not_found;
638 633
639 rlen = args->len; 634 rlen = args->len;
640 ASSERT(args->agbno + rlen <= fend); 635 ASSERT(args->agbno + rlen <= tend);
641 end = args->agbno + rlen; 636 end = args->agbno + rlen;
642 637
643 /* 638 /*
@@ -686,11 +681,11 @@ xfs_alloc_find_best_extent(
686 struct xfs_btree_cur **scur, /* searching cursor */ 681 struct xfs_btree_cur **scur, /* searching cursor */
687 xfs_agblock_t gdiff, /* difference for search comparison */ 682 xfs_agblock_t gdiff, /* difference for search comparison */
688 xfs_agblock_t *sbno, /* extent found by search */ 683 xfs_agblock_t *sbno, /* extent found by search */
689 xfs_extlen_t *slen, 684 xfs_extlen_t *slen, /* extent length */
690 xfs_extlen_t *slena, /* aligned length */ 685 xfs_agblock_t *sbnoa, /* aligned extent found by search */
686 xfs_extlen_t *slena, /* aligned extent length */
691 int dir) /* 0 = search right, 1 = search left */ 687 int dir) /* 0 = search right, 1 = search left */
692{ 688{
693 xfs_agblock_t bno;
694 xfs_agblock_t new; 689 xfs_agblock_t new;
695 xfs_agblock_t sdiff; 690 xfs_agblock_t sdiff;
696 int error; 691 int error;
@@ -708,16 +703,16 @@ xfs_alloc_find_best_extent(
708 if (error) 703 if (error)
709 goto error0; 704 goto error0;
710 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 705 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
711 xfs_alloc_compute_aligned(args, *sbno, *slen, &bno, slena); 706 xfs_alloc_compute_aligned(args, *sbno, *slen, sbnoa, slena);
712 707
713 /* 708 /*
714 * The good extent is closer than this one. 709 * The good extent is closer than this one.
715 */ 710 */
716 if (!dir) { 711 if (!dir) {
717 if (bno >= args->agbno + gdiff) 712 if (*sbnoa >= args->agbno + gdiff)
718 goto out_use_good; 713 goto out_use_good;
719 } else { 714 } else {
720 if (bno <= args->agbno - gdiff) 715 if (*sbnoa <= args->agbno - gdiff)
721 goto out_use_good; 716 goto out_use_good;
722 } 717 }
723 718
@@ -729,8 +724,8 @@ xfs_alloc_find_best_extent(
729 xfs_alloc_fix_len(args); 724 xfs_alloc_fix_len(args);
730 725
731 sdiff = xfs_alloc_compute_diff(args->agbno, args->len, 726 sdiff = xfs_alloc_compute_diff(args->agbno, args->len,
732 args->alignment, *sbno, 727 args->alignment, *sbnoa,
733 *slen, &new); 728 *slena, &new);
734 729
735 /* 730 /*
736 * Choose closer size and invalidate other cursor. 731 * Choose closer size and invalidate other cursor.
@@ -780,7 +775,7 @@ xfs_alloc_ag_vextent_near(
780 xfs_agblock_t gtbnoa; /* aligned ... */ 775 xfs_agblock_t gtbnoa; /* aligned ... */
781 xfs_extlen_t gtdiff; /* difference to right side entry */ 776 xfs_extlen_t gtdiff; /* difference to right side entry */
782 xfs_extlen_t gtlen; /* length of right side entry */ 777 xfs_extlen_t gtlen; /* length of right side entry */
783 xfs_extlen_t gtlena = 0; /* aligned ... */ 778 xfs_extlen_t gtlena; /* aligned ... */
784 xfs_agblock_t gtnew; /* useful start bno of right side */ 779 xfs_agblock_t gtnew; /* useful start bno of right side */
785 int error; /* error code */ 780 int error; /* error code */
786 int i; /* result code, temporary */ 781 int i; /* result code, temporary */
@@ -789,9 +784,10 @@ xfs_alloc_ag_vextent_near(
789 xfs_agblock_t ltbnoa; /* aligned ... */ 784 xfs_agblock_t ltbnoa; /* aligned ... */
790 xfs_extlen_t ltdiff; /* difference to left side entry */ 785 xfs_extlen_t ltdiff; /* difference to left side entry */
791 xfs_extlen_t ltlen; /* length of left side entry */ 786 xfs_extlen_t ltlen; /* length of left side entry */
792 xfs_extlen_t ltlena = 0; /* aligned ... */ 787 xfs_extlen_t ltlena; /* aligned ... */
793 xfs_agblock_t ltnew; /* useful start bno of left side */ 788 xfs_agblock_t ltnew; /* useful start bno of left side */
794 xfs_extlen_t rlen; /* length of returned extent */ 789 xfs_extlen_t rlen; /* length of returned extent */
790 int forced = 0;
795#if defined(DEBUG) && defined(__KERNEL__) 791#if defined(DEBUG) && defined(__KERNEL__)
796 /* 792 /*
797 * Randomly don't execute the first algorithm. 793 * Randomly don't execute the first algorithm.
@@ -800,13 +796,20 @@ xfs_alloc_ag_vextent_near(
800 796
801 dofirst = random32() & 1; 797 dofirst = random32() & 1;
802#endif 798#endif
799
800restart:
801 bno_cur_lt = NULL;
802 bno_cur_gt = NULL;
803 ltlen = 0;
804 gtlena = 0;
805 ltlena = 0;
806
803 /* 807 /*
804 * Get a cursor for the by-size btree. 808 * Get a cursor for the by-size btree.
805 */ 809 */
806 cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, 810 cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
807 args->agno, XFS_BTNUM_CNT); 811 args->agno, XFS_BTNUM_CNT);
808 ltlen = 0; 812
809 bno_cur_lt = bno_cur_gt = NULL;
810 /* 813 /*
811 * See if there are any free extents as big as maxlen. 814 * See if there are any free extents as big as maxlen.
812 */ 815 */
@@ -822,11 +825,13 @@ xfs_alloc_ag_vextent_near(
822 goto error0; 825 goto error0;
823 if (i == 0 || ltlen == 0) { 826 if (i == 0 || ltlen == 0) {
824 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 827 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
828 trace_xfs_alloc_near_noentry(args);
825 return 0; 829 return 0;
826 } 830 }
827 ASSERT(i == 1); 831 ASSERT(i == 1);
828 } 832 }
829 args->wasfromfl = 0; 833 args->wasfromfl = 0;
834
830 /* 835 /*
831 * First algorithm. 836 * First algorithm.
832 * If the requested extent is large wrt the freespaces available 837 * If the requested extent is large wrt the freespaces available
@@ -890,7 +895,7 @@ xfs_alloc_ag_vextent_near(
890 if (args->len < blen) 895 if (args->len < blen)
891 continue; 896 continue;
892 ltdiff = xfs_alloc_compute_diff(args->agbno, args->len, 897 ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
893 args->alignment, ltbno, ltlen, &ltnew); 898 args->alignment, ltbnoa, ltlena, &ltnew);
894 if (ltnew != NULLAGBLOCK && 899 if (ltnew != NULLAGBLOCK &&
895 (args->len > blen || ltdiff < bdiff)) { 900 (args->len > blen || ltdiff < bdiff)) {
896 bdiff = ltdiff; 901 bdiff = ltdiff;
@@ -1042,11 +1047,12 @@ xfs_alloc_ag_vextent_near(
1042 args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); 1047 args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
1043 xfs_alloc_fix_len(args); 1048 xfs_alloc_fix_len(args);
1044 ltdiff = xfs_alloc_compute_diff(args->agbno, args->len, 1049 ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
1045 args->alignment, ltbno, ltlen, &ltnew); 1050 args->alignment, ltbnoa, ltlena, &ltnew);
1046 1051
1047 error = xfs_alloc_find_best_extent(args, 1052 error = xfs_alloc_find_best_extent(args,
1048 &bno_cur_lt, &bno_cur_gt, 1053 &bno_cur_lt, &bno_cur_gt,
1049 ltdiff, &gtbno, &gtlen, &gtlena, 1054 ltdiff, &gtbno, &gtlen,
1055 &gtbnoa, &gtlena,
1050 0 /* search right */); 1056 0 /* search right */);
1051 } else { 1057 } else {
1052 ASSERT(gtlena >= args->minlen); 1058 ASSERT(gtlena >= args->minlen);
@@ -1057,11 +1063,12 @@ xfs_alloc_ag_vextent_near(
1057 args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen); 1063 args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen);
1058 xfs_alloc_fix_len(args); 1064 xfs_alloc_fix_len(args);
1059 gtdiff = xfs_alloc_compute_diff(args->agbno, args->len, 1065 gtdiff = xfs_alloc_compute_diff(args->agbno, args->len,
1060 args->alignment, gtbno, gtlen, &gtnew); 1066 args->alignment, gtbnoa, gtlena, &gtnew);
1061 1067
1062 error = xfs_alloc_find_best_extent(args, 1068 error = xfs_alloc_find_best_extent(args,
1063 &bno_cur_gt, &bno_cur_lt, 1069 &bno_cur_gt, &bno_cur_lt,
1064 gtdiff, &ltbno, &ltlen, &ltlena, 1070 gtdiff, &ltbno, &ltlen,
1071 &ltbnoa, &ltlena,
1065 1 /* search left */); 1072 1 /* search left */);
1066 } 1073 }
1067 1074
@@ -1073,6 +1080,12 @@ xfs_alloc_ag_vextent_near(
1073 * If we couldn't get anything, give up. 1080 * If we couldn't get anything, give up.
1074 */ 1081 */
1075 if (bno_cur_lt == NULL && bno_cur_gt == NULL) { 1082 if (bno_cur_lt == NULL && bno_cur_gt == NULL) {
1083 if (!forced++) {
1084 trace_xfs_alloc_near_busy(args);
1085 xfs_log_force(args->mp, XFS_LOG_SYNC);
1086 goto restart;
1087 }
1088
1076 trace_xfs_alloc_size_neither(args); 1089 trace_xfs_alloc_size_neither(args);
1077 args->agbno = NULLAGBLOCK; 1090 args->agbno = NULLAGBLOCK;
1078 return 0; 1091 return 0;
@@ -1107,12 +1120,13 @@ xfs_alloc_ag_vextent_near(
1107 return 0; 1120 return 0;
1108 } 1121 }
1109 rlen = args->len; 1122 rlen = args->len;
1110 (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment, ltbno, 1123 (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment,
1111 ltlen, &ltnew); 1124 ltbnoa, ltlena, &ltnew);
1112 ASSERT(ltnew >= ltbno); 1125 ASSERT(ltnew >= ltbno);
1113 ASSERT(ltnew + rlen <= ltbno + ltlen); 1126 ASSERT(ltnew + rlen <= ltbnoa + ltlena);
1114 ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); 1127 ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
1115 args->agbno = ltnew; 1128 args->agbno = ltnew;
1129
1116 if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen, 1130 if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen,
1117 ltnew, rlen, XFSA_FIXUP_BNO_OK))) 1131 ltnew, rlen, XFSA_FIXUP_BNO_OK)))
1118 goto error0; 1132 goto error0;
@@ -1155,26 +1169,35 @@ xfs_alloc_ag_vextent_size(
1155 int i; /* temp status variable */ 1169 int i; /* temp status variable */
1156 xfs_agblock_t rbno; /* returned block number */ 1170 xfs_agblock_t rbno; /* returned block number */
1157 xfs_extlen_t rlen; /* length of returned extent */ 1171 xfs_extlen_t rlen; /* length of returned extent */
1172 int forced = 0;
1158 1173
1174restart:
1159 /* 1175 /*
1160 * Allocate and initialize a cursor for the by-size btree. 1176 * Allocate and initialize a cursor for the by-size btree.
1161 */ 1177 */
1162 cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, 1178 cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
1163 args->agno, XFS_BTNUM_CNT); 1179 args->agno, XFS_BTNUM_CNT);
1164 bno_cur = NULL; 1180 bno_cur = NULL;
1181
1165 /* 1182 /*
1166 * Look for an entry >= maxlen+alignment-1 blocks. 1183 * Look for an entry >= maxlen+alignment-1 blocks.
1167 */ 1184 */
1168 if ((error = xfs_alloc_lookup_ge(cnt_cur, 0, 1185 if ((error = xfs_alloc_lookup_ge(cnt_cur, 0,
1169 args->maxlen + args->alignment - 1, &i))) 1186 args->maxlen + args->alignment - 1, &i)))
1170 goto error0; 1187 goto error0;
1188
1171 /* 1189 /*
1172 * If none, then pick up the last entry in the tree unless the 1190 * If none or we have busy extents that we cannot allocate from, then
1173 * tree is empty. 1191 * we have to settle for a smaller extent. In the case that there are
1192 * no large extents, this will return the last entry in the tree unless
1193 * the tree is empty. In the case that there are only busy large
1194 * extents, this will return the largest small extent unless there
1195 * are no smaller extents available.
1174 */ 1196 */
1175 if (!i) { 1197 if (!i || forced > 1) {
1176 if ((error = xfs_alloc_ag_vextent_small(args, cnt_cur, &fbno, 1198 error = xfs_alloc_ag_vextent_small(args, cnt_cur,
1177 &flen, &i))) 1199 &fbno, &flen, &i);
1200 if (error)
1178 goto error0; 1201 goto error0;
1179 if (i == 0 || flen == 0) { 1202 if (i == 0 || flen == 0) {
1180 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 1203 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
@@ -1182,22 +1205,56 @@ xfs_alloc_ag_vextent_size(
1182 return 0; 1205 return 0;
1183 } 1206 }
1184 ASSERT(i == 1); 1207 ASSERT(i == 1);
1208 xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen);
1209 } else {
1210 /*
1211 * Search for a non-busy extent that is large enough.
1212 * If we are at low space, don't check, or if we fall of
1213 * the end of the btree, turn off the busy check and
1214 * restart.
1215 */
1216 for (;;) {
1217 error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i);
1218 if (error)
1219 goto error0;
1220 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1221
1222 xfs_alloc_compute_aligned(args, fbno, flen,
1223 &rbno, &rlen);
1224
1225 if (rlen >= args->maxlen)
1226 break;
1227
1228 error = xfs_btree_increment(cnt_cur, 0, &i);
1229 if (error)
1230 goto error0;
1231 if (i == 0) {
1232 /*
1233 * Our only valid extents must have been busy.
1234 * Make it unbusy by forcing the log out and
1235 * retrying. If we've been here before, forcing
1236 * the log isn't making the extents available,
1237 * which means they have probably been freed in
1238 * this transaction. In that case, we have to
1239 * give up on them and we'll attempt a minlen
1240 * allocation the next time around.
1241 */
1242 xfs_btree_del_cursor(cnt_cur,
1243 XFS_BTREE_NOERROR);
1244 trace_xfs_alloc_size_busy(args);
1245 if (!forced++)
1246 xfs_log_force(args->mp, XFS_LOG_SYNC);
1247 goto restart;
1248 }
1249 }
1185 } 1250 }
1186 /* 1251
1187 * There's a freespace as big as maxlen+alignment-1, get it.
1188 */
1189 else {
1190 if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i)))
1191 goto error0;
1192 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1193 }
1194 /* 1252 /*
1195 * In the first case above, we got the last entry in the 1253 * In the first case above, we got the last entry in the
1196 * by-size btree. Now we check to see if the space hits maxlen 1254 * by-size btree. Now we check to see if the space hits maxlen
1197 * once aligned; if not, we search left for something better. 1255 * once aligned; if not, we search left for something better.
1198 * This can't happen in the second case above. 1256 * This can't happen in the second case above.
1199 */ 1257 */
1200 xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen);
1201 rlen = XFS_EXTLEN_MIN(args->maxlen, rlen); 1258 rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
1202 XFS_WANT_CORRUPTED_GOTO(rlen == 0 || 1259 XFS_WANT_CORRUPTED_GOTO(rlen == 0 ||
1203 (rlen <= flen && rbno + rlen <= fbno + flen), error0); 1260 (rlen <= flen && rbno + rlen <= fbno + flen), error0);
@@ -1251,13 +1308,19 @@ xfs_alloc_ag_vextent_size(
1251 * Fix up the length. 1308 * Fix up the length.
1252 */ 1309 */
1253 args->len = rlen; 1310 args->len = rlen;
1254 xfs_alloc_fix_len(args); 1311 if (rlen < args->minlen) {
1255 if (rlen < args->minlen || !xfs_alloc_fix_minleft(args)) { 1312 if (!forced++) {
1256 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 1313 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1257 trace_xfs_alloc_size_nominleft(args); 1314 trace_xfs_alloc_size_busy(args);
1258 args->agbno = NULLAGBLOCK; 1315 xfs_log_force(args->mp, XFS_LOG_SYNC);
1259 return 0; 1316 goto restart;
1317 }
1318 goto out_nominleft;
1260 } 1319 }
1320 xfs_alloc_fix_len(args);
1321
1322 if (!xfs_alloc_fix_minleft(args))
1323 goto out_nominleft;
1261 rlen = args->len; 1324 rlen = args->len;
1262 XFS_WANT_CORRUPTED_GOTO(rlen <= flen, error0); 1325 XFS_WANT_CORRUPTED_GOTO(rlen <= flen, error0);
1263 /* 1326 /*
@@ -1287,6 +1350,12 @@ error0:
1287 if (bno_cur) 1350 if (bno_cur)
1288 xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR); 1351 xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
1289 return error; 1352 return error;
1353
1354out_nominleft:
1355 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1356 trace_xfs_alloc_size_nominleft(args);
1357 args->agbno = NULLAGBLOCK;
1358 return 0;
1290} 1359}
1291 1360
1292/* 1361/*
@@ -1326,6 +1395,9 @@ xfs_alloc_ag_vextent_small(
1326 if (error) 1395 if (error)
1327 goto error0; 1396 goto error0;
1328 if (fbno != NULLAGBLOCK) { 1397 if (fbno != NULLAGBLOCK) {
1398 xfs_alloc_busy_reuse(args->mp, args->agno, fbno, 1,
1399 args->userdata);
1400
1329 if (args->userdata) { 1401 if (args->userdata) {
1330 xfs_buf_t *bp; 1402 xfs_buf_t *bp;
1331 1403
@@ -1617,18 +1689,6 @@ xfs_free_ag_extent(
1617 1689
1618 trace_xfs_free_extent(mp, agno, bno, len, isfl, haveleft, haveright); 1690 trace_xfs_free_extent(mp, agno, bno, len, isfl, haveleft, haveright);
1619 1691
1620 /*
1621 * Since blocks move to the free list without the coordination
1622 * used in xfs_bmap_finish, we can't allow block to be available
1623 * for reallocation and non-transaction writing (user data)
1624 * until we know that the transaction that moved it to the free
1625 * list is permanently on disk. We track the blocks by declaring
1626 * these blocks as "busy"; the busy list is maintained on a per-ag
1627 * basis and each transaction records which entries should be removed
1628 * when the iclog commits to disk. If a busy block is allocated,
1629 * the iclog is pushed up to the LSN that freed the block.
1630 */
1631 xfs_alloc_busy_insert(tp, agno, bno, len);
1632 return 0; 1692 return 0;
1633 1693
1634 error0: 1694 error0:
@@ -1923,21 +1983,6 @@ xfs_alloc_get_freelist(
1923 xfs_alloc_log_agf(tp, agbp, logflags); 1983 xfs_alloc_log_agf(tp, agbp, logflags);
1924 *bnop = bno; 1984 *bnop = bno;
1925 1985
1926 /*
1927 * As blocks are freed, they are added to the per-ag busy list and
1928 * remain there until the freeing transaction is committed to disk.
1929 * Now that we have allocated blocks, this list must be searched to see
1930 * if a block is being reused. If one is, then the freeing transaction
1931 * must be pushed to disk before this transaction.
1932 *
1933 * We do this by setting the current transaction to a sync transaction
1934 * which guarantees that the freeing transaction is on disk before this
1935 * transaction. This is done instead of a synchronous log force here so
1936 * that we don't sit and wait with the AGF locked in the transaction
1937 * during the log force.
1938 */
1939 if (xfs_alloc_busy_search(mp, be32_to_cpu(agf->agf_seqno), bno, 1))
1940 xfs_trans_set_sync(tp);
1941 return 0; 1986 return 0;
1942} 1987}
1943 1988
@@ -2423,105 +2468,13 @@ xfs_free_extent(
2423 } 2468 }
2424 2469
2425 error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0); 2470 error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0);
2471 if (!error)
2472 xfs_alloc_busy_insert(tp, args.agno, args.agbno, len);
2426error0: 2473error0:
2427 xfs_perag_put(args.pag); 2474 xfs_perag_put(args.pag);
2428 return error; 2475 return error;
2429} 2476}
2430 2477
2431
2432/*
2433 * AG Busy list management
2434 * The busy list contains block ranges that have been freed but whose
2435 * transactions have not yet hit disk. If any block listed in a busy
2436 * list is reused, the transaction that freed it must be forced to disk
2437 * before continuing to use the block.
2438 *
2439 * xfs_alloc_busy_insert - add to the per-ag busy list
2440 * xfs_alloc_busy_clear - remove an item from the per-ag busy list
2441 * xfs_alloc_busy_search - search for a busy extent
2442 */
2443
2444/*
2445 * Insert a new extent into the busy tree.
2446 *
2447 * The busy extent tree is indexed by the start block of the busy extent.
2448 * there can be multiple overlapping ranges in the busy extent tree but only
2449 * ever one entry at a given start block. The reason for this is that
2450 * multi-block extents can be freed, then smaller chunks of that extent
2451 * allocated and freed again before the first transaction commit is on disk.
2452 * If the exact same start block is freed a second time, we have to wait for
2453 * that busy extent to pass out of the tree before the new extent is inserted.
2454 * There are two main cases we have to handle here.
2455 *
2456 * The first case is a transaction that triggers a "free - allocate - free"
2457 * cycle. This can occur during btree manipulations as a btree block is freed
2458 * to the freelist, then allocated from the free list, then freed again. In
2459 * this case, the second extxpnet free is what triggers the duplicate and as
2460 * such the transaction IDs should match. Because the extent was allocated in
2461 * this transaction, the transaction must be marked as synchronous. This is
2462 * true for all cases where the free/alloc/free occurs in the one transaction,
2463 * hence the addition of the ASSERT(tp->t_flags & XFS_TRANS_SYNC) to this case.
2464 * This serves to catch violations of the second case quite effectively.
2465 *
2466 * The second case is where the free/alloc/free occur in different
2467 * transactions. In this case, the thread freeing the extent the second time
2468 * can't mark the extent busy immediately because it is already tracked in a
2469 * transaction that may be committing. When the log commit for the existing
2470 * busy extent completes, the busy extent will be removed from the tree. If we
2471 * allow the second busy insert to continue using that busy extent structure,
2472 * it can be freed before this transaction is safely in the log. Hence our
2473 * only option in this case is to force the log to remove the existing busy
2474 * extent from the list before we insert the new one with the current
2475 * transaction ID.
2476 *
2477 * The problem we are trying to avoid in the free-alloc-free in separate
2478 * transactions is most easily described with a timeline:
2479 *
2480 * Thread 1 Thread 2 Thread 3 xfslogd
2481 * xact alloc
2482 * free X
2483 * mark busy
2484 * commit xact
2485 * free xact
2486 * xact alloc
2487 * alloc X
2488 * busy search
2489 * mark xact sync
2490 * commit xact
2491 * free xact
2492 * force log
2493 * checkpoint starts
2494 * ....
2495 * xact alloc
2496 * free X
2497 * mark busy
2498 * finds match
2499 * *** KABOOM! ***
2500 * ....
2501 * log IO completes
2502 * unbusy X
2503 * checkpoint completes
2504 *
2505 * By issuing a log force in thread 3 @ "KABOOM", the thread will block until
2506 * the checkpoint completes, and the busy extent it matched will have been
2507 * removed from the tree when it is woken. Hence it can then continue safely.
2508 *
2509 * However, to ensure this matching process is robust, we need to use the
2510 * transaction ID for identifying transaction, as delayed logging results in
2511 * the busy extent and transaction lifecycles being different. i.e. the busy
2512 * extent is active for a lot longer than the transaction. Hence the
2513 * transaction structure can be freed and reallocated, then mark the same
2514 * extent busy again in the new transaction. In this case the new transaction
2515 * will have a different tid but can have the same address, and hence we need
2516 * to check against the tid.
2517 *
2518 * Future: for delayed logging, we could avoid the log force if the extent was
2519 * first freed in the current checkpoint sequence. This, however, requires the
2520 * ability to pin the current checkpoint in memory until this transaction
2521 * commits to ensure that both the original free and the current one combine
2522 * logically into the one checkpoint. If the checkpoint sequences are
2523 * different, however, we still need to wait on a log force.
2524 */
2525void 2478void
2526xfs_alloc_busy_insert( 2479xfs_alloc_busy_insert(
2527 struct xfs_trans *tp, 2480 struct xfs_trans *tp,
@@ -2533,9 +2486,7 @@ xfs_alloc_busy_insert(
2533 struct xfs_busy_extent *busyp; 2486 struct xfs_busy_extent *busyp;
2534 struct xfs_perag *pag; 2487 struct xfs_perag *pag;
2535 struct rb_node **rbp; 2488 struct rb_node **rbp;
2536 struct rb_node *parent; 2489 struct rb_node *parent = NULL;
2537 int match;
2538
2539 2490
2540 new = kmem_zalloc(sizeof(struct xfs_busy_extent), KM_MAYFAIL); 2491 new = kmem_zalloc(sizeof(struct xfs_busy_extent), KM_MAYFAIL);
2541 if (!new) { 2492 if (!new) {
@@ -2544,7 +2495,7 @@ xfs_alloc_busy_insert(
2544 * block, make this a synchronous transaction to insure that 2495 * block, make this a synchronous transaction to insure that
2545 * the block is not reused before this transaction commits. 2496 * the block is not reused before this transaction commits.
2546 */ 2497 */
2547 trace_xfs_alloc_busy(tp, agno, bno, len, 1); 2498 trace_xfs_alloc_busy_enomem(tp->t_mountp, agno, bno, len);
2548 xfs_trans_set_sync(tp); 2499 xfs_trans_set_sync(tp);
2549 return; 2500 return;
2550 } 2501 }
@@ -2552,66 +2503,28 @@ xfs_alloc_busy_insert(
2552 new->agno = agno; 2503 new->agno = agno;
2553 new->bno = bno; 2504 new->bno = bno;
2554 new->length = len; 2505 new->length = len;
2555 new->tid = xfs_log_get_trans_ident(tp);
2556
2557 INIT_LIST_HEAD(&new->list); 2506 INIT_LIST_HEAD(&new->list);
2558 2507
2559 /* trace before insert to be able to see failed inserts */ 2508 /* trace before insert to be able to see failed inserts */
2560 trace_xfs_alloc_busy(tp, agno, bno, len, 0); 2509 trace_xfs_alloc_busy(tp->t_mountp, agno, bno, len);
2561 2510
2562 pag = xfs_perag_get(tp->t_mountp, new->agno); 2511 pag = xfs_perag_get(tp->t_mountp, new->agno);
2563restart:
2564 spin_lock(&pag->pagb_lock); 2512 spin_lock(&pag->pagb_lock);
2565 rbp = &pag->pagb_tree.rb_node; 2513 rbp = &pag->pagb_tree.rb_node;
2566 parent = NULL; 2514 while (*rbp) {
2567 busyp = NULL;
2568 match = 0;
2569 while (*rbp && match >= 0) {
2570 parent = *rbp; 2515 parent = *rbp;
2571 busyp = rb_entry(parent, struct xfs_busy_extent, rb_node); 2516 busyp = rb_entry(parent, struct xfs_busy_extent, rb_node);
2572 2517
2573 if (new->bno < busyp->bno) { 2518 if (new->bno < busyp->bno) {
2574 /* may overlap, but exact start block is lower */
2575 rbp = &(*rbp)->rb_left; 2519 rbp = &(*rbp)->rb_left;
2576 if (new->bno + new->length > busyp->bno) 2520 ASSERT(new->bno + new->length <= busyp->bno);
2577 match = busyp->tid == new->tid ? 1 : -1;
2578 } else if (new->bno > busyp->bno) { 2521 } else if (new->bno > busyp->bno) {
2579 /* may overlap, but exact start block is higher */
2580 rbp = &(*rbp)->rb_right; 2522 rbp = &(*rbp)->rb_right;
2581 if (bno < busyp->bno + busyp->length) 2523 ASSERT(bno >= busyp->bno + busyp->length);
2582 match = busyp->tid == new->tid ? 1 : -1;
2583 } else { 2524 } else {
2584 match = busyp->tid == new->tid ? 1 : -1; 2525 ASSERT(0);
2585 break;
2586 } 2526 }
2587 } 2527 }
2588 if (match < 0) {
2589 /* overlap marked busy in different transaction */
2590 spin_unlock(&pag->pagb_lock);
2591 xfs_log_force(tp->t_mountp, XFS_LOG_SYNC);
2592 goto restart;
2593 }
2594 if (match > 0) {
2595 /*
2596 * overlap marked busy in same transaction. Update if exact
2597 * start block match, otherwise combine the busy extents into
2598 * a single range.
2599 */
2600 if (busyp->bno == new->bno) {
2601 busyp->length = max(busyp->length, new->length);
2602 spin_unlock(&pag->pagb_lock);
2603 ASSERT(tp->t_flags & XFS_TRANS_SYNC);
2604 xfs_perag_put(pag);
2605 kmem_free(new);
2606 return;
2607 }
2608 rb_erase(&busyp->rb_node, &pag->pagb_tree);
2609 new->length = max(busyp->bno + busyp->length,
2610 new->bno + new->length) -
2611 min(busyp->bno, new->bno);
2612 new->bno = min(busyp->bno, new->bno);
2613 } else
2614 busyp = NULL;
2615 2528
2616 rb_link_node(&new->rb_node, parent, rbp); 2529 rb_link_node(&new->rb_node, parent, rbp);
2617 rb_insert_color(&new->rb_node, &pag->pagb_tree); 2530 rb_insert_color(&new->rb_node, &pag->pagb_tree);
@@ -2619,7 +2532,6 @@ restart:
2619 list_add(&new->list, &tp->t_busy); 2532 list_add(&new->list, &tp->t_busy);
2620 spin_unlock(&pag->pagb_lock); 2533 spin_unlock(&pag->pagb_lock);
2621 xfs_perag_put(pag); 2534 xfs_perag_put(pag);
2622 kmem_free(busyp);
2623} 2535}
2624 2536
2625/* 2537/*
@@ -2668,31 +2580,443 @@ xfs_alloc_busy_search(
2668 } 2580 }
2669 } 2581 }
2670 spin_unlock(&pag->pagb_lock); 2582 spin_unlock(&pag->pagb_lock);
2671 trace_xfs_alloc_busysearch(mp, agno, bno, len, !!match);
2672 xfs_perag_put(pag); 2583 xfs_perag_put(pag);
2673 return match; 2584 return match;
2674} 2585}
2675 2586
2587/*
2588 * The found free extent [fbno, fend] overlaps part or all of the given busy
2589 * extent. If the overlap covers the beginning, the end, or all of the busy
2590 * extent, the overlapping portion can be made unbusy and used for the
2591 * allocation. We can't split a busy extent because we can't modify a
2592 * transaction/CIL context busy list, but we can update an entries block
2593 * number or length.
2594 *
2595 * Returns true if the extent can safely be reused, or false if the search
2596 * needs to be restarted.
2597 */
2598STATIC bool
2599xfs_alloc_busy_update_extent(
2600 struct xfs_mount *mp,
2601 struct xfs_perag *pag,
2602 struct xfs_busy_extent *busyp,
2603 xfs_agblock_t fbno,
2604 xfs_extlen_t flen,
2605 bool userdata)
2606{
2607 xfs_agblock_t fend = fbno + flen;
2608 xfs_agblock_t bbno = busyp->bno;
2609 xfs_agblock_t bend = bbno + busyp->length;
2610
2611 /*
2612 * If there is a busy extent overlapping a user allocation, we have
2613 * no choice but to force the log and retry the search.
2614 *
2615 * Fortunately this does not happen during normal operation, but
2616 * only if the filesystem is very low on space and has to dip into
2617 * the AGFL for normal allocations.
2618 */
2619 if (userdata)
2620 goto out_force_log;
2621
2622 if (bbno < fbno && bend > fend) {
2623 /*
2624 * Case 1:
2625 * bbno bend
2626 * +BBBBBBBBBBBBBBBBB+
2627 * +---------+
2628 * fbno fend
2629 */
2630
2631 /*
2632 * We would have to split the busy extent to be able to track
2633 * it correct, which we cannot do because we would have to
2634 * modify the list of busy extents attached to the transaction
2635 * or CIL context, which is immutable.
2636 *
2637 * Force out the log to clear the busy extent and retry the
2638 * search.
2639 */
2640 goto out_force_log;
2641 } else if (bbno >= fbno && bend <= fend) {
2642 /*
2643 * Case 2:
2644 * bbno bend
2645 * +BBBBBBBBBBBBBBBBB+
2646 * +-----------------+
2647 * fbno fend
2648 *
2649 * Case 3:
2650 * bbno bend
2651 * +BBBBBBBBBBBBBBBBB+
2652 * +--------------------------+
2653 * fbno fend
2654 *
2655 * Case 4:
2656 * bbno bend
2657 * +BBBBBBBBBBBBBBBBB+
2658 * +--------------------------+
2659 * fbno fend
2660 *
2661 * Case 5:
2662 * bbno bend
2663 * +BBBBBBBBBBBBBBBBB+
2664 * +-----------------------------------+
2665 * fbno fend
2666 *
2667 */
2668
2669 /*
2670 * The busy extent is fully covered by the extent we are
2671 * allocating, and can simply be removed from the rbtree.
2672 * However we cannot remove it from the immutable list
2673 * tracking busy extents in the transaction or CIL context,
2674 * so set the length to zero to mark it invalid.
2675 *
2676 * We also need to restart the busy extent search from the
2677 * tree root, because erasing the node can rearrange the
2678 * tree topology.
2679 */
2680 rb_erase(&busyp->rb_node, &pag->pagb_tree);
2681 busyp->length = 0;
2682 return false;
2683 } else if (fend < bend) {
2684 /*
2685 * Case 6:
2686 * bbno bend
2687 * +BBBBBBBBBBBBBBBBB+
2688 * +---------+
2689 * fbno fend
2690 *
2691 * Case 7:
2692 * bbno bend
2693 * +BBBBBBBBBBBBBBBBB+
2694 * +------------------+
2695 * fbno fend
2696 *
2697 */
2698 busyp->bno = fend;
2699 } else if (bbno < fbno) {
2700 /*
2701 * Case 8:
2702 * bbno bend
2703 * +BBBBBBBBBBBBBBBBB+
2704 * +-------------+
2705 * fbno fend
2706 *
2707 * Case 9:
2708 * bbno bend
2709 * +BBBBBBBBBBBBBBBBB+
2710 * +----------------------+
2711 * fbno fend
2712 */
2713 busyp->length = fbno - busyp->bno;
2714 } else {
2715 ASSERT(0);
2716 }
2717
2718 trace_xfs_alloc_busy_reuse(mp, pag->pag_agno, fbno, flen);
2719 return true;
2720
2721out_force_log:
2722 spin_unlock(&pag->pagb_lock);
2723 xfs_log_force(mp, XFS_LOG_SYNC);
2724 trace_xfs_alloc_busy_force(mp, pag->pag_agno, fbno, flen);
2725 spin_lock(&pag->pagb_lock);
2726 return false;
2727}
2728
2729
2730/*
2731 * For a given extent [fbno, flen], make sure we can reuse it safely.
2732 */
2676void 2733void
2677xfs_alloc_busy_clear( 2734xfs_alloc_busy_reuse(
2678 struct xfs_mount *mp, 2735 struct xfs_mount *mp,
2679 struct xfs_busy_extent *busyp) 2736 xfs_agnumber_t agno,
2737 xfs_agblock_t fbno,
2738 xfs_extlen_t flen,
2739 bool userdata)
2680{ 2740{
2681 struct xfs_perag *pag; 2741 struct xfs_perag *pag;
2742 struct rb_node *rbp;
2682 2743
2683 trace_xfs_alloc_unbusy(mp, busyp->agno, busyp->bno, 2744 ASSERT(flen > 0);
2684 busyp->length);
2685 2745
2686 ASSERT(xfs_alloc_busy_search(mp, busyp->agno, busyp->bno, 2746 pag = xfs_perag_get(mp, agno);
2687 busyp->length) == 1); 2747 spin_lock(&pag->pagb_lock);
2748restart:
2749 rbp = pag->pagb_tree.rb_node;
2750 while (rbp) {
2751 struct xfs_busy_extent *busyp =
2752 rb_entry(rbp, struct xfs_busy_extent, rb_node);
2753 xfs_agblock_t bbno = busyp->bno;
2754 xfs_agblock_t bend = bbno + busyp->length;
2688 2755
2689 list_del_init(&busyp->list); 2756 if (fbno + flen <= bbno) {
2757 rbp = rbp->rb_left;
2758 continue;
2759 } else if (fbno >= bend) {
2760 rbp = rbp->rb_right;
2761 continue;
2762 }
2690 2763
2691 pag = xfs_perag_get(mp, busyp->agno); 2764 if (!xfs_alloc_busy_update_extent(mp, pag, busyp, fbno, flen,
2692 spin_lock(&pag->pagb_lock); 2765 userdata))
2693 rb_erase(&busyp->rb_node, &pag->pagb_tree); 2766 goto restart;
2767 }
2694 spin_unlock(&pag->pagb_lock); 2768 spin_unlock(&pag->pagb_lock);
2695 xfs_perag_put(pag); 2769 xfs_perag_put(pag);
2770}
2771
2772/*
2773 * For a given extent [fbno, flen], search the busy extent list to find a
2774 * subset of the extent that is not busy. If *rlen is smaller than
2775 * args->minlen no suitable extent could be found, and the higher level
2776 * code needs to force out the log and retry the allocation.
2777 */
2778STATIC void
2779xfs_alloc_busy_trim(
2780 struct xfs_alloc_arg *args,
2781 xfs_agblock_t bno,
2782 xfs_extlen_t len,
2783 xfs_agblock_t *rbno,
2784 xfs_extlen_t *rlen)
2785{
2786 xfs_agblock_t fbno;
2787 xfs_extlen_t flen;
2788 struct rb_node *rbp;
2789
2790 ASSERT(len > 0);
2696 2791
2792 spin_lock(&args->pag->pagb_lock);
2793restart:
2794 fbno = bno;
2795 flen = len;
2796 rbp = args->pag->pagb_tree.rb_node;
2797 while (rbp && flen >= args->minlen) {
2798 struct xfs_busy_extent *busyp =
2799 rb_entry(rbp, struct xfs_busy_extent, rb_node);
2800 xfs_agblock_t fend = fbno + flen;
2801 xfs_agblock_t bbno = busyp->bno;
2802 xfs_agblock_t bend = bbno + busyp->length;
2803
2804 if (fend <= bbno) {
2805 rbp = rbp->rb_left;
2806 continue;
2807 } else if (fbno >= bend) {
2808 rbp = rbp->rb_right;
2809 continue;
2810 }
2811
2812 /*
2813 * If this is a metadata allocation, try to reuse the busy
2814 * extent instead of trimming the allocation.
2815 */
2816 if (!args->userdata) {
2817 if (!xfs_alloc_busy_update_extent(args->mp, args->pag,
2818 busyp, fbno, flen,
2819 false))
2820 goto restart;
2821 continue;
2822 }
2823
2824 if (bbno <= fbno) {
2825 /* start overlap */
2826
2827 /*
2828 * Case 1:
2829 * bbno bend
2830 * +BBBBBBBBBBBBBBBBB+
2831 * +---------+
2832 * fbno fend
2833 *
2834 * Case 2:
2835 * bbno bend
2836 * +BBBBBBBBBBBBBBBBB+
2837 * +-------------+
2838 * fbno fend
2839 *
2840 * Case 3:
2841 * bbno bend
2842 * +BBBBBBBBBBBBBBBBB+
2843 * +-------------+
2844 * fbno fend
2845 *
2846 * Case 4:
2847 * bbno bend
2848 * +BBBBBBBBBBBBBBBBB+
2849 * +-----------------+
2850 * fbno fend
2851 *
2852 * No unbusy region in extent, return failure.
2853 */
2854 if (fend <= bend)
2855 goto fail;
2856
2857 /*
2858 * Case 5:
2859 * bbno bend
2860 * +BBBBBBBBBBBBBBBBB+
2861 * +----------------------+
2862 * fbno fend
2863 *
2864 * Case 6:
2865 * bbno bend
2866 * +BBBBBBBBBBBBBBBBB+
2867 * +--------------------------+
2868 * fbno fend
2869 *
2870 * Needs to be trimmed to:
2871 * +-------+
2872 * fbno fend
2873 */
2874 fbno = bend;
2875 } else if (bend >= fend) {
2876 /* end overlap */
2877
2878 /*
2879 * Case 7:
2880 * bbno bend
2881 * +BBBBBBBBBBBBBBBBB+
2882 * +------------------+
2883 * fbno fend
2884 *
2885 * Case 8:
2886 * bbno bend
2887 * +BBBBBBBBBBBBBBBBB+
2888 * +--------------------------+
2889 * fbno fend
2890 *
2891 * Needs to be trimmed to:
2892 * +-------+
2893 * fbno fend
2894 */
2895 fend = bbno;
2896 } else {
2897 /* middle overlap */
2898
2899 /*
2900 * Case 9:
2901 * bbno bend
2902 * +BBBBBBBBBBBBBBBBB+
2903 * +-----------------------------------+
2904 * fbno fend
2905 *
2906 * Can be trimmed to:
2907 * +-------+ OR +-------+
2908 * fbno fend fbno fend
2909 *
2910 * Backward allocation leads to significant
2911 * fragmentation of directories, which degrades
2912 * directory performance, therefore we always want to
2913 * choose the option that produces forward allocation
2914 * patterns.
2915 * Preferring the lower bno extent will make the next
2916 * request use "fend" as the start of the next
2917 * allocation; if the segment is no longer busy at
2918 * that point, we'll get a contiguous allocation, but
2919 * even if it is still busy, we will get a forward
2920 * allocation.
2921 * We try to avoid choosing the segment at "bend",
2922 * because that can lead to the next allocation
2923 * taking the segment at "fbno", which would be a
2924 * backward allocation. We only use the segment at
2925 * "fbno" if it is much larger than the current
2926 * requested size, because in that case there's a
2927 * good chance subsequent allocations will be
2928 * contiguous.
2929 */
2930 if (bbno - fbno >= args->maxlen) {
2931 /* left candidate fits perfect */
2932 fend = bbno;
2933 } else if (fend - bend >= args->maxlen * 4) {
2934 /* right candidate has enough free space */
2935 fbno = bend;
2936 } else if (bbno - fbno >= args->minlen) {
2937 /* left candidate fits minimum requirement */
2938 fend = bbno;
2939 } else {
2940 goto fail;
2941 }
2942 }
2943
2944 flen = fend - fbno;
2945 }
2946 spin_unlock(&args->pag->pagb_lock);
2947
2948 if (fbno != bno || flen != len) {
2949 trace_xfs_alloc_busy_trim(args->mp, args->agno, bno, len,
2950 fbno, flen);
2951 }
2952 *rbno = fbno;
2953 *rlen = flen;
2954 return;
2955fail:
2956 /*
2957 * Return a zero extent length as failure indications. All callers
2958 * re-check if the trimmed extent satisfies the minlen requirement.
2959 */
2960 spin_unlock(&args->pag->pagb_lock);
2961 trace_xfs_alloc_busy_trim(args->mp, args->agno, bno, len, fbno, 0);
2962 *rbno = fbno;
2963 *rlen = 0;
2964}
2965
2966static void
2967xfs_alloc_busy_clear_one(
2968 struct xfs_mount *mp,
2969 struct xfs_perag *pag,
2970 struct xfs_busy_extent *busyp)
2971{
2972 if (busyp->length) {
2973 trace_xfs_alloc_busy_clear(mp, busyp->agno, busyp->bno,
2974 busyp->length);
2975 rb_erase(&busyp->rb_node, &pag->pagb_tree);
2976 }
2977
2978 list_del_init(&busyp->list);
2697 kmem_free(busyp); 2979 kmem_free(busyp);
2698} 2980}
2981
2982void
2983xfs_alloc_busy_clear(
2984 struct xfs_mount *mp,
2985 struct list_head *list)
2986{
2987 struct xfs_busy_extent *busyp, *n;
2988 struct xfs_perag *pag = NULL;
2989 xfs_agnumber_t agno = NULLAGNUMBER;
2990
2991 list_for_each_entry_safe(busyp, n, list, list) {
2992 if (busyp->agno != agno) {
2993 if (pag) {
2994 spin_unlock(&pag->pagb_lock);
2995 xfs_perag_put(pag);
2996 }
2997 pag = xfs_perag_get(mp, busyp->agno);
2998 spin_lock(&pag->pagb_lock);
2999 agno = busyp->agno;
3000 }
3001
3002 xfs_alloc_busy_clear_one(mp, pag, busyp);
3003 }
3004
3005 if (pag) {
3006 spin_unlock(&pag->pagb_lock);
3007 xfs_perag_put(pag);
3008 }
3009}
3010
3011/*
3012 * Callback for list_sort to sort busy extents by the AG they reside in.
3013 */
3014int
3015xfs_busy_extent_ag_cmp(
3016 void *priv,
3017 struct list_head *a,
3018 struct list_head *b)
3019{
3020 return container_of(a, struct xfs_busy_extent, list)->agno -
3021 container_of(b, struct xfs_busy_extent, list)->agno;
3022}
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index d0b3bc72005b..240ad288f2f9 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -140,11 +140,24 @@ xfs_alloc_busy_insert(struct xfs_trans *tp, xfs_agnumber_t agno,
140 xfs_agblock_t bno, xfs_extlen_t len); 140 xfs_agblock_t bno, xfs_extlen_t len);
141 141
142void 142void
143xfs_alloc_busy_clear(struct xfs_mount *mp, struct xfs_busy_extent *busyp); 143xfs_alloc_busy_clear(struct xfs_mount *mp, struct list_head *list);
144 144
145int 145int
146xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno, 146xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
147 xfs_agblock_t bno, xfs_extlen_t len); 147 xfs_agblock_t bno, xfs_extlen_t len);
148
149void
150xfs_alloc_busy_reuse(struct xfs_mount *mp, xfs_agnumber_t agno,
151 xfs_agblock_t fbno, xfs_extlen_t flen, bool userdata);
152
153int
154xfs_busy_extent_ag_cmp(void *priv, struct list_head *a, struct list_head *b);
155
156static inline void xfs_alloc_busy_sort(struct list_head *list)
157{
158 list_sort(NULL, list, xfs_busy_extent_ag_cmp);
159}
160
148#endif /* __KERNEL__ */ 161#endif /* __KERNEL__ */
149 162
150/* 163/*
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index 3916925e2584..8b469d53599f 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -95,6 +95,8 @@ xfs_allocbt_alloc_block(
95 return 0; 95 return 0;
96 } 96 }
97 97
98 xfs_alloc_busy_reuse(cur->bc_mp, cur->bc_private.a.agno, bno, 1, false);
99
98 xfs_trans_agbtree_delta(cur->bc_tp, 1); 100 xfs_trans_agbtree_delta(cur->bc_tp, 1);
99 new->s = cpu_to_be32(bno); 101 new->s = cpu_to_be32(bno);
100 102
@@ -118,17 +120,6 @@ xfs_allocbt_free_block(
118 if (error) 120 if (error)
119 return error; 121 return error;
120 122
121 /*
122 * Since blocks move to the free list without the coordination used in
123 * xfs_bmap_finish, we can't allow block to be available for
124 * reallocation and non-transaction writing (user data) until we know
125 * that the transaction that moved it to the free list is permanently
126 * on disk. We track the blocks by declaring these blocks as "busy";
127 * the busy list is maintained on a per-ag basis and each transaction
128 * records which entries should be removed when the iclog commits to
129 * disk. If a busy block is allocated, the iclog is pushed up to the
130 * LSN that freed the block.
131 */
132 xfs_alloc_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1); 123 xfs_alloc_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1);
133 xfs_trans_agbtree_delta(cur->bc_tp, -1); 124 xfs_trans_agbtree_delta(cur->bc_tp, -1);
134 return 0; 125 return 0;
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index be628677c288..9a84a85c03b1 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -202,7 +202,7 @@ xfs_swap_extents(
202 xfs_inode_t *tip, /* tmp inode */ 202 xfs_inode_t *tip, /* tmp inode */
203 xfs_swapext_t *sxp) 203 xfs_swapext_t *sxp)
204{ 204{
205 xfs_mount_t *mp; 205 xfs_mount_t *mp = ip->i_mount;
206 xfs_trans_t *tp; 206 xfs_trans_t *tp;
207 xfs_bstat_t *sbp = &sxp->sx_stat; 207 xfs_bstat_t *sbp = &sxp->sx_stat;
208 xfs_ifork_t *tempifp, *ifp, *tifp; 208 xfs_ifork_t *tempifp, *ifp, *tifp;
@@ -212,16 +212,12 @@ xfs_swap_extents(
212 int taforkblks = 0; 212 int taforkblks = 0;
213 __uint64_t tmp; 213 __uint64_t tmp;
214 214
215 mp = ip->i_mount;
216
217 tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL); 215 tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL);
218 if (!tempifp) { 216 if (!tempifp) {
219 error = XFS_ERROR(ENOMEM); 217 error = XFS_ERROR(ENOMEM);
220 goto out; 218 goto out;
221 } 219 }
222 220
223 sbp = &sxp->sx_stat;
224
225 /* 221 /*
226 * we have to do two separate lock calls here to keep lockdep 222 * we have to do two separate lock calls here to keep lockdep
227 * happy. If we try to get all the locks in one call, lock will 223 * happy. If we try to get all the locks in one call, lock will
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index d11ce613d692..c8e3349c287c 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1354,7 +1354,7 @@ xfs_itruncate_start(
1354 return 0; 1354 return 0;
1355 } 1355 }
1356 last_byte = xfs_file_last_byte(ip); 1356 last_byte = xfs_file_last_byte(ip);
1357 trace_xfs_itruncate_start(ip, flags, new_size, toss_start, last_byte); 1357 trace_xfs_itruncate_start(ip, new_size, flags, toss_start, last_byte);
1358 if (last_byte > toss_start) { 1358 if (last_byte > toss_start) {
1359 if (flags & XFS_ITRUNC_DEFINITE) { 1359 if (flags & XFS_ITRUNC_DEFINITE) {
1360 xfs_tosspages(ip, toss_start, 1360 xfs_tosspages(ip, toss_start,
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 576fdfe81d60..09983a3344a5 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -970,7 +970,6 @@ xfs_iflush_abort(
970{ 970{
971 xfs_inode_log_item_t *iip = ip->i_itemp; 971 xfs_inode_log_item_t *iip = ip->i_itemp;
972 972
973 iip = ip->i_itemp;
974 if (iip) { 973 if (iip) {
975 struct xfs_ail *ailp = iip->ili_item.li_ailp; 974 struct xfs_ail *ailp = iip->ili_item.li_ailp;
976 if (iip->ili_item.li_flags & XFS_LI_IN_AIL) { 975 if (iip->ili_item.li_flags & XFS_LI_IN_AIL) {
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index b612ce4520ae..211930246f20 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1449,6 +1449,13 @@ xlog_dealloc_log(xlog_t *log)
1449 1449
1450 xlog_cil_destroy(log); 1450 xlog_cil_destroy(log);
1451 1451
1452 /*
1453 * always need to ensure that the extra buffer does not point to memory
1454 * owned by another log buffer before we free it.
1455 */
1456 xfs_buf_set_empty(log->l_xbuf, log->l_iclog_size);
1457 xfs_buf_free(log->l_xbuf);
1458
1452 iclog = log->l_iclog; 1459 iclog = log->l_iclog;
1453 for (i=0; i<log->l_iclog_bufs; i++) { 1460 for (i=0; i<log->l_iclog_bufs; i++) {
1454 xfs_buf_free(iclog->ic_bp); 1461 xfs_buf_free(iclog->ic_bp);
@@ -1458,7 +1465,6 @@ xlog_dealloc_log(xlog_t *log)
1458 } 1465 }
1459 spinlock_destroy(&log->l_icloglock); 1466 spinlock_destroy(&log->l_icloglock);
1460 1467
1461 xfs_buf_free(log->l_xbuf);
1462 log->l_mp->m_log = NULL; 1468 log->l_mp->m_log = NULL;
1463 kmem_free(log); 1469 kmem_free(log);
1464} /* xlog_dealloc_log */ 1470} /* xlog_dealloc_log */
@@ -3248,13 +3254,6 @@ xfs_log_ticket_get(
3248 return ticket; 3254 return ticket;
3249} 3255}
3250 3256
3251xlog_tid_t
3252xfs_log_get_trans_ident(
3253 struct xfs_trans *tp)
3254{
3255 return tp->t_ticket->t_tid;
3256}
3257
3258/* 3257/*
3259 * Allocate and initialise a new log ticket. 3258 * Allocate and initialise a new log ticket.
3260 */ 3259 */
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 3bd3291ef8d2..78c9039994af 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -189,8 +189,6 @@ void xlog_iodone(struct xfs_buf *);
189struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket); 189struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket);
190void xfs_log_ticket_put(struct xlog_ticket *ticket); 190void xfs_log_ticket_put(struct xlog_ticket *ticket);
191 191
192xlog_tid_t xfs_log_get_trans_ident(struct xfs_trans *tp);
193
194void xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp, 192void xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
195 struct xfs_log_vec *log_vector, 193 struct xfs_log_vec *log_vector,
196 xfs_lsn_t *commit_lsn, int flags); 194 xfs_lsn_t *commit_lsn, int flags);
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 9ca59be08977..7d56e88a3f0e 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -361,13 +361,12 @@ xlog_cil_committed(
361 int abort) 361 int abort)
362{ 362{
363 struct xfs_cil_ctx *ctx = args; 363 struct xfs_cil_ctx *ctx = args;
364 struct xfs_busy_extent *busyp, *n;
365 364
366 xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain, 365 xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain,
367 ctx->start_lsn, abort); 366 ctx->start_lsn, abort);
368 367
369 list_for_each_entry_safe(busyp, n, &ctx->busy_extents, list) 368 xfs_alloc_busy_sort(&ctx->busy_extents);
370 xfs_alloc_busy_clear(ctx->cil->xc_log->l_mp, busyp); 369 xfs_alloc_busy_clear(ctx->cil->xc_log->l_mp, &ctx->busy_extents);
371 370
372 spin_lock(&ctx->cil->xc_cil_lock); 371 spin_lock(&ctx->cil->xc_cil_lock);
373 list_del(&ctx->committing); 372 list_del(&ctx->committing);
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 5864850e9e34..2d3b6a498d63 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -146,6 +146,8 @@ static inline uint xlog_get_client_id(__be32 i)
146 shutdown */ 146 shutdown */
147#define XLOG_TAIL_WARN 0x10 /* log tail verify warning issued */ 147#define XLOG_TAIL_WARN 0x10 /* log tail verify warning issued */
148 148
149typedef __uint32_t xlog_tid_t;
150
149#ifdef __KERNEL__ 151#ifdef __KERNEL__
150/* 152/*
151 * Below are states for covering allocation transactions. 153 * Below are states for covering allocation transactions.
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 5cc464a17c93..04142caedb2b 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -205,6 +205,35 @@ xlog_bread(
205} 205}
206 206
207/* 207/*
208 * Read at an offset into the buffer. Returns with the buffer in it's original
209 * state regardless of the result of the read.
210 */
211STATIC int
212xlog_bread_offset(
213 xlog_t *log,
214 xfs_daddr_t blk_no, /* block to read from */
215 int nbblks, /* blocks to read */
216 xfs_buf_t *bp,
217 xfs_caddr_t offset)
218{
219 xfs_caddr_t orig_offset = XFS_BUF_PTR(bp);
220 int orig_len = bp->b_buffer_length;
221 int error, error2;
222
223 error = XFS_BUF_SET_PTR(bp, offset, BBTOB(nbblks));
224 if (error)
225 return error;
226
227 error = xlog_bread_noalign(log, blk_no, nbblks, bp);
228
229 /* must reset buffer pointer even on error */
230 error2 = XFS_BUF_SET_PTR(bp, orig_offset, orig_len);
231 if (error)
232 return error;
233 return error2;
234}
235
236/*
208 * Write out the buffer at the given block for the given number of blocks. 237 * Write out the buffer at the given block for the given number of blocks.
209 * The buffer is kept locked across the write and is returned locked. 238 * The buffer is kept locked across the write and is returned locked.
210 * This can only be used for synchronous log writes. 239 * This can only be used for synchronous log writes.
@@ -1229,20 +1258,12 @@ xlog_write_log_records(
1229 */ 1258 */
1230 ealign = round_down(end_block, sectbb); 1259 ealign = round_down(end_block, sectbb);
1231 if (j == 0 && (start_block + endcount > ealign)) { 1260 if (j == 0 && (start_block + endcount > ealign)) {
1232 offset = XFS_BUF_PTR(bp); 1261 offset = XFS_BUF_PTR(bp) + BBTOB(ealign - start_block);
1233 balign = BBTOB(ealign - start_block); 1262 error = xlog_bread_offset(log, ealign, sectbb,
1234 error = XFS_BUF_SET_PTR(bp, offset + balign, 1263 bp, offset);
1235 BBTOB(sectbb));
1236 if (error) 1264 if (error)
1237 break; 1265 break;
1238 1266
1239 error = xlog_bread_noalign(log, ealign, sectbb, bp);
1240 if (error)
1241 break;
1242
1243 error = XFS_BUF_SET_PTR(bp, offset, bufblks);
1244 if (error)
1245 break;
1246 } 1267 }
1247 1268
1248 offset = xlog_align(log, start_block, endcount, bp); 1269 offset = xlog_align(log, start_block, endcount, bp);
@@ -3448,19 +3469,9 @@ xlog_do_recovery_pass(
3448 * - order is important. 3469 * - order is important.
3449 */ 3470 */
3450 wrapped_hblks = hblks - split_hblks; 3471 wrapped_hblks = hblks - split_hblks;
3451 error = XFS_BUF_SET_PTR(hbp, 3472 error = xlog_bread_offset(log, 0,
3452 offset + BBTOB(split_hblks), 3473 wrapped_hblks, hbp,
3453 BBTOB(hblks - split_hblks)); 3474 offset + BBTOB(split_hblks));
3454 if (error)
3455 goto bread_err2;
3456
3457 error = xlog_bread_noalign(log, 0,
3458 wrapped_hblks, hbp);
3459 if (error)
3460 goto bread_err2;
3461
3462 error = XFS_BUF_SET_PTR(hbp, offset,
3463 BBTOB(hblks));
3464 if (error) 3475 if (error)
3465 goto bread_err2; 3476 goto bread_err2;
3466 } 3477 }
@@ -3511,19 +3522,9 @@ xlog_do_recovery_pass(
3511 * _first_, then the log start (LR header end) 3522 * _first_, then the log start (LR header end)
3512 * - order is important. 3523 * - order is important.
3513 */ 3524 */
3514 error = XFS_BUF_SET_PTR(dbp, 3525 error = xlog_bread_offset(log, 0,
3515 offset + BBTOB(split_bblks), 3526 bblks - split_bblks, hbp,
3516 BBTOB(bblks - split_bblks)); 3527 offset + BBTOB(split_bblks));
3517 if (error)
3518 goto bread_err2;
3519
3520 error = xlog_bread_noalign(log, wrapped_hblks,
3521 bblks - split_bblks,
3522 dbp);
3523 if (error)
3524 goto bread_err2;
3525
3526 error = XFS_BUF_SET_PTR(dbp, offset, h_size);
3527 if (error) 3528 if (error)
3528 goto bread_err2; 3529 goto bread_err2;
3529 } 3530 }
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index bb3f9a7b24ed..b49b82363d20 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1900,7 +1900,7 @@ xfs_mod_incore_sb_batch(
1900 uint nmsb, 1900 uint nmsb,
1901 int rsvd) 1901 int rsvd)
1902{ 1902{
1903 xfs_mod_sb_t *msbp = &msb[0]; 1903 xfs_mod_sb_t *msbp;
1904 int error = 0; 1904 int error = 0;
1905 1905
1906 /* 1906 /*
@@ -1910,7 +1910,7 @@ xfs_mod_incore_sb_batch(
1910 * changes will be atomic. 1910 * changes will be atomic.
1911 */ 1911 */
1912 spin_lock(&mp->m_sb_lock); 1912 spin_lock(&mp->m_sb_lock);
1913 for (msbp = &msbp[0]; msbp < (msb + nmsb); msbp++) { 1913 for (msbp = msb; msbp < (msb + nmsb); msbp++) {
1914 ASSERT(msbp->msb_field < XFS_SBS_ICOUNT || 1914 ASSERT(msbp->msb_field < XFS_SBS_ICOUNT ||
1915 msbp->msb_field > XFS_SBS_FDBLOCKS); 1915 msbp->msb_field > XFS_SBS_FDBLOCKS);
1916 1916
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 76922793f64f..d1f24858ccc4 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -608,10 +608,8 @@ STATIC void
608xfs_trans_free( 608xfs_trans_free(
609 struct xfs_trans *tp) 609 struct xfs_trans *tp)
610{ 610{
611 struct xfs_busy_extent *busyp, *n; 611 xfs_alloc_busy_sort(&tp->t_busy);
612 612 xfs_alloc_busy_clear(tp->t_mountp, &tp->t_busy);
613 list_for_each_entry_safe(busyp, n, &tp->t_busy, list)
614 xfs_alloc_busy_clear(tp->t_mountp, busyp);
615 613
616 atomic_dec(&tp->t_mountp->m_active_trans); 614 atomic_dec(&tp->t_mountp->m_active_trans);
617 xfs_trans_free_dqinfo(tp); 615 xfs_trans_free_dqinfo(tp);
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
index 26d1867d8156..65584b55607d 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/xfs_types.h
@@ -73,8 +73,6 @@ typedef __int32_t xfs_tid_t; /* transaction identifier */
73typedef __uint32_t xfs_dablk_t; /* dir/attr block number (in file) */ 73typedef __uint32_t xfs_dablk_t; /* dir/attr block number (in file) */
74typedef __uint32_t xfs_dahash_t; /* dir/attr hash value */ 74typedef __uint32_t xfs_dahash_t; /* dir/attr hash value */
75 75
76typedef __uint32_t xlog_tid_t; /* transaction ID type */
77
78/* 76/*
79 * These types are 64 bits on disk but are either 32 or 64 bits in memory. 77 * These types are 64 bits on disk but are either 32 or 64 bits in memory.
80 * Disk based types: 78 * Disk based types: