aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/vfs_inode.c3
-rw-r--r--fs/Kconfig88
-rw-r--r--fs/Makefile2
-rw-r--r--fs/bfs/dir.c2
-rw-r--r--fs/bio-integrity.c29
-rw-r--r--fs/bio.c297
-rw-r--r--fs/block_dev.c182
-rw-r--r--fs/cifs/cifs_spnego.c35
-rw-r--r--fs/cifs/cifsfs.h2
-rw-r--r--fs/cifs/cifsglob.h1
-rw-r--r--fs/cifs/cifsproto.h4
-rw-r--r--fs/cifs/cifssmb.c84
-rw-r--r--fs/cifs/dns_resolve.c74
-rw-r--r--fs/cifs/file.c130
-rw-r--r--fs/cifs/inode.c644
-rw-r--r--fs/cifs/misc.c8
-rw-r--r--fs/cifs/readdir.c128
-rw-r--r--fs/cifs/sess.c4
-rw-r--r--fs/cifs/transport.c3
-rw-r--r--fs/dcache.c10
-rw-r--r--fs/dlm/config.c77
-rw-r--r--fs/dlm/dlm_internal.h7
-rw-r--r--fs/dlm/lockspace.c158
-rw-r--r--fs/dlm/lockspace.h1
-rw-r--r--fs/dlm/user.c124
-rw-r--r--fs/dlm/user.h4
-rw-r--r--fs/exec.c2
-rw-r--r--fs/ext2/ext2.h2
-rw-r--r--fs/ext2/file.c1
-rw-r--r--fs/ext2/inode.c8
-rw-r--r--fs/ext3/file.c1
-rw-r--r--fs/ext3/inode.c8
-rw-r--r--fs/ext4/Makefile10
-rw-r--r--fs/ext4/acl.h12
-rw-r--r--fs/ext4/balloc.c1457
-rw-r--r--fs/ext4/bitmap.c6
-rw-r--r--fs/ext4/dir.c64
-rw-r--r--fs/ext4/ext4.h133
-rw-r--r--fs/ext4/ext4_extents.h15
-rw-r--r--fs/ext4/ext4_i.h39
-rw-r--r--fs/ext4/ext4_sb.h25
-rw-r--r--fs/ext4/extents.c281
-rw-r--r--fs/ext4/file.c10
-rw-r--r--fs/ext4/fsync.c7
-rw-r--r--fs/ext4/hash.c8
-rw-r--r--fs/ext4/ialloc.c71
-rw-r--r--fs/ext4/inode.c620
-rw-r--r--fs/ext4/ioctl.c96
-rw-r--r--fs/ext4/mballoc.c220
-rw-r--r--fs/ext4/mballoc.h1
-rw-r--r--fs/ext4/migrate.c10
-rw-r--r--fs/ext4/namei.c402
-rw-r--r--fs/ext4/resize.c33
-rw-r--r--fs/ext4/super.c315
-rw-r--r--fs/ext4/symlink.c8
-rw-r--r--fs/ext4/xattr.c14
-rw-r--r--fs/ext4/xattr.h12
-rw-r--r--fs/fat/fatent.c14
-rw-r--r--fs/gfs2/glock.c15
-rw-r--r--fs/gfs2/glock.h1
-rw-r--r--fs/gfs2/incore.h38
-rw-r--r--fs/gfs2/inode.c159
-rw-r--r--fs/gfs2/inode.h2
-rw-r--r--fs/gfs2/locking/dlm/mount.c3
-rw-r--r--fs/gfs2/log.c21
-rw-r--r--fs/gfs2/mount.c7
-rw-r--r--fs/gfs2/ops_address.c18
-rw-r--r--fs/gfs2/ops_file.c16
-rw-r--r--fs/gfs2/ops_fstype.c578
-rw-r--r--fs/gfs2/ops_inode.c127
-rw-r--r--fs/gfs2/ops_super.c108
-rw-r--r--fs/gfs2/super.c340
-rw-r--r--fs/gfs2/super.h6
-rw-r--r--fs/gfs2/sys.c11
-rw-r--r--fs/inotify_user.c27
-rw-r--r--fs/ioctl.c277
-rw-r--r--fs/jbd2/checkpoint.c71
-rw-r--r--fs/jbd2/commit.c32
-rw-r--r--fs/jbd2/journal.c103
-rw-r--r--fs/jbd2/recovery.c7
-rw-r--r--fs/nfs/super.c6
-rw-r--r--fs/ntfs/usnjrnl.h4
-rw-r--r--fs/ocfs2/alloc.c9
-rw-r--r--fs/ocfs2/alloc.h9
-rw-r--r--fs/ocfs2/aops.c2
-rw-r--r--fs/ocfs2/extent_map.c346
-rw-r--r--fs/ocfs2/extent_map.h3
-rw-r--r--fs/ocfs2/file.c1
-rw-r--r--fs/partitions/check.c272
-rw-r--r--fs/partitions/check.h4
-rw-r--r--fs/proc/array.c59
-rw-r--r--fs/proc/generic.c4
-rw-r--r--fs/proc/proc_misc.c7
-rw-r--r--fs/ramfs/file-nommu.c2
-rw-r--r--fs/splice.c3
-rw-r--r--fs/ubifs/budget.c114
-rw-r--r--fs/ubifs/debug.c2
-rw-r--r--fs/ubifs/dir.c3
-rw-r--r--fs/ubifs/file.c20
-rw-r--r--fs/ubifs/find.c19
-rw-r--r--fs/ubifs/gc.c20
-rw-r--r--fs/ubifs/misc.h49
-rw-r--r--fs/ubifs/super.c25
-rw-r--r--fs/ubifs/tnc.c116
-rw-r--r--fs/ubifs/ubifs-media.h2
-rw-r--r--fs/ubifs/ubifs.h14
-rw-r--r--fs/udf/file.c1
-rw-r--r--fs/udf/ialloc.c44
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c4
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c3
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.h8
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c20
-rw-r--r--fs/xfs/xfs_buf_item.c44
-rw-r--r--fs/xfs/xfs_dfrag.c9
-rw-r--r--fs/xfs/xfs_inode.c94
-rw-r--r--fs/xfs/xfs_log.c67
-rw-r--r--fs/xfs/xfs_log_priv.h1
-rw-r--r--fs/xfs/xfs_vnodeops.c26
118 files changed, 4835 insertions, 4574 deletions
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index c95295c65045..e83aa5ebe861 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -626,8 +626,7 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
626 return NULL; 626 return NULL;
627 627
628error: 628error:
629 if (fid) 629 p9_client_clunk(fid);
630 p9_client_clunk(fid);
631 630
632 return ERR_PTR(result); 631 return ERR_PTR(result);
633} 632}
diff --git a/fs/Kconfig b/fs/Kconfig
index abccb5dab9a8..f54a157a0296 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -136,37 +136,51 @@ config EXT3_FS_SECURITY
136 If you are not using a security module that requires using 136 If you are not using a security module that requires using
137 extended attributes for file security labels, say N. 137 extended attributes for file security labels, say N.
138 138
139config EXT4DEV_FS 139config EXT4_FS
140 tristate "Ext4dev/ext4 extended fs support development (EXPERIMENTAL)" 140 tristate "The Extended 4 (ext4) filesystem"
141 depends on EXPERIMENTAL
142 select JBD2 141 select JBD2
143 select CRC16 142 select CRC16
144 help 143 help
145 Ext4dev is a predecessor filesystem of the next generation 144 This is the next generation of the ext3 filesystem.
146 extended fs ext4, based on ext3 filesystem code. It will be
147 renamed ext4 fs later, once ext4dev is mature and stabilized.
148 145
149 Unlike the change from ext2 filesystem to ext3 filesystem, 146 Unlike the change from ext2 filesystem to ext3 filesystem,
150 the on-disk format of ext4dev is not the same as ext3 any more: 147 the on-disk format of ext4 is not forwards compatible with
151 it is based on extent maps and it supports 48-bit physical block 148 ext3; it is based on extent maps and it supports 48-bit
152 numbers. These combined on-disk format changes will allow 149 physical block numbers. The ext4 filesystem also supports delayed
153 ext4dev/ext4 to handle more than 16 TB filesystem volumes -- 150 allocation, persistent preallocation, high resolution time stamps,
154 a hard limit that ext3 cannot overcome without changing the 151 and a number of other features to improve performance and speed
155 on-disk format. 152 up fsck time. For more information, please see the web pages at
156 153 http://ext4.wiki.kernel.org.
157 Other than extent maps and 48-bit block numbers, ext4dev also is 154
158 likely to have other new features such as persistent preallocation, 155 The ext4 filesystem will support mounting an ext3
159 high resolution time stamps, and larger file support etc. These 156 filesystem; while there will be some performance gains from
160 features will be added to ext4dev gradually. 157 the delayed allocation and inode table readahead, the best
158 performance gains will require enabling ext4 features in the
159 filesystem, or formating a new filesystem as an ext4
160 filesystem initially.
161 161
162 To compile this file system support as a module, choose M here. The 162 To compile this file system support as a module, choose M here. The
163 module will be called ext4dev. 163 module will be called ext4dev.
164 164
165 If unsure, say N. 165 If unsure, say N.
166 166
167config EXT4DEV_FS_XATTR 167config EXT4DEV_COMPAT
168 bool "Ext4dev extended attributes" 168 bool "Enable ext4dev compatibility"
169 depends on EXT4DEV_FS 169 depends on EXT4_FS
170 help
171 Starting with 2.6.28, the name of the ext4 filesystem was
172 renamed from ext4dev to ext4. Unfortunately there are some
173 legacy userspace programs (such as klibc's fstype) have
174 "ext4dev" hardcoded.
175
176 To enable backwards compatibility so that systems that are
177 still expecting to mount ext4 filesystems using ext4dev,
178 chose Y here. This feature will go away by 2.6.31, so
179 please arrange to get your userspace programs fixed!
180
181config EXT4_FS_XATTR
182 bool "Ext4 extended attributes"
183 depends on EXT4_FS
170 default y 184 default y
171 help 185 help
172 Extended attributes are name:value pairs associated with inodes by 186 Extended attributes are name:value pairs associated with inodes by
@@ -175,11 +189,11 @@ config EXT4DEV_FS_XATTR
175 189
176 If unsure, say N. 190 If unsure, say N.
177 191
178 You need this for POSIX ACL support on ext4dev/ext4. 192 You need this for POSIX ACL support on ext4.
179 193
180config EXT4DEV_FS_POSIX_ACL 194config EXT4_FS_POSIX_ACL
181 bool "Ext4dev POSIX Access Control Lists" 195 bool "Ext4 POSIX Access Control Lists"
182 depends on EXT4DEV_FS_XATTR 196 depends on EXT4_FS_XATTR
183 select FS_POSIX_ACL 197 select FS_POSIX_ACL
184 help 198 help
185 POSIX Access Control Lists (ACLs) support permissions for users and 199 POSIX Access Control Lists (ACLs) support permissions for users and
@@ -190,14 +204,14 @@ config EXT4DEV_FS_POSIX_ACL
190 204
191 If you don't know what Access Control Lists are, say N 205 If you don't know what Access Control Lists are, say N
192 206
193config EXT4DEV_FS_SECURITY 207config EXT4_FS_SECURITY
194 bool "Ext4dev Security Labels" 208 bool "Ext4 Security Labels"
195 depends on EXT4DEV_FS_XATTR 209 depends on EXT4_FS_XATTR
196 help 210 help
197 Security labels support alternative access control models 211 Security labels support alternative access control models
198 implemented by security modules like SELinux. This option 212 implemented by security modules like SELinux. This option
199 enables an extended attribute handler for file security 213 enables an extended attribute handler for file security
200 labels in the ext4dev/ext4 filesystem. 214 labels in the ext4 filesystem.
201 215
202 If you are not using a security module that requires using 216 If you are not using a security module that requires using
203 extended attributes for file security labels, say N. 217 extended attributes for file security labels, say N.
@@ -240,22 +254,22 @@ config JBD2
240 help 254 help
241 This is a generic journaling layer for block devices that support 255 This is a generic journaling layer for block devices that support
242 both 32-bit and 64-bit block numbers. It is currently used by 256 both 32-bit and 64-bit block numbers. It is currently used by
243 the ext4dev/ext4 filesystem, but it could also be used to add 257 the ext4 filesystem, but it could also be used to add
244 journal support to other file systems or block devices such 258 journal support to other file systems or block devices such
245 as RAID or LVM. 259 as RAID or LVM.
246 260
247 If you are using ext4dev/ext4, you need to say Y here. If you are not 261 If you are using ext4, you need to say Y here. If you are not
248 using ext4dev/ext4 then you will probably want to say N. 262 using ext4 then you will probably want to say N.
249 263
250 To compile this device as a module, choose M here. The module will be 264 To compile this device as a module, choose M here. The module will be
251 called jbd2. If you are compiling ext4dev/ext4 into the kernel, 265 called jbd2. If you are compiling ext4 into the kernel,
252 you cannot compile this code as a module. 266 you cannot compile this code as a module.
253 267
254config JBD2_DEBUG 268config JBD2_DEBUG
255 bool "JBD2 (ext4dev/ext4) debugging support" 269 bool "JBD2 (ext4) debugging support"
256 depends on JBD2 && DEBUG_FS 270 depends on JBD2 && DEBUG_FS
257 help 271 help
258 If you are using the ext4dev/ext4 journaled file system (or 272 If you are using the ext4 journaled file system (or
259 potentially any other filesystem/device using JBD2), this option 273 potentially any other filesystem/device using JBD2), this option
260 allows you to enable debugging output while the system is running, 274 allows you to enable debugging output while the system is running,
261 in order to help track down any problems you are having. 275 in order to help track down any problems you are having.
@@ -270,9 +284,9 @@ config JBD2_DEBUG
270config FS_MBCACHE 284config FS_MBCACHE
271# Meta block cache for Extended Attributes (ext2/ext3/ext4) 285# Meta block cache for Extended Attributes (ext2/ext3/ext4)
272 tristate 286 tristate
273 depends on EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4DEV_FS_XATTR 287 depends on EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4_FS_XATTR
274 default y if EXT2_FS=y || EXT3_FS=y || EXT4DEV_FS=y 288 default y if EXT2_FS=y || EXT3_FS=y || EXT4_FS=y
275 default m if EXT2_FS=m || EXT3_FS=m || EXT4DEV_FS=m 289 default m if EXT2_FS=m || EXT3_FS=m || EXT4_FS=m
276 290
277config REISERFS_FS 291config REISERFS_FS
278 tristate "Reiserfs support" 292 tristate "Reiserfs support"
diff --git a/fs/Makefile b/fs/Makefile
index a1482a5eff15..de404b00eb0c 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -69,7 +69,7 @@ obj-$(CONFIG_DLM) += dlm/
69# Do not add any filesystems before this line 69# Do not add any filesystems before this line
70obj-$(CONFIG_REISERFS_FS) += reiserfs/ 70obj-$(CONFIG_REISERFS_FS) += reiserfs/
71obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3 71obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3
72obj-$(CONFIG_EXT4DEV_FS) += ext4/ # Before ext2 so root fs can be ext4dev 72obj-$(CONFIG_EXT4_FS) += ext4/ # Before ext2 so root fs can be ext4dev
73obj-$(CONFIG_JBD) += jbd/ 73obj-$(CONFIG_JBD) += jbd/
74obj-$(CONFIG_JBD2) += jbd2/ 74obj-$(CONFIG_JBD2) += jbd2/
75obj-$(CONFIG_EXT2_FS) += ext2/ 75obj-$(CONFIG_EXT2_FS) += ext2/
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index 87ee5ccee348..ed8feb052df9 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -125,8 +125,8 @@ static int bfs_create(struct inode *dir, struct dentry *dentry, int mode,
125 inode->i_ino); 125 inode->i_ino);
126 if (err) { 126 if (err) {
127 inode_dec_link_count(inode); 127 inode_dec_link_count(inode);
128 iput(inode);
129 mutex_unlock(&info->bfs_lock); 128 mutex_unlock(&info->bfs_lock);
129 iput(inode);
130 return err; 130 return err;
131 } 131 }
132 mutex_unlock(&info->bfs_lock); 132 mutex_unlock(&info->bfs_lock);
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index c3e174b35fe6..19caf7c962ac 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -107,7 +107,8 @@ void bio_integrity_free(struct bio *bio, struct bio_set *bs)
107 BUG_ON(bip == NULL); 107 BUG_ON(bip == NULL);
108 108
109 /* A cloned bio doesn't own the integrity metadata */ 109 /* A cloned bio doesn't own the integrity metadata */
110 if (!bio_flagged(bio, BIO_CLONED) && bip->bip_buf != NULL) 110 if (!bio_flagged(bio, BIO_CLONED) && !bio_flagged(bio, BIO_FS_INTEGRITY)
111 && bip->bip_buf != NULL)
111 kfree(bip->bip_buf); 112 kfree(bip->bip_buf);
112 113
113 mempool_free(bip->bip_vec, bs->bvec_pools[bip->bip_pool]); 114 mempool_free(bip->bip_vec, bs->bvec_pools[bip->bip_pool]);
@@ -150,6 +151,24 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
150} 151}
151EXPORT_SYMBOL(bio_integrity_add_page); 152EXPORT_SYMBOL(bio_integrity_add_page);
152 153
154static int bdev_integrity_enabled(struct block_device *bdev, int rw)
155{
156 struct blk_integrity *bi = bdev_get_integrity(bdev);
157
158 if (bi == NULL)
159 return 0;
160
161 if (rw == READ && bi->verify_fn != NULL &&
162 (bi->flags & INTEGRITY_FLAG_READ))
163 return 1;
164
165 if (rw == WRITE && bi->generate_fn != NULL &&
166 (bi->flags & INTEGRITY_FLAG_WRITE))
167 return 1;
168
169 return 0;
170}
171
153/** 172/**
154 * bio_integrity_enabled - Check whether integrity can be passed 173 * bio_integrity_enabled - Check whether integrity can be passed
155 * @bio: bio to check 174 * @bio: bio to check
@@ -313,6 +332,14 @@ static void bio_integrity_generate(struct bio *bio)
313 } 332 }
314} 333}
315 334
335static inline unsigned short blk_integrity_tuple_size(struct blk_integrity *bi)
336{
337 if (bi)
338 return bi->tuple_size;
339
340 return 0;
341}
342
316/** 343/**
317 * bio_integrity_prep - Prepare bio for integrity I/O 344 * bio_integrity_prep - Prepare bio for integrity I/O
318 * @bio: bio to prepare 345 * @bio: bio to prepare
diff --git a/fs/bio.c b/fs/bio.c
index 3cba7ae34d75..77a55bcceedb 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -30,7 +30,7 @@
30 30
31static struct kmem_cache *bio_slab __read_mostly; 31static struct kmem_cache *bio_slab __read_mostly;
32 32
33mempool_t *bio_split_pool __read_mostly; 33static mempool_t *bio_split_pool __read_mostly;
34 34
35/* 35/*
36 * if you change this list, also change bvec_alloc or things will 36 * if you change this list, also change bvec_alloc or things will
@@ -60,25 +60,46 @@ struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct
60 struct bio_vec *bvl; 60 struct bio_vec *bvl;
61 61
62 /* 62 /*
63 * see comment near bvec_array define! 63 * If 'bs' is given, lookup the pool and do the mempool alloc.
64 * If not, this is a bio_kmalloc() allocation and just do a
65 * kzalloc() for the exact number of vecs right away.
64 */ 66 */
65 switch (nr) { 67 if (bs) {
66 case 1 : *idx = 0; break; 68 /*
67 case 2 ... 4: *idx = 1; break; 69 * see comment near bvec_array define!
68 case 5 ... 16: *idx = 2; break; 70 */
69 case 17 ... 64: *idx = 3; break; 71 switch (nr) {
70 case 65 ... 128: *idx = 4; break; 72 case 1:
71 case 129 ... BIO_MAX_PAGES: *idx = 5; break; 73 *idx = 0;
74 break;
75 case 2 ... 4:
76 *idx = 1;
77 break;
78 case 5 ... 16:
79 *idx = 2;
80 break;
81 case 17 ... 64:
82 *idx = 3;
83 break;
84 case 65 ... 128:
85 *idx = 4;
86 break;
87 case 129 ... BIO_MAX_PAGES:
88 *idx = 5;
89 break;
72 default: 90 default:
73 return NULL; 91 return NULL;
74 } 92 }
75 /*
76 * idx now points to the pool we want to allocate from
77 */
78 93
79 bvl = mempool_alloc(bs->bvec_pools[*idx], gfp_mask); 94 /*
80 if (bvl) 95 * idx now points to the pool we want to allocate from
81 memset(bvl, 0, bvec_nr_vecs(*idx) * sizeof(struct bio_vec)); 96 */
97 bvl = mempool_alloc(bs->bvec_pools[*idx], gfp_mask);
98 if (bvl)
99 memset(bvl, 0,
100 bvec_nr_vecs(*idx) * sizeof(struct bio_vec));
101 } else
102 bvl = kzalloc(nr * sizeof(struct bio_vec), gfp_mask);
82 103
83 return bvl; 104 return bvl;
84} 105}
@@ -107,10 +128,17 @@ static void bio_fs_destructor(struct bio *bio)
107 bio_free(bio, fs_bio_set); 128 bio_free(bio, fs_bio_set);
108} 129}
109 130
131static void bio_kmalloc_destructor(struct bio *bio)
132{
133 kfree(bio->bi_io_vec);
134 kfree(bio);
135}
136
110void bio_init(struct bio *bio) 137void bio_init(struct bio *bio)
111{ 138{
112 memset(bio, 0, sizeof(*bio)); 139 memset(bio, 0, sizeof(*bio));
113 bio->bi_flags = 1 << BIO_UPTODATE; 140 bio->bi_flags = 1 << BIO_UPTODATE;
141 bio->bi_comp_cpu = -1;
114 atomic_set(&bio->bi_cnt, 1); 142 atomic_set(&bio->bi_cnt, 1);
115} 143}
116 144
@@ -118,19 +146,25 @@ void bio_init(struct bio *bio)
118 * bio_alloc_bioset - allocate a bio for I/O 146 * bio_alloc_bioset - allocate a bio for I/O
119 * @gfp_mask: the GFP_ mask given to the slab allocator 147 * @gfp_mask: the GFP_ mask given to the slab allocator
120 * @nr_iovecs: number of iovecs to pre-allocate 148 * @nr_iovecs: number of iovecs to pre-allocate
121 * @bs: the bio_set to allocate from 149 * @bs: the bio_set to allocate from. If %NULL, just use kmalloc
122 * 150 *
123 * Description: 151 * Description:
124 * bio_alloc_bioset will first try it's on mempool to satisfy the allocation. 152 * bio_alloc_bioset will first try its own mempool to satisfy the allocation.
125 * If %__GFP_WAIT is set then we will block on the internal pool waiting 153 * If %__GFP_WAIT is set then we will block on the internal pool waiting
126 * for a &struct bio to become free. 154 * for a &struct bio to become free. If a %NULL @bs is passed in, we will
155 * fall back to just using @kmalloc to allocate the required memory.
127 * 156 *
128 * allocate bio and iovecs from the memory pools specified by the 157 * allocate bio and iovecs from the memory pools specified by the
129 * bio_set structure. 158 * bio_set structure, or @kmalloc if none given.
130 **/ 159 **/
131struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) 160struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
132{ 161{
133 struct bio *bio = mempool_alloc(bs->bio_pool, gfp_mask); 162 struct bio *bio;
163
164 if (bs)
165 bio = mempool_alloc(bs->bio_pool, gfp_mask);
166 else
167 bio = kmalloc(sizeof(*bio), gfp_mask);
134 168
135 if (likely(bio)) { 169 if (likely(bio)) {
136 struct bio_vec *bvl = NULL; 170 struct bio_vec *bvl = NULL;
@@ -141,7 +175,10 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
141 175
142 bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs); 176 bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs);
143 if (unlikely(!bvl)) { 177 if (unlikely(!bvl)) {
144 mempool_free(bio, bs->bio_pool); 178 if (bs)
179 mempool_free(bio, bs->bio_pool);
180 else
181 kfree(bio);
145 bio = NULL; 182 bio = NULL;
146 goto out; 183 goto out;
147 } 184 }
@@ -164,6 +201,23 @@ struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs)
164 return bio; 201 return bio;
165} 202}
166 203
204/*
205 * Like bio_alloc(), but doesn't use a mempool backing. This means that
206 * it CAN fail, but while bio_alloc() can only be used for allocations
207 * that have a short (finite) life span, bio_kmalloc() should be used
208 * for more permanent bio allocations (like allocating some bio's for
209 * initalization or setup purposes).
210 */
211struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs)
212{
213 struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, NULL);
214
215 if (bio)
216 bio->bi_destructor = bio_kmalloc_destructor;
217
218 return bio;
219}
220
167void zero_fill_bio(struct bio *bio) 221void zero_fill_bio(struct bio *bio)
168{ 222{
169 unsigned long flags; 223 unsigned long flags;
@@ -208,14 +262,6 @@ inline int bio_phys_segments(struct request_queue *q, struct bio *bio)
208 return bio->bi_phys_segments; 262 return bio->bi_phys_segments;
209} 263}
210 264
211inline int bio_hw_segments(struct request_queue *q, struct bio *bio)
212{
213 if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
214 blk_recount_segments(q, bio);
215
216 return bio->bi_hw_segments;
217}
218
219/** 265/**
220 * __bio_clone - clone a bio 266 * __bio_clone - clone a bio
221 * @bio: destination bio 267 * @bio: destination bio
@@ -350,8 +396,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
350 */ 396 */
351 397
352 while (bio->bi_phys_segments >= q->max_phys_segments 398 while (bio->bi_phys_segments >= q->max_phys_segments
353 || bio->bi_hw_segments >= q->max_hw_segments 399 || bio->bi_phys_segments >= q->max_hw_segments) {
354 || BIOVEC_VIRT_OVERSIZE(bio->bi_size)) {
355 400
356 if (retried_segments) 401 if (retried_segments)
357 return 0; 402 return 0;
@@ -395,13 +440,11 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
395 } 440 }
396 441
397 /* If we may be able to merge these biovecs, force a recount */ 442 /* If we may be able to merge these biovecs, force a recount */
398 if (bio->bi_vcnt && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec) || 443 if (bio->bi_vcnt && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec)))
399 BIOVEC_VIRT_MERGEABLE(bvec-1, bvec)))
400 bio->bi_flags &= ~(1 << BIO_SEG_VALID); 444 bio->bi_flags &= ~(1 << BIO_SEG_VALID);
401 445
402 bio->bi_vcnt++; 446 bio->bi_vcnt++;
403 bio->bi_phys_segments++; 447 bio->bi_phys_segments++;
404 bio->bi_hw_segments++;
405 done: 448 done:
406 bio->bi_size += len; 449 bio->bi_size += len;
407 return len; 450 return len;
@@ -449,16 +492,19 @@ int bio_add_page(struct bio *bio, struct page *page, unsigned int len,
449 492
450struct bio_map_data { 493struct bio_map_data {
451 struct bio_vec *iovecs; 494 struct bio_vec *iovecs;
452 int nr_sgvecs;
453 struct sg_iovec *sgvecs; 495 struct sg_iovec *sgvecs;
496 int nr_sgvecs;
497 int is_our_pages;
454}; 498};
455 499
456static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio, 500static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio,
457 struct sg_iovec *iov, int iov_count) 501 struct sg_iovec *iov, int iov_count,
502 int is_our_pages)
458{ 503{
459 memcpy(bmd->iovecs, bio->bi_io_vec, sizeof(struct bio_vec) * bio->bi_vcnt); 504 memcpy(bmd->iovecs, bio->bi_io_vec, sizeof(struct bio_vec) * bio->bi_vcnt);
460 memcpy(bmd->sgvecs, iov, sizeof(struct sg_iovec) * iov_count); 505 memcpy(bmd->sgvecs, iov, sizeof(struct sg_iovec) * iov_count);
461 bmd->nr_sgvecs = iov_count; 506 bmd->nr_sgvecs = iov_count;
507 bmd->is_our_pages = is_our_pages;
462 bio->bi_private = bmd; 508 bio->bi_private = bmd;
463} 509}
464 510
@@ -493,7 +539,8 @@ static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count,
493} 539}
494 540
495static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs, 541static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs,
496 struct sg_iovec *iov, int iov_count, int uncopy) 542 struct sg_iovec *iov, int iov_count, int uncopy,
543 int do_free_page)
497{ 544{
498 int ret = 0, i; 545 int ret = 0, i;
499 struct bio_vec *bvec; 546 struct bio_vec *bvec;
@@ -536,7 +583,7 @@ static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs,
536 } 583 }
537 } 584 }
538 585
539 if (uncopy) 586 if (do_free_page)
540 __free_page(bvec->bv_page); 587 __free_page(bvec->bv_page);
541 } 588 }
542 589
@@ -553,10 +600,11 @@ static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs,
553int bio_uncopy_user(struct bio *bio) 600int bio_uncopy_user(struct bio *bio)
554{ 601{
555 struct bio_map_data *bmd = bio->bi_private; 602 struct bio_map_data *bmd = bio->bi_private;
556 int ret; 603 int ret = 0;
557
558 ret = __bio_copy_iov(bio, bmd->iovecs, bmd->sgvecs, bmd->nr_sgvecs, 1);
559 604
605 if (!bio_flagged(bio, BIO_NULL_MAPPED))
606 ret = __bio_copy_iov(bio, bmd->iovecs, bmd->sgvecs,
607 bmd->nr_sgvecs, 1, bmd->is_our_pages);
560 bio_free_map_data(bmd); 608 bio_free_map_data(bmd);
561 bio_put(bio); 609 bio_put(bio);
562 return ret; 610 return ret;
@@ -565,16 +613,20 @@ int bio_uncopy_user(struct bio *bio)
565/** 613/**
566 * bio_copy_user_iov - copy user data to bio 614 * bio_copy_user_iov - copy user data to bio
567 * @q: destination block queue 615 * @q: destination block queue
616 * @map_data: pointer to the rq_map_data holding pages (if necessary)
568 * @iov: the iovec. 617 * @iov: the iovec.
569 * @iov_count: number of elements in the iovec 618 * @iov_count: number of elements in the iovec
570 * @write_to_vm: bool indicating writing to pages or not 619 * @write_to_vm: bool indicating writing to pages or not
620 * @gfp_mask: memory allocation flags
571 * 621 *
572 * Prepares and returns a bio for indirect user io, bouncing data 622 * Prepares and returns a bio for indirect user io, bouncing data
573 * to/from kernel pages as necessary. Must be paired with 623 * to/from kernel pages as necessary. Must be paired with
574 * call bio_uncopy_user() on io completion. 624 * call bio_uncopy_user() on io completion.
575 */ 625 */
576struct bio *bio_copy_user_iov(struct request_queue *q, struct sg_iovec *iov, 626struct bio *bio_copy_user_iov(struct request_queue *q,
577 int iov_count, int write_to_vm) 627 struct rq_map_data *map_data,
628 struct sg_iovec *iov, int iov_count,
629 int write_to_vm, gfp_t gfp_mask)
578{ 630{
579 struct bio_map_data *bmd; 631 struct bio_map_data *bmd;
580 struct bio_vec *bvec; 632 struct bio_vec *bvec;
@@ -597,25 +649,38 @@ struct bio *bio_copy_user_iov(struct request_queue *q, struct sg_iovec *iov,
597 len += iov[i].iov_len; 649 len += iov[i].iov_len;
598 } 650 }
599 651
600 bmd = bio_alloc_map_data(nr_pages, iov_count, GFP_KERNEL); 652 bmd = bio_alloc_map_data(nr_pages, iov_count, gfp_mask);
601 if (!bmd) 653 if (!bmd)
602 return ERR_PTR(-ENOMEM); 654 return ERR_PTR(-ENOMEM);
603 655
604 ret = -ENOMEM; 656 ret = -ENOMEM;
605 bio = bio_alloc(GFP_KERNEL, nr_pages); 657 bio = bio_alloc(gfp_mask, nr_pages);
606 if (!bio) 658 if (!bio)
607 goto out_bmd; 659 goto out_bmd;
608 660
609 bio->bi_rw |= (!write_to_vm << BIO_RW); 661 bio->bi_rw |= (!write_to_vm << BIO_RW);
610 662
611 ret = 0; 663 ret = 0;
664 i = 0;
612 while (len) { 665 while (len) {
613 unsigned int bytes = PAGE_SIZE; 666 unsigned int bytes;
667
668 if (map_data)
669 bytes = 1U << (PAGE_SHIFT + map_data->page_order);
670 else
671 bytes = PAGE_SIZE;
614 672
615 if (bytes > len) 673 if (bytes > len)
616 bytes = len; 674 bytes = len;
617 675
618 page = alloc_page(q->bounce_gfp | GFP_KERNEL); 676 if (map_data) {
677 if (i == map_data->nr_entries) {
678 ret = -ENOMEM;
679 break;
680 }
681 page = map_data->pages[i++];
682 } else
683 page = alloc_page(q->bounce_gfp | gfp_mask);
619 if (!page) { 684 if (!page) {
620 ret = -ENOMEM; 685 ret = -ENOMEM;
621 break; 686 break;
@@ -634,16 +699,17 @@ struct bio *bio_copy_user_iov(struct request_queue *q, struct sg_iovec *iov,
634 * success 699 * success
635 */ 700 */
636 if (!write_to_vm) { 701 if (!write_to_vm) {
637 ret = __bio_copy_iov(bio, bio->bi_io_vec, iov, iov_count, 0); 702 ret = __bio_copy_iov(bio, bio->bi_io_vec, iov, iov_count, 0, 0);
638 if (ret) 703 if (ret)
639 goto cleanup; 704 goto cleanup;
640 } 705 }
641 706
642 bio_set_map_data(bmd, bio, iov, iov_count); 707 bio_set_map_data(bmd, bio, iov, iov_count, map_data ? 0 : 1);
643 return bio; 708 return bio;
644cleanup: 709cleanup:
645 bio_for_each_segment(bvec, bio, i) 710 if (!map_data)
646 __free_page(bvec->bv_page); 711 bio_for_each_segment(bvec, bio, i)
712 __free_page(bvec->bv_page);
647 713
648 bio_put(bio); 714 bio_put(bio);
649out_bmd: 715out_bmd:
@@ -654,29 +720,32 @@ out_bmd:
654/** 720/**
655 * bio_copy_user - copy user data to bio 721 * bio_copy_user - copy user data to bio
656 * @q: destination block queue 722 * @q: destination block queue
723 * @map_data: pointer to the rq_map_data holding pages (if necessary)
657 * @uaddr: start of user address 724 * @uaddr: start of user address
658 * @len: length in bytes 725 * @len: length in bytes
659 * @write_to_vm: bool indicating writing to pages or not 726 * @write_to_vm: bool indicating writing to pages or not
727 * @gfp_mask: memory allocation flags
660 * 728 *
661 * Prepares and returns a bio for indirect user io, bouncing data 729 * Prepares and returns a bio for indirect user io, bouncing data
662 * to/from kernel pages as necessary. Must be paired with 730 * to/from kernel pages as necessary. Must be paired with
663 * call bio_uncopy_user() on io completion. 731 * call bio_uncopy_user() on io completion.
664 */ 732 */
665struct bio *bio_copy_user(struct request_queue *q, unsigned long uaddr, 733struct bio *bio_copy_user(struct request_queue *q, struct rq_map_data *map_data,
666 unsigned int len, int write_to_vm) 734 unsigned long uaddr, unsigned int len,
735 int write_to_vm, gfp_t gfp_mask)
667{ 736{
668 struct sg_iovec iov; 737 struct sg_iovec iov;
669 738
670 iov.iov_base = (void __user *)uaddr; 739 iov.iov_base = (void __user *)uaddr;
671 iov.iov_len = len; 740 iov.iov_len = len;
672 741
673 return bio_copy_user_iov(q, &iov, 1, write_to_vm); 742 return bio_copy_user_iov(q, map_data, &iov, 1, write_to_vm, gfp_mask);
674} 743}
675 744
676static struct bio *__bio_map_user_iov(struct request_queue *q, 745static struct bio *__bio_map_user_iov(struct request_queue *q,
677 struct block_device *bdev, 746 struct block_device *bdev,
678 struct sg_iovec *iov, int iov_count, 747 struct sg_iovec *iov, int iov_count,
679 int write_to_vm) 748 int write_to_vm, gfp_t gfp_mask)
680{ 749{
681 int i, j; 750 int i, j;
682 int nr_pages = 0; 751 int nr_pages = 0;
@@ -702,12 +771,12 @@ static struct bio *__bio_map_user_iov(struct request_queue *q,
702 if (!nr_pages) 771 if (!nr_pages)
703 return ERR_PTR(-EINVAL); 772 return ERR_PTR(-EINVAL);
704 773
705 bio = bio_alloc(GFP_KERNEL, nr_pages); 774 bio = bio_alloc(gfp_mask, nr_pages);
706 if (!bio) 775 if (!bio)
707 return ERR_PTR(-ENOMEM); 776 return ERR_PTR(-ENOMEM);
708 777
709 ret = -ENOMEM; 778 ret = -ENOMEM;
710 pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); 779 pages = kcalloc(nr_pages, sizeof(struct page *), gfp_mask);
711 if (!pages) 780 if (!pages)
712 goto out; 781 goto out;
713 782
@@ -786,19 +855,21 @@ static struct bio *__bio_map_user_iov(struct request_queue *q,
786 * @uaddr: start of user address 855 * @uaddr: start of user address
787 * @len: length in bytes 856 * @len: length in bytes
788 * @write_to_vm: bool indicating writing to pages or not 857 * @write_to_vm: bool indicating writing to pages or not
858 * @gfp_mask: memory allocation flags
789 * 859 *
790 * Map the user space address into a bio suitable for io to a block 860 * Map the user space address into a bio suitable for io to a block
791 * device. Returns an error pointer in case of error. 861 * device. Returns an error pointer in case of error.
792 */ 862 */
793struct bio *bio_map_user(struct request_queue *q, struct block_device *bdev, 863struct bio *bio_map_user(struct request_queue *q, struct block_device *bdev,
794 unsigned long uaddr, unsigned int len, int write_to_vm) 864 unsigned long uaddr, unsigned int len, int write_to_vm,
865 gfp_t gfp_mask)
795{ 866{
796 struct sg_iovec iov; 867 struct sg_iovec iov;
797 868
798 iov.iov_base = (void __user *)uaddr; 869 iov.iov_base = (void __user *)uaddr;
799 iov.iov_len = len; 870 iov.iov_len = len;
800 871
801 return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm); 872 return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm, gfp_mask);
802} 873}
803 874
804/** 875/**
@@ -808,18 +879,19 @@ struct bio *bio_map_user(struct request_queue *q, struct block_device *bdev,
808 * @iov: the iovec. 879 * @iov: the iovec.
809 * @iov_count: number of elements in the iovec 880 * @iov_count: number of elements in the iovec
810 * @write_to_vm: bool indicating writing to pages or not 881 * @write_to_vm: bool indicating writing to pages or not
882 * @gfp_mask: memory allocation flags
811 * 883 *
812 * Map the user space address into a bio suitable for io to a block 884 * Map the user space address into a bio suitable for io to a block
813 * device. Returns an error pointer in case of error. 885 * device. Returns an error pointer in case of error.
814 */ 886 */
815struct bio *bio_map_user_iov(struct request_queue *q, struct block_device *bdev, 887struct bio *bio_map_user_iov(struct request_queue *q, struct block_device *bdev,
816 struct sg_iovec *iov, int iov_count, 888 struct sg_iovec *iov, int iov_count,
817 int write_to_vm) 889 int write_to_vm, gfp_t gfp_mask)
818{ 890{
819 struct bio *bio; 891 struct bio *bio;
820 892
821 bio = __bio_map_user_iov(q, bdev, iov, iov_count, write_to_vm); 893 bio = __bio_map_user_iov(q, bdev, iov, iov_count, write_to_vm,
822 894 gfp_mask);
823 if (IS_ERR(bio)) 895 if (IS_ERR(bio))
824 return bio; 896 return bio;
825 897
@@ -976,48 +1048,13 @@ static void bio_copy_kern_endio(struct bio *bio, int err)
976struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len, 1048struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len,
977 gfp_t gfp_mask, int reading) 1049 gfp_t gfp_mask, int reading)
978{ 1050{
979 unsigned long kaddr = (unsigned long)data;
980 unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
981 unsigned long start = kaddr >> PAGE_SHIFT;
982 const int nr_pages = end - start;
983 struct bio *bio; 1051 struct bio *bio;
984 struct bio_vec *bvec; 1052 struct bio_vec *bvec;
985 struct bio_map_data *bmd; 1053 int i;
986 int i, ret;
987 struct sg_iovec iov;
988
989 iov.iov_base = data;
990 iov.iov_len = len;
991
992 bmd = bio_alloc_map_data(nr_pages, 1, gfp_mask);
993 if (!bmd)
994 return ERR_PTR(-ENOMEM);
995
996 ret = -ENOMEM;
997 bio = bio_alloc(gfp_mask, nr_pages);
998 if (!bio)
999 goto out_bmd;
1000
1001 while (len) {
1002 struct page *page;
1003 unsigned int bytes = PAGE_SIZE;
1004
1005 if (bytes > len)
1006 bytes = len;
1007
1008 page = alloc_page(q->bounce_gfp | gfp_mask);
1009 if (!page) {
1010 ret = -ENOMEM;
1011 goto cleanup;
1012 }
1013
1014 if (bio_add_pc_page(q, bio, page, bytes, 0) < bytes) {
1015 ret = -EINVAL;
1016 goto cleanup;
1017 }
1018 1054
1019 len -= bytes; 1055 bio = bio_copy_user(q, NULL, (unsigned long)data, len, 1, gfp_mask);
1020 } 1056 if (IS_ERR(bio))
1057 return bio;
1021 1058
1022 if (!reading) { 1059 if (!reading) {
1023 void *p = data; 1060 void *p = data;
@@ -1030,20 +1067,9 @@ struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len,
1030 } 1067 }
1031 } 1068 }
1032 1069
1033 bio->bi_private = bmd;
1034 bio->bi_end_io = bio_copy_kern_endio; 1070 bio->bi_end_io = bio_copy_kern_endio;
1035 1071
1036 bio_set_map_data(bmd, bio, &iov, 1);
1037 return bio; 1072 return bio;
1038cleanup:
1039 bio_for_each_segment(bvec, bio, i)
1040 __free_page(bvec->bv_page);
1041
1042 bio_put(bio);
1043out_bmd:
1044 bio_free_map_data(bmd);
1045
1046 return ERR_PTR(ret);
1047} 1073}
1048 1074
1049/* 1075/*
@@ -1230,9 +1256,9 @@ static void bio_pair_end_2(struct bio *bi, int err)
1230 * split a bio - only worry about a bio with a single page 1256 * split a bio - only worry about a bio with a single page
1231 * in it's iovec 1257 * in it's iovec
1232 */ 1258 */
1233struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors) 1259struct bio_pair *bio_split(struct bio *bi, int first_sectors)
1234{ 1260{
1235 struct bio_pair *bp = mempool_alloc(pool, GFP_NOIO); 1261 struct bio_pair *bp = mempool_alloc(bio_split_pool, GFP_NOIO);
1236 1262
1237 if (!bp) 1263 if (!bp)
1238 return bp; 1264 return bp;
@@ -1266,7 +1292,7 @@ struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors)
1266 bp->bio2.bi_end_io = bio_pair_end_2; 1292 bp->bio2.bi_end_io = bio_pair_end_2;
1267 1293
1268 bp->bio1.bi_private = bi; 1294 bp->bio1.bi_private = bi;
1269 bp->bio2.bi_private = pool; 1295 bp->bio2.bi_private = bio_split_pool;
1270 1296
1271 if (bio_integrity(bi)) 1297 if (bio_integrity(bi))
1272 bio_integrity_split(bi, bp, first_sectors); 1298 bio_integrity_split(bi, bp, first_sectors);
@@ -1274,6 +1300,42 @@ struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors)
1274 return bp; 1300 return bp;
1275} 1301}
1276 1302
1303/**
1304 * bio_sector_offset - Find hardware sector offset in bio
1305 * @bio: bio to inspect
1306 * @index: bio_vec index
1307 * @offset: offset in bv_page
1308 *
1309 * Return the number of hardware sectors between beginning of bio
1310 * and an end point indicated by a bio_vec index and an offset
1311 * within that vector's page.
1312 */
1313sector_t bio_sector_offset(struct bio *bio, unsigned short index,
1314 unsigned int offset)
1315{
1316 unsigned int sector_sz = queue_hardsect_size(bio->bi_bdev->bd_disk->queue);
1317 struct bio_vec *bv;
1318 sector_t sectors;
1319 int i;
1320
1321 sectors = 0;
1322
1323 if (index >= bio->bi_idx)
1324 index = bio->bi_vcnt - 1;
1325
1326 __bio_for_each_segment(bv, bio, i, 0) {
1327 if (i == index) {
1328 if (offset > bv->bv_offset)
1329 sectors += (offset - bv->bv_offset) / sector_sz;
1330 break;
1331 }
1332
1333 sectors += bv->bv_len / sector_sz;
1334 }
1335
1336 return sectors;
1337}
1338EXPORT_SYMBOL(bio_sector_offset);
1277 1339
1278/* 1340/*
1279 * create memory pools for biovec's in a bio_set. 1341 * create memory pools for biovec's in a bio_set.
@@ -1376,6 +1438,7 @@ static int __init init_bio(void)
1376subsys_initcall(init_bio); 1438subsys_initcall(init_bio);
1377 1439
1378EXPORT_SYMBOL(bio_alloc); 1440EXPORT_SYMBOL(bio_alloc);
1441EXPORT_SYMBOL(bio_kmalloc);
1379EXPORT_SYMBOL(bio_put); 1442EXPORT_SYMBOL(bio_put);
1380EXPORT_SYMBOL(bio_free); 1443EXPORT_SYMBOL(bio_free);
1381EXPORT_SYMBOL(bio_endio); 1444EXPORT_SYMBOL(bio_endio);
@@ -1383,7 +1446,6 @@ EXPORT_SYMBOL(bio_init);
1383EXPORT_SYMBOL(__bio_clone); 1446EXPORT_SYMBOL(__bio_clone);
1384EXPORT_SYMBOL(bio_clone); 1447EXPORT_SYMBOL(bio_clone);
1385EXPORT_SYMBOL(bio_phys_segments); 1448EXPORT_SYMBOL(bio_phys_segments);
1386EXPORT_SYMBOL(bio_hw_segments);
1387EXPORT_SYMBOL(bio_add_page); 1449EXPORT_SYMBOL(bio_add_page);
1388EXPORT_SYMBOL(bio_add_pc_page); 1450EXPORT_SYMBOL(bio_add_pc_page);
1389EXPORT_SYMBOL(bio_get_nr_vecs); 1451EXPORT_SYMBOL(bio_get_nr_vecs);
@@ -1393,7 +1455,6 @@ EXPORT_SYMBOL(bio_map_kern);
1393EXPORT_SYMBOL(bio_copy_kern); 1455EXPORT_SYMBOL(bio_copy_kern);
1394EXPORT_SYMBOL(bio_pair_release); 1456EXPORT_SYMBOL(bio_pair_release);
1395EXPORT_SYMBOL(bio_split); 1457EXPORT_SYMBOL(bio_split);
1396EXPORT_SYMBOL(bio_split_pool);
1397EXPORT_SYMBOL(bio_copy_user); 1458EXPORT_SYMBOL(bio_copy_user);
1398EXPORT_SYMBOL(bio_uncopy_user); 1459EXPORT_SYMBOL(bio_uncopy_user);
1399EXPORT_SYMBOL(bioset_create); 1460EXPORT_SYMBOL(bioset_create);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index aff54219e049..d84f0469a016 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -540,22 +540,6 @@ EXPORT_SYMBOL(bd_release);
540 * /sys/block/sda/holders/dm-0 --> /sys/block/dm-0 540 * /sys/block/sda/holders/dm-0 --> /sys/block/dm-0
541 */ 541 */
542 542
543static struct kobject *bdev_get_kobj(struct block_device *bdev)
544{
545 if (bdev->bd_contains != bdev)
546 return kobject_get(&bdev->bd_part->dev.kobj);
547 else
548 return kobject_get(&bdev->bd_disk->dev.kobj);
549}
550
551static struct kobject *bdev_get_holder(struct block_device *bdev)
552{
553 if (bdev->bd_contains != bdev)
554 return kobject_get(bdev->bd_part->holder_dir);
555 else
556 return kobject_get(bdev->bd_disk->holder_dir);
557}
558
559static int add_symlink(struct kobject *from, struct kobject *to) 543static int add_symlink(struct kobject *from, struct kobject *to)
560{ 544{
561 if (!from || !to) 545 if (!from || !to)
@@ -604,11 +588,11 @@ static int bd_holder_grab_dirs(struct block_device *bdev,
604 if (!bo->hdev) 588 if (!bo->hdev)
605 goto fail_put_sdir; 589 goto fail_put_sdir;
606 590
607 bo->sdev = bdev_get_kobj(bdev); 591 bo->sdev = kobject_get(&part_to_dev(bdev->bd_part)->kobj);
608 if (!bo->sdev) 592 if (!bo->sdev)
609 goto fail_put_hdev; 593 goto fail_put_hdev;
610 594
611 bo->hdir = bdev_get_holder(bdev); 595 bo->hdir = kobject_get(bdev->bd_part->holder_dir);
612 if (!bo->hdir) 596 if (!bo->hdir)
613 goto fail_put_sdev; 597 goto fail_put_sdev;
614 598
@@ -868,6 +852,87 @@ struct block_device *open_by_devnum(dev_t dev, unsigned mode)
868 852
869EXPORT_SYMBOL(open_by_devnum); 853EXPORT_SYMBOL(open_by_devnum);
870 854
855/**
856 * flush_disk - invalidates all buffer-cache entries on a disk
857 *
858 * @bdev: struct block device to be flushed
859 *
860 * Invalidates all buffer-cache entries on a disk. It should be called
861 * when a disk has been changed -- either by a media change or online
862 * resize.
863 */
864static void flush_disk(struct block_device *bdev)
865{
866 if (__invalidate_device(bdev)) {
867 char name[BDEVNAME_SIZE] = "";
868
869 if (bdev->bd_disk)
870 disk_name(bdev->bd_disk, 0, name);
871 printk(KERN_WARNING "VFS: busy inodes on changed media or "
872 "resized disk %s\n", name);
873 }
874
875 if (!bdev->bd_disk)
876 return;
877 if (disk_partitionable(bdev->bd_disk))
878 bdev->bd_invalidated = 1;
879}
880
881/**
882 * check_disk_size_change - checks for disk size change and adjusts bdev size.
883 * @disk: struct gendisk to check
884 * @bdev: struct bdev to adjust.
885 *
886 * This routine checks to see if the bdev size does not match the disk size
887 * and adjusts it if it differs.
888 */
889void check_disk_size_change(struct gendisk *disk, struct block_device *bdev)
890{
891 loff_t disk_size, bdev_size;
892
893 disk_size = (loff_t)get_capacity(disk) << 9;
894 bdev_size = i_size_read(bdev->bd_inode);
895 if (disk_size != bdev_size) {
896 char name[BDEVNAME_SIZE];
897
898 disk_name(disk, 0, name);
899 printk(KERN_INFO
900 "%s: detected capacity change from %lld to %lld\n",
901 name, bdev_size, disk_size);
902 i_size_write(bdev->bd_inode, disk_size);
903 flush_disk(bdev);
904 }
905}
906EXPORT_SYMBOL(check_disk_size_change);
907
908/**
909 * revalidate_disk - wrapper for lower-level driver's revalidate_disk call-back
910 * @disk: struct gendisk to be revalidated
911 *
912 * This routine is a wrapper for lower-level driver's revalidate_disk
913 * call-backs. It is used to do common pre and post operations needed
914 * for all revalidate_disk operations.
915 */
916int revalidate_disk(struct gendisk *disk)
917{
918 struct block_device *bdev;
919 int ret = 0;
920
921 if (disk->fops->revalidate_disk)
922 ret = disk->fops->revalidate_disk(disk);
923
924 bdev = bdget_disk(disk, 0);
925 if (!bdev)
926 return ret;
927
928 mutex_lock(&bdev->bd_mutex);
929 check_disk_size_change(disk, bdev);
930 mutex_unlock(&bdev->bd_mutex);
931 bdput(bdev);
932 return ret;
933}
934EXPORT_SYMBOL(revalidate_disk);
935
871/* 936/*
872 * This routine checks whether a removable media has been changed, 937 * This routine checks whether a removable media has been changed,
873 * and invalidates all buffer-cache-entries in that case. This 938 * and invalidates all buffer-cache-entries in that case. This
@@ -887,13 +952,9 @@ int check_disk_change(struct block_device *bdev)
887 if (!bdops->media_changed(bdev->bd_disk)) 952 if (!bdops->media_changed(bdev->bd_disk))
888 return 0; 953 return 0;
889 954
890 if (__invalidate_device(bdev)) 955 flush_disk(bdev);
891 printk("VFS: busy inodes on changed media.\n");
892
893 if (bdops->revalidate_disk) 956 if (bdops->revalidate_disk)
894 bdops->revalidate_disk(bdev->bd_disk); 957 bdops->revalidate_disk(bdev->bd_disk);
895 if (bdev->bd_disk->minors > 1)
896 bdev->bd_invalidated = 1;
897 return 1; 958 return 1;
898} 959}
899 960
@@ -927,10 +988,10 @@ static int __blkdev_put(struct block_device *bdev, int for_part);
927 988
928static int do_open(struct block_device *bdev, struct file *file, int for_part) 989static int do_open(struct block_device *bdev, struct file *file, int for_part)
929{ 990{
930 struct module *owner = NULL;
931 struct gendisk *disk; 991 struct gendisk *disk;
992 struct hd_struct *part = NULL;
932 int ret; 993 int ret;
933 int part; 994 int partno;
934 int perm = 0; 995 int perm = 0;
935 996
936 if (file->f_mode & FMODE_READ) 997 if (file->f_mode & FMODE_READ)
@@ -948,25 +1009,27 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
948 1009
949 ret = -ENXIO; 1010 ret = -ENXIO;
950 file->f_mapping = bdev->bd_inode->i_mapping; 1011 file->f_mapping = bdev->bd_inode->i_mapping;
1012
951 lock_kernel(); 1013 lock_kernel();
952 disk = get_gendisk(bdev->bd_dev, &part); 1014
953 if (!disk) { 1015 disk = get_gendisk(bdev->bd_dev, &partno);
954 unlock_kernel(); 1016 if (!disk)
955 bdput(bdev); 1017 goto out_unlock_kernel;
956 return ret; 1018 part = disk_get_part(disk, partno);
957 } 1019 if (!part)
958 owner = disk->fops->owner; 1020 goto out_unlock_kernel;
959 1021
960 mutex_lock_nested(&bdev->bd_mutex, for_part); 1022 mutex_lock_nested(&bdev->bd_mutex, for_part);
961 if (!bdev->bd_openers) { 1023 if (!bdev->bd_openers) {
962 bdev->bd_disk = disk; 1024 bdev->bd_disk = disk;
1025 bdev->bd_part = part;
963 bdev->bd_contains = bdev; 1026 bdev->bd_contains = bdev;
964 if (!part) { 1027 if (!partno) {
965 struct backing_dev_info *bdi; 1028 struct backing_dev_info *bdi;
966 if (disk->fops->open) { 1029 if (disk->fops->open) {
967 ret = disk->fops->open(bdev->bd_inode, file); 1030 ret = disk->fops->open(bdev->bd_inode, file);
968 if (ret) 1031 if (ret)
969 goto out_first; 1032 goto out_clear;
970 } 1033 }
971 if (!bdev->bd_openers) { 1034 if (!bdev->bd_openers) {
972 bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); 1035 bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
@@ -978,36 +1041,36 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
978 if (bdev->bd_invalidated) 1041 if (bdev->bd_invalidated)
979 rescan_partitions(disk, bdev); 1042 rescan_partitions(disk, bdev);
980 } else { 1043 } else {
981 struct hd_struct *p;
982 struct block_device *whole; 1044 struct block_device *whole;
983 whole = bdget_disk(disk, 0); 1045 whole = bdget_disk(disk, 0);
984 ret = -ENOMEM; 1046 ret = -ENOMEM;
985 if (!whole) 1047 if (!whole)
986 goto out_first; 1048 goto out_clear;
987 BUG_ON(for_part); 1049 BUG_ON(for_part);
988 ret = __blkdev_get(whole, file->f_mode, file->f_flags, 1); 1050 ret = __blkdev_get(whole, file->f_mode, file->f_flags, 1);
989 if (ret) 1051 if (ret)
990 goto out_first; 1052 goto out_clear;
991 bdev->bd_contains = whole; 1053 bdev->bd_contains = whole;
992 p = disk->part[part - 1];
993 bdev->bd_inode->i_data.backing_dev_info = 1054 bdev->bd_inode->i_data.backing_dev_info =
994 whole->bd_inode->i_data.backing_dev_info; 1055 whole->bd_inode->i_data.backing_dev_info;
995 if (!(disk->flags & GENHD_FL_UP) || !p || !p->nr_sects) { 1056 if (!(disk->flags & GENHD_FL_UP) ||
1057 !part || !part->nr_sects) {
996 ret = -ENXIO; 1058 ret = -ENXIO;
997 goto out_first; 1059 goto out_clear;
998 } 1060 }
999 kobject_get(&p->dev.kobj); 1061 bd_set_size(bdev, (loff_t)part->nr_sects << 9);
1000 bdev->bd_part = p;
1001 bd_set_size(bdev, (loff_t) p->nr_sects << 9);
1002 } 1062 }
1003 } else { 1063 } else {
1064 disk_put_part(part);
1004 put_disk(disk); 1065 put_disk(disk);
1005 module_put(owner); 1066 module_put(disk->fops->owner);
1067 part = NULL;
1068 disk = NULL;
1006 if (bdev->bd_contains == bdev) { 1069 if (bdev->bd_contains == bdev) {
1007 if (bdev->bd_disk->fops->open) { 1070 if (bdev->bd_disk->fops->open) {
1008 ret = bdev->bd_disk->fops->open(bdev->bd_inode, file); 1071 ret = bdev->bd_disk->fops->open(bdev->bd_inode, file);
1009 if (ret) 1072 if (ret)
1010 goto out; 1073 goto out_unlock_bdev;
1011 } 1074 }
1012 if (bdev->bd_invalidated) 1075 if (bdev->bd_invalidated)
1013 rescan_partitions(bdev->bd_disk, bdev); 1076 rescan_partitions(bdev->bd_disk, bdev);
@@ -1020,19 +1083,24 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
1020 unlock_kernel(); 1083 unlock_kernel();
1021 return 0; 1084 return 0;
1022 1085
1023out_first: 1086 out_clear:
1024 bdev->bd_disk = NULL; 1087 bdev->bd_disk = NULL;
1088 bdev->bd_part = NULL;
1025 bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info; 1089 bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info;
1026 if (bdev != bdev->bd_contains) 1090 if (bdev != bdev->bd_contains)
1027 __blkdev_put(bdev->bd_contains, 1); 1091 __blkdev_put(bdev->bd_contains, 1);
1028 bdev->bd_contains = NULL; 1092 bdev->bd_contains = NULL;
1029 put_disk(disk); 1093 out_unlock_bdev:
1030 module_put(owner);
1031out:
1032 mutex_unlock(&bdev->bd_mutex); 1094 mutex_unlock(&bdev->bd_mutex);
1095 out_unlock_kernel:
1033 unlock_kernel(); 1096 unlock_kernel();
1034 if (ret) 1097
1035 bdput(bdev); 1098 disk_put_part(part);
1099 if (disk)
1100 module_put(disk->fops->owner);
1101 put_disk(disk);
1102 bdput(bdev);
1103
1036 return ret; 1104 return ret;
1037} 1105}
1038 1106
@@ -1117,11 +1185,8 @@ static int __blkdev_put(struct block_device *bdev, int for_part)
1117 1185
1118 put_disk(disk); 1186 put_disk(disk);
1119 module_put(owner); 1187 module_put(owner);
1120 1188 disk_put_part(bdev->bd_part);
1121 if (bdev->bd_contains != bdev) { 1189 bdev->bd_part = NULL;
1122 kobject_put(&bdev->bd_part->dev.kobj);
1123 bdev->bd_part = NULL;
1124 }
1125 bdev->bd_disk = NULL; 1190 bdev->bd_disk = NULL;
1126 bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info; 1191 bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info;
1127 if (bdev != bdev->bd_contains) 1192 if (bdev != bdev->bd_contains)
@@ -1197,10 +1262,9 @@ EXPORT_SYMBOL(ioctl_by_bdev);
1197 1262
1198/** 1263/**
1199 * lookup_bdev - lookup a struct block_device by name 1264 * lookup_bdev - lookup a struct block_device by name
1265 * @pathname: special file representing the block device
1200 * 1266 *
1201 * @path: special file representing the block device 1267 * Get a reference to the blockdevice at @pathname in the current
1202 *
1203 * Get a reference to the blockdevice at @path in the current
1204 * namespace if possible and return it. Return ERR_PTR(error) 1268 * namespace if possible and return it. Return ERR_PTR(error)
1205 * otherwise. 1269 * otherwise.
1206 */ 1270 */
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 117ef4bba68e..fcee9298b620 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -66,11 +66,28 @@ struct key_type cifs_spnego_key_type = {
66 .describe = user_describe, 66 .describe = user_describe,
67}; 67};
68 68
69#define MAX_VER_STR_LEN 8 /* length of longest version string e.g. 69/* length of longest version string e.g. strlen("ver=0xFF") */
70 strlen("ver=0xFF") */ 70#define MAX_VER_STR_LEN 8
71#define MAX_MECH_STR_LEN 13 /* length of longest security mechanism name, eg 71
72 in future could have strlen(";sec=ntlmsspi") */ 72/* length of longest security mechanism name, eg in future could have
73#define MAX_IPV6_ADDR_LEN 42 /* eg FEDC:BA98:7654:3210:FEDC:BA98:7654:3210/60 */ 73 * strlen(";sec=ntlmsspi") */
74#define MAX_MECH_STR_LEN 13
75
76/* max possible addr len eg FEDC:BA98:7654:3210:FEDC:BA98:7654:3210/60 */
77#define MAX_IPV6_ADDR_LEN 42
78
79/* strlen of "host=" */
80#define HOST_KEY_LEN 5
81
82/* strlen of ";ip4=" or ";ip6=" */
83#define IP_KEY_LEN 5
84
85/* strlen of ";uid=0x" */
86#define UID_KEY_LEN 7
87
88/* strlen of ";user=" */
89#define USER_KEY_LEN 6
90
74/* get a key struct with a SPNEGO security blob, suitable for session setup */ 91/* get a key struct with a SPNEGO security blob, suitable for session setup */
75struct key * 92struct key *
76cifs_get_spnego_key(struct cifsSesInfo *sesInfo) 93cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
@@ -84,11 +101,11 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
84 /* length of fields (with semicolons): ver=0xyz ip4=ipaddress 101 /* length of fields (with semicolons): ver=0xyz ip4=ipaddress
85 host=hostname sec=mechanism uid=0xFF user=username */ 102 host=hostname sec=mechanism uid=0xFF user=username */
86 desc_len = MAX_VER_STR_LEN + 103 desc_len = MAX_VER_STR_LEN +
87 6 /* len of "host=" */ + strlen(hostname) + 104 HOST_KEY_LEN + strlen(hostname) +
88 5 /* len of ";ipv4=" */ + MAX_IPV6_ADDR_LEN + 105 IP_KEY_LEN + MAX_IPV6_ADDR_LEN +
89 MAX_MECH_STR_LEN + 106 MAX_MECH_STR_LEN +
90 7 /* len of ";uid=0x" */ + (sizeof(uid_t) * 2) + 107 UID_KEY_LEN + (sizeof(uid_t) * 2) +
91 6 /* len of ";user=" */ + strlen(sesInfo->userName) + 1; 108 USER_KEY_LEN + strlen(sesInfo->userName) + 1;
92 109
93 spnego_key = ERR_PTR(-ENOMEM); 110 spnego_key = ERR_PTR(-ENOMEM);
94 description = kzalloc(desc_len, GFP_KERNEL); 111 description = kzalloc(desc_len, GFP_KERNEL);
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 135c965c4137..f7b4a5cd837b 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -41,7 +41,7 @@ extern int cifs_create(struct inode *, struct dentry *, int,
41 struct nameidata *); 41 struct nameidata *);
42extern struct dentry *cifs_lookup(struct inode *, struct dentry *, 42extern struct dentry *cifs_lookup(struct inode *, struct dentry *,
43 struct nameidata *); 43 struct nameidata *);
44extern int cifs_unlink(struct inode *, struct dentry *); 44extern int cifs_unlink(struct inode *dir, struct dentry *dentry);
45extern int cifs_hardlink(struct dentry *, struct inode *, struct dentry *); 45extern int cifs_hardlink(struct dentry *, struct inode *, struct dentry *);
46extern int cifs_mknod(struct inode *, struct dentry *, int, dev_t); 46extern int cifs_mknod(struct inode *, struct dentry *, int, dev_t);
47extern int cifs_mkdir(struct inode *, struct dentry *, int); 47extern int cifs_mkdir(struct inode *, struct dentry *, int);
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 8dfd6f24d488..0d22479d99b7 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -309,6 +309,7 @@ struct cifs_search_info {
309 __u32 resume_key; 309 __u32 resume_key;
310 char *ntwrk_buf_start; 310 char *ntwrk_buf_start;
311 char *srch_entries_start; 311 char *srch_entries_start;
312 char *last_entry;
312 char *presume_name; 313 char *presume_name;
313 unsigned int resume_name_len; 314 unsigned int resume_name_len;
314 bool endOfSearch:1; 315 bool endOfSearch:1;
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index a729d083e6f4..0cff7fe986e8 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -179,6 +179,8 @@ extern int CIFSSMBSetPathInfo(const int xid, struct cifsTconInfo *tcon,
179extern int CIFSSMBSetFileInfo(const int xid, struct cifsTconInfo *tcon, 179extern int CIFSSMBSetFileInfo(const int xid, struct cifsTconInfo *tcon,
180 const FILE_BASIC_INFO *data, __u16 fid, 180 const FILE_BASIC_INFO *data, __u16 fid,
181 __u32 pid_of_opener); 181 __u32 pid_of_opener);
182extern int CIFSSMBSetFileDisposition(const int xid, struct cifsTconInfo *tcon,
183 bool delete_file, __u16 fid, __u32 pid_of_opener);
182#if 0 184#if 0
183extern int CIFSSMBSetAttrLegacy(int xid, struct cifsTconInfo *tcon, 185extern int CIFSSMBSetAttrLegacy(int xid, struct cifsTconInfo *tcon,
184 char *fileName, __u16 dos_attributes, 186 char *fileName, __u16 dos_attributes,
@@ -229,7 +231,7 @@ extern int CIFSSMBRename(const int xid, struct cifsTconInfo *tcon,
229 const struct nls_table *nls_codepage, 231 const struct nls_table *nls_codepage,
230 int remap_special_chars); 232 int remap_special_chars);
231extern int CIFSSMBRenameOpenFile(const int xid, struct cifsTconInfo *pTcon, 233extern int CIFSSMBRenameOpenFile(const int xid, struct cifsTconInfo *pTcon,
232 int netfid, char *target_name, 234 int netfid, const char *target_name,
233 const struct nls_table *nls_codepage, 235 const struct nls_table *nls_codepage,
234 int remap_special_chars); 236 int remap_special_chars);
235extern int CIFSCreateHardLink(const int xid, 237extern int CIFSCreateHardLink(const int xid,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 994de7c90474..6f4ffe15d68d 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -2017,7 +2017,7 @@ renameRetry:
2017} 2017}
2018 2018
2019int CIFSSMBRenameOpenFile(const int xid, struct cifsTconInfo *pTcon, 2019int CIFSSMBRenameOpenFile(const int xid, struct cifsTconInfo *pTcon,
2020 int netfid, char *target_name, 2020 int netfid, const char *target_name,
2021 const struct nls_table *nls_codepage, int remap) 2021 const struct nls_table *nls_codepage, int remap)
2022{ 2022{
2023 struct smb_com_transaction2_sfi_req *pSMB = NULL; 2023 struct smb_com_transaction2_sfi_req *pSMB = NULL;
@@ -2071,7 +2071,7 @@ int CIFSSMBRenameOpenFile(const int xid, struct cifsTconInfo *pTcon,
2071 remap); 2071 remap);
2072 } 2072 }
2073 rename_info->target_name_len = cpu_to_le32(2 * len_of_str); 2073 rename_info->target_name_len = cpu_to_le32(2 * len_of_str);
2074 count = 12 /* sizeof(struct set_file_rename) */ + (2 * len_of_str) + 2; 2074 count = 12 /* sizeof(struct set_file_rename) */ + (2 * len_of_str);
2075 byte_count += count; 2075 byte_count += count;
2076 pSMB->DataCount = cpu_to_le16(count); 2076 pSMB->DataCount = cpu_to_le16(count);
2077 pSMB->TotalDataCount = pSMB->DataCount; 2077 pSMB->TotalDataCount = pSMB->DataCount;
@@ -3614,6 +3614,8 @@ findFirstRetry:
3614 /* BB remember to free buffer if error BB */ 3614 /* BB remember to free buffer if error BB */
3615 rc = validate_t2((struct smb_t2_rsp *)pSMBr); 3615 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
3616 if (rc == 0) { 3616 if (rc == 0) {
3617 unsigned int lnoff;
3618
3617 if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE) 3619 if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE)
3618 psrch_inf->unicode = true; 3620 psrch_inf->unicode = true;
3619 else 3621 else
@@ -3636,6 +3638,17 @@ findFirstRetry:
3636 le16_to_cpu(parms->SearchCount); 3638 le16_to_cpu(parms->SearchCount);
3637 psrch_inf->index_of_last_entry = 2 /* skip . and .. */ + 3639 psrch_inf->index_of_last_entry = 2 /* skip . and .. */ +
3638 psrch_inf->entries_in_buffer; 3640 psrch_inf->entries_in_buffer;
3641 lnoff = le16_to_cpu(parms->LastNameOffset);
3642 if (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE <
3643 lnoff) {
3644 cERROR(1, ("ignoring corrupt resume name"));
3645 psrch_inf->last_entry = NULL;
3646 return rc;
3647 }
3648
3649 psrch_inf->last_entry = psrch_inf->srch_entries_start +
3650 lnoff;
3651
3639 *pnetfid = parms->SearchHandle; 3652 *pnetfid = parms->SearchHandle;
3640 } else { 3653 } else {
3641 cifs_buf_release(pSMB); 3654 cifs_buf_release(pSMB);
@@ -3725,6 +3738,8 @@ int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
3725 rc = validate_t2((struct smb_t2_rsp *)pSMBr); 3738 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
3726 3739
3727 if (rc == 0) { 3740 if (rc == 0) {
3741 unsigned int lnoff;
3742
3728 /* BB fixme add lock for file (srch_info) struct here */ 3743 /* BB fixme add lock for file (srch_info) struct here */
3729 if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE) 3744 if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE)
3730 psrch_inf->unicode = true; 3745 psrch_inf->unicode = true;
@@ -3751,6 +3766,16 @@ int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
3751 le16_to_cpu(parms->SearchCount); 3766 le16_to_cpu(parms->SearchCount);
3752 psrch_inf->index_of_last_entry += 3767 psrch_inf->index_of_last_entry +=
3753 psrch_inf->entries_in_buffer; 3768 psrch_inf->entries_in_buffer;
3769 lnoff = le16_to_cpu(parms->LastNameOffset);
3770 if (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE <
3771 lnoff) {
3772 cERROR(1, ("ignoring corrupt resume name"));
3773 psrch_inf->last_entry = NULL;
3774 return rc;
3775 } else
3776 psrch_inf->last_entry =
3777 psrch_inf->srch_entries_start + lnoff;
3778
3754/* cFYI(1,("fnxt2 entries in buf %d index_of_last %d", 3779/* cFYI(1,("fnxt2 entries in buf %d index_of_last %d",
3755 psrch_inf->entries_in_buffer, psrch_inf->index_of_last_entry)); */ 3780 psrch_inf->entries_in_buffer, psrch_inf->index_of_last_entry)); */
3756 3781
@@ -4876,6 +4901,61 @@ CIFSSMBSetFileInfo(const int xid, struct cifsTconInfo *tcon,
4876 return rc; 4901 return rc;
4877} 4902}
4878 4903
4904int
4905CIFSSMBSetFileDisposition(const int xid, struct cifsTconInfo *tcon,
4906 bool delete_file, __u16 fid, __u32 pid_of_opener)
4907{
4908 struct smb_com_transaction2_sfi_req *pSMB = NULL;
4909 char *data_offset;
4910 int rc = 0;
4911 __u16 params, param_offset, offset, byte_count, count;
4912
4913 cFYI(1, ("Set File Disposition (via SetFileInfo)"));
4914 rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB);
4915
4916 if (rc)
4917 return rc;
4918
4919 pSMB->hdr.Pid = cpu_to_le16((__u16)pid_of_opener);
4920 pSMB->hdr.PidHigh = cpu_to_le16((__u16)(pid_of_opener >> 16));
4921
4922 params = 6;
4923 pSMB->MaxSetupCount = 0;
4924 pSMB->Reserved = 0;
4925 pSMB->Flags = 0;
4926 pSMB->Timeout = 0;
4927 pSMB->Reserved2 = 0;
4928 param_offset = offsetof(struct smb_com_transaction2_sfi_req, Fid) - 4;
4929 offset = param_offset + params;
4930
4931 data_offset = (char *) (&pSMB->hdr.Protocol) + offset;
4932
4933 count = 1;
4934 pSMB->MaxParameterCount = cpu_to_le16(2);
4935 /* BB find max SMB PDU from sess */
4936 pSMB->MaxDataCount = cpu_to_le16(1000);
4937 pSMB->SetupCount = 1;
4938 pSMB->Reserved3 = 0;
4939 pSMB->SubCommand = cpu_to_le16(TRANS2_SET_FILE_INFORMATION);
4940 byte_count = 3 /* pad */ + params + count;
4941 pSMB->DataCount = cpu_to_le16(count);
4942 pSMB->ParameterCount = cpu_to_le16(params);
4943 pSMB->TotalDataCount = pSMB->DataCount;
4944 pSMB->TotalParameterCount = pSMB->ParameterCount;
4945 pSMB->ParameterOffset = cpu_to_le16(param_offset);
4946 pSMB->DataOffset = cpu_to_le16(offset);
4947 pSMB->Fid = fid;
4948 pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_DISPOSITION_INFO);
4949 pSMB->Reserved4 = 0;
4950 pSMB->hdr.smb_buf_length += byte_count;
4951 pSMB->ByteCount = cpu_to_le16(byte_count);
4952 *data_offset = delete_file ? 1 : 0;
4953 rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
4954 if (rc)
4955 cFYI(1, ("Send error in SetFileDisposition = %d", rc));
4956
4957 return rc;
4958}
4879 4959
4880int 4960int
4881CIFSSMBSetPathInfo(const int xid, struct cifsTconInfo *tcon, 4961CIFSSMBSetPathInfo(const int xid, struct cifsTconInfo *tcon,
diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c
index a2e0673e1b08..1e0c1bd8f2e4 100644
--- a/fs/cifs/dns_resolve.c
+++ b/fs/cifs/dns_resolve.c
@@ -29,19 +29,55 @@
29#include "cifsproto.h" 29#include "cifsproto.h"
30#include "cifs_debug.h" 30#include "cifs_debug.h"
31 31
32static int dns_resolver_instantiate(struct key *key, const void *data, 32/* Checks if supplied name is IP address
33 * returns:
34 * 1 - name is IP
35 * 0 - name is not IP
36 */
37static int
38is_ip(const char *name)
39{
40 int rc;
41 struct sockaddr_in sin_server;
42 struct sockaddr_in6 sin_server6;
43
44 rc = cifs_inet_pton(AF_INET, name,
45 &sin_server.sin_addr.s_addr);
46
47 if (rc <= 0) {
48 /* not ipv4 address, try ipv6 */
49 rc = cifs_inet_pton(AF_INET6, name,
50 &sin_server6.sin6_addr.in6_u);
51 if (rc > 0)
52 return 1;
53 } else {
54 return 1;
55 }
56 /* we failed translating address */
57 return 0;
58}
59
60static int
61dns_resolver_instantiate(struct key *key, const void *data,
33 size_t datalen) 62 size_t datalen)
34{ 63{
35 int rc = 0; 64 int rc = 0;
36 char *ip; 65 char *ip;
37 66
38 ip = kmalloc(datalen+1, GFP_KERNEL); 67 ip = kmalloc(datalen + 1, GFP_KERNEL);
39 if (!ip) 68 if (!ip)
40 return -ENOMEM; 69 return -ENOMEM;
41 70
42 memcpy(ip, data, datalen); 71 memcpy(ip, data, datalen);
43 ip[datalen] = '\0'; 72 ip[datalen] = '\0';
44 73
74 /* make sure this looks like an address */
75 if (!is_ip((const char *) ip)) {
76 kfree(ip);
77 return -EINVAL;
78 }
79
80 key->type_data.x[0] = datalen;
45 rcu_assign_pointer(key->payload.data, ip); 81 rcu_assign_pointer(key->payload.data, ip);
46 82
47 return rc; 83 return rc;
@@ -62,33 +98,6 @@ struct key_type key_type_dns_resolver = {
62 .match = user_match, 98 .match = user_match,
63}; 99};
64 100
65/* Checks if supplied name is IP address
66 * returns:
67 * 1 - name is IP
68 * 0 - name is not IP
69 */
70static int is_ip(const char *name)
71{
72 int rc;
73 struct sockaddr_in sin_server;
74 struct sockaddr_in6 sin_server6;
75
76 rc = cifs_inet_pton(AF_INET, name,
77 &sin_server.sin_addr.s_addr);
78
79 if (rc <= 0) {
80 /* not ipv4 address, try ipv6 */
81 rc = cifs_inet_pton(AF_INET6, name,
82 &sin_server6.sin6_addr.in6_u);
83 if (rc > 0)
84 return 1;
85 } else {
86 return 1;
87 }
88 /* we failed translating address */
89 return 0;
90}
91
92/* Resolves server name to ip address. 101/* Resolves server name to ip address.
93 * input: 102 * input:
94 * unc - server UNC 103 * unc - server UNC
@@ -140,6 +149,7 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
140 149
141 rkey = request_key(&key_type_dns_resolver, name, ""); 150 rkey = request_key(&key_type_dns_resolver, name, "");
142 if (!IS_ERR(rkey)) { 151 if (!IS_ERR(rkey)) {
152 len = rkey->type_data.x[0];
143 data = rkey->payload.data; 153 data = rkey->payload.data;
144 } else { 154 } else {
145 cERROR(1, ("%s: unable to resolve: %s", __func__, name)); 155 cERROR(1, ("%s: unable to resolve: %s", __func__, name));
@@ -148,11 +158,9 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
148 158
149skip_upcall: 159skip_upcall:
150 if (data) { 160 if (data) {
151 len = strlen(data); 161 *ip_addr = kmalloc(len + 1, GFP_KERNEL);
152 *ip_addr = kmalloc(len+1, GFP_KERNEL);
153 if (*ip_addr) { 162 if (*ip_addr) {
154 memcpy(*ip_addr, data, len); 163 memcpy(*ip_addr, data, len + 1);
155 (*ip_addr)[len] = '\0';
156 if (!IS_ERR(rkey)) 164 if (!IS_ERR(rkey))
157 cFYI(1, ("%s: resolved: %s to %s", __func__, 165 cFYI(1, ("%s: resolved: %s to %s", __func__,
158 name, 166 name,
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index cbefe1f1f9fe..c4a8a0605125 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -107,7 +107,7 @@ static inline int cifs_open_inode_helper(struct inode *inode, struct file *file,
107 107
108 /* want handles we can use to read with first 108 /* want handles we can use to read with first
109 in the list so we do not have to walk the 109 in the list so we do not have to walk the
110 list to search for one in prepare_write */ 110 list to search for one in write_begin */
111 if ((file->f_flags & O_ACCMODE) == O_WRONLY) { 111 if ((file->f_flags & O_ACCMODE) == O_WRONLY) {
112 list_add_tail(&pCifsFile->flist, 112 list_add_tail(&pCifsFile->flist,
113 &pCifsInode->openFileList); 113 &pCifsInode->openFileList);
@@ -915,7 +915,7 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
915} 915}
916 916
917static ssize_t cifs_write(struct file *file, const char *write_data, 917static ssize_t cifs_write(struct file *file, const char *write_data,
918 size_t write_size, loff_t *poffset) 918 size_t write_size, loff_t *poffset)
919{ 919{
920 int rc = 0; 920 int rc = 0;
921 unsigned int bytes_written = 0; 921 unsigned int bytes_written = 0;
@@ -1065,6 +1065,7 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode)
1065struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode) 1065struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode)
1066{ 1066{
1067 struct cifsFileInfo *open_file; 1067 struct cifsFileInfo *open_file;
1068 bool any_available = false;
1068 int rc; 1069 int rc;
1069 1070
1070 /* Having a null inode here (because mapping->host was set to zero by 1071 /* Having a null inode here (because mapping->host was set to zero by
@@ -1080,8 +1081,10 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode)
1080 read_lock(&GlobalSMBSeslock); 1081 read_lock(&GlobalSMBSeslock);
1081refind_writable: 1082refind_writable:
1082 list_for_each_entry(open_file, &cifs_inode->openFileList, flist) { 1083 list_for_each_entry(open_file, &cifs_inode->openFileList, flist) {
1083 if (open_file->closePend) 1084 if (open_file->closePend ||
1085 (!any_available && open_file->pid != current->tgid))
1084 continue; 1086 continue;
1087
1085 if (open_file->pfile && 1088 if (open_file->pfile &&
1086 ((open_file->pfile->f_flags & O_RDWR) || 1089 ((open_file->pfile->f_flags & O_RDWR) ||
1087 (open_file->pfile->f_flags & O_WRONLY))) { 1090 (open_file->pfile->f_flags & O_WRONLY))) {
@@ -1131,6 +1134,11 @@ refind_writable:
1131 of the loop here. */ 1134 of the loop here. */
1132 } 1135 }
1133 } 1136 }
1137 /* couldn't find useable FH with same pid, try any available */
1138 if (!any_available) {
1139 any_available = true;
1140 goto refind_writable;
1141 }
1134 read_unlock(&GlobalSMBSeslock); 1142 read_unlock(&GlobalSMBSeslock);
1135 return NULL; 1143 return NULL;
1136} 1144}
@@ -1447,49 +1455,52 @@ static int cifs_writepage(struct page *page, struct writeback_control *wbc)
1447 return rc; 1455 return rc;
1448} 1456}
1449 1457
1450static int cifs_commit_write(struct file *file, struct page *page, 1458static int cifs_write_end(struct file *file, struct address_space *mapping,
1451 unsigned offset, unsigned to) 1459 loff_t pos, unsigned len, unsigned copied,
1460 struct page *page, void *fsdata)
1452{ 1461{
1453 int xid; 1462 int rc;
1454 int rc = 0; 1463 struct inode *inode = mapping->host;
1455 struct inode *inode = page->mapping->host;
1456 loff_t position = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1457 char *page_data;
1458 1464
1459 xid = GetXid(); 1465 cFYI(1, ("write_end for page %p from pos %lld with %d bytes",
1460 cFYI(1, ("commit write for page %p up to position %lld for %d", 1466 page, pos, copied));
1461 page, position, to)); 1467
1462 spin_lock(&inode->i_lock); 1468 if (!PageUptodate(page) && copied == PAGE_CACHE_SIZE)
1463 if (position > inode->i_size) 1469 SetPageUptodate(page);
1464 i_size_write(inode, position);
1465 1470
1466 spin_unlock(&inode->i_lock);
1467 if (!PageUptodate(page)) { 1471 if (!PageUptodate(page)) {
1468 position = ((loff_t)page->index << PAGE_CACHE_SHIFT) + offset; 1472 char *page_data;
1469 /* can not rely on (or let) writepage write this data */ 1473 unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
1470 if (to < offset) { 1474 int xid;
1471 cFYI(1, ("Illegal offsets, can not copy from %d to %d", 1475
1472 offset, to)); 1476 xid = GetXid();
1473 FreeXid(xid);
1474 return rc;
1475 }
1476 /* this is probably better than directly calling 1477 /* this is probably better than directly calling
1477 partialpage_write since in this function the file handle is 1478 partialpage_write since in this function the file handle is
1478 known which we might as well leverage */ 1479 known which we might as well leverage */
1479 /* BB check if anything else missing out of ppw 1480 /* BB check if anything else missing out of ppw
1480 such as updating last write time */ 1481 such as updating last write time */
1481 page_data = kmap(page); 1482 page_data = kmap(page);
1482 rc = cifs_write(file, page_data + offset, to-offset, 1483 rc = cifs_write(file, page_data + offset, copied, &pos);
1483 &position); 1484 /* if (rc < 0) should we set writebehind rc? */
1484 if (rc > 0)
1485 rc = 0;
1486 /* else if (rc < 0) should we set writebehind rc? */
1487 kunmap(page); 1485 kunmap(page);
1486
1487 FreeXid(xid);
1488 } else { 1488 } else {
1489 rc = copied;
1490 pos += copied;
1489 set_page_dirty(page); 1491 set_page_dirty(page);
1490 } 1492 }
1491 1493
1492 FreeXid(xid); 1494 if (rc > 0) {
1495 spin_lock(&inode->i_lock);
1496 if (pos > inode->i_size)
1497 i_size_write(inode, pos);
1498 spin_unlock(&inode->i_lock);
1499 }
1500
1501 unlock_page(page);
1502 page_cache_release(page);
1503
1493 return rc; 1504 return rc;
1494} 1505}
1495 1506
@@ -2035,49 +2046,44 @@ bool is_size_safe_to_change(struct cifsInodeInfo *cifsInode, __u64 end_of_file)
2035 return true; 2046 return true;
2036} 2047}
2037 2048
2038static int cifs_prepare_write(struct file *file, struct page *page, 2049static int cifs_write_begin(struct file *file, struct address_space *mapping,
2039 unsigned from, unsigned to) 2050 loff_t pos, unsigned len, unsigned flags,
2051 struct page **pagep, void **fsdata)
2040{ 2052{
2041 int rc = 0; 2053 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
2042 loff_t i_size; 2054 loff_t offset = pos & (PAGE_CACHE_SIZE - 1);
2043 loff_t offset;
2044 2055
2045 cFYI(1, ("prepare write for page %p from %d to %d", page, from, to)); 2056 cFYI(1, ("write_begin from %lld len %d", (long long)pos, len));
2046 if (PageUptodate(page)) 2057
2058 *pagep = __grab_cache_page(mapping, index);
2059 if (!*pagep)
2060 return -ENOMEM;
2061
2062 if (PageUptodate(*pagep))
2047 return 0; 2063 return 0;
2048 2064
2049 /* If we are writing a full page it will be up to date, 2065 /* If we are writing a full page it will be up to date,
2050 no need to read from the server */ 2066 no need to read from the server */
2051 if ((to == PAGE_CACHE_SIZE) && (from == 0)) { 2067 if (len == PAGE_CACHE_SIZE && flags & AOP_FLAG_UNINTERRUPTIBLE)
2052 SetPageUptodate(page);
2053 return 0; 2068 return 0;
2054 }
2055 2069
2056 offset = (loff_t)page->index << PAGE_CACHE_SHIFT; 2070 if ((file->f_flags & O_ACCMODE) != O_WRONLY) {
2057 i_size = i_size_read(page->mapping->host); 2071 int rc;
2058 2072
2059 if ((offset >= i_size) ||
2060 ((from == 0) && (offset + to) >= i_size)) {
2061 /*
2062 * We don't need to read data beyond the end of the file.
2063 * zero it, and set the page uptodate
2064 */
2065 simple_prepare_write(file, page, from, to);
2066 SetPageUptodate(page);
2067 } else if ((file->f_flags & O_ACCMODE) != O_WRONLY) {
2068 /* might as well read a page, it is fast enough */ 2073 /* might as well read a page, it is fast enough */
2069 rc = cifs_readpage_worker(file, page, &offset); 2074 rc = cifs_readpage_worker(file, *pagep, &offset);
2075
2076 /* we do not need to pass errors back
2077 e.g. if we do not have read access to the file
2078 because cifs_write_end will attempt synchronous writes
2079 -- shaggy */
2070 } else { 2080 } else {
2071 /* we could try using another file handle if there is one - 2081 /* we could try using another file handle if there is one -
2072 but how would we lock it to prevent close of that handle 2082 but how would we lock it to prevent close of that handle
2073 racing with this read? In any case 2083 racing with this read? In any case
2074 this will be written out by commit_write so is fine */ 2084 this will be written out by write_end so is fine */
2075 } 2085 }
2076 2086
2077 /* we do not need to pass errors back
2078 e.g. if we do not have read access to the file
2079 because cifs_commit_write will do the right thing. -- shaggy */
2080
2081 return 0; 2087 return 0;
2082} 2088}
2083 2089
@@ -2086,8 +2092,8 @@ const struct address_space_operations cifs_addr_ops = {
2086 .readpages = cifs_readpages, 2092 .readpages = cifs_readpages,
2087 .writepage = cifs_writepage, 2093 .writepage = cifs_writepage,
2088 .writepages = cifs_writepages, 2094 .writepages = cifs_writepages,
2089 .prepare_write = cifs_prepare_write, 2095 .write_begin = cifs_write_begin,
2090 .commit_write = cifs_commit_write, 2096 .write_end = cifs_write_end,
2091 .set_page_dirty = __set_page_dirty_nobuffers, 2097 .set_page_dirty = __set_page_dirty_nobuffers,
2092 /* .sync_page = cifs_sync_page, */ 2098 /* .sync_page = cifs_sync_page, */
2093 /* .direct_IO = */ 2099 /* .direct_IO = */
@@ -2102,8 +2108,8 @@ const struct address_space_operations cifs_addr_ops_smallbuf = {
2102 .readpage = cifs_readpage, 2108 .readpage = cifs_readpage,
2103 .writepage = cifs_writepage, 2109 .writepage = cifs_writepage,
2104 .writepages = cifs_writepages, 2110 .writepages = cifs_writepages,
2105 .prepare_write = cifs_prepare_write, 2111 .write_begin = cifs_write_begin,
2106 .commit_write = cifs_commit_write, 2112 .write_end = cifs_write_end,
2107 .set_page_dirty = __set_page_dirty_nobuffers, 2113 .set_page_dirty = __set_page_dirty_nobuffers,
2108 /* .sync_page = cifs_sync_page, */ 2114 /* .sync_page = cifs_sync_page, */
2109 /* .direct_IO = */ 2115 /* .direct_IO = */
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 9c548f110102..a8c833345fc9 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -665,40 +665,201 @@ struct inode *cifs_iget(struct super_block *sb, unsigned long ino)
665 return inode; 665 return inode;
666} 666}
667 667
668int cifs_unlink(struct inode *inode, struct dentry *direntry) 668static int
669cifs_set_file_info(struct inode *inode, struct iattr *attrs, int xid,
670 char *full_path, __u32 dosattr)
671{
672 int rc;
673 int oplock = 0;
674 __u16 netfid;
675 __u32 netpid;
676 bool set_time = false;
677 struct cifsFileInfo *open_file;
678 struct cifsInodeInfo *cifsInode = CIFS_I(inode);
679 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
680 struct cifsTconInfo *pTcon = cifs_sb->tcon;
681 FILE_BASIC_INFO info_buf;
682
683 if (attrs->ia_valid & ATTR_ATIME) {
684 set_time = true;
685 info_buf.LastAccessTime =
686 cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_atime));
687 } else
688 info_buf.LastAccessTime = 0;
689
690 if (attrs->ia_valid & ATTR_MTIME) {
691 set_time = true;
692 info_buf.LastWriteTime =
693 cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_mtime));
694 } else
695 info_buf.LastWriteTime = 0;
696
697 /*
698 * Samba throws this field away, but windows may actually use it.
699 * Do not set ctime unless other time stamps are changed explicitly
700 * (i.e. by utimes()) since we would then have a mix of client and
701 * server times.
702 */
703 if (set_time && (attrs->ia_valid & ATTR_CTIME)) {
704 cFYI(1, ("CIFS - CTIME changed"));
705 info_buf.ChangeTime =
706 cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_ctime));
707 } else
708 info_buf.ChangeTime = 0;
709
710 info_buf.CreationTime = 0; /* don't change */
711 info_buf.Attributes = cpu_to_le32(dosattr);
712
713 /*
714 * If the file is already open for write, just use that fileid
715 */
716 open_file = find_writable_file(cifsInode);
717 if (open_file) {
718 netfid = open_file->netfid;
719 netpid = open_file->pid;
720 goto set_via_filehandle;
721 }
722
723 /*
724 * NT4 apparently returns success on this call, but it doesn't
725 * really work.
726 */
727 if (!(pTcon->ses->flags & CIFS_SES_NT4)) {
728 rc = CIFSSMBSetPathInfo(xid, pTcon, full_path,
729 &info_buf, cifs_sb->local_nls,
730 cifs_sb->mnt_cifs_flags &
731 CIFS_MOUNT_MAP_SPECIAL_CHR);
732 if (rc == 0) {
733 cifsInode->cifsAttrs = dosattr;
734 goto out;
735 } else if (rc != -EOPNOTSUPP && rc != -EINVAL)
736 goto out;
737 }
738
739 cFYI(1, ("calling SetFileInfo since SetPathInfo for "
740 "times not supported by this server"));
741 rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_OPEN,
742 SYNCHRONIZE | FILE_WRITE_ATTRIBUTES,
743 CREATE_NOT_DIR, &netfid, &oplock,
744 NULL, cifs_sb->local_nls,
745 cifs_sb->mnt_cifs_flags &
746 CIFS_MOUNT_MAP_SPECIAL_CHR);
747
748 if (rc != 0) {
749 if (rc == -EIO)
750 rc = -EINVAL;
751 goto out;
752 }
753
754 netpid = current->tgid;
755
756set_via_filehandle:
757 rc = CIFSSMBSetFileInfo(xid, pTcon, &info_buf, netfid, netpid);
758 if (!rc)
759 cifsInode->cifsAttrs = dosattr;
760
761 if (open_file == NULL)
762 CIFSSMBClose(xid, pTcon, netfid);
763 else
764 atomic_dec(&open_file->wrtPending);
765out:
766 return rc;
767}
768
769/*
770 * open the given file (if it isn't already), set the DELETE_ON_CLOSE bit
771 * and rename it to a random name that hopefully won't conflict with
772 * anything else.
773 */
774static int
775cifs_rename_pending_delete(char *full_path, struct inode *inode, int xid)
776{
777 int oplock = 0;
778 int rc;
779 __u16 netfid;
780 struct cifsInodeInfo *cifsInode = CIFS_I(inode);
781 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
782 struct cifsTconInfo *tcon = cifs_sb->tcon;
783 __u32 dosattr;
784 FILE_BASIC_INFO *info_buf;
785
786 rc = CIFSSMBOpen(xid, tcon, full_path, FILE_OPEN,
787 DELETE|FILE_WRITE_ATTRIBUTES,
788 CREATE_NOT_DIR|CREATE_DELETE_ON_CLOSE,
789 &netfid, &oplock, NULL, cifs_sb->local_nls,
790 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
791 if (rc != 0)
792 goto out;
793
794 /* set ATTR_HIDDEN and clear ATTR_READONLY */
795 cifsInode = CIFS_I(inode);
796 dosattr = cifsInode->cifsAttrs & ~ATTR_READONLY;
797 if (dosattr == 0)
798 dosattr |= ATTR_NORMAL;
799 dosattr |= ATTR_HIDDEN;
800
801 info_buf = kzalloc(sizeof(*info_buf), GFP_KERNEL);
802 if (info_buf == NULL) {
803 rc = -ENOMEM;
804 goto out_close;
805 }
806 info_buf->Attributes = cpu_to_le32(dosattr);
807 rc = CIFSSMBSetFileInfo(xid, tcon, info_buf, netfid, current->tgid);
808 kfree(info_buf);
809 if (rc != 0)
810 goto out_close;
811 cifsInode->cifsAttrs = dosattr;
812
813 /* silly-rename the file */
814 CIFSSMBRenameOpenFile(xid, tcon, netfid, NULL, cifs_sb->local_nls,
815 cifs_sb->mnt_cifs_flags &
816 CIFS_MOUNT_MAP_SPECIAL_CHR);
817
818 /* set DELETE_ON_CLOSE */
819 rc = CIFSSMBSetFileDisposition(xid, tcon, true, netfid, current->tgid);
820
821 /*
822 * some samba versions return -ENOENT when we try to set the file
823 * disposition here. Likely a samba bug, but work around it for now
824 */
825 if (rc == -ENOENT)
826 rc = 0;
827
828out_close:
829 CIFSSMBClose(xid, tcon, netfid);
830out:
831 return rc;
832}
833
834int cifs_unlink(struct inode *dir, struct dentry *dentry)
669{ 835{
670 int rc = 0; 836 int rc = 0;
671 int xid; 837 int xid;
672 struct cifs_sb_info *cifs_sb;
673 struct cifsTconInfo *pTcon;
674 char *full_path = NULL; 838 char *full_path = NULL;
675 struct cifsInodeInfo *cifsInode; 839 struct inode *inode = dentry->d_inode;
676 FILE_BASIC_INFO *pinfo_buf; 840 struct cifsInodeInfo *cifsInode = CIFS_I(inode);
841 struct super_block *sb = dir->i_sb;
842 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
843 struct cifsTconInfo *tcon = cifs_sb->tcon;
844 struct iattr *attrs = NULL;
845 __u32 dosattr = 0, origattr = 0;
677 846
678 cFYI(1, ("cifs_unlink, inode = 0x%p", inode)); 847 cFYI(1, ("cifs_unlink, dir=0x%p, dentry=0x%p", dir, dentry));
679 848
680 xid = GetXid(); 849 xid = GetXid();
681 850
682 if (inode) 851 /* Unlink can be called from rename so we can not take the
683 cifs_sb = CIFS_SB(inode->i_sb); 852 * sb->s_vfs_rename_mutex here */
684 else 853 full_path = build_path_from_dentry(dentry);
685 cifs_sb = CIFS_SB(direntry->d_sb);
686 pTcon = cifs_sb->tcon;
687
688 /* Unlink can be called from rename so we can not grab the sem here
689 since we deadlock otherwise */
690/* mutex_lock(&direntry->d_sb->s_vfs_rename_mutex);*/
691 full_path = build_path_from_dentry(direntry);
692/* mutex_unlock(&direntry->d_sb->s_vfs_rename_mutex);*/
693 if (full_path == NULL) { 854 if (full_path == NULL) {
694 FreeXid(xid); 855 FreeXid(xid);
695 return -ENOMEM; 856 return -ENOMEM;
696 } 857 }
697 858
698 if ((pTcon->ses->capabilities & CAP_UNIX) && 859 if ((tcon->ses->capabilities & CAP_UNIX) &&
699 (CIFS_UNIX_POSIX_PATH_OPS_CAP & 860 (CIFS_UNIX_POSIX_PATH_OPS_CAP &
700 le64_to_cpu(pTcon->fsUnixInfo.Capability))) { 861 le64_to_cpu(tcon->fsUnixInfo.Capability))) {
701 rc = CIFSPOSIXDelFile(xid, pTcon, full_path, 862 rc = CIFSPOSIXDelFile(xid, tcon, full_path,
702 SMB_POSIX_UNLINK_FILE_TARGET, cifs_sb->local_nls, 863 SMB_POSIX_UNLINK_FILE_TARGET, cifs_sb->local_nls,
703 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 864 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
704 cFYI(1, ("posix del rc %d", rc)); 865 cFYI(1, ("posix del rc %d", rc));
@@ -706,125 +867,60 @@ int cifs_unlink(struct inode *inode, struct dentry *direntry)
706 goto psx_del_no_retry; 867 goto psx_del_no_retry;
707 } 868 }
708 869
709 rc = CIFSSMBDelFile(xid, pTcon, full_path, cifs_sb->local_nls, 870retry_std_delete:
871 rc = CIFSSMBDelFile(xid, tcon, full_path, cifs_sb->local_nls,
710 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 872 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
873
711psx_del_no_retry: 874psx_del_no_retry:
712 if (!rc) { 875 if (!rc) {
713 if (direntry->d_inode) 876 if (inode)
714 drop_nlink(direntry->d_inode); 877 drop_nlink(inode);
715 } else if (rc == -ENOENT) { 878 } else if (rc == -ENOENT) {
716 d_drop(direntry); 879 d_drop(dentry);
717 } else if (rc == -ETXTBSY) { 880 } else if (rc == -ETXTBSY) {
718 int oplock = 0; 881 rc = cifs_rename_pending_delete(full_path, inode, xid);
719 __u16 netfid; 882 if (rc == 0)
720 883 drop_nlink(inode);
721 rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_OPEN, DELETE, 884 } else if (rc == -EACCES && dosattr == 0) {
722 CREATE_NOT_DIR | CREATE_DELETE_ON_CLOSE, 885 attrs = kzalloc(sizeof(*attrs), GFP_KERNEL);
723 &netfid, &oplock, NULL, cifs_sb->local_nls, 886 if (attrs == NULL) {
724 cifs_sb->mnt_cifs_flags & 887 rc = -ENOMEM;
725 CIFS_MOUNT_MAP_SPECIAL_CHR); 888 goto out_reval;
726 if (rc == 0) {
727 CIFSSMBRenameOpenFile(xid, pTcon, netfid, NULL,
728 cifs_sb->local_nls,
729 cifs_sb->mnt_cifs_flags &
730 CIFS_MOUNT_MAP_SPECIAL_CHR);
731 CIFSSMBClose(xid, pTcon, netfid);
732 if (direntry->d_inode)
733 drop_nlink(direntry->d_inode);
734 } 889 }
735 } else if (rc == -EACCES) {
736 /* try only if r/o attribute set in local lookup data? */
737 pinfo_buf = kzalloc(sizeof(FILE_BASIC_INFO), GFP_KERNEL);
738 if (pinfo_buf) {
739 /* ATTRS set to normal clears r/o bit */
740 pinfo_buf->Attributes = cpu_to_le32(ATTR_NORMAL);
741 if (!(pTcon->ses->flags & CIFS_SES_NT4))
742 rc = CIFSSMBSetPathInfo(xid, pTcon, full_path,
743 pinfo_buf,
744 cifs_sb->local_nls,
745 cifs_sb->mnt_cifs_flags &
746 CIFS_MOUNT_MAP_SPECIAL_CHR);
747 else
748 rc = -EOPNOTSUPP;
749 890
750 if (rc == -EOPNOTSUPP) { 891 /* try to reset dos attributes */
751 int oplock = 0; 892 origattr = cifsInode->cifsAttrs;
752 __u16 netfid; 893 if (origattr == 0)
753 /* rc = CIFSSMBSetAttrLegacy(xid, pTcon, 894 origattr |= ATTR_NORMAL;
754 full_path, 895 dosattr = origattr & ~ATTR_READONLY;
755 (__u16)ATTR_NORMAL, 896 if (dosattr == 0)
756 cifs_sb->local_nls); 897 dosattr |= ATTR_NORMAL;
757 For some strange reason it seems that NT4 eats the 898 dosattr |= ATTR_HIDDEN;
758 old setattr call without actually setting the 899
759 attributes so on to the third attempted workaround 900 rc = cifs_set_file_info(inode, attrs, xid, full_path, dosattr);
760 */ 901 if (rc != 0)
761 902 goto out_reval;
762 /* BB could scan to see if we already have it open 903
763 and pass in pid of opener to function */ 904 goto retry_std_delete;
764 rc = CIFSSMBOpen(xid, pTcon, full_path,
765 FILE_OPEN, SYNCHRONIZE |
766 FILE_WRITE_ATTRIBUTES, 0,
767 &netfid, &oplock, NULL,
768 cifs_sb->local_nls,
769 cifs_sb->mnt_cifs_flags &
770 CIFS_MOUNT_MAP_SPECIAL_CHR);
771 if (rc == 0) {
772 rc = CIFSSMBSetFileInfo(xid, pTcon,
773 pinfo_buf,
774 netfid,
775 current->tgid);
776 CIFSSMBClose(xid, pTcon, netfid);
777 }
778 }
779 kfree(pinfo_buf);
780 }
781 if (rc == 0) {
782 rc = CIFSSMBDelFile(xid, pTcon, full_path,
783 cifs_sb->local_nls,
784 cifs_sb->mnt_cifs_flags &
785 CIFS_MOUNT_MAP_SPECIAL_CHR);
786 if (!rc) {
787 if (direntry->d_inode)
788 drop_nlink(direntry->d_inode);
789 } else if (rc == -ETXTBSY) {
790 int oplock = 0;
791 __u16 netfid;
792
793 rc = CIFSSMBOpen(xid, pTcon, full_path,
794 FILE_OPEN, DELETE,
795 CREATE_NOT_DIR |
796 CREATE_DELETE_ON_CLOSE,
797 &netfid, &oplock, NULL,
798 cifs_sb->local_nls,
799 cifs_sb->mnt_cifs_flags &
800 CIFS_MOUNT_MAP_SPECIAL_CHR);
801 if (rc == 0) {
802 CIFSSMBRenameOpenFile(xid, pTcon,
803 netfid, NULL,
804 cifs_sb->local_nls,
805 cifs_sb->mnt_cifs_flags &
806 CIFS_MOUNT_MAP_SPECIAL_CHR);
807 CIFSSMBClose(xid, pTcon, netfid);
808 if (direntry->d_inode)
809 drop_nlink(direntry->d_inode);
810 }
811 /* BB if rc = -ETXTBUSY goto the rename logic BB */
812 }
813 }
814 }
815 if (direntry->d_inode) {
816 cifsInode = CIFS_I(direntry->d_inode);
817 cifsInode->time = 0; /* will force revalidate to get info
818 when needed */
819 direntry->d_inode->i_ctime = current_fs_time(inode->i_sb);
820 } 905 }
906
907 /* undo the setattr if we errored out and it's needed */
908 if (rc != 0 && dosattr != 0)
909 cifs_set_file_info(inode, attrs, xid, full_path, origattr);
910
911out_reval:
821 if (inode) { 912 if (inode) {
822 inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb);
823 cifsInode = CIFS_I(inode); 913 cifsInode = CIFS_I(inode);
824 cifsInode->time = 0; /* force revalidate of dir as well */ 914 cifsInode->time = 0; /* will force revalidate to get info
915 when needed */
916 inode->i_ctime = current_fs_time(sb);
825 } 917 }
918 dir->i_ctime = dir->i_mtime = current_fs_time(sb);
919 cifsInode = CIFS_I(dir);
920 CIFS_I(dir)->time = 0; /* force revalidate of dir as well */
826 921
827 kfree(full_path); 922 kfree(full_path);
923 kfree(attrs);
828 FreeXid(xid); 924 FreeXid(xid);
829 return rc; 925 return rc;
830} 926}
@@ -869,7 +965,7 @@ static void posix_fill_in_inode(struct inode *tmp_inode,
869 965
870int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode) 966int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
871{ 967{
872 int rc = 0; 968 int rc = 0, tmprc;
873 int xid; 969 int xid;
874 struct cifs_sb_info *cifs_sb; 970 struct cifs_sb_info *cifs_sb;
875 struct cifsTconInfo *pTcon; 971 struct cifsTconInfo *pTcon;
@@ -931,6 +1027,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
931 kfree(pInfo); 1027 kfree(pInfo);
932 goto mkdir_get_info; 1028 goto mkdir_get_info;
933 } 1029 }
1030
934 /* Is an i_ino of zero legal? */ 1031 /* Is an i_ino of zero legal? */
935 /* Are there sanity checks we can use to ensure that 1032 /* Are there sanity checks we can use to ensure that
936 the server is really filling in that field? */ 1033 the server is really filling in that field? */
@@ -1019,12 +1116,20 @@ mkdir_get_info:
1019 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) && 1116 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) &&
1020 (mode & S_IWUGO) == 0) { 1117 (mode & S_IWUGO) == 0) {
1021 FILE_BASIC_INFO pInfo; 1118 FILE_BASIC_INFO pInfo;
1119 struct cifsInodeInfo *cifsInode;
1120 u32 dosattrs;
1121
1022 memset(&pInfo, 0, sizeof(pInfo)); 1122 memset(&pInfo, 0, sizeof(pInfo));
1023 pInfo.Attributes = cpu_to_le32(ATTR_READONLY); 1123 cifsInode = CIFS_I(newinode);
1024 CIFSSMBSetPathInfo(xid, pTcon, full_path, 1124 dosattrs = cifsInode->cifsAttrs|ATTR_READONLY;
1025 &pInfo, cifs_sb->local_nls, 1125 pInfo.Attributes = cpu_to_le32(dosattrs);
1126 tmprc = CIFSSMBSetPathInfo(xid, pTcon,
1127 full_path, &pInfo,
1128 cifs_sb->local_nls,
1026 cifs_sb->mnt_cifs_flags & 1129 cifs_sb->mnt_cifs_flags &
1027 CIFS_MOUNT_MAP_SPECIAL_CHR); 1130 CIFS_MOUNT_MAP_SPECIAL_CHR);
1131 if (tmprc == 0)
1132 cifsInode->cifsAttrs = dosattrs;
1028 } 1133 }
1029 if (direntry->d_inode) { 1134 if (direntry->d_inode) {
1030 if (cifs_sb->mnt_cifs_flags & 1135 if (cifs_sb->mnt_cifs_flags &
@@ -1096,117 +1201,141 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
1096 return rc; 1201 return rc;
1097} 1202}
1098 1203
1204static int
1205cifs_do_rename(int xid, struct dentry *from_dentry, const char *fromPath,
1206 struct dentry *to_dentry, const char *toPath)
1207{
1208 struct cifs_sb_info *cifs_sb = CIFS_SB(from_dentry->d_sb);
1209 struct cifsTconInfo *pTcon = cifs_sb->tcon;
1210 __u16 srcfid;
1211 int oplock, rc;
1212
1213 /* try path-based rename first */
1214 rc = CIFSSMBRename(xid, pTcon, fromPath, toPath, cifs_sb->local_nls,
1215 cifs_sb->mnt_cifs_flags &
1216 CIFS_MOUNT_MAP_SPECIAL_CHR);
1217
1218 /*
1219 * don't bother with rename by filehandle unless file is busy and
1220 * source Note that cross directory moves do not work with
1221 * rename by filehandle to various Windows servers.
1222 */
1223 if (rc == 0 || rc != -ETXTBSY)
1224 return rc;
1225
1226 /* open the file to be renamed -- we need DELETE perms */
1227 rc = CIFSSMBOpen(xid, pTcon, fromPath, FILE_OPEN, DELETE,
1228 CREATE_NOT_DIR, &srcfid, &oplock, NULL,
1229 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
1230 CIFS_MOUNT_MAP_SPECIAL_CHR);
1231
1232 if (rc == 0) {
1233 rc = CIFSSMBRenameOpenFile(xid, pTcon, srcfid,
1234 (const char *) to_dentry->d_name.name,
1235 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
1236 CIFS_MOUNT_MAP_SPECIAL_CHR);
1237
1238 CIFSSMBClose(xid, pTcon, srcfid);
1239 }
1240
1241 return rc;
1242}
1243
1099int cifs_rename(struct inode *source_inode, struct dentry *source_direntry, 1244int cifs_rename(struct inode *source_inode, struct dentry *source_direntry,
1100 struct inode *target_inode, struct dentry *target_direntry) 1245 struct inode *target_inode, struct dentry *target_direntry)
1101{ 1246{
1102 char *fromName; 1247 char *fromName = NULL;
1103 char *toName; 1248 char *toName = NULL;
1104 struct cifs_sb_info *cifs_sb_source; 1249 struct cifs_sb_info *cifs_sb_source;
1105 struct cifs_sb_info *cifs_sb_target; 1250 struct cifs_sb_info *cifs_sb_target;
1106 struct cifsTconInfo *pTcon; 1251 struct cifsTconInfo *pTcon;
1252 FILE_UNIX_BASIC_INFO *info_buf_source = NULL;
1253 FILE_UNIX_BASIC_INFO *info_buf_target;
1107 int xid; 1254 int xid;
1108 int rc = 0; 1255 int rc;
1109
1110 xid = GetXid();
1111 1256
1112 cifs_sb_target = CIFS_SB(target_inode->i_sb); 1257 cifs_sb_target = CIFS_SB(target_inode->i_sb);
1113 cifs_sb_source = CIFS_SB(source_inode->i_sb); 1258 cifs_sb_source = CIFS_SB(source_inode->i_sb);
1114 pTcon = cifs_sb_source->tcon; 1259 pTcon = cifs_sb_source->tcon;
1115 1260
1261 xid = GetXid();
1262
1263 /*
1264 * BB: this might be allowed if same server, but different share.
1265 * Consider adding support for this
1266 */
1116 if (pTcon != cifs_sb_target->tcon) { 1267 if (pTcon != cifs_sb_target->tcon) {
1117 FreeXid(xid); 1268 rc = -EXDEV;
1118 return -EXDEV; /* BB actually could be allowed if same server, 1269 goto cifs_rename_exit;
1119 but different share.
1120 Might eventually add support for this */
1121 } 1270 }
1122 1271
1123 /* we already have the rename sem so we do not need to grab it again 1272 /*
1124 here to protect the path integrity */ 1273 * we already have the rename sem so we do not need to
1274 * grab it again here to protect the path integrity
1275 */
1125 fromName = build_path_from_dentry(source_direntry); 1276 fromName = build_path_from_dentry(source_direntry);
1277 if (fromName == NULL) {
1278 rc = -ENOMEM;
1279 goto cifs_rename_exit;
1280 }
1281
1126 toName = build_path_from_dentry(target_direntry); 1282 toName = build_path_from_dentry(target_direntry);
1127 if ((fromName == NULL) || (toName == NULL)) { 1283 if (toName == NULL) {
1128 rc = -ENOMEM; 1284 rc = -ENOMEM;
1129 goto cifs_rename_exit; 1285 goto cifs_rename_exit;
1130 } 1286 }
1131 1287
1132 rc = CIFSSMBRename(xid, pTcon, fromName, toName, 1288 rc = cifs_do_rename(xid, source_direntry, fromName,
1133 cifs_sb_source->local_nls, 1289 target_direntry, toName);
1134 cifs_sb_source->mnt_cifs_flags & 1290
1135 CIFS_MOUNT_MAP_SPECIAL_CHR);
1136 if (rc == -EEXIST) { 1291 if (rc == -EEXIST) {
1137 /* check if they are the same file because rename of hardlinked 1292 if (pTcon->unix_ext) {
1138 files is a noop */ 1293 /*
1139 FILE_UNIX_BASIC_INFO *info_buf_source; 1294 * Are src and dst hardlinks of same inode? We can
1140 FILE_UNIX_BASIC_INFO *info_buf_target; 1295 * only tell with unix extensions enabled
1141 1296 */
1142 info_buf_source = 1297 info_buf_source =
1143 kmalloc(2 * sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL); 1298 kmalloc(2 * sizeof(FILE_UNIX_BASIC_INFO),
1144 if (info_buf_source != NULL) { 1299 GFP_KERNEL);
1300 if (info_buf_source == NULL)
1301 goto unlink_target;
1302
1145 info_buf_target = info_buf_source + 1; 1303 info_buf_target = info_buf_source + 1;
1146 if (pTcon->unix_ext) 1304 rc = CIFSSMBUnixQPathInfo(xid, pTcon, fromName,
1147 rc = CIFSSMBUnixQPathInfo(xid, pTcon, fromName, 1305 info_buf_source,
1148 info_buf_source, 1306 cifs_sb_source->local_nls,
1149 cifs_sb_source->local_nls, 1307 cifs_sb_source->mnt_cifs_flags &
1150 cifs_sb_source->mnt_cifs_flags &
1151 CIFS_MOUNT_MAP_SPECIAL_CHR); 1308 CIFS_MOUNT_MAP_SPECIAL_CHR);
1152 /* else rc is still EEXIST so will fall through to 1309 if (rc != 0)
1153 unlink the target and retry rename */ 1310 goto unlink_target;
1154 if (rc == 0) { 1311
1155 rc = CIFSSMBUnixQPathInfo(xid, pTcon, toName, 1312 rc = CIFSSMBUnixQPathInfo(xid, pTcon,
1156 info_buf_target, 1313 toName, info_buf_target,
1157 cifs_sb_target->local_nls, 1314 cifs_sb_target->local_nls,
1158 /* remap based on source sb */ 1315 /* remap based on source sb */
1159 cifs_sb_source->mnt_cifs_flags & 1316 cifs_sb_source->mnt_cifs_flags &
1160 CIFS_MOUNT_MAP_SPECIAL_CHR);
1161 }
1162 if ((rc == 0) &&
1163 (info_buf_source->UniqueId ==
1164 info_buf_target->UniqueId)) {
1165 /* do not rename since the files are hardlinked which
1166 is a noop */
1167 } else {
1168 /* we either can not tell the files are hardlinked
1169 (as with Windows servers) or files are not
1170 hardlinked so delete the target manually before
1171 renaming to follow POSIX rather than Windows
1172 semantics */
1173 cifs_unlink(target_inode, target_direntry);
1174 rc = CIFSSMBRename(xid, pTcon, fromName,
1175 toName,
1176 cifs_sb_source->local_nls,
1177 cifs_sb_source->mnt_cifs_flags
1178 & CIFS_MOUNT_MAP_SPECIAL_CHR);
1179 }
1180 kfree(info_buf_source);
1181 } /* if we can not get memory just leave rc as EEXIST */
1182 }
1183
1184 if (rc)
1185 cFYI(1, ("rename rc %d", rc));
1186
1187 if ((rc == -EIO) || (rc == -EEXIST)) {
1188 int oplock = 0;
1189 __u16 netfid;
1190
1191 /* BB FIXME Is Generic Read correct for rename? */
1192 /* if renaming directory - we should not say CREATE_NOT_DIR,
1193 need to test renaming open directory, also GENERIC_READ
1194 might not right be right access to request */
1195 rc = CIFSSMBOpen(xid, pTcon, fromName, FILE_OPEN, GENERIC_READ,
1196 CREATE_NOT_DIR, &netfid, &oplock, NULL,
1197 cifs_sb_source->local_nls,
1198 cifs_sb_source->mnt_cifs_flags &
1199 CIFS_MOUNT_MAP_SPECIAL_CHR);
1200 if (rc == 0) {
1201 rc = CIFSSMBRenameOpenFile(xid, pTcon, netfid, toName,
1202 cifs_sb_source->local_nls,
1203 cifs_sb_source->mnt_cifs_flags &
1204 CIFS_MOUNT_MAP_SPECIAL_CHR); 1317 CIFS_MOUNT_MAP_SPECIAL_CHR);
1205 CIFSSMBClose(xid, pTcon, netfid); 1318
1206 } 1319 if (rc == 0 && (info_buf_source->UniqueId ==
1320 info_buf_target->UniqueId))
1321 /* same file, POSIX says that this is a noop */
1322 goto cifs_rename_exit;
1323 } /* else ... BB we could add the same check for Windows by
1324 checking the UniqueId via FILE_INTERNAL_INFO */
1325unlink_target:
1326 /*
1327 * we either can not tell the files are hardlinked (as with
1328 * Windows servers) or files are not hardlinked. Delete the
1329 * target manually before renaming to follow POSIX rather than
1330 * Windows semantics
1331 */
1332 cifs_unlink(target_inode, target_direntry);
1333 rc = cifs_do_rename(xid, source_direntry, fromName,
1334 target_direntry, toName);
1207 } 1335 }
1208 1336
1209cifs_rename_exit: 1337cifs_rename_exit:
1338 kfree(info_buf_source);
1210 kfree(fromName); 1339 kfree(fromName);
1211 kfree(toName); 1340 kfree(toName);
1212 FreeXid(xid); 1341 FreeXid(xid);
@@ -1507,101 +1636,6 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
1507} 1636}
1508 1637
1509static int 1638static int
1510cifs_set_file_info(struct inode *inode, struct iattr *attrs, int xid,
1511 char *full_path, __u32 dosattr)
1512{
1513 int rc;
1514 int oplock = 0;
1515 __u16 netfid;
1516 __u32 netpid;
1517 bool set_time = false;
1518 struct cifsFileInfo *open_file;
1519 struct cifsInodeInfo *cifsInode = CIFS_I(inode);
1520 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
1521 struct cifsTconInfo *pTcon = cifs_sb->tcon;
1522 FILE_BASIC_INFO info_buf;
1523
1524 if (attrs->ia_valid & ATTR_ATIME) {
1525 set_time = true;
1526 info_buf.LastAccessTime =
1527 cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_atime));
1528 } else
1529 info_buf.LastAccessTime = 0;
1530
1531 if (attrs->ia_valid & ATTR_MTIME) {
1532 set_time = true;
1533 info_buf.LastWriteTime =
1534 cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_mtime));
1535 } else
1536 info_buf.LastWriteTime = 0;
1537
1538 /*
1539 * Samba throws this field away, but windows may actually use it.
1540 * Do not set ctime unless other time stamps are changed explicitly
1541 * (i.e. by utimes()) since we would then have a mix of client and
1542 * server times.
1543 */
1544 if (set_time && (attrs->ia_valid & ATTR_CTIME)) {
1545 cFYI(1, ("CIFS - CTIME changed"));
1546 info_buf.ChangeTime =
1547 cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_ctime));
1548 } else
1549 info_buf.ChangeTime = 0;
1550
1551 info_buf.CreationTime = 0; /* don't change */
1552 info_buf.Attributes = cpu_to_le32(dosattr);
1553
1554 /*
1555 * If the file is already open for write, just use that fileid
1556 */
1557 open_file = find_writable_file(cifsInode);
1558 if (open_file) {
1559 netfid = open_file->netfid;
1560 netpid = open_file->pid;
1561 goto set_via_filehandle;
1562 }
1563
1564 /*
1565 * NT4 apparently returns success on this call, but it doesn't
1566 * really work.
1567 */
1568 if (!(pTcon->ses->flags & CIFS_SES_NT4)) {
1569 rc = CIFSSMBSetPathInfo(xid, pTcon, full_path,
1570 &info_buf, cifs_sb->local_nls,
1571 cifs_sb->mnt_cifs_flags &
1572 CIFS_MOUNT_MAP_SPECIAL_CHR);
1573 if (rc != -EOPNOTSUPP && rc != -EINVAL)
1574 goto out;
1575 }
1576
1577 cFYI(1, ("calling SetFileInfo since SetPathInfo for "
1578 "times not supported by this server"));
1579 rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_OPEN,
1580 SYNCHRONIZE | FILE_WRITE_ATTRIBUTES,
1581 CREATE_NOT_DIR, &netfid, &oplock,
1582 NULL, cifs_sb->local_nls,
1583 cifs_sb->mnt_cifs_flags &
1584 CIFS_MOUNT_MAP_SPECIAL_CHR);
1585
1586 if (rc != 0) {
1587 if (rc == -EIO)
1588 rc = -EINVAL;
1589 goto out;
1590 }
1591
1592 netpid = current->tgid;
1593
1594set_via_filehandle:
1595 rc = CIFSSMBSetFileInfo(xid, pTcon, &info_buf, netfid, netpid);
1596 if (open_file == NULL)
1597 CIFSSMBClose(xid, pTcon, netfid);
1598 else
1599 atomic_dec(&open_file->wrtPending);
1600out:
1601 return rc;
1602}
1603
1604static int
1605cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs) 1639cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
1606{ 1640{
1607 int rc; 1641 int rc;
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 4b17f8fe3157..88786ba02d27 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -150,8 +150,7 @@ cifs_buf_get(void)
150 but it may be more efficient to always alloc same size 150 but it may be more efficient to always alloc same size
151 albeit slightly larger than necessary and maxbuffersize 151 albeit slightly larger than necessary and maxbuffersize
152 defaults to this and can not be bigger */ 152 defaults to this and can not be bigger */
153 ret_buf = (struct smb_hdr *) mempool_alloc(cifs_req_poolp, 153 ret_buf = mempool_alloc(cifs_req_poolp, GFP_NOFS);
154 GFP_KERNEL | GFP_NOFS);
155 154
156 /* clear the first few header bytes */ 155 /* clear the first few header bytes */
157 /* for most paths, more is cleared in header_assemble */ 156 /* for most paths, more is cleared in header_assemble */
@@ -188,8 +187,7 @@ cifs_small_buf_get(void)
188 but it may be more efficient to always alloc same size 187 but it may be more efficient to always alloc same size
189 albeit slightly larger than necessary and maxbuffersize 188 albeit slightly larger than necessary and maxbuffersize
190 defaults to this and can not be bigger */ 189 defaults to this and can not be bigger */
191 ret_buf = (struct smb_hdr *) mempool_alloc(cifs_sm_req_poolp, 190 ret_buf = mempool_alloc(cifs_sm_req_poolp, GFP_NOFS);
192 GFP_KERNEL | GFP_NOFS);
193 if (ret_buf) { 191 if (ret_buf) {
194 /* No need to clear memory here, cleared in header assemble */ 192 /* No need to clear memory here, cleared in header assemble */
195 /* memset(ret_buf, 0, sizeof(struct smb_hdr) + 27);*/ 193 /* memset(ret_buf, 0, sizeof(struct smb_hdr) + 27);*/
@@ -313,8 +311,6 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
313 buffer->Flags2 = SMBFLG2_KNOWS_LONG_NAMES; 311 buffer->Flags2 = SMBFLG2_KNOWS_LONG_NAMES;
314 buffer->Pid = cpu_to_le16((__u16)current->tgid); 312 buffer->Pid = cpu_to_le16((__u16)current->tgid);
315 buffer->PidHigh = cpu_to_le16((__u16)(current->tgid >> 16)); 313 buffer->PidHigh = cpu_to_le16((__u16)(current->tgid >> 16));
316 spin_lock(&GlobalMid_Lock);
317 spin_unlock(&GlobalMid_Lock);
318 if (treeCon) { 314 if (treeCon) {
319 buffer->Tid = treeCon->tid; 315 buffer->Tid = treeCon->tid;
320 if (treeCon->ses) { 316 if (treeCon->ses) {
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 5f40ed3473f5..765adf12d54f 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -640,6 +640,70 @@ static int is_dir_changed(struct file *file)
640 640
641} 641}
642 642
643static int cifs_save_resume_key(const char *current_entry,
644 struct cifsFileInfo *cifsFile)
645{
646 int rc = 0;
647 unsigned int len = 0;
648 __u16 level;
649 char *filename;
650
651 if ((cifsFile == NULL) || (current_entry == NULL))
652 return -EINVAL;
653
654 level = cifsFile->srch_inf.info_level;
655
656 if (level == SMB_FIND_FILE_UNIX) {
657 FILE_UNIX_INFO *pFindData = (FILE_UNIX_INFO *)current_entry;
658
659 filename = &pFindData->FileName[0];
660 if (cifsFile->srch_inf.unicode) {
661 len = cifs_unicode_bytelen(filename);
662 } else {
663 /* BB should we make this strnlen of PATH_MAX? */
664 len = strnlen(filename, PATH_MAX);
665 }
666 cifsFile->srch_inf.resume_key = pFindData->ResumeKey;
667 } else if (level == SMB_FIND_FILE_DIRECTORY_INFO) {
668 FILE_DIRECTORY_INFO *pFindData =
669 (FILE_DIRECTORY_INFO *)current_entry;
670 filename = &pFindData->FileName[0];
671 len = le32_to_cpu(pFindData->FileNameLength);
672 cifsFile->srch_inf.resume_key = pFindData->FileIndex;
673 } else if (level == SMB_FIND_FILE_FULL_DIRECTORY_INFO) {
674 FILE_FULL_DIRECTORY_INFO *pFindData =
675 (FILE_FULL_DIRECTORY_INFO *)current_entry;
676 filename = &pFindData->FileName[0];
677 len = le32_to_cpu(pFindData->FileNameLength);
678 cifsFile->srch_inf.resume_key = pFindData->FileIndex;
679 } else if (level == SMB_FIND_FILE_ID_FULL_DIR_INFO) {
680 SEARCH_ID_FULL_DIR_INFO *pFindData =
681 (SEARCH_ID_FULL_DIR_INFO *)current_entry;
682 filename = &pFindData->FileName[0];
683 len = le32_to_cpu(pFindData->FileNameLength);
684 cifsFile->srch_inf.resume_key = pFindData->FileIndex;
685 } else if (level == SMB_FIND_FILE_BOTH_DIRECTORY_INFO) {
686 FILE_BOTH_DIRECTORY_INFO *pFindData =
687 (FILE_BOTH_DIRECTORY_INFO *)current_entry;
688 filename = &pFindData->FileName[0];
689 len = le32_to_cpu(pFindData->FileNameLength);
690 cifsFile->srch_inf.resume_key = pFindData->FileIndex;
691 } else if (level == SMB_FIND_FILE_INFO_STANDARD) {
692 FIND_FILE_STANDARD_INFO *pFindData =
693 (FIND_FILE_STANDARD_INFO *)current_entry;
694 filename = &pFindData->FileName[0];
695 /* one byte length, no name conversion */
696 len = (unsigned int)pFindData->FileNameLength;
697 cifsFile->srch_inf.resume_key = pFindData->ResumeKey;
698 } else {
699 cFYI(1, ("Unknown findfirst level %d", level));
700 return -EINVAL;
701 }
702 cifsFile->srch_inf.resume_name_len = len;
703 cifsFile->srch_inf.presume_name = filename;
704 return rc;
705}
706
643/* find the corresponding entry in the search */ 707/* find the corresponding entry in the search */
644/* Note that the SMB server returns search entries for . and .. which 708/* Note that the SMB server returns search entries for . and .. which
645 complicates logic here if we choose to parse for them and we do not 709 complicates logic here if we choose to parse for them and we do not
@@ -703,6 +767,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
703 while ((index_to_find >= cifsFile->srch_inf.index_of_last_entry) && 767 while ((index_to_find >= cifsFile->srch_inf.index_of_last_entry) &&
704 (rc == 0) && !cifsFile->srch_inf.endOfSearch) { 768 (rc == 0) && !cifsFile->srch_inf.endOfSearch) {
705 cFYI(1, ("calling findnext2")); 769 cFYI(1, ("calling findnext2"));
770 cifs_save_resume_key(cifsFile->srch_inf.last_entry, cifsFile);
706 rc = CIFSFindNext(xid, pTcon, cifsFile->netfid, 771 rc = CIFSFindNext(xid, pTcon, cifsFile->netfid,
707 &cifsFile->srch_inf); 772 &cifsFile->srch_inf);
708 if (rc) 773 if (rc)
@@ -919,69 +984,6 @@ static int cifs_filldir(char *pfindEntry, struct file *file,
919 return rc; 984 return rc;
920} 985}
921 986
922static int cifs_save_resume_key(const char *current_entry,
923 struct cifsFileInfo *cifsFile)
924{
925 int rc = 0;
926 unsigned int len = 0;
927 __u16 level;
928 char *filename;
929
930 if ((cifsFile == NULL) || (current_entry == NULL))
931 return -EINVAL;
932
933 level = cifsFile->srch_inf.info_level;
934
935 if (level == SMB_FIND_FILE_UNIX) {
936 FILE_UNIX_INFO *pFindData = (FILE_UNIX_INFO *)current_entry;
937
938 filename = &pFindData->FileName[0];
939 if (cifsFile->srch_inf.unicode) {
940 len = cifs_unicode_bytelen(filename);
941 } else {
942 /* BB should we make this strnlen of PATH_MAX? */
943 len = strnlen(filename, PATH_MAX);
944 }
945 cifsFile->srch_inf.resume_key = pFindData->ResumeKey;
946 } else if (level == SMB_FIND_FILE_DIRECTORY_INFO) {
947 FILE_DIRECTORY_INFO *pFindData =
948 (FILE_DIRECTORY_INFO *)current_entry;
949 filename = &pFindData->FileName[0];
950 len = le32_to_cpu(pFindData->FileNameLength);
951 cifsFile->srch_inf.resume_key = pFindData->FileIndex;
952 } else if (level == SMB_FIND_FILE_FULL_DIRECTORY_INFO) {
953 FILE_FULL_DIRECTORY_INFO *pFindData =
954 (FILE_FULL_DIRECTORY_INFO *)current_entry;
955 filename = &pFindData->FileName[0];
956 len = le32_to_cpu(pFindData->FileNameLength);
957 cifsFile->srch_inf.resume_key = pFindData->FileIndex;
958 } else if (level == SMB_FIND_FILE_ID_FULL_DIR_INFO) {
959 SEARCH_ID_FULL_DIR_INFO *pFindData =
960 (SEARCH_ID_FULL_DIR_INFO *)current_entry;
961 filename = &pFindData->FileName[0];
962 len = le32_to_cpu(pFindData->FileNameLength);
963 cifsFile->srch_inf.resume_key = pFindData->FileIndex;
964 } else if (level == SMB_FIND_FILE_BOTH_DIRECTORY_INFO) {
965 FILE_BOTH_DIRECTORY_INFO *pFindData =
966 (FILE_BOTH_DIRECTORY_INFO *)current_entry;
967 filename = &pFindData->FileName[0];
968 len = le32_to_cpu(pFindData->FileNameLength);
969 cifsFile->srch_inf.resume_key = pFindData->FileIndex;
970 } else if (level == SMB_FIND_FILE_INFO_STANDARD) {
971 FIND_FILE_STANDARD_INFO *pFindData =
972 (FIND_FILE_STANDARD_INFO *)current_entry;
973 filename = &pFindData->FileName[0];
974 /* one byte length, no name conversion */
975 len = (unsigned int)pFindData->FileNameLength;
976 cifsFile->srch_inf.resume_key = pFindData->ResumeKey;
977 } else {
978 cFYI(1, ("Unknown findfirst level %d", level));
979 return -EINVAL;
980 }
981 cifsFile->srch_inf.resume_name_len = len;
982 cifsFile->srch_inf.presume_name = filename;
983 return rc;
984}
985 987
986int cifs_readdir(struct file *file, void *direntry, filldir_t filldir) 988int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
987{ 989{
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 252fdc0567f1..2851d5da0c8c 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -624,8 +624,10 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
624 ses, nls_cp); 624 ses, nls_cp);
625 625
626ssetup_exit: 626ssetup_exit:
627 if (spnego_key) 627 if (spnego_key) {
628 key_revoke(spnego_key);
628 key_put(spnego_key); 629 key_put(spnego_key);
630 }
629 kfree(str_area); 631 kfree(str_area);
630 if (resp_buf_type == CIFS_SMALL_BUFFER) { 632 if (resp_buf_type == CIFS_SMALL_BUFFER) {
631 cFYI(1, ("ssetup freeing small buf %p", iov[0].iov_base)); 633 cFYI(1, ("ssetup freeing small buf %p", iov[0].iov_base));
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index e286db9f5ee2..bf0e6d8e382a 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -50,8 +50,7 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct cifsSesInfo *ses)
50 return NULL; 50 return NULL;
51 } 51 }
52 52
53 temp = (struct mid_q_entry *) mempool_alloc(cifs_mid_poolp, 53 temp = mempool_alloc(cifs_mid_poolp, GFP_NOFS);
54 GFP_KERNEL | GFP_NOFS);
55 if (temp == NULL) 54 if (temp == NULL)
56 return temp; 55 return temp;
57 else { 56 else {
diff --git a/fs/dcache.c b/fs/dcache.c
index 80e93956aced..e7a1a99b7464 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1395,6 +1395,10 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
1395 if (dentry->d_parent != parent) 1395 if (dentry->d_parent != parent)
1396 goto next; 1396 goto next;
1397 1397
1398 /* non-existing due to RCU? */
1399 if (d_unhashed(dentry))
1400 goto next;
1401
1398 /* 1402 /*
1399 * It is safe to compare names since d_move() cannot 1403 * It is safe to compare names since d_move() cannot
1400 * change the qstr (protected by d_lock). 1404 * change the qstr (protected by d_lock).
@@ -1410,10 +1414,8 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
1410 goto next; 1414 goto next;
1411 } 1415 }
1412 1416
1413 if (!d_unhashed(dentry)) { 1417 atomic_inc(&dentry->d_count);
1414 atomic_inc(&dentry->d_count); 1418 found = dentry;
1415 found = dentry;
1416 }
1417 spin_unlock(&dentry->d_lock); 1419 spin_unlock(&dentry->d_lock);
1418 break; 1420 break;
1419next: 1421next:
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 89d2fb7b991a..fd9859f92fad 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -14,6 +14,9 @@
14#include <linux/kernel.h> 14#include <linux/kernel.h>
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/configfs.h> 16#include <linux/configfs.h>
17#include <linux/in.h>
18#include <linux/in6.h>
19#include <net/ipv6.h>
17#include <net/sock.h> 20#include <net/sock.h>
18 21
19#include "config.h" 22#include "config.h"
@@ -377,24 +380,24 @@ static struct config_item_type node_type = {
377 .ct_owner = THIS_MODULE, 380 .ct_owner = THIS_MODULE,
378}; 381};
379 382
380static struct dlm_cluster *to_cluster(struct config_item *i) 383static struct dlm_cluster *config_item_to_cluster(struct config_item *i)
381{ 384{
382 return i ? container_of(to_config_group(i), struct dlm_cluster, group) : 385 return i ? container_of(to_config_group(i), struct dlm_cluster, group) :
383 NULL; 386 NULL;
384} 387}
385 388
386static struct dlm_space *to_space(struct config_item *i) 389static struct dlm_space *config_item_to_space(struct config_item *i)
387{ 390{
388 return i ? container_of(to_config_group(i), struct dlm_space, group) : 391 return i ? container_of(to_config_group(i), struct dlm_space, group) :
389 NULL; 392 NULL;
390} 393}
391 394
392static struct dlm_comm *to_comm(struct config_item *i) 395static struct dlm_comm *config_item_to_comm(struct config_item *i)
393{ 396{
394 return i ? container_of(i, struct dlm_comm, item) : NULL; 397 return i ? container_of(i, struct dlm_comm, item) : NULL;
395} 398}
396 399
397static struct dlm_node *to_node(struct config_item *i) 400static struct dlm_node *config_item_to_node(struct config_item *i)
398{ 401{
399 return i ? container_of(i, struct dlm_node, item) : NULL; 402 return i ? container_of(i, struct dlm_node, item) : NULL;
400} 403}
@@ -450,7 +453,7 @@ static struct config_group *make_cluster(struct config_group *g,
450 453
451static void drop_cluster(struct config_group *g, struct config_item *i) 454static void drop_cluster(struct config_group *g, struct config_item *i)
452{ 455{
453 struct dlm_cluster *cl = to_cluster(i); 456 struct dlm_cluster *cl = config_item_to_cluster(i);
454 struct config_item *tmp; 457 struct config_item *tmp;
455 int j; 458 int j;
456 459
@@ -468,7 +471,7 @@ static void drop_cluster(struct config_group *g, struct config_item *i)
468 471
469static void release_cluster(struct config_item *i) 472static void release_cluster(struct config_item *i)
470{ 473{
471 struct dlm_cluster *cl = to_cluster(i); 474 struct dlm_cluster *cl = config_item_to_cluster(i);
472 kfree(cl->group.default_groups); 475 kfree(cl->group.default_groups);
473 kfree(cl); 476 kfree(cl);
474} 477}
@@ -507,7 +510,7 @@ static struct config_group *make_space(struct config_group *g, const char *name)
507 510
508static void drop_space(struct config_group *g, struct config_item *i) 511static void drop_space(struct config_group *g, struct config_item *i)
509{ 512{
510 struct dlm_space *sp = to_space(i); 513 struct dlm_space *sp = config_item_to_space(i);
511 struct config_item *tmp; 514 struct config_item *tmp;
512 int j; 515 int j;
513 516
@@ -524,7 +527,7 @@ static void drop_space(struct config_group *g, struct config_item *i)
524 527
525static void release_space(struct config_item *i) 528static void release_space(struct config_item *i)
526{ 529{
527 struct dlm_space *sp = to_space(i); 530 struct dlm_space *sp = config_item_to_space(i);
528 kfree(sp->group.default_groups); 531 kfree(sp->group.default_groups);
529 kfree(sp); 532 kfree(sp);
530} 533}
@@ -546,7 +549,7 @@ static struct config_item *make_comm(struct config_group *g, const char *name)
546 549
547static void drop_comm(struct config_group *g, struct config_item *i) 550static void drop_comm(struct config_group *g, struct config_item *i)
548{ 551{
549 struct dlm_comm *cm = to_comm(i); 552 struct dlm_comm *cm = config_item_to_comm(i);
550 if (local_comm == cm) 553 if (local_comm == cm)
551 local_comm = NULL; 554 local_comm = NULL;
552 dlm_lowcomms_close(cm->nodeid); 555 dlm_lowcomms_close(cm->nodeid);
@@ -557,13 +560,13 @@ static void drop_comm(struct config_group *g, struct config_item *i)
557 560
558static void release_comm(struct config_item *i) 561static void release_comm(struct config_item *i)
559{ 562{
560 struct dlm_comm *cm = to_comm(i); 563 struct dlm_comm *cm = config_item_to_comm(i);
561 kfree(cm); 564 kfree(cm);
562} 565}
563 566
564static struct config_item *make_node(struct config_group *g, const char *name) 567static struct config_item *make_node(struct config_group *g, const char *name)
565{ 568{
566 struct dlm_space *sp = to_space(g->cg_item.ci_parent); 569 struct dlm_space *sp = config_item_to_space(g->cg_item.ci_parent);
567 struct dlm_node *nd; 570 struct dlm_node *nd;
568 571
569 nd = kzalloc(sizeof(struct dlm_node), GFP_KERNEL); 572 nd = kzalloc(sizeof(struct dlm_node), GFP_KERNEL);
@@ -585,8 +588,8 @@ static struct config_item *make_node(struct config_group *g, const char *name)
585 588
586static void drop_node(struct config_group *g, struct config_item *i) 589static void drop_node(struct config_group *g, struct config_item *i)
587{ 590{
588 struct dlm_space *sp = to_space(g->cg_item.ci_parent); 591 struct dlm_space *sp = config_item_to_space(g->cg_item.ci_parent);
589 struct dlm_node *nd = to_node(i); 592 struct dlm_node *nd = config_item_to_node(i);
590 593
591 mutex_lock(&sp->members_lock); 594 mutex_lock(&sp->members_lock);
592 list_del(&nd->list); 595 list_del(&nd->list);
@@ -598,7 +601,7 @@ static void drop_node(struct config_group *g, struct config_item *i)
598 601
599static void release_node(struct config_item *i) 602static void release_node(struct config_item *i)
600{ 603{
601 struct dlm_node *nd = to_node(i); 604 struct dlm_node *nd = config_item_to_node(i);
602 kfree(nd); 605 kfree(nd);
603} 606}
604 607
@@ -632,7 +635,7 @@ void dlm_config_exit(void)
632static ssize_t show_cluster(struct config_item *i, struct configfs_attribute *a, 635static ssize_t show_cluster(struct config_item *i, struct configfs_attribute *a,
633 char *buf) 636 char *buf)
634{ 637{
635 struct dlm_cluster *cl = to_cluster(i); 638 struct dlm_cluster *cl = config_item_to_cluster(i);
636 struct cluster_attribute *cla = 639 struct cluster_attribute *cla =
637 container_of(a, struct cluster_attribute, attr); 640 container_of(a, struct cluster_attribute, attr);
638 return cla->show ? cla->show(cl, buf) : 0; 641 return cla->show ? cla->show(cl, buf) : 0;
@@ -642,7 +645,7 @@ static ssize_t store_cluster(struct config_item *i,
642 struct configfs_attribute *a, 645 struct configfs_attribute *a,
643 const char *buf, size_t len) 646 const char *buf, size_t len)
644{ 647{
645 struct dlm_cluster *cl = to_cluster(i); 648 struct dlm_cluster *cl = config_item_to_cluster(i);
646 struct cluster_attribute *cla = 649 struct cluster_attribute *cla =
647 container_of(a, struct cluster_attribute, attr); 650 container_of(a, struct cluster_attribute, attr);
648 return cla->store ? cla->store(cl, buf, len) : -EINVAL; 651 return cla->store ? cla->store(cl, buf, len) : -EINVAL;
@@ -651,7 +654,7 @@ static ssize_t store_cluster(struct config_item *i,
651static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a, 654static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a,
652 char *buf) 655 char *buf)
653{ 656{
654 struct dlm_comm *cm = to_comm(i); 657 struct dlm_comm *cm = config_item_to_comm(i);
655 struct comm_attribute *cma = 658 struct comm_attribute *cma =
656 container_of(a, struct comm_attribute, attr); 659 container_of(a, struct comm_attribute, attr);
657 return cma->show ? cma->show(cm, buf) : 0; 660 return cma->show ? cma->show(cm, buf) : 0;
@@ -660,7 +663,7 @@ static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a,
660static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a, 663static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a,
661 const char *buf, size_t len) 664 const char *buf, size_t len)
662{ 665{
663 struct dlm_comm *cm = to_comm(i); 666 struct dlm_comm *cm = config_item_to_comm(i);
664 struct comm_attribute *cma = 667 struct comm_attribute *cma =
665 container_of(a, struct comm_attribute, attr); 668 container_of(a, struct comm_attribute, attr);
666 return cma->store ? cma->store(cm, buf, len) : -EINVAL; 669 return cma->store ? cma->store(cm, buf, len) : -EINVAL;
@@ -714,7 +717,7 @@ static ssize_t comm_addr_write(struct dlm_comm *cm, const char *buf, size_t len)
714static ssize_t show_node(struct config_item *i, struct configfs_attribute *a, 717static ssize_t show_node(struct config_item *i, struct configfs_attribute *a,
715 char *buf) 718 char *buf)
716{ 719{
717 struct dlm_node *nd = to_node(i); 720 struct dlm_node *nd = config_item_to_node(i);
718 struct node_attribute *nda = 721 struct node_attribute *nda =
719 container_of(a, struct node_attribute, attr); 722 container_of(a, struct node_attribute, attr);
720 return nda->show ? nda->show(nd, buf) : 0; 723 return nda->show ? nda->show(nd, buf) : 0;
@@ -723,7 +726,7 @@ static ssize_t show_node(struct config_item *i, struct configfs_attribute *a,
723static ssize_t store_node(struct config_item *i, struct configfs_attribute *a, 726static ssize_t store_node(struct config_item *i, struct configfs_attribute *a,
724 const char *buf, size_t len) 727 const char *buf, size_t len)
725{ 728{
726 struct dlm_node *nd = to_node(i); 729 struct dlm_node *nd = config_item_to_node(i);
727 struct node_attribute *nda = 730 struct node_attribute *nda =
728 container_of(a, struct node_attribute, attr); 731 container_of(a, struct node_attribute, attr);
729 return nda->store ? nda->store(nd, buf, len) : -EINVAL; 732 return nda->store ? nda->store(nd, buf, len) : -EINVAL;
@@ -768,7 +771,7 @@ static struct dlm_space *get_space(char *name)
768 i = config_group_find_item(space_list, name); 771 i = config_group_find_item(space_list, name);
769 mutex_unlock(&space_list->cg_subsys->su_mutex); 772 mutex_unlock(&space_list->cg_subsys->su_mutex);
770 773
771 return to_space(i); 774 return config_item_to_space(i);
772} 775}
773 776
774static void put_space(struct dlm_space *sp) 777static void put_space(struct dlm_space *sp)
@@ -776,6 +779,33 @@ static void put_space(struct dlm_space *sp)
776 config_item_put(&sp->group.cg_item); 779 config_item_put(&sp->group.cg_item);
777} 780}
778 781
782static int addr_compare(struct sockaddr_storage *x, struct sockaddr_storage *y)
783{
784 switch (x->ss_family) {
785 case AF_INET: {
786 struct sockaddr_in *sinx = (struct sockaddr_in *)x;
787 struct sockaddr_in *siny = (struct sockaddr_in *)y;
788 if (sinx->sin_addr.s_addr != siny->sin_addr.s_addr)
789 return 0;
790 if (sinx->sin_port != siny->sin_port)
791 return 0;
792 break;
793 }
794 case AF_INET6: {
795 struct sockaddr_in6 *sinx = (struct sockaddr_in6 *)x;
796 struct sockaddr_in6 *siny = (struct sockaddr_in6 *)y;
797 if (!ipv6_addr_equal(&sinx->sin6_addr, &siny->sin6_addr))
798 return 0;
799 if (sinx->sin6_port != siny->sin6_port)
800 return 0;
801 break;
802 }
803 default:
804 return 0;
805 }
806 return 1;
807}
808
779static struct dlm_comm *get_comm(int nodeid, struct sockaddr_storage *addr) 809static struct dlm_comm *get_comm(int nodeid, struct sockaddr_storage *addr)
780{ 810{
781 struct config_item *i; 811 struct config_item *i;
@@ -788,7 +818,7 @@ static struct dlm_comm *get_comm(int nodeid, struct sockaddr_storage *addr)
788 mutex_lock(&clusters_root.subsys.su_mutex); 818 mutex_lock(&clusters_root.subsys.su_mutex);
789 819
790 list_for_each_entry(i, &comm_list->cg_children, ci_entry) { 820 list_for_each_entry(i, &comm_list->cg_children, ci_entry) {
791 cm = to_comm(i); 821 cm = config_item_to_comm(i);
792 822
793 if (nodeid) { 823 if (nodeid) {
794 if (cm->nodeid != nodeid) 824 if (cm->nodeid != nodeid)
@@ -797,8 +827,7 @@ static struct dlm_comm *get_comm(int nodeid, struct sockaddr_storage *addr)
797 config_item_get(i); 827 config_item_get(i);
798 break; 828 break;
799 } else { 829 } else {
800 if (!cm->addr_count || 830 if (!cm->addr_count || !addr_compare(cm->addr[0], addr))
801 memcmp(cm->addr[0], addr, sizeof(*addr)))
802 continue; 831 continue;
803 found = 1; 832 found = 1;
804 config_item_get(i); 833 config_item_get(i);
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 5a7ac33b629c..868e4c9ef127 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -2,7 +2,7 @@
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. 5** Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
6** 6**
7** This copyrighted material is made available to anyone wishing to use, 7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions 8** modify, copy, or redistribute it subject to the terms and conditions
@@ -441,8 +441,11 @@ struct dlm_ls {
441 uint32_t ls_global_id; /* global unique lockspace ID */ 441 uint32_t ls_global_id; /* global unique lockspace ID */
442 uint32_t ls_exflags; 442 uint32_t ls_exflags;
443 int ls_lvblen; 443 int ls_lvblen;
444 int ls_count; /* reference count */ 444 int ls_count; /* refcount of processes in
445 the dlm using this ls */
446 int ls_create_count; /* create/release refcount */
445 unsigned long ls_flags; /* LSFL_ */ 447 unsigned long ls_flags; /* LSFL_ */
448 unsigned long ls_scan_time;
446 struct kobject ls_kobj; 449 struct kobject ls_kobj;
447 450
448 struct dlm_rsbtable *ls_rsbtbl; 451 struct dlm_rsbtable *ls_rsbtbl;
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 499e16759e96..d910501de6d2 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -2,7 +2,7 @@
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. 5** Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
6** 6**
7** This copyrighted material is made available to anyone wishing to use, 7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions 8** modify, copy, or redistribute it subject to the terms and conditions
@@ -23,6 +23,7 @@
23#include "lock.h" 23#include "lock.h"
24#include "recover.h" 24#include "recover.h"
25#include "requestqueue.h" 25#include "requestqueue.h"
26#include "user.h"
26 27
27static int ls_count; 28static int ls_count;
28static struct mutex ls_lock; 29static struct mutex ls_lock;
@@ -211,19 +212,41 @@ void dlm_lockspace_exit(void)
211 kset_unregister(dlm_kset); 212 kset_unregister(dlm_kset);
212} 213}
213 214
215static struct dlm_ls *find_ls_to_scan(void)
216{
217 struct dlm_ls *ls;
218
219 spin_lock(&lslist_lock);
220 list_for_each_entry(ls, &lslist, ls_list) {
221 if (time_after_eq(jiffies, ls->ls_scan_time +
222 dlm_config.ci_scan_secs * HZ)) {
223 spin_unlock(&lslist_lock);
224 return ls;
225 }
226 }
227 spin_unlock(&lslist_lock);
228 return NULL;
229}
230
214static int dlm_scand(void *data) 231static int dlm_scand(void *data)
215{ 232{
216 struct dlm_ls *ls; 233 struct dlm_ls *ls;
234 int timeout_jiffies = dlm_config.ci_scan_secs * HZ;
217 235
218 while (!kthread_should_stop()) { 236 while (!kthread_should_stop()) {
219 list_for_each_entry(ls, &lslist, ls_list) { 237 ls = find_ls_to_scan();
238 if (ls) {
220 if (dlm_lock_recovery_try(ls)) { 239 if (dlm_lock_recovery_try(ls)) {
240 ls->ls_scan_time = jiffies;
221 dlm_scan_rsbs(ls); 241 dlm_scan_rsbs(ls);
222 dlm_scan_timeout(ls); 242 dlm_scan_timeout(ls);
223 dlm_unlock_recovery(ls); 243 dlm_unlock_recovery(ls);
244 } else {
245 ls->ls_scan_time += HZ;
224 } 246 }
247 } else {
248 schedule_timeout_interruptible(timeout_jiffies);
225 } 249 }
226 schedule_timeout_interruptible(dlm_config.ci_scan_secs * HZ);
227 } 250 }
228 return 0; 251 return 0;
229} 252}
@@ -246,23 +269,6 @@ static void dlm_scand_stop(void)
246 kthread_stop(scand_task); 269 kthread_stop(scand_task);
247} 270}
248 271
249static struct dlm_ls *dlm_find_lockspace_name(char *name, int namelen)
250{
251 struct dlm_ls *ls;
252
253 spin_lock(&lslist_lock);
254
255 list_for_each_entry(ls, &lslist, ls_list) {
256 if (ls->ls_namelen == namelen &&
257 memcmp(ls->ls_name, name, namelen) == 0)
258 goto out;
259 }
260 ls = NULL;
261 out:
262 spin_unlock(&lslist_lock);
263 return ls;
264}
265
266struct dlm_ls *dlm_find_lockspace_global(uint32_t id) 272struct dlm_ls *dlm_find_lockspace_global(uint32_t id)
267{ 273{
268 struct dlm_ls *ls; 274 struct dlm_ls *ls;
@@ -327,6 +333,7 @@ static void remove_lockspace(struct dlm_ls *ls)
327 for (;;) { 333 for (;;) {
328 spin_lock(&lslist_lock); 334 spin_lock(&lslist_lock);
329 if (ls->ls_count == 0) { 335 if (ls->ls_count == 0) {
336 WARN_ON(ls->ls_create_count != 0);
330 list_del(&ls->ls_list); 337 list_del(&ls->ls_list);
331 spin_unlock(&lslist_lock); 338 spin_unlock(&lslist_lock);
332 return; 339 return;
@@ -381,7 +388,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
381 uint32_t flags, int lvblen) 388 uint32_t flags, int lvblen)
382{ 389{
383 struct dlm_ls *ls; 390 struct dlm_ls *ls;
384 int i, size, error = -ENOMEM; 391 int i, size, error;
385 int do_unreg = 0; 392 int do_unreg = 0;
386 393
387 if (namelen > DLM_LOCKSPACE_LEN) 394 if (namelen > DLM_LOCKSPACE_LEN)
@@ -393,12 +400,37 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
393 if (!try_module_get(THIS_MODULE)) 400 if (!try_module_get(THIS_MODULE))
394 return -EINVAL; 401 return -EINVAL;
395 402
396 ls = dlm_find_lockspace_name(name, namelen); 403 if (!dlm_user_daemon_available()) {
397 if (ls) { 404 module_put(THIS_MODULE);
398 *lockspace = ls; 405 return -EUNATCH;
406 }
407
408 error = 0;
409
410 spin_lock(&lslist_lock);
411 list_for_each_entry(ls, &lslist, ls_list) {
412 WARN_ON(ls->ls_create_count <= 0);
413 if (ls->ls_namelen != namelen)
414 continue;
415 if (memcmp(ls->ls_name, name, namelen))
416 continue;
417 if (flags & DLM_LSFL_NEWEXCL) {
418 error = -EEXIST;
419 break;
420 }
421 ls->ls_create_count++;
399 module_put(THIS_MODULE); 422 module_put(THIS_MODULE);
400 return -EEXIST; 423 error = 1; /* not an error, return 0 */
424 break;
401 } 425 }
426 spin_unlock(&lslist_lock);
427
428 if (error < 0)
429 goto out;
430 if (error)
431 goto ret_zero;
432
433 error = -ENOMEM;
402 434
403 ls = kzalloc(sizeof(struct dlm_ls) + namelen, GFP_KERNEL); 435 ls = kzalloc(sizeof(struct dlm_ls) + namelen, GFP_KERNEL);
404 if (!ls) 436 if (!ls)
@@ -408,6 +440,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
408 ls->ls_lvblen = lvblen; 440 ls->ls_lvblen = lvblen;
409 ls->ls_count = 0; 441 ls->ls_count = 0;
410 ls->ls_flags = 0; 442 ls->ls_flags = 0;
443 ls->ls_scan_time = jiffies;
411 444
412 if (flags & DLM_LSFL_TIMEWARN) 445 if (flags & DLM_LSFL_TIMEWARN)
413 set_bit(LSFL_TIMEWARN, &ls->ls_flags); 446 set_bit(LSFL_TIMEWARN, &ls->ls_flags);
@@ -418,8 +451,9 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
418 ls->ls_allocation = GFP_KERNEL; 451 ls->ls_allocation = GFP_KERNEL;
419 452
420 /* ls_exflags are forced to match among nodes, and we don't 453 /* ls_exflags are forced to match among nodes, and we don't
421 need to require all nodes to have TIMEWARN or FS set */ 454 need to require all nodes to have some flags set */
422 ls->ls_exflags = (flags & ~(DLM_LSFL_TIMEWARN | DLM_LSFL_FS)); 455 ls->ls_exflags = (flags & ~(DLM_LSFL_TIMEWARN | DLM_LSFL_FS |
456 DLM_LSFL_NEWEXCL));
423 457
424 size = dlm_config.ci_rsbtbl_size; 458 size = dlm_config.ci_rsbtbl_size;
425 ls->ls_rsbtbl_size = size; 459 ls->ls_rsbtbl_size = size;
@@ -510,6 +544,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
510 down_write(&ls->ls_in_recovery); 544 down_write(&ls->ls_in_recovery);
511 545
512 spin_lock(&lslist_lock); 546 spin_lock(&lslist_lock);
547 ls->ls_create_count = 1;
513 list_add(&ls->ls_list, &lslist); 548 list_add(&ls->ls_list, &lslist);
514 spin_unlock(&lslist_lock); 549 spin_unlock(&lslist_lock);
515 550
@@ -548,7 +583,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
548 dlm_create_debug_file(ls); 583 dlm_create_debug_file(ls);
549 584
550 log_debug(ls, "join complete"); 585 log_debug(ls, "join complete");
551 586 ret_zero:
552 *lockspace = ls; 587 *lockspace = ls;
553 return 0; 588 return 0;
554 589
@@ -635,13 +670,34 @@ static int release_lockspace(struct dlm_ls *ls, int force)
635 struct dlm_lkb *lkb; 670 struct dlm_lkb *lkb;
636 struct dlm_rsb *rsb; 671 struct dlm_rsb *rsb;
637 struct list_head *head; 672 struct list_head *head;
638 int i; 673 int i, busy, rv;
639 int busy = lockspace_busy(ls); 674
675 busy = lockspace_busy(ls);
676
677 spin_lock(&lslist_lock);
678 if (ls->ls_create_count == 1) {
679 if (busy > force)
680 rv = -EBUSY;
681 else {
682 /* remove_lockspace takes ls off lslist */
683 ls->ls_create_count = 0;
684 rv = 0;
685 }
686 } else if (ls->ls_create_count > 1) {
687 rv = --ls->ls_create_count;
688 } else {
689 rv = -EINVAL;
690 }
691 spin_unlock(&lslist_lock);
640 692
641 if (busy > force) 693 if (rv) {
642 return -EBUSY; 694 log_debug(ls, "release_lockspace no remove %d", rv);
695 return rv;
696 }
697
698 dlm_device_deregister(ls);
643 699
644 if (force < 3) 700 if (force < 3 && dlm_user_daemon_available())
645 do_uevent(ls, 0); 701 do_uevent(ls, 0);
646 702
647 dlm_recoverd_stop(ls); 703 dlm_recoverd_stop(ls);
@@ -720,15 +776,10 @@ static int release_lockspace(struct dlm_ls *ls, int force)
720 dlm_clear_members(ls); 776 dlm_clear_members(ls);
721 dlm_clear_members_gone(ls); 777 dlm_clear_members_gone(ls);
722 kfree(ls->ls_node_array); 778 kfree(ls->ls_node_array);
779 log_debug(ls, "release_lockspace final free");
723 kobject_put(&ls->ls_kobj); 780 kobject_put(&ls->ls_kobj);
724 /* The ls structure will be freed when the kobject is done with */ 781 /* The ls structure will be freed when the kobject is done with */
725 782
726 mutex_lock(&ls_lock);
727 ls_count--;
728 if (!ls_count)
729 threads_stop();
730 mutex_unlock(&ls_lock);
731
732 module_put(THIS_MODULE); 783 module_put(THIS_MODULE);
733 return 0; 784 return 0;
734} 785}
@@ -750,11 +801,38 @@ static int release_lockspace(struct dlm_ls *ls, int force)
750int dlm_release_lockspace(void *lockspace, int force) 801int dlm_release_lockspace(void *lockspace, int force)
751{ 802{
752 struct dlm_ls *ls; 803 struct dlm_ls *ls;
804 int error;
753 805
754 ls = dlm_find_lockspace_local(lockspace); 806 ls = dlm_find_lockspace_local(lockspace);
755 if (!ls) 807 if (!ls)
756 return -EINVAL; 808 return -EINVAL;
757 dlm_put_lockspace(ls); 809 dlm_put_lockspace(ls);
758 return release_lockspace(ls, force); 810
811 mutex_lock(&ls_lock);
812 error = release_lockspace(ls, force);
813 if (!error)
814 ls_count--;
815 else if (!ls_count)
816 threads_stop();
817 mutex_unlock(&ls_lock);
818
819 return error;
820}
821
822void dlm_stop_lockspaces(void)
823{
824 struct dlm_ls *ls;
825
826 restart:
827 spin_lock(&lslist_lock);
828 list_for_each_entry(ls, &lslist, ls_list) {
829 if (!test_bit(LSFL_RUNNING, &ls->ls_flags))
830 continue;
831 spin_unlock(&lslist_lock);
832 log_error(ls, "no userland control daemon, stopping lockspace");
833 dlm_ls_stop(ls);
834 goto restart;
835 }
836 spin_unlock(&lslist_lock);
759} 837}
760 838
diff --git a/fs/dlm/lockspace.h b/fs/dlm/lockspace.h
index 891eabbdd021..f879f87901f8 100644
--- a/fs/dlm/lockspace.h
+++ b/fs/dlm/lockspace.h
@@ -20,6 +20,7 @@ struct dlm_ls *dlm_find_lockspace_global(uint32_t id);
20struct dlm_ls *dlm_find_lockspace_local(void *id); 20struct dlm_ls *dlm_find_lockspace_local(void *id);
21struct dlm_ls *dlm_find_lockspace_device(int minor); 21struct dlm_ls *dlm_find_lockspace_device(int minor);
22void dlm_put_lockspace(struct dlm_ls *ls); 22void dlm_put_lockspace(struct dlm_ls *ls);
23void dlm_stop_lockspaces(void);
23 24
24#endif /* __LOCKSPACE_DOT_H__ */ 25#endif /* __LOCKSPACE_DOT_H__ */
25 26
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 34f14a14fb4e..b3832c67194a 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2006-2007 Red Hat, Inc. All rights reserved. 2 * Copyright (C) 2006-2008 Red Hat, Inc. All rights reserved.
3 * 3 *
4 * This copyrighted material is made available to anyone wishing to use, 4 * This copyrighted material is made available to anyone wishing to use,
5 * modify, copy, or redistribute it subject to the terms and conditions 5 * modify, copy, or redistribute it subject to the terms and conditions
@@ -15,7 +15,6 @@
15#include <linux/poll.h> 15#include <linux/poll.h>
16#include <linux/signal.h> 16#include <linux/signal.h>
17#include <linux/spinlock.h> 17#include <linux/spinlock.h>
18#include <linux/smp_lock.h>
19#include <linux/dlm.h> 18#include <linux/dlm.h>
20#include <linux/dlm_device.h> 19#include <linux/dlm_device.h>
21 20
@@ -27,6 +26,8 @@
27 26
28static const char name_prefix[] = "dlm"; 27static const char name_prefix[] = "dlm";
29static const struct file_operations device_fops; 28static const struct file_operations device_fops;
29static atomic_t dlm_monitor_opened;
30static int dlm_monitor_unused = 1;
30 31
31#ifdef CONFIG_COMPAT 32#ifdef CONFIG_COMPAT
32 33
@@ -340,10 +341,15 @@ static int device_user_deadlock(struct dlm_user_proc *proc,
340 return error; 341 return error;
341} 342}
342 343
343static int create_misc_device(struct dlm_ls *ls, char *name) 344static int dlm_device_register(struct dlm_ls *ls, char *name)
344{ 345{
345 int error, len; 346 int error, len;
346 347
348 /* The device is already registered. This happens when the
349 lockspace is created multiple times from userspace. */
350 if (ls->ls_device.name)
351 return 0;
352
347 error = -ENOMEM; 353 error = -ENOMEM;
348 len = strlen(name) + strlen(name_prefix) + 2; 354 len = strlen(name) + strlen(name_prefix) + 2;
349 ls->ls_device.name = kzalloc(len, GFP_KERNEL); 355 ls->ls_device.name = kzalloc(len, GFP_KERNEL);
@@ -363,6 +369,22 @@ fail:
363 return error; 369 return error;
364} 370}
365 371
372int dlm_device_deregister(struct dlm_ls *ls)
373{
374 int error;
375
376 /* The device is not registered. This happens when the lockspace
377 was never used from userspace, or when device_create_lockspace()
378 calls dlm_release_lockspace() after the register fails. */
379 if (!ls->ls_device.name)
380 return 0;
381
382 error = misc_deregister(&ls->ls_device);
383 if (!error)
384 kfree(ls->ls_device.name);
385 return error;
386}
387
366static int device_user_purge(struct dlm_user_proc *proc, 388static int device_user_purge(struct dlm_user_proc *proc,
367 struct dlm_purge_params *params) 389 struct dlm_purge_params *params)
368{ 390{
@@ -397,7 +419,7 @@ static int device_create_lockspace(struct dlm_lspace_params *params)
397 if (!ls) 419 if (!ls)
398 return -ENOENT; 420 return -ENOENT;
399 421
400 error = create_misc_device(ls, params->name); 422 error = dlm_device_register(ls, params->name);
401 dlm_put_lockspace(ls); 423 dlm_put_lockspace(ls);
402 424
403 if (error) 425 if (error)
@@ -421,31 +443,22 @@ static int device_remove_lockspace(struct dlm_lspace_params *params)
421 if (!ls) 443 if (!ls)
422 return -ENOENT; 444 return -ENOENT;
423 445
424 /* Deregister the misc device first, so we don't have
425 * a device that's not attached to a lockspace. If
426 * dlm_release_lockspace fails then we can recreate it
427 */
428 error = misc_deregister(&ls->ls_device);
429 if (error) {
430 dlm_put_lockspace(ls);
431 goto out;
432 }
433 kfree(ls->ls_device.name);
434
435 if (params->flags & DLM_USER_LSFLG_FORCEFREE) 446 if (params->flags & DLM_USER_LSFLG_FORCEFREE)
436 force = 2; 447 force = 2;
437 448
438 lockspace = ls->ls_local_handle; 449 lockspace = ls->ls_local_handle;
450 dlm_put_lockspace(ls);
439 451
440 /* dlm_release_lockspace waits for references to go to zero, 452 /* The final dlm_release_lockspace waits for references to go to
441 so all processes will need to close their device for the ls 453 zero, so all processes will need to close their device for the
442 before the release will procede */ 454 ls before the release will proceed. release also calls the
455 device_deregister above. Converting a positive return value
456 from release to zero means that userspace won't know when its
457 release was the final one, but it shouldn't need to know. */
443 458
444 dlm_put_lockspace(ls);
445 error = dlm_release_lockspace(lockspace, force); 459 error = dlm_release_lockspace(lockspace, force);
446 if (error) 460 if (error > 0)
447 create_misc_device(ls, ls->ls_name); 461 error = 0;
448 out:
449 return error; 462 return error;
450} 463}
451 464
@@ -623,17 +636,13 @@ static int device_open(struct inode *inode, struct file *file)
623 struct dlm_user_proc *proc; 636 struct dlm_user_proc *proc;
624 struct dlm_ls *ls; 637 struct dlm_ls *ls;
625 638
626 lock_kernel();
627 ls = dlm_find_lockspace_device(iminor(inode)); 639 ls = dlm_find_lockspace_device(iminor(inode));
628 if (!ls) { 640 if (!ls)
629 unlock_kernel();
630 return -ENOENT; 641 return -ENOENT;
631 }
632 642
633 proc = kzalloc(sizeof(struct dlm_user_proc), GFP_KERNEL); 643 proc = kzalloc(sizeof(struct dlm_user_proc), GFP_KERNEL);
634 if (!proc) { 644 if (!proc) {
635 dlm_put_lockspace(ls); 645 dlm_put_lockspace(ls);
636 unlock_kernel();
637 return -ENOMEM; 646 return -ENOMEM;
638 } 647 }
639 648
@@ -645,7 +654,6 @@ static int device_open(struct inode *inode, struct file *file)
645 spin_lock_init(&proc->locks_spin); 654 spin_lock_init(&proc->locks_spin);
646 init_waitqueue_head(&proc->wait); 655 init_waitqueue_head(&proc->wait);
647 file->private_data = proc; 656 file->private_data = proc;
648 unlock_kernel();
649 657
650 return 0; 658 return 0;
651} 659}
@@ -878,9 +886,28 @@ static unsigned int device_poll(struct file *file, poll_table *wait)
878 return 0; 886 return 0;
879} 887}
880 888
889int dlm_user_daemon_available(void)
890{
891 /* dlm_controld hasn't started (or, has started, but not
892 properly populated configfs) */
893
894 if (!dlm_our_nodeid())
895 return 0;
896
897 /* This is to deal with versions of dlm_controld that don't
898 know about the monitor device. We assume that if the
899 dlm_controld was started (above), but the monitor device
900 was never opened, that it's an old version. dlm_controld
901 should open the monitor device before populating configfs. */
902
903 if (dlm_monitor_unused)
904 return 1;
905
906 return atomic_read(&dlm_monitor_opened) ? 1 : 0;
907}
908
881static int ctl_device_open(struct inode *inode, struct file *file) 909static int ctl_device_open(struct inode *inode, struct file *file)
882{ 910{
883 cycle_kernel_lock();
884 file->private_data = NULL; 911 file->private_data = NULL;
885 return 0; 912 return 0;
886} 913}
@@ -890,6 +917,20 @@ static int ctl_device_close(struct inode *inode, struct file *file)
890 return 0; 917 return 0;
891} 918}
892 919
920static int monitor_device_open(struct inode *inode, struct file *file)
921{
922 atomic_inc(&dlm_monitor_opened);
923 dlm_monitor_unused = 0;
924 return 0;
925}
926
927static int monitor_device_close(struct inode *inode, struct file *file)
928{
929 if (atomic_dec_and_test(&dlm_monitor_opened))
930 dlm_stop_lockspaces();
931 return 0;
932}
933
893static const struct file_operations device_fops = { 934static const struct file_operations device_fops = {
894 .open = device_open, 935 .open = device_open,
895 .release = device_close, 936 .release = device_close,
@@ -913,19 +954,42 @@ static struct miscdevice ctl_device = {
913 .minor = MISC_DYNAMIC_MINOR, 954 .minor = MISC_DYNAMIC_MINOR,
914}; 955};
915 956
957static const struct file_operations monitor_device_fops = {
958 .open = monitor_device_open,
959 .release = monitor_device_close,
960 .owner = THIS_MODULE,
961};
962
963static struct miscdevice monitor_device = {
964 .name = "dlm-monitor",
965 .fops = &monitor_device_fops,
966 .minor = MISC_DYNAMIC_MINOR,
967};
968
916int __init dlm_user_init(void) 969int __init dlm_user_init(void)
917{ 970{
918 int error; 971 int error;
919 972
973 atomic_set(&dlm_monitor_opened, 0);
974
920 error = misc_register(&ctl_device); 975 error = misc_register(&ctl_device);
921 if (error) 976 if (error) {
922 log_print("misc_register failed for control device"); 977 log_print("misc_register failed for control device");
978 goto out;
979 }
923 980
981 error = misc_register(&monitor_device);
982 if (error) {
983 log_print("misc_register failed for monitor device");
984 misc_deregister(&ctl_device);
985 }
986 out:
924 return error; 987 return error;
925} 988}
926 989
927void dlm_user_exit(void) 990void dlm_user_exit(void)
928{ 991{
929 misc_deregister(&ctl_device); 992 misc_deregister(&ctl_device);
993 misc_deregister(&monitor_device);
930} 994}
931 995
diff --git a/fs/dlm/user.h b/fs/dlm/user.h
index d38e9f3e4151..35eb6a13d616 100644
--- a/fs/dlm/user.h
+++ b/fs/dlm/user.h
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2006 Red Hat, Inc. All rights reserved. 2 * Copyright (C) 2006-2008 Red Hat, Inc. All rights reserved.
3 * 3 *
4 * This copyrighted material is made available to anyone wishing to use, 4 * This copyrighted material is made available to anyone wishing to use,
5 * modify, copy, or redistribute it subject to the terms and conditions 5 * modify, copy, or redistribute it subject to the terms and conditions
@@ -12,5 +12,7 @@
12void dlm_user_add_ast(struct dlm_lkb *lkb, int type); 12void dlm_user_add_ast(struct dlm_lkb *lkb, int type);
13int dlm_user_init(void); 13int dlm_user_init(void);
14void dlm_user_exit(void); 14void dlm_user_exit(void);
15int dlm_device_deregister(struct dlm_ls *ls);
16int dlm_user_daemon_available(void);
15 17
16#endif 18#endif
diff --git a/fs/exec.c b/fs/exec.c
index 32993beecbe9..cecee501ce78 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -752,11 +752,11 @@ static int exec_mmap(struct mm_struct *mm)
752 tsk->active_mm = mm; 752 tsk->active_mm = mm;
753 activate_mm(active_mm, mm); 753 activate_mm(active_mm, mm);
754 task_unlock(tsk); 754 task_unlock(tsk);
755 mm_update_next_owner(old_mm);
756 arch_pick_mmap_layout(mm); 755 arch_pick_mmap_layout(mm);
757 if (old_mm) { 756 if (old_mm) {
758 up_read(&old_mm->mmap_sem); 757 up_read(&old_mm->mmap_sem);
759 BUG_ON(active_mm != old_mm); 758 BUG_ON(active_mm != old_mm);
759 mm_update_next_owner(old_mm);
760 mmput(old_mm); 760 mmput(old_mm);
761 return 0; 761 return 0;
762 } 762 }
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 47d88da2d33b..bae998c1e44e 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -133,6 +133,8 @@ extern void ext2_truncate (struct inode *);
133extern int ext2_setattr (struct dentry *, struct iattr *); 133extern int ext2_setattr (struct dentry *, struct iattr *);
134extern void ext2_set_inode_flags(struct inode *inode); 134extern void ext2_set_inode_flags(struct inode *inode);
135extern void ext2_get_inode_flags(struct ext2_inode_info *); 135extern void ext2_get_inode_flags(struct ext2_inode_info *);
136extern int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
137 u64 start, u64 len);
136int __ext2_write_begin(struct file *file, struct address_space *mapping, 138int __ext2_write_begin(struct file *file, struct address_space *mapping,
137 loff_t pos, unsigned len, unsigned flags, 139 loff_t pos, unsigned len, unsigned flags,
138 struct page **pagep, void **fsdata); 140 struct page **pagep, void **fsdata);
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 5f2fa9c36293..45ed07122182 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -86,4 +86,5 @@ const struct inode_operations ext2_file_inode_operations = {
86#endif 86#endif
87 .setattr = ext2_setattr, 87 .setattr = ext2_setattr,
88 .permission = ext2_permission, 88 .permission = ext2_permission,
89 .fiemap = ext2_fiemap,
89}; 90};
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 991d6dfeb51f..7658b33e2653 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -31,6 +31,7 @@
31#include <linux/writeback.h> 31#include <linux/writeback.h>
32#include <linux/buffer_head.h> 32#include <linux/buffer_head.h>
33#include <linux/mpage.h> 33#include <linux/mpage.h>
34#include <linux/fiemap.h>
34#include "ext2.h" 35#include "ext2.h"
35#include "acl.h" 36#include "acl.h"
36#include "xip.h" 37#include "xip.h"
@@ -704,6 +705,13 @@ int ext2_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_
704 705
705} 706}
706 707
708int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
709 u64 start, u64 len)
710{
711 return generic_block_fiemap(inode, fieinfo, start, len,
712 ext2_get_block);
713}
714
707static int ext2_writepage(struct page *page, struct writeback_control *wbc) 715static int ext2_writepage(struct page *page, struct writeback_control *wbc)
708{ 716{
709 return block_write_full_page(page, ext2_get_block, wbc); 717 return block_write_full_page(page, ext2_get_block, wbc);
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index acc4913d3019..3be1e0689c9a 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -134,5 +134,6 @@ const struct inode_operations ext3_file_inode_operations = {
134 .removexattr = generic_removexattr, 134 .removexattr = generic_removexattr,
135#endif 135#endif
136 .permission = ext3_permission, 136 .permission = ext3_permission,
137 .fiemap = ext3_fiemap,
137}; 138};
138 139
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 507d8689b111..ebfec4d0148e 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -36,6 +36,7 @@
36#include <linux/mpage.h> 36#include <linux/mpage.h>
37#include <linux/uio.h> 37#include <linux/uio.h>
38#include <linux/bio.h> 38#include <linux/bio.h>
39#include <linux/fiemap.h>
39#include "xattr.h" 40#include "xattr.h"
40#include "acl.h" 41#include "acl.h"
41 42
@@ -981,6 +982,13 @@ out:
981 return ret; 982 return ret;
982} 983}
983 984
985int ext3_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
986 u64 start, u64 len)
987{
988 return generic_block_fiemap(inode, fieinfo, start, len,
989 ext3_get_block);
990}
991
984/* 992/*
985 * `handle' can be NULL if create is zero 993 * `handle' can be NULL if create is zero
986 */ 994 */
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index ac6fa8ca0a2f..a8ff003a00f7 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -2,12 +2,12 @@
2# Makefile for the linux ext4-filesystem routines. 2# Makefile for the linux ext4-filesystem routines.
3# 3#
4 4
5obj-$(CONFIG_EXT4DEV_FS) += ext4dev.o 5obj-$(CONFIG_EXT4_FS) += ext4.o
6 6
7ext4dev-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ 7ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
8 ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ 8 ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
9 ext4_jbd2.o migrate.o mballoc.o 9 ext4_jbd2.o migrate.o mballoc.o
10 10
11ext4dev-$(CONFIG_EXT4DEV_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o 11ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
12ext4dev-$(CONFIG_EXT4DEV_FS_POSIX_ACL) += acl.o 12ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
13ext4dev-$(CONFIG_EXT4DEV_FS_SECURITY) += xattr_security.o 13ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o
diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h
index cd2b855a07d6..cb45257a246e 100644
--- a/fs/ext4/acl.h
+++ b/fs/ext4/acl.h
@@ -51,18 +51,18 @@ static inline int ext4_acl_count(size_t size)
51 } 51 }
52} 52}
53 53
54#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL 54#ifdef CONFIG_EXT4_FS_POSIX_ACL
55 55
56/* Value for inode->u.ext4_i.i_acl and inode->u.ext4_i.i_default_acl 56/* Value for inode->u.ext4_i.i_acl and inode->u.ext4_i.i_default_acl
57 if the ACL has not been cached */ 57 if the ACL has not been cached */
58#define EXT4_ACL_NOT_CACHED ((void *)-1) 58#define EXT4_ACL_NOT_CACHED ((void *)-1)
59 59
60/* acl.c */ 60/* acl.c */
61extern int ext4_permission (struct inode *, int); 61extern int ext4_permission(struct inode *, int);
62extern int ext4_acl_chmod (struct inode *); 62extern int ext4_acl_chmod(struct inode *);
63extern int ext4_init_acl (handle_t *, struct inode *, struct inode *); 63extern int ext4_init_acl(handle_t *, struct inode *, struct inode *);
64 64
65#else /* CONFIG_EXT4DEV_FS_POSIX_ACL */ 65#else /* CONFIG_EXT4_FS_POSIX_ACL */
66#include <linux/sched.h> 66#include <linux/sched.h>
67#define ext4_permission NULL 67#define ext4_permission NULL
68 68
@@ -77,5 +77,5 @@ ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
77{ 77{
78 return 0; 78 return 0;
79} 79}
80#endif /* CONFIG_EXT4DEV_FS_POSIX_ACL */ 80#endif /* CONFIG_EXT4_FS_POSIX_ACL */
81 81
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index e9fa960ba6da..bd2ece228827 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -83,6 +83,7 @@ static int ext4_group_used_meta_blocks(struct super_block *sb,
83 } 83 }
84 return used_blocks; 84 return used_blocks;
85} 85}
86
86/* Initializes an uninitialized block bitmap if given, and returns the 87/* Initializes an uninitialized block bitmap if given, and returns the
87 * number of blocks free in the group. */ 88 * number of blocks free in the group. */
88unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, 89unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
@@ -132,7 +133,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
132 */ 133 */
133 group_blocks = ext4_blocks_count(sbi->s_es) - 134 group_blocks = ext4_blocks_count(sbi->s_es) -
134 le32_to_cpu(sbi->s_es->s_first_data_block) - 135 le32_to_cpu(sbi->s_es->s_first_data_block) -
135 (EXT4_BLOCKS_PER_GROUP(sb) * (sbi->s_groups_count -1)); 136 (EXT4_BLOCKS_PER_GROUP(sb) * (sbi->s_groups_count - 1));
136 } else { 137 } else {
137 group_blocks = EXT4_BLOCKS_PER_GROUP(sb); 138 group_blocks = EXT4_BLOCKS_PER_GROUP(sb);
138 } 139 }
@@ -200,20 +201,20 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
200 * @bh: pointer to the buffer head to store the block 201 * @bh: pointer to the buffer head to store the block
201 * group descriptor 202 * group descriptor
202 */ 203 */
203struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, 204struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
204 ext4_group_t block_group, 205 ext4_group_t block_group,
205 struct buffer_head ** bh) 206 struct buffer_head **bh)
206{ 207{
207 unsigned long group_desc; 208 unsigned long group_desc;
208 unsigned long offset; 209 unsigned long offset;
209 struct ext4_group_desc * desc; 210 struct ext4_group_desc *desc;
210 struct ext4_sb_info *sbi = EXT4_SB(sb); 211 struct ext4_sb_info *sbi = EXT4_SB(sb);
211 212
212 if (block_group >= sbi->s_groups_count) { 213 if (block_group >= sbi->s_groups_count) {
213 ext4_error (sb, "ext4_get_group_desc", 214 ext4_error(sb, "ext4_get_group_desc",
214 "block_group >= groups_count - " 215 "block_group >= groups_count - "
215 "block_group = %lu, groups_count = %lu", 216 "block_group = %lu, groups_count = %lu",
216 block_group, sbi->s_groups_count); 217 block_group, sbi->s_groups_count);
217 218
218 return NULL; 219 return NULL;
219 } 220 }
@@ -222,10 +223,10 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
222 group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb); 223 group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb);
223 offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1); 224 offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1);
224 if (!sbi->s_group_desc[group_desc]) { 225 if (!sbi->s_group_desc[group_desc]) {
225 ext4_error (sb, "ext4_get_group_desc", 226 ext4_error(sb, "ext4_get_group_desc",
226 "Group descriptor not loaded - " 227 "Group descriptor not loaded - "
227 "block_group = %lu, group_desc = %lu, desc = %lu", 228 "block_group = %lu, group_desc = %lu, desc = %lu",
228 block_group, group_desc, offset); 229 block_group, group_desc, offset);
229 return NULL; 230 return NULL;
230 } 231 }
231 232
@@ -302,8 +303,8 @@ err_out:
302struct buffer_head * 303struct buffer_head *
303ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) 304ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
304{ 305{
305 struct ext4_group_desc * desc; 306 struct ext4_group_desc *desc;
306 struct buffer_head * bh = NULL; 307 struct buffer_head *bh = NULL;
307 ext4_fsblk_t bitmap_blk; 308 ext4_fsblk_t bitmap_blk;
308 309
309 desc = ext4_get_group_desc(sb, block_group, NULL); 310 desc = ext4_get_group_desc(sb, block_group, NULL);
@@ -318,9 +319,11 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
318 block_group, bitmap_blk); 319 block_group, bitmap_blk);
319 return NULL; 320 return NULL;
320 } 321 }
321 if (bh_uptodate_or_lock(bh)) 322 if (buffer_uptodate(bh) &&
323 !(desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))
322 return bh; 324 return bh;
323 325
326 lock_buffer(bh);
324 spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group)); 327 spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
325 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 328 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
326 ext4_init_block_bitmap(sb, bh, block_group, desc); 329 ext4_init_block_bitmap(sb, bh, block_group, desc);
@@ -345,301 +348,6 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
345 */ 348 */
346 return bh; 349 return bh;
347} 350}
348/*
349 * The reservation window structure operations
350 * --------------------------------------------
351 * Operations include:
352 * dump, find, add, remove, is_empty, find_next_reservable_window, etc.
353 *
354 * We use a red-black tree to represent per-filesystem reservation
355 * windows.
356 *
357 */
358
359/**
360 * __rsv_window_dump() -- Dump the filesystem block allocation reservation map
361 * @rb_root: root of per-filesystem reservation rb tree
362 * @verbose: verbose mode
363 * @fn: function which wishes to dump the reservation map
364 *
365 * If verbose is turned on, it will print the whole block reservation
366 * windows(start, end). Otherwise, it will only print out the "bad" windows,
367 * those windows that overlap with their immediate neighbors.
368 */
369#if 1
370static void __rsv_window_dump(struct rb_root *root, int verbose,
371 const char *fn)
372{
373 struct rb_node *n;
374 struct ext4_reserve_window_node *rsv, *prev;
375 int bad;
376
377restart:
378 n = rb_first(root);
379 bad = 0;
380 prev = NULL;
381
382 printk("Block Allocation Reservation Windows Map (%s):\n", fn);
383 while (n) {
384 rsv = rb_entry(n, struct ext4_reserve_window_node, rsv_node);
385 if (verbose)
386 printk("reservation window 0x%p "
387 "start: %llu, end: %llu\n",
388 rsv, rsv->rsv_start, rsv->rsv_end);
389 if (rsv->rsv_start && rsv->rsv_start >= rsv->rsv_end) {
390 printk("Bad reservation %p (start >= end)\n",
391 rsv);
392 bad = 1;
393 }
394 if (prev && prev->rsv_end >= rsv->rsv_start) {
395 printk("Bad reservation %p (prev->end >= start)\n",
396 rsv);
397 bad = 1;
398 }
399 if (bad) {
400 if (!verbose) {
401 printk("Restarting reservation walk in verbose mode\n");
402 verbose = 1;
403 goto restart;
404 }
405 }
406 n = rb_next(n);
407 prev = rsv;
408 }
409 printk("Window map complete.\n");
410 BUG_ON(bad);
411}
412#define rsv_window_dump(root, verbose) \
413 __rsv_window_dump((root), (verbose), __func__)
414#else
415#define rsv_window_dump(root, verbose) do {} while (0)
416#endif
417
418/**
419 * goal_in_my_reservation()
420 * @rsv: inode's reservation window
421 * @grp_goal: given goal block relative to the allocation block group
422 * @group: the current allocation block group
423 * @sb: filesystem super block
424 *
425 * Test if the given goal block (group relative) is within the file's
426 * own block reservation window range.
427 *
428 * If the reservation window is outside the goal allocation group, return 0;
429 * grp_goal (given goal block) could be -1, which means no specific
430 * goal block. In this case, always return 1.
431 * If the goal block is within the reservation window, return 1;
432 * otherwise, return 0;
433 */
434static int
435goal_in_my_reservation(struct ext4_reserve_window *rsv, ext4_grpblk_t grp_goal,
436 ext4_group_t group, struct super_block *sb)
437{
438 ext4_fsblk_t group_first_block, group_last_block;
439
440 group_first_block = ext4_group_first_block_no(sb, group);
441 group_last_block = group_first_block + (EXT4_BLOCKS_PER_GROUP(sb) - 1);
442
443 if ((rsv->_rsv_start > group_last_block) ||
444 (rsv->_rsv_end < group_first_block))
445 return 0;
446 if ((grp_goal >= 0) && ((grp_goal + group_first_block < rsv->_rsv_start)
447 || (grp_goal + group_first_block > rsv->_rsv_end)))
448 return 0;
449 return 1;
450}
451
452/**
453 * search_reserve_window()
454 * @rb_root: root of reservation tree
455 * @goal: target allocation block
456 *
457 * Find the reserved window which includes the goal, or the previous one
458 * if the goal is not in any window.
459 * Returns NULL if there are no windows or if all windows start after the goal.
460 */
461static struct ext4_reserve_window_node *
462search_reserve_window(struct rb_root *root, ext4_fsblk_t goal)
463{
464 struct rb_node *n = root->rb_node;
465 struct ext4_reserve_window_node *rsv;
466
467 if (!n)
468 return NULL;
469
470 do {
471 rsv = rb_entry(n, struct ext4_reserve_window_node, rsv_node);
472
473 if (goal < rsv->rsv_start)
474 n = n->rb_left;
475 else if (goal > rsv->rsv_end)
476 n = n->rb_right;
477 else
478 return rsv;
479 } while (n);
480 /*
481 * We've fallen off the end of the tree: the goal wasn't inside
482 * any particular node. OK, the previous node must be to one
483 * side of the interval containing the goal. If it's the RHS,
484 * we need to back up one.
485 */
486 if (rsv->rsv_start > goal) {
487 n = rb_prev(&rsv->rsv_node);
488 rsv = rb_entry(n, struct ext4_reserve_window_node, rsv_node);
489 }
490 return rsv;
491}
492
493/**
494 * ext4_rsv_window_add() -- Insert a window to the block reservation rb tree.
495 * @sb: super block
496 * @rsv: reservation window to add
497 *
498 * Must be called with rsv_lock hold.
499 */
500void ext4_rsv_window_add(struct super_block *sb,
501 struct ext4_reserve_window_node *rsv)
502{
503 struct rb_root *root = &EXT4_SB(sb)->s_rsv_window_root;
504 struct rb_node *node = &rsv->rsv_node;
505 ext4_fsblk_t start = rsv->rsv_start;
506
507 struct rb_node ** p = &root->rb_node;
508 struct rb_node * parent = NULL;
509 struct ext4_reserve_window_node *this;
510
511 while (*p)
512 {
513 parent = *p;
514 this = rb_entry(parent, struct ext4_reserve_window_node, rsv_node);
515
516 if (start < this->rsv_start)
517 p = &(*p)->rb_left;
518 else if (start > this->rsv_end)
519 p = &(*p)->rb_right;
520 else {
521 rsv_window_dump(root, 1);
522 BUG();
523 }
524 }
525
526 rb_link_node(node, parent, p);
527 rb_insert_color(node, root);
528}
529
530/**
531 * ext4_rsv_window_remove() -- unlink a window from the reservation rb tree
532 * @sb: super block
533 * @rsv: reservation window to remove
534 *
535 * Mark the block reservation window as not allocated, and unlink it
536 * from the filesystem reservation window rb tree. Must be called with
537 * rsv_lock hold.
538 */
539static void rsv_window_remove(struct super_block *sb,
540 struct ext4_reserve_window_node *rsv)
541{
542 rsv->rsv_start = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
543 rsv->rsv_end = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
544 rsv->rsv_alloc_hit = 0;
545 rb_erase(&rsv->rsv_node, &EXT4_SB(sb)->s_rsv_window_root);
546}
547
548/*
549 * rsv_is_empty() -- Check if the reservation window is allocated.
550 * @rsv: given reservation window to check
551 *
552 * returns 1 if the end block is EXT4_RESERVE_WINDOW_NOT_ALLOCATED.
553 */
554static inline int rsv_is_empty(struct ext4_reserve_window *rsv)
555{
556 /* a valid reservation end block could not be 0 */
557 return rsv->_rsv_end == EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
558}
559
560/**
561 * ext4_init_block_alloc_info()
562 * @inode: file inode structure
563 *
564 * Allocate and initialize the reservation window structure, and
565 * link the window to the ext4 inode structure at last
566 *
567 * The reservation window structure is only dynamically allocated
568 * and linked to ext4 inode the first time the open file
569 * needs a new block. So, before every ext4_new_block(s) call, for
570 * regular files, we should check whether the reservation window
571 * structure exists or not. In the latter case, this function is called.
572 * Fail to do so will result in block reservation being turned off for that
573 * open file.
574 *
575 * This function is called from ext4_get_blocks_handle(), also called
576 * when setting the reservation window size through ioctl before the file
577 * is open for write (needs block allocation).
578 *
579 * Needs down_write(i_data_sem) protection prior to call this function.
580 */
581void ext4_init_block_alloc_info(struct inode *inode)
582{
583 struct ext4_inode_info *ei = EXT4_I(inode);
584 struct ext4_block_alloc_info *block_i = ei->i_block_alloc_info;
585 struct super_block *sb = inode->i_sb;
586
587 block_i = kmalloc(sizeof(*block_i), GFP_NOFS);
588 if (block_i) {
589 struct ext4_reserve_window_node *rsv = &block_i->rsv_window_node;
590
591 rsv->rsv_start = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
592 rsv->rsv_end = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
593
594 /*
595 * if filesystem is mounted with NORESERVATION, the goal
596 * reservation window size is set to zero to indicate
597 * block reservation is off
598 */
599 if (!test_opt(sb, RESERVATION))
600 rsv->rsv_goal_size = 0;
601 else
602 rsv->rsv_goal_size = EXT4_DEFAULT_RESERVE_BLOCKS;
603 rsv->rsv_alloc_hit = 0;
604 block_i->last_alloc_logical_block = 0;
605 block_i->last_alloc_physical_block = 0;
606 }
607 ei->i_block_alloc_info = block_i;
608}
609
610/**
611 * ext4_discard_reservation()
612 * @inode: inode
613 *
614 * Discard(free) block reservation window on last file close, or truncate
615 * or at last iput().
616 *
617 * It is being called in three cases:
618 * ext4_release_file(): last writer close the file
619 * ext4_clear_inode(): last iput(), when nobody link to this file.
620 * ext4_truncate(): when the block indirect map is about to change.
621 *
622 */
623void ext4_discard_reservation(struct inode *inode)
624{
625 struct ext4_inode_info *ei = EXT4_I(inode);
626 struct ext4_block_alloc_info *block_i = ei->i_block_alloc_info;
627 struct ext4_reserve_window_node *rsv;
628 spinlock_t *rsv_lock = &EXT4_SB(inode->i_sb)->s_rsv_window_lock;
629
630 ext4_mb_discard_inode_preallocations(inode);
631
632 if (!block_i)
633 return;
634
635 rsv = &block_i->rsv_window_node;
636 if (!rsv_is_empty(&rsv->rsv_window)) {
637 spin_lock(rsv_lock);
638 if (!rsv_is_empty(&rsv->rsv_window))
639 rsv_window_remove(inode->i_sb, rsv);
640 spin_unlock(rsv_lock);
641 }
642}
643 351
644/** 352/**
645 * ext4_free_blocks_sb() -- Free given blocks and update quota 353 * ext4_free_blocks_sb() -- Free given blocks and update quota
@@ -648,6 +356,13 @@ void ext4_discard_reservation(struct inode *inode)
648 * @block: start physcial block to free 356 * @block: start physcial block to free
649 * @count: number of blocks to free 357 * @count: number of blocks to free
650 * @pdquot_freed_blocks: pointer to quota 358 * @pdquot_freed_blocks: pointer to quota
359 *
360 * XXX This function is only used by the on-line resizing code, which
361 * should probably be fixed up to call the mballoc variant. There
362 * this needs to be cleaned up later; in fact, I'm not convinced this
363 * is 100% correct in the face of the mballoc code. The online resizing
364 * code needs to be fixed up to more tightly (and correctly) interlock
365 * with the mballoc code.
651 */ 366 */
652void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb, 367void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
653 ext4_fsblk_t block, unsigned long count, 368 ext4_fsblk_t block, unsigned long count,
@@ -659,8 +374,8 @@ void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
659 ext4_grpblk_t bit; 374 ext4_grpblk_t bit;
660 unsigned long i; 375 unsigned long i;
661 unsigned long overflow; 376 unsigned long overflow;
662 struct ext4_group_desc * desc; 377 struct ext4_group_desc *desc;
663 struct ext4_super_block * es; 378 struct ext4_super_block *es;
664 struct ext4_sb_info *sbi; 379 struct ext4_sb_info *sbi;
665 int err = 0, ret; 380 int err = 0, ret;
666 ext4_grpblk_t group_freed; 381 ext4_grpblk_t group_freed;
@@ -671,13 +386,13 @@ void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
671 if (block < le32_to_cpu(es->s_first_data_block) || 386 if (block < le32_to_cpu(es->s_first_data_block) ||
672 block + count < block || 387 block + count < block ||
673 block + count > ext4_blocks_count(es)) { 388 block + count > ext4_blocks_count(es)) {
674 ext4_error (sb, "ext4_free_blocks", 389 ext4_error(sb, "ext4_free_blocks",
675 "Freeing blocks not in datazone - " 390 "Freeing blocks not in datazone - "
676 "block = %llu, count = %lu", block, count); 391 "block = %llu, count = %lu", block, count);
677 goto error_return; 392 goto error_return;
678 } 393 }
679 394
680 ext4_debug ("freeing block(s) %llu-%llu\n", block, block + count - 1); 395 ext4_debug("freeing block(s) %llu-%llu\n", block, block + count - 1);
681 396
682do_more: 397do_more:
683 overflow = 0; 398 overflow = 0;
@@ -694,7 +409,7 @@ do_more:
694 bitmap_bh = ext4_read_block_bitmap(sb, block_group); 409 bitmap_bh = ext4_read_block_bitmap(sb, block_group);
695 if (!bitmap_bh) 410 if (!bitmap_bh)
696 goto error_return; 411 goto error_return;
697 desc = ext4_get_group_desc (sb, block_group, &gd_bh); 412 desc = ext4_get_group_desc(sb, block_group, &gd_bh);
698 if (!desc) 413 if (!desc)
699 goto error_return; 414 goto error_return;
700 415
@@ -703,10 +418,10 @@ do_more:
703 in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) || 418 in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
704 in_range(block + count - 1, ext4_inode_table(sb, desc), 419 in_range(block + count - 1, ext4_inode_table(sb, desc),
705 sbi->s_itb_per_group)) { 420 sbi->s_itb_per_group)) {
706 ext4_error (sb, "ext4_free_blocks", 421 ext4_error(sb, "ext4_free_blocks",
707 "Freeing blocks in system zones - " 422 "Freeing blocks in system zones - "
708 "Block = %llu, count = %lu", 423 "Block = %llu, count = %lu",
709 block, count); 424 block, count);
710 goto error_return; 425 goto error_return;
711 } 426 }
712 427
@@ -848,7 +563,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
848 ext4_fsblk_t block, unsigned long count, 563 ext4_fsblk_t block, unsigned long count,
849 int metadata) 564 int metadata)
850{ 565{
851 struct super_block * sb; 566 struct super_block *sb;
852 unsigned long dquot_freed_blocks; 567 unsigned long dquot_freed_blocks;
853 568
854 /* this isn't the right place to decide whether block is metadata 569 /* this isn't the right place to decide whether block is metadata
@@ -859,748 +574,52 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
859 574
860 sb = inode->i_sb; 575 sb = inode->i_sb;
861 576
862 if (!test_opt(sb, MBALLOC) || !EXT4_SB(sb)->s_group_info) 577 ext4_mb_free_blocks(handle, inode, block, count,
863 ext4_free_blocks_sb(handle, sb, block, count, 578 metadata, &dquot_freed_blocks);
864 &dquot_freed_blocks);
865 else
866 ext4_mb_free_blocks(handle, inode, block, count,
867 metadata, &dquot_freed_blocks);
868 if (dquot_freed_blocks) 579 if (dquot_freed_blocks)
869 DQUOT_FREE_BLOCK(inode, dquot_freed_blocks); 580 DQUOT_FREE_BLOCK(inode, dquot_freed_blocks);
870 return; 581 return;
871} 582}
872 583
873/** 584int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
874 * ext4_test_allocatable() 585 s64 nblocks)
875 * @nr: given allocation block group
876 * @bh: bufferhead contains the bitmap of the given block group
877 *
878 * For ext4 allocations, we must not reuse any blocks which are
879 * allocated in the bitmap buffer's "last committed data" copy. This
880 * prevents deletes from freeing up the page for reuse until we have
881 * committed the delete transaction.
882 *
883 * If we didn't do this, then deleting something and reallocating it as
884 * data would allow the old block to be overwritten before the
885 * transaction committed (because we force data to disk before commit).
886 * This would lead to corruption if we crashed between overwriting the
887 * data and committing the delete.
888 *
889 * @@@ We may want to make this allocation behaviour conditional on
890 * data-writes at some point, and disable it for metadata allocations or
891 * sync-data inodes.
892 */
893static int ext4_test_allocatable(ext4_grpblk_t nr, struct buffer_head *bh)
894{
895 int ret;
896 struct journal_head *jh = bh2jh(bh);
897
898 if (ext4_test_bit(nr, bh->b_data))
899 return 0;
900
901 jbd_lock_bh_state(bh);
902 if (!jh->b_committed_data)
903 ret = 1;
904 else
905 ret = !ext4_test_bit(nr, jh->b_committed_data);
906 jbd_unlock_bh_state(bh);
907 return ret;
908}
909
910/**
911 * bitmap_search_next_usable_block()
912 * @start: the starting block (group relative) of the search
913 * @bh: bufferhead contains the block group bitmap
914 * @maxblocks: the ending block (group relative) of the reservation
915 *
916 * The bitmap search --- search forward alternately through the actual
917 * bitmap on disk and the last-committed copy in journal, until we find a
918 * bit free in both bitmaps.
919 */
920static ext4_grpblk_t
921bitmap_search_next_usable_block(ext4_grpblk_t start, struct buffer_head *bh,
922 ext4_grpblk_t maxblocks)
923{ 586{
924 ext4_grpblk_t next; 587 s64 free_blocks, dirty_blocks;
925 struct journal_head *jh = bh2jh(bh); 588 s64 root_blocks = 0;
926 589 struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
927 while (start < maxblocks) { 590 struct percpu_counter *dbc = &sbi->s_dirtyblocks_counter;
928 next = ext4_find_next_zero_bit(bh->b_data, maxblocks, start);
929 if (next >= maxblocks)
930 return -1;
931 if (ext4_test_allocatable(next, bh))
932 return next;
933 jbd_lock_bh_state(bh);
934 if (jh->b_committed_data)
935 start = ext4_find_next_zero_bit(jh->b_committed_data,
936 maxblocks, next);
937 jbd_unlock_bh_state(bh);
938 }
939 return -1;
940}
941 591
942/** 592 free_blocks = percpu_counter_read_positive(fbc);
943 * find_next_usable_block() 593 dirty_blocks = percpu_counter_read_positive(dbc);
944 * @start: the starting block (group relative) to find next
945 * allocatable block in bitmap.
946 * @bh: bufferhead contains the block group bitmap
947 * @maxblocks: the ending block (group relative) for the search
948 *
949 * Find an allocatable block in a bitmap. We honor both the bitmap and
950 * its last-committed copy (if that exists), and perform the "most
951 * appropriate allocation" algorithm of looking for a free block near
952 * the initial goal; then for a free byte somewhere in the bitmap; then
953 * for any free bit in the bitmap.
954 */
955static ext4_grpblk_t
956find_next_usable_block(ext4_grpblk_t start, struct buffer_head *bh,
957 ext4_grpblk_t maxblocks)
958{
959 ext4_grpblk_t here, next;
960 char *p, *r;
961
962 if (start > 0) {
963 /*
964 * The goal was occupied; search forward for a free
965 * block within the next XX blocks.
966 *
967 * end_goal is more or less random, but it has to be
968 * less than EXT4_BLOCKS_PER_GROUP. Aligning up to the
969 * next 64-bit boundary is simple..
970 */
971 ext4_grpblk_t end_goal = (start + 63) & ~63;
972 if (end_goal > maxblocks)
973 end_goal = maxblocks;
974 here = ext4_find_next_zero_bit(bh->b_data, end_goal, start);
975 if (here < end_goal && ext4_test_allocatable(here, bh))
976 return here;
977 ext4_debug("Bit not found near goal\n");
978 }
979
980 here = start;
981 if (here < 0)
982 here = 0;
983
984 p = ((char *)bh->b_data) + (here >> 3);
985 r = memscan(p, 0, ((maxblocks + 7) >> 3) - (here >> 3));
986 next = (r - ((char *)bh->b_data)) << 3;
987
988 if (next < maxblocks && next >= start && ext4_test_allocatable(next, bh))
989 return next;
990
991 /*
992 * The bitmap search --- search forward alternately through the actual
993 * bitmap and the last-committed copy until we find a bit free in
994 * both
995 */
996 here = bitmap_search_next_usable_block(here, bh, maxblocks);
997 return here;
998}
999
1000/**
1001 * claim_block()
1002 * @block: the free block (group relative) to allocate
1003 * @bh: the bufferhead containts the block group bitmap
1004 *
1005 * We think we can allocate this block in this bitmap. Try to set the bit.
1006 * If that succeeds then check that nobody has allocated and then freed the
1007 * block since we saw that is was not marked in b_committed_data. If it _was_
1008 * allocated and freed then clear the bit in the bitmap again and return
1009 * zero (failure).
1010 */
1011static inline int
1012claim_block(spinlock_t *lock, ext4_grpblk_t block, struct buffer_head *bh)
1013{
1014 struct journal_head *jh = bh2jh(bh);
1015 int ret;
1016
1017 if (ext4_set_bit_atomic(lock, block, bh->b_data))
1018 return 0;
1019 jbd_lock_bh_state(bh);
1020 if (jh->b_committed_data && ext4_test_bit(block,jh->b_committed_data)) {
1021 ext4_clear_bit_atomic(lock, block, bh->b_data);
1022 ret = 0;
1023 } else {
1024 ret = 1;
1025 }
1026 jbd_unlock_bh_state(bh);
1027 return ret;
1028}
1029 594
1030/** 595 if (!capable(CAP_SYS_RESOURCE) &&
1031 * ext4_try_to_allocate() 596 sbi->s_resuid != current->fsuid &&
1032 * @sb: superblock 597 (sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
1033 * @handle: handle to this transaction 598 root_blocks = ext4_r_blocks_count(sbi->s_es);
1034 * @group: given allocation block group
1035 * @bitmap_bh: bufferhead holds the block bitmap
1036 * @grp_goal: given target block within the group
1037 * @count: target number of blocks to allocate
1038 * @my_rsv: reservation window
1039 *
1040 * Attempt to allocate blocks within a give range. Set the range of allocation
1041 * first, then find the first free bit(s) from the bitmap (within the range),
1042 * and at last, allocate the blocks by claiming the found free bit as allocated.
1043 *
1044 * To set the range of this allocation:
1045 * if there is a reservation window, only try to allocate block(s) from the
1046 * file's own reservation window;
1047 * Otherwise, the allocation range starts from the give goal block, ends at
1048 * the block group's last block.
1049 *
1050 * If we failed to allocate the desired block then we may end up crossing to a
1051 * new bitmap. In that case we must release write access to the old one via
1052 * ext4_journal_release_buffer(), else we'll run out of credits.
1053 */
1054static ext4_grpblk_t
1055ext4_try_to_allocate(struct super_block *sb, handle_t *handle,
1056 ext4_group_t group, struct buffer_head *bitmap_bh,
1057 ext4_grpblk_t grp_goal, unsigned long *count,
1058 struct ext4_reserve_window *my_rsv)
1059{
1060 ext4_fsblk_t group_first_block;
1061 ext4_grpblk_t start, end;
1062 unsigned long num = 0;
1063
1064 /* we do allocation within the reservation window if we have a window */
1065 if (my_rsv) {
1066 group_first_block = ext4_group_first_block_no(sb, group);
1067 if (my_rsv->_rsv_start >= group_first_block)
1068 start = my_rsv->_rsv_start - group_first_block;
1069 else
1070 /* reservation window cross group boundary */
1071 start = 0;
1072 end = my_rsv->_rsv_end - group_first_block + 1;
1073 if (end > EXT4_BLOCKS_PER_GROUP(sb))
1074 /* reservation window crosses group boundary */
1075 end = EXT4_BLOCKS_PER_GROUP(sb);
1076 if ((start <= grp_goal) && (grp_goal < end))
1077 start = grp_goal;
1078 else
1079 grp_goal = -1;
1080 } else {
1081 if (grp_goal > 0)
1082 start = grp_goal;
1083 else
1084 start = 0;
1085 end = EXT4_BLOCKS_PER_GROUP(sb);
1086 }
1087
1088 BUG_ON(start > EXT4_BLOCKS_PER_GROUP(sb));
1089
1090repeat:
1091 if (grp_goal < 0 || !ext4_test_allocatable(grp_goal, bitmap_bh)) {
1092 grp_goal = find_next_usable_block(start, bitmap_bh, end);
1093 if (grp_goal < 0)
1094 goto fail_access;
1095 if (!my_rsv) {
1096 int i;
1097
1098 for (i = 0; i < 7 && grp_goal > start &&
1099 ext4_test_allocatable(grp_goal - 1,
1100 bitmap_bh);
1101 i++, grp_goal--)
1102 ;
1103 }
1104 }
1105 start = grp_goal;
1106
1107 if (!claim_block(sb_bgl_lock(EXT4_SB(sb), group),
1108 grp_goal, bitmap_bh)) {
1109 /*
1110 * The block was allocated by another thread, or it was
1111 * allocated and then freed by another thread
1112 */
1113 start++;
1114 grp_goal++;
1115 if (start >= end)
1116 goto fail_access;
1117 goto repeat;
1118 }
1119 num++;
1120 grp_goal++;
1121 while (num < *count && grp_goal < end
1122 && ext4_test_allocatable(grp_goal, bitmap_bh)
1123 && claim_block(sb_bgl_lock(EXT4_SB(sb), group),
1124 grp_goal, bitmap_bh)) {
1125 num++;
1126 grp_goal++;
1127 }
1128 *count = num;
1129 return grp_goal - num;
1130fail_access:
1131 *count = num;
1132 return -1;
1133}
1134
1135/**
1136 * find_next_reservable_window():
1137 * find a reservable space within the given range.
1138 * It does not allocate the reservation window for now:
1139 * alloc_new_reservation() will do the work later.
1140 *
1141 * @search_head: the head of the searching list;
1142 * This is not necessarily the list head of the whole filesystem
1143 *
1144 * We have both head and start_block to assist the search
1145 * for the reservable space. The list starts from head,
1146 * but we will shift to the place where start_block is,
1147 * then start from there, when looking for a reservable space.
1148 *
1149 * @size: the target new reservation window size
1150 *
1151 * @group_first_block: the first block we consider to start
1152 * the real search from
1153 *
1154 * @last_block:
1155 * the maximum block number that our goal reservable space
1156 * could start from. This is normally the last block in this
1157 * group. The search will end when we found the start of next
1158 * possible reservable space is out of this boundary.
1159 * This could handle the cross boundary reservation window
1160 * request.
1161 *
1162 * basically we search from the given range, rather than the whole
1163 * reservation double linked list, (start_block, last_block)
1164 * to find a free region that is of my size and has not
1165 * been reserved.
1166 *
1167 */
1168static int find_next_reservable_window(
1169 struct ext4_reserve_window_node *search_head,
1170 struct ext4_reserve_window_node *my_rsv,
1171 struct super_block * sb,
1172 ext4_fsblk_t start_block,
1173 ext4_fsblk_t last_block)
1174{
1175 struct rb_node *next;
1176 struct ext4_reserve_window_node *rsv, *prev;
1177 ext4_fsblk_t cur;
1178 int size = my_rsv->rsv_goal_size;
1179
1180 /* TODO: make the start of the reservation window byte-aligned */
1181 /* cur = *start_block & ~7;*/
1182 cur = start_block;
1183 rsv = search_head;
1184 if (!rsv)
1185 return -1;
1186
1187 while (1) {
1188 if (cur <= rsv->rsv_end)
1189 cur = rsv->rsv_end + 1;
1190
1191 /* TODO?
1192 * in the case we could not find a reservable space
1193 * that is what is expected, during the re-search, we could
1194 * remember what's the largest reservable space we could have
1195 * and return that one.
1196 *
1197 * For now it will fail if we could not find the reservable
1198 * space with expected-size (or more)...
1199 */
1200 if (cur > last_block)
1201 return -1; /* fail */
1202
1203 prev = rsv;
1204 next = rb_next(&rsv->rsv_node);
1205 rsv = rb_entry(next,struct ext4_reserve_window_node,rsv_node);
1206 599
1207 /* 600 if (free_blocks - (nblocks + root_blocks + dirty_blocks) <
1208 * Reached the last reservation, we can just append to the 601 EXT4_FREEBLOCKS_WATERMARK) {
1209 * previous one. 602 free_blocks = percpu_counter_sum(fbc);
1210 */ 603 dirty_blocks = percpu_counter_sum(dbc);
1211 if (!next) 604 if (dirty_blocks < 0) {
1212 break; 605 printk(KERN_CRIT "Dirty block accounting "
1213 606 "went wrong %lld\n",
1214 if (cur + size <= rsv->rsv_start) { 607 dirty_blocks);
1215 /*
1216 * Found a reserveable space big enough. We could
1217 * have a reservation across the group boundary here
1218 */
1219 break;
1220 } 608 }
1221 } 609 }
1222 /* 610 /* Check whether we have space after
1223 * we come here either : 611 * accounting for current dirty blocks
1224 * when we reach the end of the whole list,
1225 * and there is empty reservable space after last entry in the list.
1226 * append it to the end of the list.
1227 *
1228 * or we found one reservable space in the middle of the list,
1229 * return the reservation window that we could append to.
1230 * succeed.
1231 */ 612 */
613 if (free_blocks < ((root_blocks + nblocks) + dirty_blocks))
614 /* we don't have free space */
615 return -ENOSPC;
1232 616
1233 if ((prev != my_rsv) && (!rsv_is_empty(&my_rsv->rsv_window))) 617 /* Add the blocks to nblocks */
1234 rsv_window_remove(sb, my_rsv); 618 percpu_counter_add(dbc, nblocks);
1235
1236 /*
1237 * Let's book the whole avaliable window for now. We will check the
1238 * disk bitmap later and then, if there are free blocks then we adjust
1239 * the window size if it's larger than requested.
1240 * Otherwise, we will remove this node from the tree next time
1241 * call find_next_reservable_window.
1242 */
1243 my_rsv->rsv_start = cur;
1244 my_rsv->rsv_end = cur + size - 1;
1245 my_rsv->rsv_alloc_hit = 0;
1246
1247 if (prev != my_rsv)
1248 ext4_rsv_window_add(sb, my_rsv);
1249
1250 return 0; 619 return 0;
1251} 620}
1252 621
1253/** 622/**
1254 * alloc_new_reservation()--allocate a new reservation window
1255 *
1256 * To make a new reservation, we search part of the filesystem
1257 * reservation list (the list that inside the group). We try to
1258 * allocate a new reservation window near the allocation goal,
1259 * or the beginning of the group, if there is no goal.
1260 *
1261 * We first find a reservable space after the goal, then from
1262 * there, we check the bitmap for the first free block after
1263 * it. If there is no free block until the end of group, then the
1264 * whole group is full, we failed. Otherwise, check if the free
1265 * block is inside the expected reservable space, if so, we
1266 * succeed.
1267 * If the first free block is outside the reservable space, then
1268 * start from the first free block, we search for next available
1269 * space, and go on.
1270 *
1271 * on succeed, a new reservation will be found and inserted into the list
1272 * It contains at least one free block, and it does not overlap with other
1273 * reservation windows.
1274 *
1275 * failed: we failed to find a reservation window in this group
1276 *
1277 * @rsv: the reservation
1278 *
1279 * @grp_goal: The goal (group-relative). It is where the search for a
1280 * free reservable space should start from.
1281 * if we have a grp_goal(grp_goal >0 ), then start from there,
1282 * no grp_goal(grp_goal = -1), we start from the first block
1283 * of the group.
1284 *
1285 * @sb: the super block
1286 * @group: the group we are trying to allocate in
1287 * @bitmap_bh: the block group block bitmap
1288 *
1289 */
1290static int alloc_new_reservation(struct ext4_reserve_window_node *my_rsv,
1291 ext4_grpblk_t grp_goal, struct super_block *sb,
1292 ext4_group_t group, struct buffer_head *bitmap_bh)
1293{
1294 struct ext4_reserve_window_node *search_head;
1295 ext4_fsblk_t group_first_block, group_end_block, start_block;
1296 ext4_grpblk_t first_free_block;
1297 struct rb_root *fs_rsv_root = &EXT4_SB(sb)->s_rsv_window_root;
1298 unsigned long size;
1299 int ret;
1300 spinlock_t *rsv_lock = &EXT4_SB(sb)->s_rsv_window_lock;
1301
1302 group_first_block = ext4_group_first_block_no(sb, group);
1303 group_end_block = group_first_block + (EXT4_BLOCKS_PER_GROUP(sb) - 1);
1304
1305 if (grp_goal < 0)
1306 start_block = group_first_block;
1307 else
1308 start_block = grp_goal + group_first_block;
1309
1310 size = my_rsv->rsv_goal_size;
1311
1312 if (!rsv_is_empty(&my_rsv->rsv_window)) {
1313 /*
1314 * if the old reservation is cross group boundary
1315 * and if the goal is inside the old reservation window,
1316 * we will come here when we just failed to allocate from
1317 * the first part of the window. We still have another part
1318 * that belongs to the next group. In this case, there is no
1319 * point to discard our window and try to allocate a new one
1320 * in this group(which will fail). we should
1321 * keep the reservation window, just simply move on.
1322 *
1323 * Maybe we could shift the start block of the reservation
1324 * window to the first block of next group.
1325 */
1326
1327 if ((my_rsv->rsv_start <= group_end_block) &&
1328 (my_rsv->rsv_end > group_end_block) &&
1329 (start_block >= my_rsv->rsv_start))
1330 return -1;
1331
1332 if ((my_rsv->rsv_alloc_hit >
1333 (my_rsv->rsv_end - my_rsv->rsv_start + 1) / 2)) {
1334 /*
1335 * if the previously allocation hit ratio is
1336 * greater than 1/2, then we double the size of
1337 * the reservation window the next time,
1338 * otherwise we keep the same size window
1339 */
1340 size = size * 2;
1341 if (size > EXT4_MAX_RESERVE_BLOCKS)
1342 size = EXT4_MAX_RESERVE_BLOCKS;
1343 my_rsv->rsv_goal_size= size;
1344 }
1345 }
1346
1347 spin_lock(rsv_lock);
1348 /*
1349 * shift the search start to the window near the goal block
1350 */
1351 search_head = search_reserve_window(fs_rsv_root, start_block);
1352
1353 /*
1354 * find_next_reservable_window() simply finds a reservable window
1355 * inside the given range(start_block, group_end_block).
1356 *
1357 * To make sure the reservation window has a free bit inside it, we
1358 * need to check the bitmap after we found a reservable window.
1359 */
1360retry:
1361 ret = find_next_reservable_window(search_head, my_rsv, sb,
1362 start_block, group_end_block);
1363
1364 if (ret == -1) {
1365 if (!rsv_is_empty(&my_rsv->rsv_window))
1366 rsv_window_remove(sb, my_rsv);
1367 spin_unlock(rsv_lock);
1368 return -1;
1369 }
1370
1371 /*
1372 * On success, find_next_reservable_window() returns the
1373 * reservation window where there is a reservable space after it.
1374 * Before we reserve this reservable space, we need
1375 * to make sure there is at least a free block inside this region.
1376 *
1377 * searching the first free bit on the block bitmap and copy of
1378 * last committed bitmap alternatively, until we found a allocatable
1379 * block. Search start from the start block of the reservable space
1380 * we just found.
1381 */
1382 spin_unlock(rsv_lock);
1383 first_free_block = bitmap_search_next_usable_block(
1384 my_rsv->rsv_start - group_first_block,
1385 bitmap_bh, group_end_block - group_first_block + 1);
1386
1387 if (first_free_block < 0) {
1388 /*
1389 * no free block left on the bitmap, no point
1390 * to reserve the space. return failed.
1391 */
1392 spin_lock(rsv_lock);
1393 if (!rsv_is_empty(&my_rsv->rsv_window))
1394 rsv_window_remove(sb, my_rsv);
1395 spin_unlock(rsv_lock);
1396 return -1; /* failed */
1397 }
1398
1399 start_block = first_free_block + group_first_block;
1400 /*
1401 * check if the first free block is within the
1402 * free space we just reserved
1403 */
1404 if (start_block >= my_rsv->rsv_start && start_block <= my_rsv->rsv_end)
1405 return 0; /* success */
1406 /*
1407 * if the first free bit we found is out of the reservable space
1408 * continue search for next reservable space,
1409 * start from where the free block is,
1410 * we also shift the list head to where we stopped last time
1411 */
1412 search_head = my_rsv;
1413 spin_lock(rsv_lock);
1414 goto retry;
1415}
1416
1417/**
1418 * try_to_extend_reservation()
1419 * @my_rsv: given reservation window
1420 * @sb: super block
1421 * @size: the delta to extend
1422 *
1423 * Attempt to expand the reservation window large enough to have
1424 * required number of free blocks
1425 *
1426 * Since ext4_try_to_allocate() will always allocate blocks within
1427 * the reservation window range, if the window size is too small,
1428 * multiple blocks allocation has to stop at the end of the reservation
1429 * window. To make this more efficient, given the total number of
1430 * blocks needed and the current size of the window, we try to
1431 * expand the reservation window size if necessary on a best-effort
1432 * basis before ext4_new_blocks() tries to allocate blocks,
1433 */
1434static void try_to_extend_reservation(struct ext4_reserve_window_node *my_rsv,
1435 struct super_block *sb, int size)
1436{
1437 struct ext4_reserve_window_node *next_rsv;
1438 struct rb_node *next;
1439 spinlock_t *rsv_lock = &EXT4_SB(sb)->s_rsv_window_lock;
1440
1441 if (!spin_trylock(rsv_lock))
1442 return;
1443
1444 next = rb_next(&my_rsv->rsv_node);
1445
1446 if (!next)
1447 my_rsv->rsv_end += size;
1448 else {
1449 next_rsv = rb_entry(next, struct ext4_reserve_window_node, rsv_node);
1450
1451 if ((next_rsv->rsv_start - my_rsv->rsv_end - 1) >= size)
1452 my_rsv->rsv_end += size;
1453 else
1454 my_rsv->rsv_end = next_rsv->rsv_start - 1;
1455 }
1456 spin_unlock(rsv_lock);
1457}
1458
1459/**
1460 * ext4_try_to_allocate_with_rsv()
1461 * @sb: superblock
1462 * @handle: handle to this transaction
1463 * @group: given allocation block group
1464 * @bitmap_bh: bufferhead holds the block bitmap
1465 * @grp_goal: given target block within the group
1466 * @count: target number of blocks to allocate
1467 * @my_rsv: reservation window
1468 * @errp: pointer to store the error code
1469 *
1470 * This is the main function used to allocate a new block and its reservation
1471 * window.
1472 *
1473 * Each time when a new block allocation is need, first try to allocate from
1474 * its own reservation. If it does not have a reservation window, instead of
1475 * looking for a free bit on bitmap first, then look up the reservation list to
1476 * see if it is inside somebody else's reservation window, we try to allocate a
1477 * reservation window for it starting from the goal first. Then do the block
1478 * allocation within the reservation window.
1479 *
1480 * This will avoid keeping on searching the reservation list again and
1481 * again when somebody is looking for a free block (without
1482 * reservation), and there are lots of free blocks, but they are all
1483 * being reserved.
1484 *
1485 * We use a red-black tree for the per-filesystem reservation list.
1486 *
1487 */
1488static ext4_grpblk_t
1489ext4_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
1490 ext4_group_t group, struct buffer_head *bitmap_bh,
1491 ext4_grpblk_t grp_goal,
1492 struct ext4_reserve_window_node * my_rsv,
1493 unsigned long *count, int *errp)
1494{
1495 ext4_fsblk_t group_first_block, group_last_block;
1496 ext4_grpblk_t ret = 0;
1497 int fatal;
1498 unsigned long num = *count;
1499
1500 *errp = 0;
1501
1502 /*
1503 * Make sure we use undo access for the bitmap, because it is critical
1504 * that we do the frozen_data COW on bitmap buffers in all cases even
1505 * if the buffer is in BJ_Forget state in the committing transaction.
1506 */
1507 BUFFER_TRACE(bitmap_bh, "get undo access for new block");
1508 fatal = ext4_journal_get_undo_access(handle, bitmap_bh);
1509 if (fatal) {
1510 *errp = fatal;
1511 return -1;
1512 }
1513
1514 /*
1515 * we don't deal with reservation when
1516 * filesystem is mounted without reservation
1517 * or the file is not a regular file
1518 * or last attempt to allocate a block with reservation turned on failed
1519 */
1520 if (my_rsv == NULL ) {
1521 ret = ext4_try_to_allocate(sb, handle, group, bitmap_bh,
1522 grp_goal, count, NULL);
1523 goto out;
1524 }
1525 /*
1526 * grp_goal is a group relative block number (if there is a goal)
1527 * 0 <= grp_goal < EXT4_BLOCKS_PER_GROUP(sb)
1528 * first block is a filesystem wide block number
1529 * first block is the block number of the first block in this group
1530 */
1531 group_first_block = ext4_group_first_block_no(sb, group);
1532 group_last_block = group_first_block + (EXT4_BLOCKS_PER_GROUP(sb) - 1);
1533
1534 /*
1535 * Basically we will allocate a new block from inode's reservation
1536 * window.
1537 *
1538 * We need to allocate a new reservation window, if:
1539 * a) inode does not have a reservation window; or
1540 * b) last attempt to allocate a block from existing reservation
1541 * failed; or
1542 * c) we come here with a goal and with a reservation window
1543 *
1544 * We do not need to allocate a new reservation window if we come here
1545 * at the beginning with a goal and the goal is inside the window, or
1546 * we don't have a goal but already have a reservation window.
1547 * then we could go to allocate from the reservation window directly.
1548 */
1549 while (1) {
1550 if (rsv_is_empty(&my_rsv->rsv_window) || (ret < 0) ||
1551 !goal_in_my_reservation(&my_rsv->rsv_window,
1552 grp_goal, group, sb)) {
1553 if (my_rsv->rsv_goal_size < *count)
1554 my_rsv->rsv_goal_size = *count;
1555 ret = alloc_new_reservation(my_rsv, grp_goal, sb,
1556 group, bitmap_bh);
1557 if (ret < 0)
1558 break; /* failed */
1559
1560 if (!goal_in_my_reservation(&my_rsv->rsv_window,
1561 grp_goal, group, sb))
1562 grp_goal = -1;
1563 } else if (grp_goal >= 0) {
1564 int curr = my_rsv->rsv_end -
1565 (grp_goal + group_first_block) + 1;
1566
1567 if (curr < *count)
1568 try_to_extend_reservation(my_rsv, sb,
1569 *count - curr);
1570 }
1571
1572 if ((my_rsv->rsv_start > group_last_block) ||
1573 (my_rsv->rsv_end < group_first_block)) {
1574 rsv_window_dump(&EXT4_SB(sb)->s_rsv_window_root, 1);
1575 BUG();
1576 }
1577 ret = ext4_try_to_allocate(sb, handle, group, bitmap_bh,
1578 grp_goal, &num, &my_rsv->rsv_window);
1579 if (ret >= 0) {
1580 my_rsv->rsv_alloc_hit += num;
1581 *count = num;
1582 break; /* succeed */
1583 }
1584 num = *count;
1585 }
1586out:
1587 if (ret >= 0) {
1588 BUFFER_TRACE(bitmap_bh, "journal_dirty_metadata for "
1589 "bitmap block");
1590 fatal = ext4_journal_dirty_metadata(handle, bitmap_bh);
1591 if (fatal) {
1592 *errp = fatal;
1593 return -1;
1594 }
1595 return ret;
1596 }
1597
1598 BUFFER_TRACE(bitmap_bh, "journal_release_buffer");
1599 ext4_journal_release_buffer(handle, bitmap_bh);
1600 return ret;
1601}
1602
1603/**
1604 * ext4_has_free_blocks() 623 * ext4_has_free_blocks()
1605 * @sbi: in-core super block structure. 624 * @sbi: in-core super block structure.
1606 * @nblocks: number of neeed blocks 625 * @nblocks: number of neeed blocks
@@ -1610,29 +629,34 @@ out:
1610 * On success, return nblocks 629 * On success, return nblocks
1611 */ 630 */
1612ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi, 631ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
1613 ext4_fsblk_t nblocks) 632 s64 nblocks)
1614{ 633{
1615 ext4_fsblk_t free_blocks; 634 s64 free_blocks, dirty_blocks;
1616 ext4_fsblk_t root_blocks = 0; 635 s64 root_blocks = 0;
636 struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
637 struct percpu_counter *dbc = &sbi->s_dirtyblocks_counter;
1617 638
1618 free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter); 639 free_blocks = percpu_counter_read_positive(fbc);
640 dirty_blocks = percpu_counter_read_positive(dbc);
1619 641
1620 if (!capable(CAP_SYS_RESOURCE) && 642 if (!capable(CAP_SYS_RESOURCE) &&
1621 sbi->s_resuid != current->fsuid && 643 sbi->s_resuid != current->fsuid &&
1622 (sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid))) 644 (sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
1623 root_blocks = ext4_r_blocks_count(sbi->s_es); 645 root_blocks = ext4_r_blocks_count(sbi->s_es);
1624#ifdef CONFIG_SMP 646
1625 if (free_blocks - root_blocks < FBC_BATCH) 647 if (free_blocks - (nblocks + root_blocks + dirty_blocks) <
1626 free_blocks = 648 EXT4_FREEBLOCKS_WATERMARK) {
1627 percpu_counter_sum_and_set(&sbi->s_freeblocks_counter); 649 free_blocks = percpu_counter_sum(fbc);
1628#endif 650 dirty_blocks = percpu_counter_sum(dbc);
1629 if (free_blocks <= root_blocks) 651 }
652 if (free_blocks <= (root_blocks + dirty_blocks))
1630 /* we don't have free space */ 653 /* we don't have free space */
1631 return 0; 654 return 0;
1632 if (free_blocks - root_blocks < nblocks) 655
1633 return free_blocks - root_blocks; 656 if (free_blocks - (root_blocks + dirty_blocks) < nblocks)
657 return free_blocks - (root_blocks + dirty_blocks);
1634 return nblocks; 658 return nblocks;
1635 } 659}
1636 660
1637 661
1638/** 662/**
@@ -1657,303 +681,6 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
1657 return jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal); 681 return jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal);
1658} 682}
1659 683
1660/**
1661 * ext4_old_new_blocks() -- core block bitmap based block allocation function
1662 *
1663 * @handle: handle to this transaction
1664 * @inode: file inode
1665 * @goal: given target block(filesystem wide)
1666 * @count: target number of blocks to allocate
1667 * @errp: error code
1668 *
1669 * ext4_old_new_blocks uses a goal block to assist allocation and look up
1670 * the block bitmap directly to do block allocation. It tries to
1671 * allocate block(s) from the block group contains the goal block first. If
1672 * that fails, it will try to allocate block(s) from other block groups
1673 * without any specific goal block.
1674 *
1675 * This function is called when -o nomballoc mount option is enabled
1676 *
1677 */
1678ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
1679 ext4_fsblk_t goal, unsigned long *count, int *errp)
1680{
1681 struct buffer_head *bitmap_bh = NULL;
1682 struct buffer_head *gdp_bh;
1683 ext4_group_t group_no;
1684 ext4_group_t goal_group;
1685 ext4_grpblk_t grp_target_blk; /* blockgroup relative goal block */
1686 ext4_grpblk_t grp_alloc_blk; /* blockgroup-relative allocated block*/
1687 ext4_fsblk_t ret_block; /* filesyetem-wide allocated block */
1688 ext4_group_t bgi; /* blockgroup iteration index */
1689 int fatal = 0, err;
1690 int performed_allocation = 0;
1691 ext4_grpblk_t free_blocks; /* number of free blocks in a group */
1692 struct super_block *sb;
1693 struct ext4_group_desc *gdp;
1694 struct ext4_super_block *es;
1695 struct ext4_sb_info *sbi;
1696 struct ext4_reserve_window_node *my_rsv = NULL;
1697 struct ext4_block_alloc_info *block_i;
1698 unsigned short windowsz = 0;
1699 ext4_group_t ngroups;
1700 unsigned long num = *count;
1701
1702 sb = inode->i_sb;
1703 if (!sb) {
1704 *errp = -ENODEV;
1705 printk("ext4_new_block: nonexistent device");
1706 return 0;
1707 }
1708
1709 sbi = EXT4_SB(sb);
1710 if (!EXT4_I(inode)->i_delalloc_reserved_flag) {
1711 /*
1712 * With delalloc we already reserved the blocks
1713 */
1714 *count = ext4_has_free_blocks(sbi, *count);
1715 }
1716 if (*count == 0) {
1717 *errp = -ENOSPC;
1718 return 0; /*return with ENOSPC error */
1719 }
1720 num = *count;
1721
1722 /*
1723 * Check quota for allocation of this block.
1724 */
1725 if (DQUOT_ALLOC_BLOCK(inode, num)) {
1726 *errp = -EDQUOT;
1727 return 0;
1728 }
1729
1730 sbi = EXT4_SB(sb);
1731 es = EXT4_SB(sb)->s_es;
1732 ext4_debug("goal=%llu.\n", goal);
1733 /*
1734 * Allocate a block from reservation only when
1735 * filesystem is mounted with reservation(default,-o reservation), and
1736 * it's a regular file, and
1737 * the desired window size is greater than 0 (One could use ioctl
1738 * command EXT4_IOC_SETRSVSZ to set the window size to 0 to turn off
1739 * reservation on that particular file)
1740 */
1741 block_i = EXT4_I(inode)->i_block_alloc_info;
1742 if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0))
1743 my_rsv = &block_i->rsv_window_node;
1744
1745 /*
1746 * First, test whether the goal block is free.
1747 */
1748 if (goal < le32_to_cpu(es->s_first_data_block) ||
1749 goal >= ext4_blocks_count(es))
1750 goal = le32_to_cpu(es->s_first_data_block);
1751 ext4_get_group_no_and_offset(sb, goal, &group_no, &grp_target_blk);
1752 goal_group = group_no;
1753retry_alloc:
1754 gdp = ext4_get_group_desc(sb, group_no, &gdp_bh);
1755 if (!gdp)
1756 goto io_error;
1757
1758 free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
1759 /*
1760 * if there is not enough free blocks to make a new resevation
1761 * turn off reservation for this allocation
1762 */
1763 if (my_rsv && (free_blocks < windowsz)
1764 && (rsv_is_empty(&my_rsv->rsv_window)))
1765 my_rsv = NULL;
1766
1767 if (free_blocks > 0) {
1768 bitmap_bh = ext4_read_block_bitmap(sb, group_no);
1769 if (!bitmap_bh)
1770 goto io_error;
1771 grp_alloc_blk = ext4_try_to_allocate_with_rsv(sb, handle,
1772 group_no, bitmap_bh, grp_target_blk,
1773 my_rsv, &num, &fatal);
1774 if (fatal)
1775 goto out;
1776 if (grp_alloc_blk >= 0)
1777 goto allocated;
1778 }
1779
1780 ngroups = EXT4_SB(sb)->s_groups_count;
1781 smp_rmb();
1782
1783 /*
1784 * Now search the rest of the groups. We assume that
1785 * group_no and gdp correctly point to the last group visited.
1786 */
1787 for (bgi = 0; bgi < ngroups; bgi++) {
1788 group_no++;
1789 if (group_no >= ngroups)
1790 group_no = 0;
1791 gdp = ext4_get_group_desc(sb, group_no, &gdp_bh);
1792 if (!gdp)
1793 goto io_error;
1794 free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
1795 /*
1796 * skip this group if the number of
1797 * free blocks is less than half of the reservation
1798 * window size.
1799 */
1800 if (free_blocks <= (windowsz/2))
1801 continue;
1802
1803 brelse(bitmap_bh);
1804 bitmap_bh = ext4_read_block_bitmap(sb, group_no);
1805 if (!bitmap_bh)
1806 goto io_error;
1807 /*
1808 * try to allocate block(s) from this group, without a goal(-1).
1809 */
1810 grp_alloc_blk = ext4_try_to_allocate_with_rsv(sb, handle,
1811 group_no, bitmap_bh, -1, my_rsv,
1812 &num, &fatal);
1813 if (fatal)
1814 goto out;
1815 if (grp_alloc_blk >= 0)
1816 goto allocated;
1817 }
1818 /*
1819 * We may end up a bogus ealier ENOSPC error due to
1820 * filesystem is "full" of reservations, but
1821 * there maybe indeed free blocks avaliable on disk
1822 * In this case, we just forget about the reservations
1823 * just do block allocation as without reservations.
1824 */
1825 if (my_rsv) {
1826 my_rsv = NULL;
1827 windowsz = 0;
1828 group_no = goal_group;
1829 goto retry_alloc;
1830 }
1831 /* No space left on the device */
1832 *errp = -ENOSPC;
1833 goto out;
1834
1835allocated:
1836
1837 ext4_debug("using block group %lu(%d)\n",
1838 group_no, gdp->bg_free_blocks_count);
1839
1840 BUFFER_TRACE(gdp_bh, "get_write_access");
1841 fatal = ext4_journal_get_write_access(handle, gdp_bh);
1842 if (fatal)
1843 goto out;
1844
1845 ret_block = grp_alloc_blk + ext4_group_first_block_no(sb, group_no);
1846
1847 if (in_range(ext4_block_bitmap(sb, gdp), ret_block, num) ||
1848 in_range(ext4_inode_bitmap(sb, gdp), ret_block, num) ||
1849 in_range(ret_block, ext4_inode_table(sb, gdp),
1850 EXT4_SB(sb)->s_itb_per_group) ||
1851 in_range(ret_block + num - 1, ext4_inode_table(sb, gdp),
1852 EXT4_SB(sb)->s_itb_per_group)) {
1853 ext4_error(sb, "ext4_new_block",
1854 "Allocating block in system zone - "
1855 "blocks from %llu, length %lu",
1856 ret_block, num);
1857 /*
1858 * claim_block marked the blocks we allocated
1859 * as in use. So we may want to selectively
1860 * mark some of the blocks as free
1861 */
1862 goto retry_alloc;
1863 }
1864
1865 performed_allocation = 1;
1866
1867#ifdef CONFIG_JBD2_DEBUG
1868 {
1869 struct buffer_head *debug_bh;
1870
1871 /* Record bitmap buffer state in the newly allocated block */
1872 debug_bh = sb_find_get_block(sb, ret_block);
1873 if (debug_bh) {
1874 BUFFER_TRACE(debug_bh, "state when allocated");
1875 BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap state");
1876 brelse(debug_bh);
1877 }
1878 }
1879 jbd_lock_bh_state(bitmap_bh);
1880 spin_lock(sb_bgl_lock(sbi, group_no));
1881 if (buffer_jbd(bitmap_bh) && bh2jh(bitmap_bh)->b_committed_data) {
1882 int i;
1883
1884 for (i = 0; i < num; i++) {
1885 if (ext4_test_bit(grp_alloc_blk+i,
1886 bh2jh(bitmap_bh)->b_committed_data)) {
1887 printk("%s: block was unexpectedly set in "
1888 "b_committed_data\n", __func__);
1889 }
1890 }
1891 }
1892 ext4_debug("found bit %d\n", grp_alloc_blk);
1893 spin_unlock(sb_bgl_lock(sbi, group_no));
1894 jbd_unlock_bh_state(bitmap_bh);
1895#endif
1896
1897 if (ret_block + num - 1 >= ext4_blocks_count(es)) {
1898 ext4_error(sb, "ext4_new_block",
1899 "block(%llu) >= blocks count(%llu) - "
1900 "block_group = %lu, es == %p ", ret_block,
1901 ext4_blocks_count(es), group_no, es);
1902 goto out;
1903 }
1904
1905 /*
1906 * It is up to the caller to add the new buffer to a journal
1907 * list of some description. We don't know in advance whether
1908 * the caller wants to use it as metadata or data.
1909 */
1910 spin_lock(sb_bgl_lock(sbi, group_no));
1911 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))
1912 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
1913 le16_add_cpu(&gdp->bg_free_blocks_count, -num);
1914 gdp->bg_checksum = ext4_group_desc_csum(sbi, group_no, gdp);
1915 spin_unlock(sb_bgl_lock(sbi, group_no));
1916 if (!EXT4_I(inode)->i_delalloc_reserved_flag)
1917 percpu_counter_sub(&sbi->s_freeblocks_counter, num);
1918
1919 if (sbi->s_log_groups_per_flex) {
1920 ext4_group_t flex_group = ext4_flex_group(sbi, group_no);
1921 spin_lock(sb_bgl_lock(sbi, flex_group));
1922 sbi->s_flex_groups[flex_group].free_blocks -= num;
1923 spin_unlock(sb_bgl_lock(sbi, flex_group));
1924 }
1925
1926 BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
1927 err = ext4_journal_dirty_metadata(handle, gdp_bh);
1928 if (!fatal)
1929 fatal = err;
1930
1931 sb->s_dirt = 1;
1932 if (fatal)
1933 goto out;
1934
1935 *errp = 0;
1936 brelse(bitmap_bh);
1937 DQUOT_FREE_BLOCK(inode, *count-num);
1938 *count = num;
1939 return ret_block;
1940
1941io_error:
1942 *errp = -EIO;
1943out:
1944 if (fatal) {
1945 *errp = fatal;
1946 ext4_std_error(sb, fatal);
1947 }
1948 /*
1949 * Undo the block allocation
1950 */
1951 if (!performed_allocation)
1952 DQUOT_FREE_BLOCK(inode, *count);
1953 brelse(bitmap_bh);
1954 return 0;
1955}
1956
1957#define EXT4_META_BLOCK 0x1 684#define EXT4_META_BLOCK 0x1
1958 685
1959static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode, 686static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode,
@@ -1963,10 +690,6 @@ static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode,
1963 struct ext4_allocation_request ar; 690 struct ext4_allocation_request ar;
1964 ext4_fsblk_t ret; 691 ext4_fsblk_t ret;
1965 692
1966 if (!test_opt(inode->i_sb, MBALLOC)) {
1967 return ext4_old_new_blocks(handle, inode, goal, count, errp);
1968 }
1969
1970 memset(&ar, 0, sizeof(ar)); 693 memset(&ar, 0, sizeof(ar));
1971 /* Fill with neighbour allocated blocks */ 694 /* Fill with neighbour allocated blocks */
1972 695
@@ -2008,7 +731,7 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
2008 /* 731 /*
2009 * Account for the allocated meta blocks 732 * Account for the allocated meta blocks
2010 */ 733 */
2011 if (!(*errp)) { 734 if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) {
2012 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 735 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
2013 EXT4_I(inode)->i_allocated_meta_blocks += *count; 736 EXT4_I(inode)->i_allocated_meta_blocks += *count;
2014 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 737 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
@@ -2093,10 +816,9 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
2093 bitmap_count += x; 816 bitmap_count += x;
2094 } 817 }
2095 brelse(bitmap_bh); 818 brelse(bitmap_bh);
2096 printk("ext4_count_free_blocks: stored = %llu" 819 printk(KERN_DEBUG "ext4_count_free_blocks: stored = %llu"
2097 ", computed = %llu, %llu\n", 820 ", computed = %llu, %llu\n", ext4_free_blocks_count(es),
2098 ext4_free_blocks_count(es), 821 desc_count, bitmap_count);
2099 desc_count, bitmap_count);
2100 return bitmap_count; 822 return bitmap_count;
2101#else 823#else
2102 desc_count = 0; 824 desc_count = 0;
@@ -2183,8 +905,9 @@ unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group)
2183 905
2184 if (!EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG) || 906 if (!EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG) ||
2185 metagroup < first_meta_bg) 907 metagroup < first_meta_bg)
2186 return ext4_bg_num_gdb_nometa(sb,group); 908 return ext4_bg_num_gdb_nometa(sb, group);
2187 909
2188 return ext4_bg_num_gdb_meta(sb,group); 910 return ext4_bg_num_gdb_meta(sb,group);
2189 911
2190} 912}
913
diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c
index d37ea6750454..0a7a6663c190 100644
--- a/fs/ext4/bitmap.c
+++ b/fs/ext4/bitmap.c
@@ -15,17 +15,17 @@
15 15
16static const int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0}; 16static const int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0};
17 17
18unsigned long ext4_count_free (struct buffer_head * map, unsigned int numchars) 18unsigned long ext4_count_free(struct buffer_head *map, unsigned int numchars)
19{ 19{
20 unsigned int i; 20 unsigned int i;
21 unsigned long sum = 0; 21 unsigned long sum = 0;
22 22
23 if (!map) 23 if (!map)
24 return (0); 24 return 0;
25 for (i = 0; i < numchars; i++) 25 for (i = 0; i < numchars; i++)
26 sum += nibblemap[map->b_data[i] & 0xf] + 26 sum += nibblemap[map->b_data[i] & 0xf] +
27 nibblemap[(map->b_data[i] >> 4) & 0xf]; 27 nibblemap[(map->b_data[i] >> 4) & 0xf];
28 return (sum); 28 return sum;
29} 29}
30 30
31#endif /* EXT4FS_DEBUG */ 31#endif /* EXT4FS_DEBUG */
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index ec8e33b45219..3ca6a2b7632d 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -33,10 +33,10 @@ static unsigned char ext4_filetype_table[] = {
33}; 33};
34 34
35static int ext4_readdir(struct file *, void *, filldir_t); 35static int ext4_readdir(struct file *, void *, filldir_t);
36static int ext4_dx_readdir(struct file * filp, 36static int ext4_dx_readdir(struct file *filp,
37 void * dirent, filldir_t filldir); 37 void *dirent, filldir_t filldir);
38static int ext4_release_dir (struct inode * inode, 38static int ext4_release_dir(struct inode *inode,
39 struct file * filp); 39 struct file *filp);
40 40
41const struct file_operations ext4_dir_operations = { 41const struct file_operations ext4_dir_operations = {
42 .llseek = generic_file_llseek, 42 .llseek = generic_file_llseek,
@@ -61,12 +61,12 @@ static unsigned char get_dtype(struct super_block *sb, int filetype)
61} 61}
62 62
63 63
64int ext4_check_dir_entry (const char * function, struct inode * dir, 64int ext4_check_dir_entry(const char *function, struct inode *dir,
65 struct ext4_dir_entry_2 * de, 65 struct ext4_dir_entry_2 *de,
66 struct buffer_head * bh, 66 struct buffer_head *bh,
67 unsigned long offset) 67 unsigned long offset)
68{ 68{
69 const char * error_msg = NULL; 69 const char *error_msg = NULL;
70 const int rlen = ext4_rec_len_from_disk(de->rec_len); 70 const int rlen = ext4_rec_len_from_disk(de->rec_len);
71 71
72 if (rlen < EXT4_DIR_REC_LEN(1)) 72 if (rlen < EXT4_DIR_REC_LEN(1))
@@ -82,7 +82,7 @@ int ext4_check_dir_entry (const char * function, struct inode * dir,
82 error_msg = "inode out of bounds"; 82 error_msg = "inode out of bounds";
83 83
84 if (error_msg != NULL) 84 if (error_msg != NULL)
85 ext4_error (dir->i_sb, function, 85 ext4_error(dir->i_sb, function,
86 "bad entry in directory #%lu: %s - " 86 "bad entry in directory #%lu: %s - "
87 "offset=%lu, inode=%lu, rec_len=%d, name_len=%d", 87 "offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
88 dir->i_ino, error_msg, offset, 88 dir->i_ino, error_msg, offset,
@@ -91,8 +91,8 @@ int ext4_check_dir_entry (const char * function, struct inode * dir,
91 return error_msg == NULL ? 1 : 0; 91 return error_msg == NULL ? 1 : 0;
92} 92}
93 93
94static int ext4_readdir(struct file * filp, 94static int ext4_readdir(struct file *filp,
95 void * dirent, filldir_t filldir) 95 void *dirent, filldir_t filldir)
96{ 96{
97 int error = 0; 97 int error = 0;
98 unsigned long offset; 98 unsigned long offset;
@@ -102,6 +102,7 @@ static int ext4_readdir(struct file * filp,
102 int err; 102 int err;
103 struct inode *inode = filp->f_path.dentry->d_inode; 103 struct inode *inode = filp->f_path.dentry->d_inode;
104 int ret = 0; 104 int ret = 0;
105 int dir_has_error = 0;
105 106
106 sb = inode->i_sb; 107 sb = inode->i_sb;
107 108
@@ -148,9 +149,13 @@ static int ext4_readdir(struct file * filp,
148 * of recovering data when there's a bad sector 149 * of recovering data when there's a bad sector
149 */ 150 */
150 if (!bh) { 151 if (!bh) {
151 ext4_error (sb, "ext4_readdir", 152 if (!dir_has_error) {
152 "directory #%lu contains a hole at offset %lu", 153 ext4_error(sb, __func__, "directory #%lu "
153 inode->i_ino, (unsigned long)filp->f_pos); 154 "contains a hole at offset %Lu",
155 inode->i_ino,
156 (unsigned long long) filp->f_pos);
157 dir_has_error = 1;
158 }
154 /* corrupt size? Maybe no more blocks to read */ 159 /* corrupt size? Maybe no more blocks to read */
155 if (filp->f_pos > inode->i_blocks << 9) 160 if (filp->f_pos > inode->i_blocks << 9)
156 break; 161 break;
@@ -187,14 +192,14 @@ revalidate:
187 while (!error && filp->f_pos < inode->i_size 192 while (!error && filp->f_pos < inode->i_size
188 && offset < sb->s_blocksize) { 193 && offset < sb->s_blocksize) {
189 de = (struct ext4_dir_entry_2 *) (bh->b_data + offset); 194 de = (struct ext4_dir_entry_2 *) (bh->b_data + offset);
190 if (!ext4_check_dir_entry ("ext4_readdir", inode, de, 195 if (!ext4_check_dir_entry("ext4_readdir", inode, de,
191 bh, offset)) { 196 bh, offset)) {
192 /* 197 /*
193 * On error, skip the f_pos to the next block 198 * On error, skip the f_pos to the next block
194 */ 199 */
195 filp->f_pos = (filp->f_pos | 200 filp->f_pos = (filp->f_pos |
196 (sb->s_blocksize - 1)) + 1; 201 (sb->s_blocksize - 1)) + 1;
197 brelse (bh); 202 brelse(bh);
198 ret = stored; 203 ret = stored;
199 goto out; 204 goto out;
200 } 205 }
@@ -218,12 +223,12 @@ revalidate:
218 break; 223 break;
219 if (version != filp->f_version) 224 if (version != filp->f_version)
220 goto revalidate; 225 goto revalidate;
221 stored ++; 226 stored++;
222 } 227 }
223 filp->f_pos += ext4_rec_len_from_disk(de->rec_len); 228 filp->f_pos += ext4_rec_len_from_disk(de->rec_len);
224 } 229 }
225 offset = 0; 230 offset = 0;
226 brelse (bh); 231 brelse(bh);
227 } 232 }
228out: 233out:
229 return ret; 234 return ret;
@@ -290,9 +295,9 @@ static void free_rb_tree_fname(struct rb_root *root)
290 parent = rb_parent(n); 295 parent = rb_parent(n);
291 fname = rb_entry(n, struct fname, rb_hash); 296 fname = rb_entry(n, struct fname, rb_hash);
292 while (fname) { 297 while (fname) {
293 struct fname * old = fname; 298 struct fname *old = fname;
294 fname = fname->next; 299 fname = fname->next;
295 kfree (old); 300 kfree(old);
296 } 301 }
297 if (!parent) 302 if (!parent)
298 root->rb_node = NULL; 303 root->rb_node = NULL;
@@ -331,7 +336,7 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
331 struct ext4_dir_entry_2 *dirent) 336 struct ext4_dir_entry_2 *dirent)
332{ 337{
333 struct rb_node **p, *parent = NULL; 338 struct rb_node **p, *parent = NULL;
334 struct fname * fname, *new_fn; 339 struct fname *fname, *new_fn;
335 struct dir_private_info *info; 340 struct dir_private_info *info;
336 int len; 341 int len;
337 342
@@ -388,19 +393,20 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
388 * for all entres on the fname linked list. (Normally there is only 393 * for all entres on the fname linked list. (Normally there is only
389 * one entry on the linked list, unless there are 62 bit hash collisions.) 394 * one entry on the linked list, unless there are 62 bit hash collisions.)
390 */ 395 */
391static int call_filldir(struct file * filp, void * dirent, 396static int call_filldir(struct file *filp, void *dirent,
392 filldir_t filldir, struct fname *fname) 397 filldir_t filldir, struct fname *fname)
393{ 398{
394 struct dir_private_info *info = filp->private_data; 399 struct dir_private_info *info = filp->private_data;
395 loff_t curr_pos; 400 loff_t curr_pos;
396 struct inode *inode = filp->f_path.dentry->d_inode; 401 struct inode *inode = filp->f_path.dentry->d_inode;
397 struct super_block * sb; 402 struct super_block *sb;
398 int error; 403 int error;
399 404
400 sb = inode->i_sb; 405 sb = inode->i_sb;
401 406
402 if (!fname) { 407 if (!fname) {
403 printk("call_filldir: called with null fname?!?\n"); 408 printk(KERN_ERR "ext4: call_filldir: called with "
409 "null fname?!?\n");
404 return 0; 410 return 0;
405 } 411 }
406 curr_pos = hash2pos(fname->hash, fname->minor_hash); 412 curr_pos = hash2pos(fname->hash, fname->minor_hash);
@@ -419,8 +425,8 @@ static int call_filldir(struct file * filp, void * dirent,
419 return 0; 425 return 0;
420} 426}
421 427
422static int ext4_dx_readdir(struct file * filp, 428static int ext4_dx_readdir(struct file *filp,
423 void * dirent, filldir_t filldir) 429 void *dirent, filldir_t filldir)
424{ 430{
425 struct dir_private_info *info = filp->private_data; 431 struct dir_private_info *info = filp->private_data;
426 struct inode *inode = filp->f_path.dentry->d_inode; 432 struct inode *inode = filp->f_path.dentry->d_inode;
@@ -511,7 +517,7 @@ finished:
511 return 0; 517 return 0;
512} 518}
513 519
514static int ext4_release_dir (struct inode * inode, struct file * filp) 520static int ext4_release_dir(struct inode *inode, struct file *filp)
515{ 521{
516 if (filp->private_data) 522 if (filp->private_data)
517 ext4_htree_free_dir_info(filp->private_data); 523 ext4_htree_free_dir_info(filp->private_data);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 295003241d3d..6690a41cdd9f 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -44,9 +44,9 @@
44#ifdef EXT4FS_DEBUG 44#ifdef EXT4FS_DEBUG
45#define ext4_debug(f, a...) \ 45#define ext4_debug(f, a...) \
46 do { \ 46 do { \
47 printk (KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:", \ 47 printk(KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:", \
48 __FILE__, __LINE__, __func__); \ 48 __FILE__, __LINE__, __func__); \
49 printk (KERN_DEBUG f, ## a); \ 49 printk(KERN_DEBUG f, ## a); \
50 } while (0) 50 } while (0)
51#else 51#else
52#define ext4_debug(f, a...) do {} while (0) 52#define ext4_debug(f, a...) do {} while (0)
@@ -128,7 +128,7 @@ struct ext4_allocation_request {
128#else 128#else
129# define EXT4_BLOCK_SIZE(s) (EXT4_MIN_BLOCK_SIZE << (s)->s_log_block_size) 129# define EXT4_BLOCK_SIZE(s) (EXT4_MIN_BLOCK_SIZE << (s)->s_log_block_size)
130#endif 130#endif
131#define EXT4_ADDR_PER_BLOCK(s) (EXT4_BLOCK_SIZE(s) / sizeof (__u32)) 131#define EXT4_ADDR_PER_BLOCK(s) (EXT4_BLOCK_SIZE(s) / sizeof(__u32))
132#ifdef __KERNEL__ 132#ifdef __KERNEL__
133# define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits) 133# define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits)
134#else 134#else
@@ -245,7 +245,7 @@ struct flex_groups {
245#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ 245#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */
246 246
247#define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */ 247#define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */
248#define EXT4_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */ 248#define EXT4_FL_USER_MODIFIABLE 0x000B80FF /* User modifiable flags */
249 249
250/* 250/*
251 * Inode dynamic state flags 251 * Inode dynamic state flags
@@ -291,8 +291,6 @@ struct ext4_new_group_data {
291#define EXT4_IOC_SETFLAGS FS_IOC_SETFLAGS 291#define EXT4_IOC_SETFLAGS FS_IOC_SETFLAGS
292#define EXT4_IOC_GETVERSION _IOR('f', 3, long) 292#define EXT4_IOC_GETVERSION _IOR('f', 3, long)
293#define EXT4_IOC_SETVERSION _IOW('f', 4, long) 293#define EXT4_IOC_SETVERSION _IOW('f', 4, long)
294#define EXT4_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long)
295#define EXT4_IOC_GROUP_ADD _IOW('f', 8,struct ext4_new_group_input)
296#define EXT4_IOC_GETVERSION_OLD FS_IOC_GETVERSION 294#define EXT4_IOC_GETVERSION_OLD FS_IOC_GETVERSION
297#define EXT4_IOC_SETVERSION_OLD FS_IOC_SETVERSION 295#define EXT4_IOC_SETVERSION_OLD FS_IOC_SETVERSION
298#ifdef CONFIG_JBD2_DEBUG 296#ifdef CONFIG_JBD2_DEBUG
@@ -300,7 +298,10 @@ struct ext4_new_group_data {
300#endif 298#endif
301#define EXT4_IOC_GETRSVSZ _IOR('f', 5, long) 299#define EXT4_IOC_GETRSVSZ _IOR('f', 5, long)
302#define EXT4_IOC_SETRSVSZ _IOW('f', 6, long) 300#define EXT4_IOC_SETRSVSZ _IOW('f', 6, long)
303#define EXT4_IOC_MIGRATE _IO('f', 7) 301#define EXT4_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long)
302#define EXT4_IOC_GROUP_ADD _IOW('f', 8, struct ext4_new_group_input)
303#define EXT4_IOC_MIGRATE _IO('f', 9)
304 /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */
304 305
305/* 306/*
306 * ioctl commands in 32 bit emulation 307 * ioctl commands in 32 bit emulation
@@ -538,8 +539,9 @@ do { \
538#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ 539#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */
539#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ 540#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
540#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ 541#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */
541#define EXT4_MOUNT_MBALLOC 0x4000000 /* Buddy allocation support */
542#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ 542#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
543#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */
544
543/* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */ 545/* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
544#ifndef _LINUX_EXT2_FS_H 546#ifndef _LINUX_EXT2_FS_H
545#define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt 547#define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt
@@ -667,7 +669,7 @@ struct ext4_super_block {
667}; 669};
668 670
669#ifdef __KERNEL__ 671#ifdef __KERNEL__
670static inline struct ext4_sb_info * EXT4_SB(struct super_block *sb) 672static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
671{ 673{
672 return sb->s_fs_info; 674 return sb->s_fs_info;
673} 675}
@@ -725,11 +727,11 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
725 */ 727 */
726 728
727#define EXT4_HAS_COMPAT_FEATURE(sb,mask) \ 729#define EXT4_HAS_COMPAT_FEATURE(sb,mask) \
728 ( EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask) ) 730 (EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask))
729#define EXT4_HAS_RO_COMPAT_FEATURE(sb,mask) \ 731#define EXT4_HAS_RO_COMPAT_FEATURE(sb,mask) \
730 ( EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask) ) 732 (EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask))
731#define EXT4_HAS_INCOMPAT_FEATURE(sb,mask) \ 733#define EXT4_HAS_INCOMPAT_FEATURE(sb,mask) \
732 ( EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask) ) 734 (EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask))
733#define EXT4_SET_COMPAT_FEATURE(sb,mask) \ 735#define EXT4_SET_COMPAT_FEATURE(sb,mask) \
734 EXT4_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask) 736 EXT4_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask)
735#define EXT4_SET_RO_COMPAT_FEATURE(sb,mask) \ 737#define EXT4_SET_RO_COMPAT_FEATURE(sb,mask) \
@@ -789,6 +791,8 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
789#define EXT4_DEF_RESUID 0 791#define EXT4_DEF_RESUID 0
790#define EXT4_DEF_RESGID 0 792#define EXT4_DEF_RESGID 0
791 793
794#define EXT4_DEF_INODE_READAHEAD_BLKS 32
795
792/* 796/*
793 * Default mount options 797 * Default mount options
794 */ 798 */
@@ -954,6 +958,24 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no)
954void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, 958void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
955 unsigned long *blockgrpp, ext4_grpblk_t *offsetp); 959 unsigned long *blockgrpp, ext4_grpblk_t *offsetp);
956 960
961extern struct proc_dir_entry *ext4_proc_root;
962
963#ifdef CONFIG_PROC_FS
964extern const struct file_operations ext4_ui_proc_fops;
965
966#define EXT4_PROC_HANDLER(name, var) \
967do { \
968 proc = proc_create_data(name, mode, sbi->s_proc, \
969 &ext4_ui_proc_fops, &sbi->s_##var); \
970 if (proc == NULL) { \
971 printk(KERN_ERR "EXT4-fs: can't create %s\n", name); \
972 goto err_out; \
973 } \
974} while (0)
975#else
976#define EXT4_PROC_HANDLER(name, var)
977#endif
978
957/* 979/*
958 * Function prototypes 980 * Function prototypes
959 */ 981 */
@@ -981,23 +1003,20 @@ extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
981extern ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode, 1003extern ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
982 ext4_lblk_t iblock, ext4_fsblk_t goal, 1004 ext4_lblk_t iblock, ext4_fsblk_t goal,
983 unsigned long *count, int *errp); 1005 unsigned long *count, int *errp);
984extern ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode, 1006extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
985 ext4_fsblk_t goal, unsigned long *count, int *errp);
986extern ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi, 1007extern ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
987 ext4_fsblk_t nblocks); 1008 s64 nblocks);
988extern void ext4_free_blocks (handle_t *handle, struct inode *inode, 1009extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
989 ext4_fsblk_t block, unsigned long count, int metadata); 1010 ext4_fsblk_t block, unsigned long count, int metadata);
990extern void ext4_free_blocks_sb (handle_t *handle, struct super_block *sb, 1011extern void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
991 ext4_fsblk_t block, unsigned long count, 1012 ext4_fsblk_t block, unsigned long count,
992 unsigned long *pdquot_freed_blocks); 1013 unsigned long *pdquot_freed_blocks);
993extern ext4_fsblk_t ext4_count_free_blocks (struct super_block *); 1014extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *);
994extern void ext4_check_blocks_bitmap (struct super_block *); 1015extern void ext4_check_blocks_bitmap(struct super_block *);
995extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, 1016extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
996 ext4_group_t block_group, 1017 ext4_group_t block_group,
997 struct buffer_head ** bh); 1018 struct buffer_head ** bh);
998extern int ext4_should_retry_alloc(struct super_block *sb, int *retries); 1019extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
999extern void ext4_init_block_alloc_info(struct inode *);
1000extern void ext4_rsv_window_add(struct super_block *sb, struct ext4_reserve_window_node *rsv);
1001 1020
1002/* dir.c */ 1021/* dir.c */
1003extern int ext4_check_dir_entry(const char *, struct inode *, 1022extern int ext4_check_dir_entry(const char *, struct inode *,
@@ -1009,20 +1028,20 @@ extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
1009extern void ext4_htree_free_dir_info(struct dir_private_info *p); 1028extern void ext4_htree_free_dir_info(struct dir_private_info *p);
1010 1029
1011/* fsync.c */ 1030/* fsync.c */
1012extern int ext4_sync_file (struct file *, struct dentry *, int); 1031extern int ext4_sync_file(struct file *, struct dentry *, int);
1013 1032
1014/* hash.c */ 1033/* hash.c */
1015extern int ext4fs_dirhash(const char *name, int len, struct 1034extern int ext4fs_dirhash(const char *name, int len, struct
1016 dx_hash_info *hinfo); 1035 dx_hash_info *hinfo);
1017 1036
1018/* ialloc.c */ 1037/* ialloc.c */
1019extern struct inode * ext4_new_inode (handle_t *, struct inode *, int); 1038extern struct inode * ext4_new_inode(handle_t *, struct inode *, int);
1020extern void ext4_free_inode (handle_t *, struct inode *); 1039extern void ext4_free_inode(handle_t *, struct inode *);
1021extern struct inode * ext4_orphan_get (struct super_block *, unsigned long); 1040extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
1022extern unsigned long ext4_count_free_inodes (struct super_block *); 1041extern unsigned long ext4_count_free_inodes(struct super_block *);
1023extern unsigned long ext4_count_dirs (struct super_block *); 1042extern unsigned long ext4_count_dirs(struct super_block *);
1024extern void ext4_check_inodes_bitmap (struct super_block *); 1043extern void ext4_check_inodes_bitmap(struct super_block *);
1025extern unsigned long ext4_count_free (struct buffer_head *, unsigned); 1044extern unsigned long ext4_count_free(struct buffer_head *, unsigned);
1026 1045
1027/* mballoc.c */ 1046/* mballoc.c */
1028extern long ext4_mb_stats; 1047extern long ext4_mb_stats;
@@ -1032,7 +1051,7 @@ extern int ext4_mb_release(struct super_block *);
1032extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *, 1051extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *,
1033 struct ext4_allocation_request *, int *); 1052 struct ext4_allocation_request *, int *);
1034extern int ext4_mb_reserve_blocks(struct super_block *, int); 1053extern int ext4_mb_reserve_blocks(struct super_block *, int);
1035extern void ext4_mb_discard_inode_preallocations(struct inode *); 1054extern void ext4_discard_preallocations(struct inode *);
1036extern int __init init_ext4_mballoc(void); 1055extern int __init init_ext4_mballoc(void);
1037extern void exit_ext4_mballoc(void); 1056extern void exit_ext4_mballoc(void);
1038extern void ext4_mb_free_blocks(handle_t *, struct inode *, 1057extern void ext4_mb_free_blocks(handle_t *, struct inode *,
@@ -1050,24 +1069,25 @@ struct buffer_head *ext4_getblk(handle_t *, struct inode *,
1050 ext4_lblk_t, int, int *); 1069 ext4_lblk_t, int, int *);
1051struct buffer_head *ext4_bread(handle_t *, struct inode *, 1070struct buffer_head *ext4_bread(handle_t *, struct inode *,
1052 ext4_lblk_t, int, int *); 1071 ext4_lblk_t, int, int *);
1072int ext4_get_block(struct inode *inode, sector_t iblock,
1073 struct buffer_head *bh_result, int create);
1053int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, 1074int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
1054 ext4_lblk_t iblock, unsigned long maxblocks, 1075 ext4_lblk_t iblock, unsigned long maxblocks,
1055 struct buffer_head *bh_result, 1076 struct buffer_head *bh_result,
1056 int create, int extend_disksize); 1077 int create, int extend_disksize);
1057 1078
1058extern struct inode *ext4_iget(struct super_block *, unsigned long); 1079extern struct inode *ext4_iget(struct super_block *, unsigned long);
1059extern int ext4_write_inode (struct inode *, int); 1080extern int ext4_write_inode(struct inode *, int);
1060extern int ext4_setattr (struct dentry *, struct iattr *); 1081extern int ext4_setattr(struct dentry *, struct iattr *);
1061extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, 1082extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
1062 struct kstat *stat); 1083 struct kstat *stat);
1063extern void ext4_delete_inode (struct inode *); 1084extern void ext4_delete_inode(struct inode *);
1064extern int ext4_sync_inode (handle_t *, struct inode *); 1085extern int ext4_sync_inode(handle_t *, struct inode *);
1065extern void ext4_discard_reservation (struct inode *);
1066extern void ext4_dirty_inode(struct inode *); 1086extern void ext4_dirty_inode(struct inode *);
1067extern int ext4_change_inode_journal_flag(struct inode *, int); 1087extern int ext4_change_inode_journal_flag(struct inode *, int);
1068extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); 1088extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
1069extern int ext4_can_truncate(struct inode *inode); 1089extern int ext4_can_truncate(struct inode *inode);
1070extern void ext4_truncate (struct inode *); 1090extern void ext4_truncate(struct inode *);
1071extern void ext4_set_inode_flags(struct inode *); 1091extern void ext4_set_inode_flags(struct inode *);
1072extern void ext4_get_inode_flags(struct ext4_inode_info *); 1092extern void ext4_get_inode_flags(struct ext4_inode_info *);
1073extern void ext4_set_aops(struct inode *inode); 1093extern void ext4_set_aops(struct inode *inode);
@@ -1080,11 +1100,10 @@ extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page);
1080 1100
1081/* ioctl.c */ 1101/* ioctl.c */
1082extern long ext4_ioctl(struct file *, unsigned int, unsigned long); 1102extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
1083extern long ext4_compat_ioctl (struct file *, unsigned int, unsigned long); 1103extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
1084 1104
1085/* migrate.c */ 1105/* migrate.c */
1086extern int ext4_ext_migrate(struct inode *, struct file *, unsigned int, 1106extern int ext4_ext_migrate(struct inode *);
1087 unsigned long);
1088/* namei.c */ 1107/* namei.c */
1089extern int ext4_orphan_add(handle_t *, struct inode *); 1108extern int ext4_orphan_add(handle_t *, struct inode *);
1090extern int ext4_orphan_del(handle_t *, struct inode *); 1109extern int ext4_orphan_del(handle_t *, struct inode *);
@@ -1099,14 +1118,14 @@ extern int ext4_group_extend(struct super_block *sb,
1099 ext4_fsblk_t n_blocks_count); 1118 ext4_fsblk_t n_blocks_count);
1100 1119
1101/* super.c */ 1120/* super.c */
1102extern void ext4_error (struct super_block *, const char *, const char *, ...) 1121extern void ext4_error(struct super_block *, const char *, const char *, ...)
1103 __attribute__ ((format (printf, 3, 4))); 1122 __attribute__ ((format (printf, 3, 4)));
1104extern void __ext4_std_error (struct super_block *, const char *, int); 1123extern void __ext4_std_error(struct super_block *, const char *, int);
1105extern void ext4_abort (struct super_block *, const char *, const char *, ...) 1124extern void ext4_abort(struct super_block *, const char *, const char *, ...)
1106 __attribute__ ((format (printf, 3, 4))); 1125 __attribute__ ((format (printf, 3, 4)));
1107extern void ext4_warning (struct super_block *, const char *, const char *, ...) 1126extern void ext4_warning(struct super_block *, const char *, const char *, ...)
1108 __attribute__ ((format (printf, 3, 4))); 1127 __attribute__ ((format (printf, 3, 4)));
1109extern void ext4_update_dynamic_rev (struct super_block *sb); 1128extern void ext4_update_dynamic_rev(struct super_block *sb);
1110extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb, 1129extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb,
1111 __u32 compat); 1130 __u32 compat);
1112extern int ext4_update_rocompat_feature(handle_t *handle, 1131extern int ext4_update_rocompat_feature(handle_t *handle,
@@ -1179,7 +1198,7 @@ static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
1179 1198
1180static inline 1199static inline
1181struct ext4_group_info *ext4_get_group_info(struct super_block *sb, 1200struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
1182 ext4_group_t group) 1201 ext4_group_t group)
1183{ 1202{
1184 struct ext4_group_info ***grp_info; 1203 struct ext4_group_info ***grp_info;
1185 long indexv, indexh; 1204 long indexv, indexh;
@@ -1207,6 +1226,28 @@ do { \
1207 __ext4_std_error((sb), __func__, (errno)); \ 1226 __ext4_std_error((sb), __func__, (errno)); \
1208} while (0) 1227} while (0)
1209 1228
1229#ifdef CONFIG_SMP
1230/* Each CPU can accumulate FBC_BATCH blocks in their local
1231 * counters. So we need to make sure we have free blocks more
1232 * than FBC_BATCH * nr_cpu_ids. Also add a window of 4 times.
1233 */
1234#define EXT4_FREEBLOCKS_WATERMARK (4 * (FBC_BATCH * nr_cpu_ids))
1235#else
1236#define EXT4_FREEBLOCKS_WATERMARK 0
1237#endif
1238
1239static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
1240{
1241 /*
1242 * XXX: replace with spinlock if seen contended -bzzz
1243 */
1244 down_write(&EXT4_I(inode)->i_data_sem);
1245 if (newsize > EXT4_I(inode)->i_disksize)
1246 EXT4_I(inode)->i_disksize = newsize;
1247 up_write(&EXT4_I(inode)->i_data_sem);
1248 return ;
1249}
1250
1210/* 1251/*
1211 * Inodes and files operations 1252 * Inodes and files operations
1212 */ 1253 */
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index d33dc56d6986..bec7ce59fc0d 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -124,6 +124,19 @@ struct ext4_ext_path {
124#define EXT4_EXT_CACHE_GAP 1 124#define EXT4_EXT_CACHE_GAP 1
125#define EXT4_EXT_CACHE_EXTENT 2 125#define EXT4_EXT_CACHE_EXTENT 2
126 126
127/*
128 * to be called by ext4_ext_walk_space()
129 * negative retcode - error
130 * positive retcode - signal for ext4_ext_walk_space(), see below
131 * callback must return valid extent (passed or newly created)
132 */
133typedef int (*ext_prepare_callback)(struct inode *, struct ext4_ext_path *,
134 struct ext4_ext_cache *,
135 struct ext4_extent *, void *);
136
137#define EXT_CONTINUE 0
138#define EXT_BREAK 1
139#define EXT_REPEAT 2
127 140
128#define EXT_MAX_BLOCK 0xffffffff 141#define EXT_MAX_BLOCK 0xffffffff
129 142
@@ -224,6 +237,8 @@ extern int ext4_ext_try_to_merge(struct inode *inode,
224 struct ext4_extent *); 237 struct ext4_extent *);
225extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *); 238extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *);
226extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *); 239extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *);
240extern int ext4_ext_walk_space(struct inode *, ext4_lblk_t, ext4_lblk_t,
241 ext_prepare_callback, void *);
227extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t, 242extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
228 struct ext4_ext_path *); 243 struct ext4_ext_path *);
229extern int ext4_ext_search_left(struct inode *, struct ext4_ext_path *, 244extern int ext4_ext_search_left(struct inode *, struct ext4_ext_path *,
diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h
index ef7409f0e7e4..5c124c0ac6d3 100644
--- a/fs/ext4/ext4_i.h
+++ b/fs/ext4/ext4_i.h
@@ -33,38 +33,6 @@ typedef __u32 ext4_lblk_t;
33/* data type for block group number */ 33/* data type for block group number */
34typedef unsigned long ext4_group_t; 34typedef unsigned long ext4_group_t;
35 35
36struct ext4_reserve_window {
37 ext4_fsblk_t _rsv_start; /* First byte reserved */
38 ext4_fsblk_t _rsv_end; /* Last byte reserved or 0 */
39};
40
41struct ext4_reserve_window_node {
42 struct rb_node rsv_node;
43 __u32 rsv_goal_size;
44 __u32 rsv_alloc_hit;
45 struct ext4_reserve_window rsv_window;
46};
47
48struct ext4_block_alloc_info {
49 /* information about reservation window */
50 struct ext4_reserve_window_node rsv_window_node;
51 /*
52 * was i_next_alloc_block in ext4_inode_info
53 * is the logical (file-relative) number of the
54 * most-recently-allocated block in this file.
55 * We use this for detecting linearly ascending allocation requests.
56 */
57 ext4_lblk_t last_alloc_logical_block;
58 /*
59 * Was i_next_alloc_goal in ext4_inode_info
60 * is the *physical* companion to i_next_alloc_block.
61 * it the physical block number of the block which was most-recentl
62 * allocated to this file. This give us the goal (target) for the next
63 * allocation when we detect linearly ascending requests.
64 */
65 ext4_fsblk_t last_alloc_physical_block;
66};
67
68#define rsv_start rsv_window._rsv_start 36#define rsv_start rsv_window._rsv_start
69#define rsv_end rsv_window._rsv_end 37#define rsv_end rsv_window._rsv_end
70 38
@@ -97,11 +65,8 @@ struct ext4_inode_info {
97 ext4_group_t i_block_group; 65 ext4_group_t i_block_group;
98 __u32 i_state; /* Dynamic state flags for ext4 */ 66 __u32 i_state; /* Dynamic state flags for ext4 */
99 67
100 /* block reservation info */
101 struct ext4_block_alloc_info *i_block_alloc_info;
102
103 ext4_lblk_t i_dir_start_lookup; 68 ext4_lblk_t i_dir_start_lookup;
104#ifdef CONFIG_EXT4DEV_FS_XATTR 69#ifdef CONFIG_EXT4_FS_XATTR
105 /* 70 /*
106 * Extended attributes can be read independently of the main file 71 * Extended attributes can be read independently of the main file
107 * data. Taking i_mutex even when reading would cause contention 72 * data. Taking i_mutex even when reading would cause contention
@@ -111,7 +76,7 @@ struct ext4_inode_info {
111 */ 76 */
112 struct rw_semaphore xattr_sem; 77 struct rw_semaphore xattr_sem;
113#endif 78#endif
114#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL 79#ifdef CONFIG_EXT4_FS_POSIX_ACL
115 struct posix_acl *i_acl; 80 struct posix_acl *i_acl;
116 struct posix_acl *i_default_acl; 81 struct posix_acl *i_default_acl;
117#endif 82#endif
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index 6300226d5531..6a0b40d43264 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -40,8 +40,8 @@ struct ext4_sb_info {
40 unsigned long s_blocks_last; /* Last seen block count */ 40 unsigned long s_blocks_last; /* Last seen block count */
41 loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */ 41 loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */
42 struct buffer_head * s_sbh; /* Buffer containing the super block */ 42 struct buffer_head * s_sbh; /* Buffer containing the super block */
43 struct ext4_super_block * s_es; /* Pointer to the super block in the buffer */ 43 struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */
44 struct buffer_head ** s_group_desc; 44 struct buffer_head **s_group_desc;
45 unsigned long s_mount_opt; 45 unsigned long s_mount_opt;
46 ext4_fsblk_t s_sb_block; 46 ext4_fsblk_t s_sb_block;
47 uid_t s_resuid; 47 uid_t s_resuid;
@@ -52,6 +52,7 @@ struct ext4_sb_info {
52 int s_desc_per_block_bits; 52 int s_desc_per_block_bits;
53 int s_inode_size; 53 int s_inode_size;
54 int s_first_ino; 54 int s_first_ino;
55 unsigned int s_inode_readahead_blks;
55 spinlock_t s_next_gen_lock; 56 spinlock_t s_next_gen_lock;
56 u32 s_next_generation; 57 u32 s_next_generation;
57 u32 s_hash_seed[4]; 58 u32 s_hash_seed[4];
@@ -59,16 +60,17 @@ struct ext4_sb_info {
59 struct percpu_counter s_freeblocks_counter; 60 struct percpu_counter s_freeblocks_counter;
60 struct percpu_counter s_freeinodes_counter; 61 struct percpu_counter s_freeinodes_counter;
61 struct percpu_counter s_dirs_counter; 62 struct percpu_counter s_dirs_counter;
63 struct percpu_counter s_dirtyblocks_counter;
62 struct blockgroup_lock s_blockgroup_lock; 64 struct blockgroup_lock s_blockgroup_lock;
65 struct proc_dir_entry *s_proc;
63 66
64 /* root of the per fs reservation window tree */ 67 /* root of the per fs reservation window tree */
65 spinlock_t s_rsv_window_lock; 68 spinlock_t s_rsv_window_lock;
66 struct rb_root s_rsv_window_root; 69 struct rb_root s_rsv_window_root;
67 struct ext4_reserve_window_node s_rsv_window_head;
68 70
69 /* Journaling */ 71 /* Journaling */
70 struct inode * s_journal_inode; 72 struct inode *s_journal_inode;
71 struct journal_s * s_journal; 73 struct journal_s *s_journal;
72 struct list_head s_orphan; 74 struct list_head s_orphan;
73 unsigned long s_commit_interval; 75 unsigned long s_commit_interval;
74 struct block_device *journal_bdev; 76 struct block_device *journal_bdev;
@@ -106,12 +108,12 @@ struct ext4_sb_info {
106 108
107 /* tunables */ 109 /* tunables */
108 unsigned long s_stripe; 110 unsigned long s_stripe;
109 unsigned long s_mb_stream_request; 111 unsigned int s_mb_stream_request;
110 unsigned long s_mb_max_to_scan; 112 unsigned int s_mb_max_to_scan;
111 unsigned long s_mb_min_to_scan; 113 unsigned int s_mb_min_to_scan;
112 unsigned long s_mb_stats; 114 unsigned int s_mb_stats;
113 unsigned long s_mb_order2_reqs; 115 unsigned int s_mb_order2_reqs;
114 unsigned long s_mb_group_prealloc; 116 unsigned int s_mb_group_prealloc;
115 /* where last allocation was done - for stream allocation */ 117 /* where last allocation was done - for stream allocation */
116 unsigned long s_mb_last_group; 118 unsigned long s_mb_last_group;
117 unsigned long s_mb_last_start; 119 unsigned long s_mb_last_start;
@@ -121,7 +123,6 @@ struct ext4_sb_info {
121 int s_mb_history_cur; 123 int s_mb_history_cur;
122 int s_mb_history_max; 124 int s_mb_history_max;
123 int s_mb_history_num; 125 int s_mb_history_num;
124 struct proc_dir_entry *s_mb_proc;
125 spinlock_t s_mb_history_lock; 126 spinlock_t s_mb_history_lock;
126 int s_mb_history_filter; 127 int s_mb_history_filter;
127 128
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index b24d3c53f20c..ea2ce3c0ae66 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -40,6 +40,7 @@
40#include <linux/slab.h> 40#include <linux/slab.h>
41#include <linux/falloc.h> 41#include <linux/falloc.h>
42#include <asm/uaccess.h> 42#include <asm/uaccess.h>
43#include <linux/fiemap.h>
43#include "ext4_jbd2.h" 44#include "ext4_jbd2.h"
44#include "ext4_extents.h" 45#include "ext4_extents.h"
45 46
@@ -383,8 +384,8 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
383 ext_debug("\n"); 384 ext_debug("\n");
384} 385}
385#else 386#else
386#define ext4_ext_show_path(inode,path) 387#define ext4_ext_show_path(inode, path)
387#define ext4_ext_show_leaf(inode,path) 388#define ext4_ext_show_leaf(inode, path)
388#endif 389#endif
389 390
390void ext4_ext_drop_refs(struct ext4_ext_path *path) 391void ext4_ext_drop_refs(struct ext4_ext_path *path)
@@ -440,9 +441,10 @@ ext4_ext_binsearch_idx(struct inode *inode,
440 for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) { 441 for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) {
441 if (k != 0 && 442 if (k != 0 &&
442 le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) { 443 le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) {
443 printk("k=%d, ix=0x%p, first=0x%p\n", k, 444 printk(KERN_DEBUG "k=%d, ix=0x%p, "
444 ix, EXT_FIRST_INDEX(eh)); 445 "first=0x%p\n", k,
445 printk("%u <= %u\n", 446 ix, EXT_FIRST_INDEX(eh));
447 printk(KERN_DEBUG "%u <= %u\n",
446 le32_to_cpu(ix->ei_block), 448 le32_to_cpu(ix->ei_block),
447 le32_to_cpu(ix[-1].ei_block)); 449 le32_to_cpu(ix[-1].ei_block));
448 } 450 }
@@ -1475,7 +1477,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1475 struct ext4_ext_path *path, 1477 struct ext4_ext_path *path,
1476 struct ext4_extent *newext) 1478 struct ext4_extent *newext)
1477{ 1479{
1478 struct ext4_extent_header * eh; 1480 struct ext4_extent_header *eh;
1479 struct ext4_extent *ex, *fex; 1481 struct ext4_extent *ex, *fex;
1480 struct ext4_extent *nearex; /* nearest extent */ 1482 struct ext4_extent *nearex; /* nearest extent */
1481 struct ext4_ext_path *npath = NULL; 1483 struct ext4_ext_path *npath = NULL;
@@ -1625,6 +1627,113 @@ cleanup:
1625 return err; 1627 return err;
1626} 1628}
1627 1629
1630int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
1631 ext4_lblk_t num, ext_prepare_callback func,
1632 void *cbdata)
1633{
1634 struct ext4_ext_path *path = NULL;
1635 struct ext4_ext_cache cbex;
1636 struct ext4_extent *ex;
1637 ext4_lblk_t next, start = 0, end = 0;
1638 ext4_lblk_t last = block + num;
1639 int depth, exists, err = 0;
1640
1641 BUG_ON(func == NULL);
1642 BUG_ON(inode == NULL);
1643
1644 while (block < last && block != EXT_MAX_BLOCK) {
1645 num = last - block;
1646 /* find extent for this block */
1647 path = ext4_ext_find_extent(inode, block, path);
1648 if (IS_ERR(path)) {
1649 err = PTR_ERR(path);
1650 path = NULL;
1651 break;
1652 }
1653
1654 depth = ext_depth(inode);
1655 BUG_ON(path[depth].p_hdr == NULL);
1656 ex = path[depth].p_ext;
1657 next = ext4_ext_next_allocated_block(path);
1658
1659 exists = 0;
1660 if (!ex) {
1661 /* there is no extent yet, so try to allocate
1662 * all requested space */
1663 start = block;
1664 end = block + num;
1665 } else if (le32_to_cpu(ex->ee_block) > block) {
1666 /* need to allocate space before found extent */
1667 start = block;
1668 end = le32_to_cpu(ex->ee_block);
1669 if (block + num < end)
1670 end = block + num;
1671 } else if (block >= le32_to_cpu(ex->ee_block)
1672 + ext4_ext_get_actual_len(ex)) {
1673 /* need to allocate space after found extent */
1674 start = block;
1675 end = block + num;
1676 if (end >= next)
1677 end = next;
1678 } else if (block >= le32_to_cpu(ex->ee_block)) {
1679 /*
1680 * some part of requested space is covered
1681 * by found extent
1682 */
1683 start = block;
1684 end = le32_to_cpu(ex->ee_block)
1685 + ext4_ext_get_actual_len(ex);
1686 if (block + num < end)
1687 end = block + num;
1688 exists = 1;
1689 } else {
1690 BUG();
1691 }
1692 BUG_ON(end <= start);
1693
1694 if (!exists) {
1695 cbex.ec_block = start;
1696 cbex.ec_len = end - start;
1697 cbex.ec_start = 0;
1698 cbex.ec_type = EXT4_EXT_CACHE_GAP;
1699 } else {
1700 cbex.ec_block = le32_to_cpu(ex->ee_block);
1701 cbex.ec_len = ext4_ext_get_actual_len(ex);
1702 cbex.ec_start = ext_pblock(ex);
1703 cbex.ec_type = EXT4_EXT_CACHE_EXTENT;
1704 }
1705
1706 BUG_ON(cbex.ec_len == 0);
1707 err = func(inode, path, &cbex, ex, cbdata);
1708 ext4_ext_drop_refs(path);
1709
1710 if (err < 0)
1711 break;
1712
1713 if (err == EXT_REPEAT)
1714 continue;
1715 else if (err == EXT_BREAK) {
1716 err = 0;
1717 break;
1718 }
1719
1720 if (ext_depth(inode) != depth) {
1721 /* depth was changed. we have to realloc path */
1722 kfree(path);
1723 path = NULL;
1724 }
1725
1726 block = cbex.ec_block + cbex.ec_len;
1727 }
1728
1729 if (path) {
1730 ext4_ext_drop_refs(path);
1731 kfree(path);
1732 }
1733
1734 return err;
1735}
1736
1628static void 1737static void
1629ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block, 1738ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block,
1630 __u32 len, ext4_fsblk_t start, int type) 1739 __u32 len, ext4_fsblk_t start, int type)
@@ -2142,7 +2251,7 @@ void ext4_ext_init(struct super_block *sb)
2142 */ 2251 */
2143 2252
2144 if (test_opt(sb, EXTENTS)) { 2253 if (test_opt(sb, EXTENTS)) {
2145 printk("EXT4-fs: file extents enabled"); 2254 printk(KERN_INFO "EXT4-fs: file extents enabled");
2146#ifdef AGGRESSIVE_TEST 2255#ifdef AGGRESSIVE_TEST
2147 printk(", aggressive tests"); 2256 printk(", aggressive tests");
2148#endif 2257#endif
@@ -2696,11 +2805,8 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2696 goto out2; 2805 goto out2;
2697 } 2806 }
2698 /* 2807 /*
2699 * Okay, we need to do block allocation. Lazily initialize the block 2808 * Okay, we need to do block allocation.
2700 * allocation info here if necessary.
2701 */ 2809 */
2702 if (S_ISREG(inode->i_mode) && (!EXT4_I(inode)->i_block_alloc_info))
2703 ext4_init_block_alloc_info(inode);
2704 2810
2705 /* find neighbour allocated blocks */ 2811 /* find neighbour allocated blocks */
2706 ar.lleft = iblock; 2812 ar.lleft = iblock;
@@ -2760,7 +2866,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2760 /* free data blocks we just allocated */ 2866 /* free data blocks we just allocated */
2761 /* not a good idea to call discard here directly, 2867 /* not a good idea to call discard here directly,
2762 * but otherwise we'd need to call it every free() */ 2868 * but otherwise we'd need to call it every free() */
2763 ext4_mb_discard_inode_preallocations(inode); 2869 ext4_discard_preallocations(inode);
2764 ext4_free_blocks(handle, inode, ext_pblock(&newex), 2870 ext4_free_blocks(handle, inode, ext_pblock(&newex),
2765 ext4_ext_get_actual_len(&newex), 0); 2871 ext4_ext_get_actual_len(&newex), 0);
2766 goto out2; 2872 goto out2;
@@ -2824,7 +2930,7 @@ void ext4_ext_truncate(struct inode *inode)
2824 down_write(&EXT4_I(inode)->i_data_sem); 2930 down_write(&EXT4_I(inode)->i_data_sem);
2825 ext4_ext_invalidate_cache(inode); 2931 ext4_ext_invalidate_cache(inode);
2826 2932
2827 ext4_discard_reservation(inode); 2933 ext4_discard_preallocations(inode);
2828 2934
2829 /* 2935 /*
2830 * TODO: optimization is possible here. 2936 * TODO: optimization is possible here.
@@ -2877,10 +2983,11 @@ static void ext4_falloc_update_inode(struct inode *inode,
2877 * Update only when preallocation was requested beyond 2983 * Update only when preallocation was requested beyond
2878 * the file size. 2984 * the file size.
2879 */ 2985 */
2880 if (!(mode & FALLOC_FL_KEEP_SIZE) && 2986 if (!(mode & FALLOC_FL_KEEP_SIZE)) {
2881 new_size > i_size_read(inode)) { 2987 if (new_size > i_size_read(inode))
2882 i_size_write(inode, new_size); 2988 i_size_write(inode, new_size);
2883 EXT4_I(inode)->i_disksize = new_size; 2989 if (new_size > EXT4_I(inode)->i_disksize)
2990 ext4_update_i_disksize(inode, new_size);
2884 } 2991 }
2885 2992
2886} 2993}
@@ -2972,3 +3079,143 @@ retry:
2972 mutex_unlock(&inode->i_mutex); 3079 mutex_unlock(&inode->i_mutex);
2973 return ret > 0 ? ret2 : ret; 3080 return ret > 0 ? ret2 : ret;
2974} 3081}
3082
3083/*
3084 * Callback function called for each extent to gather FIEMAP information.
3085 */
3086int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
3087 struct ext4_ext_cache *newex, struct ext4_extent *ex,
3088 void *data)
3089{
3090 struct fiemap_extent_info *fieinfo = data;
3091 unsigned long blksize_bits = inode->i_sb->s_blocksize_bits;
3092 __u64 logical;
3093 __u64 physical;
3094 __u64 length;
3095 __u32 flags = 0;
3096 int error;
3097
3098 logical = (__u64)newex->ec_block << blksize_bits;
3099
3100 if (newex->ec_type == EXT4_EXT_CACHE_GAP) {
3101 pgoff_t offset;
3102 struct page *page;
3103 struct buffer_head *bh = NULL;
3104
3105 offset = logical >> PAGE_SHIFT;
3106 page = find_get_page(inode->i_mapping, offset);
3107 if (!page || !page_has_buffers(page))
3108 return EXT_CONTINUE;
3109
3110 bh = page_buffers(page);
3111
3112 if (!bh)
3113 return EXT_CONTINUE;
3114
3115 if (buffer_delay(bh)) {
3116 flags |= FIEMAP_EXTENT_DELALLOC;
3117 page_cache_release(page);
3118 } else {
3119 page_cache_release(page);
3120 return EXT_CONTINUE;
3121 }
3122 }
3123
3124 physical = (__u64)newex->ec_start << blksize_bits;
3125 length = (__u64)newex->ec_len << blksize_bits;
3126
3127 if (ex && ext4_ext_is_uninitialized(ex))
3128 flags |= FIEMAP_EXTENT_UNWRITTEN;
3129
3130 /*
3131 * If this extent reaches EXT_MAX_BLOCK, it must be last.
3132 *
3133 * Or if ext4_ext_next_allocated_block is EXT_MAX_BLOCK,
3134 * this also indicates no more allocated blocks.
3135 *
3136 * XXX this might miss a single-block extent at EXT_MAX_BLOCK
3137 */
3138 if (logical + length - 1 == EXT_MAX_BLOCK ||
3139 ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK)
3140 flags |= FIEMAP_EXTENT_LAST;
3141
3142 error = fiemap_fill_next_extent(fieinfo, logical, physical,
3143 length, flags);
3144 if (error < 0)
3145 return error;
3146 if (error == 1)
3147 return EXT_BREAK;
3148
3149 return EXT_CONTINUE;
3150}
3151
3152/* fiemap flags we can handle specified here */
3153#define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
3154
3155int ext4_xattr_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo)
3156{
3157 __u64 physical = 0;
3158 __u64 length;
3159 __u32 flags = FIEMAP_EXTENT_LAST;
3160 int blockbits = inode->i_sb->s_blocksize_bits;
3161 int error = 0;
3162
3163 /* in-inode? */
3164 if (EXT4_I(inode)->i_state & EXT4_STATE_XATTR) {
3165 struct ext4_iloc iloc;
3166 int offset; /* offset of xattr in inode */
3167
3168 error = ext4_get_inode_loc(inode, &iloc);
3169 if (error)
3170 return error;
3171 physical = iloc.bh->b_blocknr << blockbits;
3172 offset = EXT4_GOOD_OLD_INODE_SIZE +
3173 EXT4_I(inode)->i_extra_isize;
3174 physical += offset;
3175 length = EXT4_SB(inode->i_sb)->s_inode_size - offset;
3176 flags |= FIEMAP_EXTENT_DATA_INLINE;
3177 } else { /* external block */
3178 physical = EXT4_I(inode)->i_file_acl << blockbits;
3179 length = inode->i_sb->s_blocksize;
3180 }
3181
3182 if (physical)
3183 error = fiemap_fill_next_extent(fieinfo, 0, physical,
3184 length, flags);
3185 return (error < 0 ? error : 0);
3186}
3187
3188int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3189 __u64 start, __u64 len)
3190{
3191 ext4_lblk_t start_blk;
3192 ext4_lblk_t len_blks;
3193 int error = 0;
3194
3195 /* fallback to generic here if not in extents fmt */
3196 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
3197 return generic_block_fiemap(inode, fieinfo, start, len,
3198 ext4_get_block);
3199
3200 if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS))
3201 return -EBADR;
3202
3203 if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
3204 error = ext4_xattr_fiemap(inode, fieinfo);
3205 } else {
3206 start_blk = start >> inode->i_sb->s_blocksize_bits;
3207 len_blks = len >> inode->i_sb->s_blocksize_bits;
3208
3209 /*
3210 * Walk the extent tree gathering extent information.
3211 * ext4_ext_fiemap_cb will push extents back to user.
3212 */
3213 down_write(&EXT4_I(inode)->i_data_sem);
3214 error = ext4_ext_walk_space(inode, start_blk, len_blks,
3215 ext4_ext_fiemap_cb, fieinfo);
3216 up_write(&EXT4_I(inode)->i_data_sem);
3217 }
3218
3219 return error;
3220}
3221
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 430eb7978db4..6bd11fba71f7 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -31,14 +31,14 @@
31 * from ext4_file_open: open gets called at every open, but release 31 * from ext4_file_open: open gets called at every open, but release
32 * gets called only when /all/ the files are closed. 32 * gets called only when /all/ the files are closed.
33 */ 33 */
34static int ext4_release_file (struct inode * inode, struct file * filp) 34static int ext4_release_file(struct inode *inode, struct file *filp)
35{ 35{
36 /* if we are the last writer on the inode, drop the block reservation */ 36 /* if we are the last writer on the inode, drop the block reservation */
37 if ((filp->f_mode & FMODE_WRITE) && 37 if ((filp->f_mode & FMODE_WRITE) &&
38 (atomic_read(&inode->i_writecount) == 1)) 38 (atomic_read(&inode->i_writecount) == 1))
39 { 39 {
40 down_write(&EXT4_I(inode)->i_data_sem); 40 down_write(&EXT4_I(inode)->i_data_sem);
41 ext4_discard_reservation(inode); 41 ext4_discard_preallocations(inode);
42 up_write(&EXT4_I(inode)->i_data_sem); 42 up_write(&EXT4_I(inode)->i_data_sem);
43 } 43 }
44 if (is_dx(inode) && filp->private_data) 44 if (is_dx(inode) && filp->private_data)
@@ -140,6 +140,9 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
140 return 0; 140 return 0;
141} 141}
142 142
143extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
144 __u64 start, __u64 len);
145
143const struct file_operations ext4_file_operations = { 146const struct file_operations ext4_file_operations = {
144 .llseek = generic_file_llseek, 147 .llseek = generic_file_llseek,
145 .read = do_sync_read, 148 .read = do_sync_read,
@@ -162,7 +165,7 @@ const struct inode_operations ext4_file_inode_operations = {
162 .truncate = ext4_truncate, 165 .truncate = ext4_truncate,
163 .setattr = ext4_setattr, 166 .setattr = ext4_setattr,
164 .getattr = ext4_getattr, 167 .getattr = ext4_getattr,
165#ifdef CONFIG_EXT4DEV_FS_XATTR 168#ifdef CONFIG_EXT4_FS_XATTR
166 .setxattr = generic_setxattr, 169 .setxattr = generic_setxattr,
167 .getxattr = generic_getxattr, 170 .getxattr = generic_getxattr,
168 .listxattr = ext4_listxattr, 171 .listxattr = ext4_listxattr,
@@ -170,5 +173,6 @@ const struct inode_operations ext4_file_inode_operations = {
170#endif 173#endif
171 .permission = ext4_permission, 174 .permission = ext4_permission,
172 .fallocate = ext4_fallocate, 175 .fallocate = ext4_fallocate,
176 .fiemap = ext4_fiemap,
173}; 177};
174 178
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index a45c3737ad31..5afe4370840b 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -28,6 +28,7 @@
28#include <linux/writeback.h> 28#include <linux/writeback.h>
29#include <linux/jbd2.h> 29#include <linux/jbd2.h>
30#include <linux/blkdev.h> 30#include <linux/blkdev.h>
31#include <linux/marker.h>
31#include "ext4.h" 32#include "ext4.h"
32#include "ext4_jbd2.h" 33#include "ext4_jbd2.h"
33 34
@@ -43,7 +44,7 @@
43 * inode to disk. 44 * inode to disk.
44 */ 45 */
45 46
46int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync) 47int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
47{ 48{
48 struct inode *inode = dentry->d_inode; 49 struct inode *inode = dentry->d_inode;
49 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; 50 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
@@ -51,6 +52,10 @@ int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync)
51 52
52 J_ASSERT(ext4_journal_current_handle() == NULL); 53 J_ASSERT(ext4_journal_current_handle() == NULL);
53 54
55 trace_mark(ext4_sync_file, "dev %s datasync %d ino %ld parent %ld",
56 inode->i_sb->s_id, datasync, inode->i_ino,
57 dentry->d_parent->d_inode->i_ino);
58
54 /* 59 /*
55 * data=writeback: 60 * data=writeback:
56 * The caller's filemap_fdatawrite()/wait will sync the data. 61 * The caller's filemap_fdatawrite()/wait will sync the data.
diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
index 1d6329dbe390..556ca8eba3db 100644
--- a/fs/ext4/hash.c
+++ b/fs/ext4/hash.c
@@ -27,7 +27,7 @@ static void TEA_transform(__u32 buf[4], __u32 const in[])
27 sum += DELTA; 27 sum += DELTA;
28 b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); 28 b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b);
29 b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); 29 b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d);
30 } while(--n); 30 } while (--n);
31 31
32 buf[0] += b0; 32 buf[0] += b0;
33 buf[1] += b1; 33 buf[1] += b1;
@@ -35,7 +35,7 @@ static void TEA_transform(__u32 buf[4], __u32 const in[])
35 35
36 36
37/* The old legacy hash */ 37/* The old legacy hash */
38static __u32 dx_hack_hash (const char *name, int len) 38static __u32 dx_hack_hash(const char *name, int len)
39{ 39{
40 __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; 40 __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
41 while (len--) { 41 while (len--) {
@@ -59,7 +59,7 @@ static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
59 val = pad; 59 val = pad;
60 if (len > num*4) 60 if (len > num*4)
61 len = num * 4; 61 len = num * 4;
62 for (i=0; i < len; i++) { 62 for (i = 0; i < len; i++) {
63 if ((i % 4) == 0) 63 if ((i % 4) == 0)
64 val = pad; 64 val = pad;
65 val = msg[i] + (val << 8); 65 val = msg[i] + (val << 8);
@@ -104,7 +104,7 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
104 104
105 /* Check to see if the seed is all zero's */ 105 /* Check to see if the seed is all zero's */
106 if (hinfo->seed) { 106 if (hinfo->seed) {
107 for (i=0; i < 4; i++) { 107 for (i = 0; i < 4; i++) {
108 if (hinfo->seed[i]) 108 if (hinfo->seed[i])
109 break; 109 break;
110 } 110 }
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index f344834bbf58..fe34d74cfb19 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -115,9 +115,11 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
115 block_group, bitmap_blk); 115 block_group, bitmap_blk);
116 return NULL; 116 return NULL;
117 } 117 }
118 if (bh_uptodate_or_lock(bh)) 118 if (buffer_uptodate(bh) &&
119 !(desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)))
119 return bh; 120 return bh;
120 121
122 lock_buffer(bh);
121 spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group)); 123 spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
122 if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { 124 if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
123 ext4_init_inode_bitmap(sb, bh, block_group, desc); 125 ext4_init_inode_bitmap(sb, bh, block_group, desc);
@@ -154,39 +156,40 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
154 * though), and then we'd have two inodes sharing the 156 * though), and then we'd have two inodes sharing the
155 * same inode number and space on the harddisk. 157 * same inode number and space on the harddisk.
156 */ 158 */
157void ext4_free_inode (handle_t *handle, struct inode * inode) 159void ext4_free_inode(handle_t *handle, struct inode *inode)
158{ 160{
159 struct super_block * sb = inode->i_sb; 161 struct super_block *sb = inode->i_sb;
160 int is_directory; 162 int is_directory;
161 unsigned long ino; 163 unsigned long ino;
162 struct buffer_head *bitmap_bh = NULL; 164 struct buffer_head *bitmap_bh = NULL;
163 struct buffer_head *bh2; 165 struct buffer_head *bh2;
164 ext4_group_t block_group; 166 ext4_group_t block_group;
165 unsigned long bit; 167 unsigned long bit;
166 struct ext4_group_desc * gdp; 168 struct ext4_group_desc *gdp;
167 struct ext4_super_block * es; 169 struct ext4_super_block *es;
168 struct ext4_sb_info *sbi; 170 struct ext4_sb_info *sbi;
169 int fatal = 0, err; 171 int fatal = 0, err;
170 ext4_group_t flex_group; 172 ext4_group_t flex_group;
171 173
172 if (atomic_read(&inode->i_count) > 1) { 174 if (atomic_read(&inode->i_count) > 1) {
173 printk ("ext4_free_inode: inode has count=%d\n", 175 printk(KERN_ERR "ext4_free_inode: inode has count=%d\n",
174 atomic_read(&inode->i_count)); 176 atomic_read(&inode->i_count));
175 return; 177 return;
176 } 178 }
177 if (inode->i_nlink) { 179 if (inode->i_nlink) {
178 printk ("ext4_free_inode: inode has nlink=%d\n", 180 printk(KERN_ERR "ext4_free_inode: inode has nlink=%d\n",
179 inode->i_nlink); 181 inode->i_nlink);
180 return; 182 return;
181 } 183 }
182 if (!sb) { 184 if (!sb) {
183 printk("ext4_free_inode: inode on nonexistent device\n"); 185 printk(KERN_ERR "ext4_free_inode: inode on "
186 "nonexistent device\n");
184 return; 187 return;
185 } 188 }
186 sbi = EXT4_SB(sb); 189 sbi = EXT4_SB(sb);
187 190
188 ino = inode->i_ino; 191 ino = inode->i_ino;
189 ext4_debug ("freeing inode %lu\n", ino); 192 ext4_debug("freeing inode %lu\n", ino);
190 193
191 /* 194 /*
192 * Note: we must free any quota before locking the superblock, 195 * Note: we must free any quota before locking the superblock,
@@ -200,12 +203,12 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
200 is_directory = S_ISDIR(inode->i_mode); 203 is_directory = S_ISDIR(inode->i_mode);
201 204
202 /* Do this BEFORE marking the inode not in use or returning an error */ 205 /* Do this BEFORE marking the inode not in use or returning an error */
203 clear_inode (inode); 206 clear_inode(inode);
204 207
205 es = EXT4_SB(sb)->s_es; 208 es = EXT4_SB(sb)->s_es;
206 if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) { 209 if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
207 ext4_error (sb, "ext4_free_inode", 210 ext4_error(sb, "ext4_free_inode",
208 "reserved or nonexistent inode %lu", ino); 211 "reserved or nonexistent inode %lu", ino);
209 goto error_return; 212 goto error_return;
210 } 213 }
211 block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); 214 block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
@@ -222,10 +225,10 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
222 /* Ok, now we can actually update the inode bitmaps.. */ 225 /* Ok, now we can actually update the inode bitmaps.. */
223 if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group), 226 if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
224 bit, bitmap_bh->b_data)) 227 bit, bitmap_bh->b_data))
225 ext4_error (sb, "ext4_free_inode", 228 ext4_error(sb, "ext4_free_inode",
226 "bit already cleared for inode %lu", ino); 229 "bit already cleared for inode %lu", ino);
227 else { 230 else {
228 gdp = ext4_get_group_desc (sb, block_group, &bh2); 231 gdp = ext4_get_group_desc(sb, block_group, &bh2);
229 232
230 BUFFER_TRACE(bh2, "get_write_access"); 233 BUFFER_TRACE(bh2, "get_write_access");
231 fatal = ext4_journal_get_write_access(handle, bh2); 234 fatal = ext4_journal_get_write_access(handle, bh2);
@@ -287,7 +290,7 @@ static int find_group_dir(struct super_block *sb, struct inode *parent,
287 avefreei = freei / ngroups; 290 avefreei = freei / ngroups;
288 291
289 for (group = 0; group < ngroups; group++) { 292 for (group = 0; group < ngroups; group++) {
290 desc = ext4_get_group_desc (sb, group, NULL); 293 desc = ext4_get_group_desc(sb, group, NULL);
291 if (!desc || !desc->bg_free_inodes_count) 294 if (!desc || !desc->bg_free_inodes_count)
292 continue; 295 continue;
293 if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei) 296 if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei)
@@ -576,16 +579,16 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
576 * For other inodes, search forward from the parent directory's block 579 * For other inodes, search forward from the parent directory's block
577 * group to find a free inode. 580 * group to find a free inode.
578 */ 581 */
579struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode) 582struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
580{ 583{
581 struct super_block *sb; 584 struct super_block *sb;
582 struct buffer_head *bitmap_bh = NULL; 585 struct buffer_head *bitmap_bh = NULL;
583 struct buffer_head *bh2; 586 struct buffer_head *bh2;
584 ext4_group_t group = 0; 587 ext4_group_t group = 0;
585 unsigned long ino = 0; 588 unsigned long ino = 0;
586 struct inode * inode; 589 struct inode *inode;
587 struct ext4_group_desc * gdp = NULL; 590 struct ext4_group_desc *gdp = NULL;
588 struct ext4_super_block * es; 591 struct ext4_super_block *es;
589 struct ext4_inode_info *ei; 592 struct ext4_inode_info *ei;
590 struct ext4_sb_info *sbi; 593 struct ext4_sb_info *sbi;
591 int ret2, err = 0; 594 int ret2, err = 0;
@@ -613,7 +616,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
613 } 616 }
614 617
615 if (S_ISDIR(mode)) { 618 if (S_ISDIR(mode)) {
616 if (test_opt (sb, OLDALLOC)) 619 if (test_opt(sb, OLDALLOC))
617 ret2 = find_group_dir(sb, dir, &group); 620 ret2 = find_group_dir(sb, dir, &group);
618 else 621 else
619 ret2 = find_group_orlov(sb, dir, &group); 622 ret2 = find_group_orlov(sb, dir, &group);
@@ -783,7 +786,7 @@ got:
783 } 786 }
784 787
785 inode->i_uid = current->fsuid; 788 inode->i_uid = current->fsuid;
786 if (test_opt (sb, GRPID)) 789 if (test_opt(sb, GRPID))
787 inode->i_gid = dir->i_gid; 790 inode->i_gid = dir->i_gid;
788 else if (dir->i_mode & S_ISGID) { 791 else if (dir->i_mode & S_ISGID) {
789 inode->i_gid = dir->i_gid; 792 inode->i_gid = dir->i_gid;
@@ -816,7 +819,6 @@ got:
816 ei->i_flags &= ~EXT4_DIRSYNC_FL; 819 ei->i_flags &= ~EXT4_DIRSYNC_FL;
817 ei->i_file_acl = 0; 820 ei->i_file_acl = 0;
818 ei->i_dtime = 0; 821 ei->i_dtime = 0;
819 ei->i_block_alloc_info = NULL;
820 ei->i_block_group = group; 822 ei->i_block_group = group;
821 823
822 ext4_set_inode_flags(inode); 824 ext4_set_inode_flags(inode);
@@ -832,7 +834,7 @@ got:
832 ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize; 834 ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize;
833 835
834 ret = inode; 836 ret = inode;
835 if(DQUOT_ALLOC_INODE(inode)) { 837 if (DQUOT_ALLOC_INODE(inode)) {
836 err = -EDQUOT; 838 err = -EDQUOT;
837 goto fail_drop; 839 goto fail_drop;
838 } 840 }
@@ -841,7 +843,7 @@ got:
841 if (err) 843 if (err)
842 goto fail_free_drop; 844 goto fail_free_drop;
843 845
844 err = ext4_init_security(handle,inode, dir); 846 err = ext4_init_security(handle, inode, dir);
845 if (err) 847 if (err)
846 goto fail_free_drop; 848 goto fail_free_drop;
847 849
@@ -959,7 +961,7 @@ error:
959 return ERR_PTR(err); 961 return ERR_PTR(err);
960} 962}
961 963
962unsigned long ext4_count_free_inodes (struct super_block * sb) 964unsigned long ext4_count_free_inodes(struct super_block *sb)
963{ 965{
964 unsigned long desc_count; 966 unsigned long desc_count;
965 struct ext4_group_desc *gdp; 967 struct ext4_group_desc *gdp;
@@ -974,7 +976,7 @@ unsigned long ext4_count_free_inodes (struct super_block * sb)
974 bitmap_count = 0; 976 bitmap_count = 0;
975 gdp = NULL; 977 gdp = NULL;
976 for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) { 978 for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
977 gdp = ext4_get_group_desc (sb, i, NULL); 979 gdp = ext4_get_group_desc(sb, i, NULL);
978 if (!gdp) 980 if (!gdp)
979 continue; 981 continue;
980 desc_count += le16_to_cpu(gdp->bg_free_inodes_count); 982 desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
@@ -989,13 +991,14 @@ unsigned long ext4_count_free_inodes (struct super_block * sb)
989 bitmap_count += x; 991 bitmap_count += x;
990 } 992 }
991 brelse(bitmap_bh); 993 brelse(bitmap_bh);
992 printk("ext4_count_free_inodes: stored = %u, computed = %lu, %lu\n", 994 printk(KERN_DEBUG "ext4_count_free_inodes: "
993 le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count); 995 "stored = %u, computed = %lu, %lu\n",
996 le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count);
994 return desc_count; 997 return desc_count;
995#else 998#else
996 desc_count = 0; 999 desc_count = 0;
997 for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) { 1000 for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
998 gdp = ext4_get_group_desc (sb, i, NULL); 1001 gdp = ext4_get_group_desc(sb, i, NULL);
999 if (!gdp) 1002 if (!gdp)
1000 continue; 1003 continue;
1001 desc_count += le16_to_cpu(gdp->bg_free_inodes_count); 1004 desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
@@ -1006,13 +1009,13 @@ unsigned long ext4_count_free_inodes (struct super_block * sb)
1006} 1009}
1007 1010
1008/* Called at mount-time, super-block is locked */ 1011/* Called at mount-time, super-block is locked */
1009unsigned long ext4_count_dirs (struct super_block * sb) 1012unsigned long ext4_count_dirs(struct super_block * sb)
1010{ 1013{
1011 unsigned long count = 0; 1014 unsigned long count = 0;
1012 ext4_group_t i; 1015 ext4_group_t i;
1013 1016
1014 for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) { 1017 for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
1015 struct ext4_group_desc *gdp = ext4_get_group_desc (sb, i, NULL); 1018 struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
1016 if (!gdp) 1019 if (!gdp)
1017 continue; 1020 continue;
1018 count += le16_to_cpu(gdp->bg_used_dirs_count); 1021 count += le16_to_cpu(gdp->bg_used_dirs_count);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 7e91913e325b..9b4ec9decfd1 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -190,7 +190,7 @@ static int ext4_journal_test_restart(handle_t *handle, struct inode *inode)
190/* 190/*
191 * Called at the last iput() if i_nlink is zero. 191 * Called at the last iput() if i_nlink is zero.
192 */ 192 */
193void ext4_delete_inode (struct inode * inode) 193void ext4_delete_inode(struct inode *inode)
194{ 194{
195 handle_t *handle; 195 handle_t *handle;
196 int err; 196 int err;
@@ -330,11 +330,11 @@ static int ext4_block_to_path(struct inode *inode,
330 int final = 0; 330 int final = 0;
331 331
332 if (i_block < 0) { 332 if (i_block < 0) {
333 ext4_warning (inode->i_sb, "ext4_block_to_path", "block < 0"); 333 ext4_warning(inode->i_sb, "ext4_block_to_path", "block < 0");
334 } else if (i_block < direct_blocks) { 334 } else if (i_block < direct_blocks) {
335 offsets[n++] = i_block; 335 offsets[n++] = i_block;
336 final = direct_blocks; 336 final = direct_blocks;
337 } else if ( (i_block -= direct_blocks) < indirect_blocks) { 337 } else if ((i_block -= direct_blocks) < indirect_blocks) {
338 offsets[n++] = EXT4_IND_BLOCK; 338 offsets[n++] = EXT4_IND_BLOCK;
339 offsets[n++] = i_block; 339 offsets[n++] = i_block;
340 final = ptrs; 340 final = ptrs;
@@ -400,14 +400,14 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
400 400
401 *err = 0; 401 *err = 0;
402 /* i_data is not going away, no lock needed */ 402 /* i_data is not going away, no lock needed */
403 add_chain (chain, NULL, EXT4_I(inode)->i_data + *offsets); 403 add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets);
404 if (!p->key) 404 if (!p->key)
405 goto no_block; 405 goto no_block;
406 while (--depth) { 406 while (--depth) {
407 bh = sb_bread(sb, le32_to_cpu(p->key)); 407 bh = sb_bread(sb, le32_to_cpu(p->key));
408 if (!bh) 408 if (!bh)
409 goto failure; 409 goto failure;
410 add_chain(++p, bh, (__le32*)bh->b_data + *++offsets); 410 add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets);
411 /* Reader: end */ 411 /* Reader: end */
412 if (!p->key) 412 if (!p->key)
413 goto no_block; 413 goto no_block;
@@ -443,7 +443,7 @@ no_block:
443static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) 443static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
444{ 444{
445 struct ext4_inode_info *ei = EXT4_I(inode); 445 struct ext4_inode_info *ei = EXT4_I(inode);
446 __le32 *start = ind->bh ? (__le32*) ind->bh->b_data : ei->i_data; 446 __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data;
447 __le32 *p; 447 __le32 *p;
448 ext4_fsblk_t bg_start; 448 ext4_fsblk_t bg_start;
449 ext4_fsblk_t last_block; 449 ext4_fsblk_t last_block;
@@ -486,18 +486,9 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
486static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, 486static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
487 Indirect *partial) 487 Indirect *partial)
488{ 488{
489 struct ext4_block_alloc_info *block_i;
490
491 block_i = EXT4_I(inode)->i_block_alloc_info;
492
493 /* 489 /*
494 * try the heuristic for sequential allocation, 490 * XXX need to get goal block from mballoc's data structures
495 * failing that at least try to get decent locality.
496 */ 491 */
497 if (block_i && (block == block_i->last_alloc_logical_block + 1)
498 && (block_i->last_alloc_physical_block != 0)) {
499 return block_i->last_alloc_physical_block + 1;
500 }
501 492
502 return ext4_find_near(inode, partial); 493 return ext4_find_near(inode, partial);
503} 494}
@@ -630,7 +621,7 @@ allocated:
630 *err = 0; 621 *err = 0;
631 return ret; 622 return ret;
632failed_out: 623failed_out:
633 for (i = 0; i <index; i++) 624 for (i = 0; i < index; i++)
634 ext4_free_blocks(handle, inode, new_blocks[i], 1, 0); 625 ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
635 return ret; 626 return ret;
636} 627}
@@ -703,7 +694,7 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
703 branch[n].p = (__le32 *) bh->b_data + offsets[n]; 694 branch[n].p = (__le32 *) bh->b_data + offsets[n];
704 branch[n].key = cpu_to_le32(new_blocks[n]); 695 branch[n].key = cpu_to_le32(new_blocks[n]);
705 *branch[n].p = branch[n].key; 696 *branch[n].p = branch[n].key;
706 if ( n == indirect_blks) { 697 if (n == indirect_blks) {
707 current_block = new_blocks[n]; 698 current_block = new_blocks[n];
708 /* 699 /*
709 * End of chain, update the last new metablock of 700 * End of chain, update the last new metablock of
@@ -730,7 +721,7 @@ failed:
730 BUFFER_TRACE(branch[i].bh, "call jbd2_journal_forget"); 721 BUFFER_TRACE(branch[i].bh, "call jbd2_journal_forget");
731 ext4_journal_forget(handle, branch[i].bh); 722 ext4_journal_forget(handle, branch[i].bh);
732 } 723 }
733 for (i = 0; i <indirect_blks; i++) 724 for (i = 0; i < indirect_blks; i++)
734 ext4_free_blocks(handle, inode, new_blocks[i], 1, 0); 725 ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
735 726
736 ext4_free_blocks(handle, inode, new_blocks[i], num, 0); 727 ext4_free_blocks(handle, inode, new_blocks[i], num, 0);
@@ -757,10 +748,8 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
757{ 748{
758 int i; 749 int i;
759 int err = 0; 750 int err = 0;
760 struct ext4_block_alloc_info *block_i;
761 ext4_fsblk_t current_block; 751 ext4_fsblk_t current_block;
762 752
763 block_i = EXT4_I(inode)->i_block_alloc_info;
764 /* 753 /*
765 * If we're splicing into a [td]indirect block (as opposed to the 754 * If we're splicing into a [td]indirect block (as opposed to the
766 * inode) then we need to get write access to the [td]indirect block 755 * inode) then we need to get write access to the [td]indirect block
@@ -783,18 +772,7 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
783 if (num == 0 && blks > 1) { 772 if (num == 0 && blks > 1) {
784 current_block = le32_to_cpu(where->key) + 1; 773 current_block = le32_to_cpu(where->key) + 1;
785 for (i = 1; i < blks; i++) 774 for (i = 1; i < blks; i++)
786 *(where->p + i ) = cpu_to_le32(current_block++); 775 *(where->p + i) = cpu_to_le32(current_block++);
787 }
788
789 /*
790 * update the most recently allocated logical & physical block
791 * in i_block_alloc_info, to assist find the proper goal block for next
792 * allocation
793 */
794 if (block_i) {
795 block_i->last_alloc_logical_block = block + blks - 1;
796 block_i->last_alloc_physical_block =
797 le32_to_cpu(where[num].key) + blks - 1;
798 } 776 }
799 777
800 /* We are done with atomic stuff, now do the rest of housekeeping */ 778 /* We are done with atomic stuff, now do the rest of housekeeping */
@@ -914,12 +892,8 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
914 goto cleanup; 892 goto cleanup;
915 893
916 /* 894 /*
917 * Okay, we need to do block allocation. Lazily initialize the block 895 * Okay, we need to do block allocation.
918 * allocation info here if necessary
919 */ 896 */
920 if (S_ISREG(inode->i_mode) && (!ei->i_block_alloc_info))
921 ext4_init_block_alloc_info(inode);
922
923 goal = ext4_find_goal(inode, iblock, partial); 897 goal = ext4_find_goal(inode, iblock, partial);
924 898
925 /* the number of blocks need to allocate for [d,t]indirect blocks */ 899 /* the number of blocks need to allocate for [d,t]indirect blocks */
@@ -1030,19 +1004,20 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
1030 BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); 1004 BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
1031 mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb; 1005 mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
1032 1006
1033 /* Account for allocated meta_blocks */ 1007 if (mdb_free) {
1034 mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks; 1008 /* Account for allocated meta_blocks */
1009 mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks;
1035 1010
1036 /* update fs free blocks counter for truncate case */ 1011 /* update fs dirty blocks counter */
1037 percpu_counter_add(&sbi->s_freeblocks_counter, mdb_free); 1012 percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free);
1013 EXT4_I(inode)->i_allocated_meta_blocks = 0;
1014 EXT4_I(inode)->i_reserved_meta_blocks = mdb;
1015 }
1038 1016
1039 /* update per-inode reservations */ 1017 /* update per-inode reservations */
1040 BUG_ON(used > EXT4_I(inode)->i_reserved_data_blocks); 1018 BUG_ON(used > EXT4_I(inode)->i_reserved_data_blocks);
1041 EXT4_I(inode)->i_reserved_data_blocks -= used; 1019 EXT4_I(inode)->i_reserved_data_blocks -= used;
1042 1020
1043 BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
1044 EXT4_I(inode)->i_reserved_meta_blocks = mdb;
1045 EXT4_I(inode)->i_allocated_meta_blocks = 0;
1046 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1021 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1047} 1022}
1048 1023
@@ -1160,8 +1135,8 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
1160/* Maximum number of blocks we map for direct IO at once. */ 1135/* Maximum number of blocks we map for direct IO at once. */
1161#define DIO_MAX_BLOCKS 4096 1136#define DIO_MAX_BLOCKS 4096
1162 1137
1163static int ext4_get_block(struct inode *inode, sector_t iblock, 1138int ext4_get_block(struct inode *inode, sector_t iblock,
1164 struct buffer_head *bh_result, int create) 1139 struct buffer_head *bh_result, int create)
1165{ 1140{
1166 handle_t *handle = ext4_journal_current_handle(); 1141 handle_t *handle = ext4_journal_current_handle();
1167 int ret = 0, started = 0; 1142 int ret = 0, started = 0;
@@ -1241,7 +1216,7 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
1241 BUFFER_TRACE(bh, "call get_create_access"); 1216 BUFFER_TRACE(bh, "call get_create_access");
1242 fatal = ext4_journal_get_create_access(handle, bh); 1217 fatal = ext4_journal_get_create_access(handle, bh);
1243 if (!fatal && !buffer_uptodate(bh)) { 1218 if (!fatal && !buffer_uptodate(bh)) {
1244 memset(bh->b_data,0,inode->i_sb->s_blocksize); 1219 memset(bh->b_data, 0, inode->i_sb->s_blocksize);
1245 set_buffer_uptodate(bh); 1220 set_buffer_uptodate(bh);
1246 } 1221 }
1247 unlock_buffer(bh); 1222 unlock_buffer(bh);
@@ -1266,7 +1241,7 @@ err:
1266struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, 1241struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
1267 ext4_lblk_t block, int create, int *err) 1242 ext4_lblk_t block, int create, int *err)
1268{ 1243{
1269 struct buffer_head * bh; 1244 struct buffer_head *bh;
1270 1245
1271 bh = ext4_getblk(handle, inode, block, create, err); 1246 bh = ext4_getblk(handle, inode, block, create, err);
1272 if (!bh) 1247 if (!bh)
@@ -1282,13 +1257,13 @@ struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
1282 return NULL; 1257 return NULL;
1283} 1258}
1284 1259
1285static int walk_page_buffers( handle_t *handle, 1260static int walk_page_buffers(handle_t *handle,
1286 struct buffer_head *head, 1261 struct buffer_head *head,
1287 unsigned from, 1262 unsigned from,
1288 unsigned to, 1263 unsigned to,
1289 int *partial, 1264 int *partial,
1290 int (*fn)( handle_t *handle, 1265 int (*fn)(handle_t *handle,
1291 struct buffer_head *bh)) 1266 struct buffer_head *bh))
1292{ 1267{
1293 struct buffer_head *bh; 1268 struct buffer_head *bh;
1294 unsigned block_start, block_end; 1269 unsigned block_start, block_end;
@@ -1296,9 +1271,9 @@ static int walk_page_buffers( handle_t *handle,
1296 int err, ret = 0; 1271 int err, ret = 0;
1297 struct buffer_head *next; 1272 struct buffer_head *next;
1298 1273
1299 for ( bh = head, block_start = 0; 1274 for (bh = head, block_start = 0;
1300 ret == 0 && (bh != head || !block_start); 1275 ret == 0 && (bh != head || !block_start);
1301 block_start = block_end, bh = next) 1276 block_start = block_end, bh = next)
1302 { 1277 {
1303 next = bh->b_this_page; 1278 next = bh->b_this_page;
1304 block_end = block_start + blocksize; 1279 block_end = block_start + blocksize;
@@ -1351,23 +1326,23 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
1351 loff_t pos, unsigned len, unsigned flags, 1326 loff_t pos, unsigned len, unsigned flags,
1352 struct page **pagep, void **fsdata) 1327 struct page **pagep, void **fsdata)
1353{ 1328{
1354 struct inode *inode = mapping->host; 1329 struct inode *inode = mapping->host;
1355 int ret, needed_blocks = ext4_writepage_trans_blocks(inode); 1330 int ret, needed_blocks = ext4_writepage_trans_blocks(inode);
1356 handle_t *handle; 1331 handle_t *handle;
1357 int retries = 0; 1332 int retries = 0;
1358 struct page *page; 1333 struct page *page;
1359 pgoff_t index; 1334 pgoff_t index;
1360 unsigned from, to; 1335 unsigned from, to;
1361 1336
1362 index = pos >> PAGE_CACHE_SHIFT; 1337 index = pos >> PAGE_CACHE_SHIFT;
1363 from = pos & (PAGE_CACHE_SIZE - 1); 1338 from = pos & (PAGE_CACHE_SIZE - 1);
1364 to = from + len; 1339 to = from + len;
1365 1340
1366retry: 1341retry:
1367 handle = ext4_journal_start(inode, needed_blocks); 1342 handle = ext4_journal_start(inode, needed_blocks);
1368 if (IS_ERR(handle)) { 1343 if (IS_ERR(handle)) {
1369 ret = PTR_ERR(handle); 1344 ret = PTR_ERR(handle);
1370 goto out; 1345 goto out;
1371 } 1346 }
1372 1347
1373 page = __grab_cache_page(mapping, index); 1348 page = __grab_cache_page(mapping, index);
@@ -1387,9 +1362,16 @@ retry:
1387 } 1362 }
1388 1363
1389 if (ret) { 1364 if (ret) {
1390 unlock_page(page); 1365 unlock_page(page);
1391 ext4_journal_stop(handle); 1366 ext4_journal_stop(handle);
1392 page_cache_release(page); 1367 page_cache_release(page);
1368 /*
1369 * block_write_begin may have instantiated a few blocks
1370 * outside i_size. Trim these off again. Don't need
1371 * i_size_read because we hold i_mutex.
1372 */
1373 if (pos + len > inode->i_size)
1374 vmtruncate(inode, inode->i_size);
1393 } 1375 }
1394 1376
1395 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 1377 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -1426,16 +1408,18 @@ static int ext4_ordered_write_end(struct file *file,
1426 ret = ext4_jbd2_file_inode(handle, inode); 1408 ret = ext4_jbd2_file_inode(handle, inode);
1427 1409
1428 if (ret == 0) { 1410 if (ret == 0) {
1429 /*
1430 * generic_write_end() will run mark_inode_dirty() if i_size
1431 * changes. So let's piggyback the i_disksize mark_inode_dirty
1432 * into that.
1433 */
1434 loff_t new_i_size; 1411 loff_t new_i_size;
1435 1412
1436 new_i_size = pos + copied; 1413 new_i_size = pos + copied;
1437 if (new_i_size > EXT4_I(inode)->i_disksize) 1414 if (new_i_size > EXT4_I(inode)->i_disksize) {
1438 EXT4_I(inode)->i_disksize = new_i_size; 1415 ext4_update_i_disksize(inode, new_i_size);
1416 /* We need to mark inode dirty even if
1417 * new_i_size is less that inode->i_size
1418 * bu greater than i_disksize.(hint delalloc)
1419 */
1420 ext4_mark_inode_dirty(handle, inode);
1421 }
1422
1439 ret2 = generic_write_end(file, mapping, pos, len, copied, 1423 ret2 = generic_write_end(file, mapping, pos, len, copied,
1440 page, fsdata); 1424 page, fsdata);
1441 copied = ret2; 1425 copied = ret2;
@@ -1460,8 +1444,14 @@ static int ext4_writeback_write_end(struct file *file,
1460 loff_t new_i_size; 1444 loff_t new_i_size;
1461 1445
1462 new_i_size = pos + copied; 1446 new_i_size = pos + copied;
1463 if (new_i_size > EXT4_I(inode)->i_disksize) 1447 if (new_i_size > EXT4_I(inode)->i_disksize) {
1464 EXT4_I(inode)->i_disksize = new_i_size; 1448 ext4_update_i_disksize(inode, new_i_size);
1449 /* We need to mark inode dirty even if
1450 * new_i_size is less that inode->i_size
1451 * bu greater than i_disksize.(hint delalloc)
1452 */
1453 ext4_mark_inode_dirty(handle, inode);
1454 }
1465 1455
1466 ret2 = generic_write_end(file, mapping, pos, len, copied, 1456 ret2 = generic_write_end(file, mapping, pos, len, copied,
1467 page, fsdata); 1457 page, fsdata);
@@ -1486,6 +1476,7 @@ static int ext4_journalled_write_end(struct file *file,
1486 int ret = 0, ret2; 1476 int ret = 0, ret2;
1487 int partial = 0; 1477 int partial = 0;
1488 unsigned from, to; 1478 unsigned from, to;
1479 loff_t new_i_size;
1489 1480
1490 from = pos & (PAGE_CACHE_SIZE - 1); 1481 from = pos & (PAGE_CACHE_SIZE - 1);
1491 to = from + len; 1482 to = from + len;
@@ -1500,11 +1491,12 @@ static int ext4_journalled_write_end(struct file *file,
1500 to, &partial, write_end_fn); 1491 to, &partial, write_end_fn);
1501 if (!partial) 1492 if (!partial)
1502 SetPageUptodate(page); 1493 SetPageUptodate(page);
1503 if (pos+copied > inode->i_size) 1494 new_i_size = pos + copied;
1495 if (new_i_size > inode->i_size)
1504 i_size_write(inode, pos+copied); 1496 i_size_write(inode, pos+copied);
1505 EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; 1497 EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
1506 if (inode->i_size > EXT4_I(inode)->i_disksize) { 1498 if (new_i_size > EXT4_I(inode)->i_disksize) {
1507 EXT4_I(inode)->i_disksize = inode->i_size; 1499 ext4_update_i_disksize(inode, new_i_size);
1508 ret2 = ext4_mark_inode_dirty(handle, inode); 1500 ret2 = ext4_mark_inode_dirty(handle, inode);
1509 if (!ret) 1501 if (!ret)
1510 ret = ret2; 1502 ret = ret2;
@@ -1521,6 +1513,7 @@ static int ext4_journalled_write_end(struct file *file,
1521 1513
1522static int ext4_da_reserve_space(struct inode *inode, int nrblocks) 1514static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
1523{ 1515{
1516 int retries = 0;
1524 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1517 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1525 unsigned long md_needed, mdblocks, total = 0; 1518 unsigned long md_needed, mdblocks, total = 0;
1526 1519
@@ -1529,6 +1522,7 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
1529 * in order to allocate nrblocks 1522 * in order to allocate nrblocks
1530 * worse case is one extent per block 1523 * worse case is one extent per block
1531 */ 1524 */
1525repeat:
1532 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 1526 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
1533 total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks; 1527 total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks;
1534 mdblocks = ext4_calc_metadata_amount(inode, total); 1528 mdblocks = ext4_calc_metadata_amount(inode, total);
@@ -1537,13 +1531,14 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
1537 md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks; 1531 md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks;
1538 total = md_needed + nrblocks; 1532 total = md_needed + nrblocks;
1539 1533
1540 if (ext4_has_free_blocks(sbi, total) < total) { 1534 if (ext4_claim_free_blocks(sbi, total)) {
1541 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1535 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1536 if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
1537 yield();
1538 goto repeat;
1539 }
1542 return -ENOSPC; 1540 return -ENOSPC;
1543 } 1541 }
1544 /* reduce fs free blocks counter */
1545 percpu_counter_sub(&sbi->s_freeblocks_counter, total);
1546
1547 EXT4_I(inode)->i_reserved_data_blocks += nrblocks; 1542 EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
1548 EXT4_I(inode)->i_reserved_meta_blocks = mdblocks; 1543 EXT4_I(inode)->i_reserved_meta_blocks = mdblocks;
1549 1544
@@ -1585,8 +1580,8 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
1585 1580
1586 release = to_free + mdb_free; 1581 release = to_free + mdb_free;
1587 1582
1588 /* update fs free blocks counter for truncate case */ 1583 /* update fs dirty blocks counter for truncate case */
1589 percpu_counter_add(&sbi->s_freeblocks_counter, release); 1584 percpu_counter_sub(&sbi->s_dirtyblocks_counter, release);
1590 1585
1591 /* update per-inode reservations */ 1586 /* update per-inode reservations */
1592 BUG_ON(to_free > EXT4_I(inode)->i_reserved_data_blocks); 1587 BUG_ON(to_free > EXT4_I(inode)->i_reserved_data_blocks);
@@ -1630,6 +1625,7 @@ struct mpage_da_data {
1630 struct writeback_control *wbc; 1625 struct writeback_control *wbc;
1631 int io_done; 1626 int io_done;
1632 long pages_written; 1627 long pages_written;
1628 int retval;
1633}; 1629};
1634 1630
1635/* 1631/*
@@ -1783,6 +1779,57 @@ static inline void __unmap_underlying_blocks(struct inode *inode,
1783 unmap_underlying_metadata(bdev, bh->b_blocknr + i); 1779 unmap_underlying_metadata(bdev, bh->b_blocknr + i);
1784} 1780}
1785 1781
1782static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
1783 sector_t logical, long blk_cnt)
1784{
1785 int nr_pages, i;
1786 pgoff_t index, end;
1787 struct pagevec pvec;
1788 struct inode *inode = mpd->inode;
1789 struct address_space *mapping = inode->i_mapping;
1790
1791 index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
1792 end = (logical + blk_cnt - 1) >>
1793 (PAGE_CACHE_SHIFT - inode->i_blkbits);
1794 while (index <= end) {
1795 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
1796 if (nr_pages == 0)
1797 break;
1798 for (i = 0; i < nr_pages; i++) {
1799 struct page *page = pvec.pages[i];
1800 index = page->index;
1801 if (index > end)
1802 break;
1803 index++;
1804
1805 BUG_ON(!PageLocked(page));
1806 BUG_ON(PageWriteback(page));
1807 block_invalidatepage(page, 0);
1808 ClearPageUptodate(page);
1809 unlock_page(page);
1810 }
1811 }
1812 return;
1813}
1814
1815static void ext4_print_free_blocks(struct inode *inode)
1816{
1817 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1818 printk(KERN_EMERG "Total free blocks count %lld\n",
1819 ext4_count_free_blocks(inode->i_sb));
1820 printk(KERN_EMERG "Free/Dirty block details\n");
1821 printk(KERN_EMERG "free_blocks=%lld\n",
1822 percpu_counter_sum(&sbi->s_freeblocks_counter));
1823 printk(KERN_EMERG "dirty_blocks=%lld\n",
1824 percpu_counter_sum(&sbi->s_dirtyblocks_counter));
1825 printk(KERN_EMERG "Block reservation details\n");
1826 printk(KERN_EMERG "i_reserved_data_blocks=%lu\n",
1827 EXT4_I(inode)->i_reserved_data_blocks);
1828 printk(KERN_EMERG "i_reserved_meta_blocks=%lu\n",
1829 EXT4_I(inode)->i_reserved_meta_blocks);
1830 return;
1831}
1832
1786/* 1833/*
1787 * mpage_da_map_blocks - go through given space 1834 * mpage_da_map_blocks - go through given space
1788 * 1835 *
@@ -1792,32 +1839,69 @@ static inline void __unmap_underlying_blocks(struct inode *inode,
1792 * The function skips space we know is already mapped to disk blocks. 1839 * The function skips space we know is already mapped to disk blocks.
1793 * 1840 *
1794 */ 1841 */
1795static void mpage_da_map_blocks(struct mpage_da_data *mpd) 1842static int mpage_da_map_blocks(struct mpage_da_data *mpd)
1796{ 1843{
1797 int err = 0; 1844 int err = 0;
1798 struct buffer_head *lbh = &mpd->lbh;
1799 sector_t next = lbh->b_blocknr;
1800 struct buffer_head new; 1845 struct buffer_head new;
1846 struct buffer_head *lbh = &mpd->lbh;
1847 sector_t next;
1801 1848
1802 /* 1849 /*
1803 * We consider only non-mapped and non-allocated blocks 1850 * We consider only non-mapped and non-allocated blocks
1804 */ 1851 */
1805 if (buffer_mapped(lbh) && !buffer_delay(lbh)) 1852 if (buffer_mapped(lbh) && !buffer_delay(lbh))
1806 return; 1853 return 0;
1807
1808 new.b_state = lbh->b_state; 1854 new.b_state = lbh->b_state;
1809 new.b_blocknr = 0; 1855 new.b_blocknr = 0;
1810 new.b_size = lbh->b_size; 1856 new.b_size = lbh->b_size;
1811 1857 next = lbh->b_blocknr;
1812 /* 1858 /*
1813 * If we didn't accumulate anything 1859 * If we didn't accumulate anything
1814 * to write simply return 1860 * to write simply return
1815 */ 1861 */
1816 if (!new.b_size) 1862 if (!new.b_size)
1817 return; 1863 return 0;
1818 err = mpd->get_block(mpd->inode, next, &new, 1); 1864 err = mpd->get_block(mpd->inode, next, &new, 1);
1819 if (err) 1865 if (err) {
1820 return; 1866
1867 /* If get block returns with error
1868 * we simply return. Later writepage
1869 * will redirty the page and writepages
1870 * will find the dirty page again
1871 */
1872 if (err == -EAGAIN)
1873 return 0;
1874
1875 if (err == -ENOSPC &&
1876 ext4_count_free_blocks(mpd->inode->i_sb)) {
1877 mpd->retval = err;
1878 return 0;
1879 }
1880
1881 /*
1882 * get block failure will cause us
1883 * to loop in writepages. Because
1884 * a_ops->writepage won't be able to
1885 * make progress. The page will be redirtied
1886 * by writepage and writepages will again
1887 * try to write the same.
1888 */
1889 printk(KERN_EMERG "%s block allocation failed for inode %lu "
1890 "at logical offset %llu with max blocks "
1891 "%zd with error %d\n",
1892 __func__, mpd->inode->i_ino,
1893 (unsigned long long)next,
1894 lbh->b_size >> mpd->inode->i_blkbits, err);
1895 printk(KERN_EMERG "This should not happen.!! "
1896 "Data will be lost\n");
1897 if (err == -ENOSPC) {
1898 ext4_print_free_blocks(mpd->inode);
1899 }
1900 /* invlaidate all the pages */
1901 ext4_da_block_invalidatepages(mpd, next,
1902 lbh->b_size >> mpd->inode->i_blkbits);
1903 return err;
1904 }
1821 BUG_ON(new.b_size == 0); 1905 BUG_ON(new.b_size == 0);
1822 1906
1823 if (buffer_new(&new)) 1907 if (buffer_new(&new))
@@ -1830,7 +1914,7 @@ static void mpage_da_map_blocks(struct mpage_da_data *mpd)
1830 if (buffer_delay(lbh) || buffer_unwritten(lbh)) 1914 if (buffer_delay(lbh) || buffer_unwritten(lbh))
1831 mpage_put_bnr_to_bhs(mpd, next, &new); 1915 mpage_put_bnr_to_bhs(mpd, next, &new);
1832 1916
1833 return; 1917 return 0;
1834} 1918}
1835 1919
1836#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \ 1920#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
@@ -1899,8 +1983,8 @@ flush_it:
1899 * We couldn't merge the block to our extent, so we 1983 * We couldn't merge the block to our extent, so we
1900 * need to flush current extent and start new one 1984 * need to flush current extent and start new one
1901 */ 1985 */
1902 mpage_da_map_blocks(mpd); 1986 if (mpage_da_map_blocks(mpd) == 0)
1903 mpage_da_submit_io(mpd); 1987 mpage_da_submit_io(mpd);
1904 mpd->io_done = 1; 1988 mpd->io_done = 1;
1905 return; 1989 return;
1906} 1990}
@@ -1942,8 +2026,8 @@ static int __mpage_da_writepage(struct page *page,
1942 * and start IO on them using writepage() 2026 * and start IO on them using writepage()
1943 */ 2027 */
1944 if (mpd->next_page != mpd->first_page) { 2028 if (mpd->next_page != mpd->first_page) {
1945 mpage_da_map_blocks(mpd); 2029 if (mpage_da_map_blocks(mpd) == 0)
1946 mpage_da_submit_io(mpd); 2030 mpage_da_submit_io(mpd);
1947 /* 2031 /*
1948 * skip rest of the page in the page_vec 2032 * skip rest of the page in the page_vec
1949 */ 2033 */
@@ -2018,39 +2102,36 @@ static int __mpage_da_writepage(struct page *page,
2018 */ 2102 */
2019static int mpage_da_writepages(struct address_space *mapping, 2103static int mpage_da_writepages(struct address_space *mapping,
2020 struct writeback_control *wbc, 2104 struct writeback_control *wbc,
2021 get_block_t get_block) 2105 struct mpage_da_data *mpd)
2022{ 2106{
2023 struct mpage_da_data mpd;
2024 long to_write; 2107 long to_write;
2025 int ret; 2108 int ret;
2026 2109
2027 if (!get_block) 2110 if (!mpd->get_block)
2028 return generic_writepages(mapping, wbc); 2111 return generic_writepages(mapping, wbc);
2029 2112
2030 mpd.wbc = wbc; 2113 mpd->lbh.b_size = 0;
2031 mpd.inode = mapping->host; 2114 mpd->lbh.b_state = 0;
2032 mpd.lbh.b_size = 0; 2115 mpd->lbh.b_blocknr = 0;
2033 mpd.lbh.b_state = 0; 2116 mpd->first_page = 0;
2034 mpd.lbh.b_blocknr = 0; 2117 mpd->next_page = 0;
2035 mpd.first_page = 0; 2118 mpd->io_done = 0;
2036 mpd.next_page = 0; 2119 mpd->pages_written = 0;
2037 mpd.get_block = get_block; 2120 mpd->retval = 0;
2038 mpd.io_done = 0;
2039 mpd.pages_written = 0;
2040 2121
2041 to_write = wbc->nr_to_write; 2122 to_write = wbc->nr_to_write;
2042 2123
2043 ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, &mpd); 2124 ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd);
2044 2125
2045 /* 2126 /*
2046 * Handle last extent of pages 2127 * Handle last extent of pages
2047 */ 2128 */
2048 if (!mpd.io_done && mpd.next_page != mpd.first_page) { 2129 if (!mpd->io_done && mpd->next_page != mpd->first_page) {
2049 mpage_da_map_blocks(&mpd); 2130 if (mpage_da_map_blocks(mpd) == 0)
2050 mpage_da_submit_io(&mpd); 2131 mpage_da_submit_io(mpd);
2051 } 2132 }
2052 2133
2053 wbc->nr_to_write = to_write - mpd.pages_written; 2134 wbc->nr_to_write = to_write - mpd->pages_written;
2054 return ret; 2135 return ret;
2055} 2136}
2056 2137
@@ -2103,18 +2184,24 @@ static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
2103 handle_t *handle = NULL; 2184 handle_t *handle = NULL;
2104 2185
2105 handle = ext4_journal_current_handle(); 2186 handle = ext4_journal_current_handle();
2106 if (!handle) { 2187 BUG_ON(!handle);
2107 ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks, 2188 ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
2108 bh_result, 0, 0, 0); 2189 bh_result, create, 0, EXT4_DELALLOC_RSVED);
2109 BUG_ON(!ret);
2110 } else {
2111 ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
2112 bh_result, create, 0, EXT4_DELALLOC_RSVED);
2113 }
2114
2115 if (ret > 0) { 2190 if (ret > 0) {
2191
2116 bh_result->b_size = (ret << inode->i_blkbits); 2192 bh_result->b_size = (ret << inode->i_blkbits);
2117 2193
2194 if (ext4_should_order_data(inode)) {
2195 int retval;
2196 retval = ext4_jbd2_file_inode(handle, inode);
2197 if (retval)
2198 /*
2199 * Failed to add inode for ordered
2200 * mode. Don't update file size
2201 */
2202 return retval;
2203 }
2204
2118 /* 2205 /*
2119 * Update on-disk size along with block allocation 2206 * Update on-disk size along with block allocation
2120 * we don't use 'extend_disksize' as size may change 2207 * we don't use 'extend_disksize' as size may change
@@ -2124,18 +2211,9 @@ static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
2124 if (disksize > i_size_read(inode)) 2211 if (disksize > i_size_read(inode))
2125 disksize = i_size_read(inode); 2212 disksize = i_size_read(inode);
2126 if (disksize > EXT4_I(inode)->i_disksize) { 2213 if (disksize > EXT4_I(inode)->i_disksize) {
2127 /* 2214 ext4_update_i_disksize(inode, disksize);
2128 * XXX: replace with spinlock if seen contended -bzzz 2215 ret = ext4_mark_inode_dirty(handle, inode);
2129 */ 2216 return ret;
2130 down_write(&EXT4_I(inode)->i_data_sem);
2131 if (disksize > EXT4_I(inode)->i_disksize)
2132 EXT4_I(inode)->i_disksize = disksize;
2133 up_write(&EXT4_I(inode)->i_data_sem);
2134
2135 if (EXT4_I(inode)->i_disksize == disksize) {
2136 ret = ext4_mark_inode_dirty(handle, inode);
2137 return ret;
2138 }
2139 } 2217 }
2140 ret = 0; 2218 ret = 0;
2141 } 2219 }
@@ -2284,6 +2362,7 @@ static int ext4_da_writepages(struct address_space *mapping,
2284{ 2362{
2285 handle_t *handle = NULL; 2363 handle_t *handle = NULL;
2286 loff_t range_start = 0; 2364 loff_t range_start = 0;
2365 struct mpage_da_data mpd;
2287 struct inode *inode = mapping->host; 2366 struct inode *inode = mapping->host;
2288 int needed_blocks, ret = 0, nr_to_writebump = 0; 2367 int needed_blocks, ret = 0, nr_to_writebump = 0;
2289 long to_write, pages_skipped = 0; 2368 long to_write, pages_skipped = 0;
@@ -2317,6 +2396,9 @@ static int ext4_da_writepages(struct address_space *mapping,
2317 range_start = wbc->range_start; 2396 range_start = wbc->range_start;
2318 pages_skipped = wbc->pages_skipped; 2397 pages_skipped = wbc->pages_skipped;
2319 2398
2399 mpd.wbc = wbc;
2400 mpd.inode = mapping->host;
2401
2320restart_loop: 2402restart_loop:
2321 to_write = wbc->nr_to_write; 2403 to_write = wbc->nr_to_write;
2322 while (!ret && to_write > 0) { 2404 while (!ret && to_write > 0) {
@@ -2340,23 +2422,17 @@ restart_loop:
2340 dump_stack(); 2422 dump_stack();
2341 goto out_writepages; 2423 goto out_writepages;
2342 } 2424 }
2343 if (ext4_should_order_data(inode)) {
2344 /*
2345 * With ordered mode we need to add
2346 * the inode to the journal handl
2347 * when we do block allocation.
2348 */
2349 ret = ext4_jbd2_file_inode(handle, inode);
2350 if (ret) {
2351 ext4_journal_stop(handle);
2352 goto out_writepages;
2353 }
2354 }
2355
2356 to_write -= wbc->nr_to_write; 2425 to_write -= wbc->nr_to_write;
2357 ret = mpage_da_writepages(mapping, wbc, 2426
2358 ext4_da_get_block_write); 2427 mpd.get_block = ext4_da_get_block_write;
2428 ret = mpage_da_writepages(mapping, wbc, &mpd);
2429
2359 ext4_journal_stop(handle); 2430 ext4_journal_stop(handle);
2431
2432 if (mpd.retval == -ENOSPC)
2433 jbd2_journal_force_commit_nested(sbi->s_journal);
2434
2435 /* reset the retry count */
2360 if (ret == MPAGE_DA_EXTENT_TAIL) { 2436 if (ret == MPAGE_DA_EXTENT_TAIL) {
2361 /* 2437 /*
2362 * got one extent now try with 2438 * got one extent now try with
@@ -2391,6 +2467,33 @@ out_writepages:
2391 return ret; 2467 return ret;
2392} 2468}
2393 2469
2470#define FALL_BACK_TO_NONDELALLOC 1
2471static int ext4_nonda_switch(struct super_block *sb)
2472{
2473 s64 free_blocks, dirty_blocks;
2474 struct ext4_sb_info *sbi = EXT4_SB(sb);
2475
2476 /*
2477 * switch to non delalloc mode if we are running low
2478 * on free block. The free block accounting via percpu
2479 * counters can get slightly wrong with FBC_BATCH getting
2480 * accumulated on each CPU without updating global counters
2481 * Delalloc need an accurate free block accounting. So switch
2482 * to non delalloc when we are near to error range.
2483 */
2484 free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
2485 dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyblocks_counter);
2486 if (2 * free_blocks < 3 * dirty_blocks ||
2487 free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) {
2488 /*
2489 * free block count is less that 150% of dirty blocks
2490 * or free blocks is less that watermark
2491 */
2492 return 1;
2493 }
2494 return 0;
2495}
2496
2394static int ext4_da_write_begin(struct file *file, struct address_space *mapping, 2497static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
2395 loff_t pos, unsigned len, unsigned flags, 2498 loff_t pos, unsigned len, unsigned flags,
2396 struct page **pagep, void **fsdata) 2499 struct page **pagep, void **fsdata)
@@ -2406,6 +2509,12 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
2406 from = pos & (PAGE_CACHE_SIZE - 1); 2509 from = pos & (PAGE_CACHE_SIZE - 1);
2407 to = from + len; 2510 to = from + len;
2408 2511
2512 if (ext4_nonda_switch(inode->i_sb)) {
2513 *fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
2514 return ext4_write_begin(file, mapping, pos,
2515 len, flags, pagep, fsdata);
2516 }
2517 *fsdata = (void *)0;
2409retry: 2518retry:
2410 /* 2519 /*
2411 * With delayed allocation, we don't log the i_disksize update 2520 * With delayed allocation, we don't log the i_disksize update
@@ -2433,6 +2542,13 @@ retry:
2433 unlock_page(page); 2542 unlock_page(page);
2434 ext4_journal_stop(handle); 2543 ext4_journal_stop(handle);
2435 page_cache_release(page); 2544 page_cache_release(page);
2545 /*
2546 * block_write_begin may have instantiated a few blocks
2547 * outside i_size. Trim these off again. Don't need
2548 * i_size_read because we hold i_mutex.
2549 */
2550 if (pos + len > inode->i_size)
2551 vmtruncate(inode, inode->i_size);
2436 } 2552 }
2437 2553
2438 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 2554 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -2456,7 +2572,7 @@ static int ext4_da_should_update_i_disksize(struct page *page,
2456 bh = page_buffers(page); 2572 bh = page_buffers(page);
2457 idx = offset >> inode->i_blkbits; 2573 idx = offset >> inode->i_blkbits;
2458 2574
2459 for (i=0; i < idx; i++) 2575 for (i = 0; i < idx; i++)
2460 bh = bh->b_this_page; 2576 bh = bh->b_this_page;
2461 2577
2462 if (!buffer_mapped(bh) || (buffer_delay(bh))) 2578 if (!buffer_mapped(bh) || (buffer_delay(bh)))
@@ -2474,9 +2590,22 @@ static int ext4_da_write_end(struct file *file,
2474 handle_t *handle = ext4_journal_current_handle(); 2590 handle_t *handle = ext4_journal_current_handle();
2475 loff_t new_i_size; 2591 loff_t new_i_size;
2476 unsigned long start, end; 2592 unsigned long start, end;
2593 int write_mode = (int)(unsigned long)fsdata;
2594
2595 if (write_mode == FALL_BACK_TO_NONDELALLOC) {
2596 if (ext4_should_order_data(inode)) {
2597 return ext4_ordered_write_end(file, mapping, pos,
2598 len, copied, page, fsdata);
2599 } else if (ext4_should_writeback_data(inode)) {
2600 return ext4_writeback_write_end(file, mapping, pos,
2601 len, copied, page, fsdata);
2602 } else {
2603 BUG();
2604 }
2605 }
2477 2606
2478 start = pos & (PAGE_CACHE_SIZE - 1); 2607 start = pos & (PAGE_CACHE_SIZE - 1);
2479 end = start + copied -1; 2608 end = start + copied - 1;
2480 2609
2481 /* 2610 /*
2482 * generic_write_end() will run mark_inode_dirty() if i_size 2611 * generic_write_end() will run mark_inode_dirty() if i_size
@@ -2500,6 +2629,11 @@ static int ext4_da_write_end(struct file *file,
2500 EXT4_I(inode)->i_disksize = new_i_size; 2629 EXT4_I(inode)->i_disksize = new_i_size;
2501 } 2630 }
2502 up_write(&EXT4_I(inode)->i_data_sem); 2631 up_write(&EXT4_I(inode)->i_data_sem);
2632 /* We need to mark inode dirty even if
2633 * new_i_size is less that inode->i_size
2634 * bu greater than i_disksize.(hint delalloc)
2635 */
2636 ext4_mark_inode_dirty(handle, inode);
2503 } 2637 }
2504 } 2638 }
2505 ret2 = generic_write_end(file, mapping, pos, len, copied, 2639 ret2 = generic_write_end(file, mapping, pos, len, copied,
@@ -2591,7 +2725,7 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
2591 return 0; 2725 return 0;
2592 } 2726 }
2593 2727
2594 return generic_block_bmap(mapping,block,ext4_get_block); 2728 return generic_block_bmap(mapping, block, ext4_get_block);
2595} 2729}
2596 2730
2597static int bget_one(handle_t *handle, struct buffer_head *bh) 2731static int bget_one(handle_t *handle, struct buffer_head *bh)
@@ -3197,7 +3331,7 @@ static Indirect *ext4_find_shared(struct inode *inode, int depth,
3197 if (!partial->key && *partial->p) 3331 if (!partial->key && *partial->p)
3198 /* Writer: end */ 3332 /* Writer: end */
3199 goto no_top; 3333 goto no_top;
3200 for (p=partial; p>chain && all_zeroes((__le32*)p->bh->b_data,p->p); p--) 3334 for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--)
3201 ; 3335 ;
3202 /* 3336 /*
3203 * OK, we've found the last block that must survive. The rest of our 3337 * OK, we've found the last block that must survive. The rest of our
@@ -3216,7 +3350,7 @@ static Indirect *ext4_find_shared(struct inode *inode, int depth,
3216 } 3350 }
3217 /* Writer: end */ 3351 /* Writer: end */
3218 3352
3219 while(partial > p) { 3353 while (partial > p) {
3220 brelse(partial->bh); 3354 brelse(partial->bh);
3221 partial--; 3355 partial--;
3222 } 3356 }
@@ -3408,9 +3542,9 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
3408 /* This zaps the entire block. Bottom up. */ 3542 /* This zaps the entire block. Bottom up. */
3409 BUFFER_TRACE(bh, "free child branches"); 3543 BUFFER_TRACE(bh, "free child branches");
3410 ext4_free_branches(handle, inode, bh, 3544 ext4_free_branches(handle, inode, bh,
3411 (__le32*)bh->b_data, 3545 (__le32 *) bh->b_data,
3412 (__le32*)bh->b_data + addr_per_block, 3546 (__le32 *) bh->b_data + addr_per_block,
3413 depth); 3547 depth);
3414 3548
3415 /* 3549 /*
3416 * We've probably journalled the indirect block several 3550 * We've probably journalled the indirect block several
@@ -3578,7 +3712,7 @@ void ext4_truncate(struct inode *inode)
3578 */ 3712 */
3579 down_write(&ei->i_data_sem); 3713 down_write(&ei->i_data_sem);
3580 3714
3581 ext4_discard_reservation(inode); 3715 ext4_discard_preallocations(inode);
3582 3716
3583 /* 3717 /*
3584 * The orphan list entry will now protect us from any crash which 3718 * The orphan list entry will now protect us from any crash which
@@ -3673,41 +3807,6 @@ out_stop:
3673 ext4_journal_stop(handle); 3807 ext4_journal_stop(handle);
3674} 3808}
3675 3809
3676static ext4_fsblk_t ext4_get_inode_block(struct super_block *sb,
3677 unsigned long ino, struct ext4_iloc *iloc)
3678{
3679 ext4_group_t block_group;
3680 unsigned long offset;
3681 ext4_fsblk_t block;
3682 struct ext4_group_desc *gdp;
3683
3684 if (!ext4_valid_inum(sb, ino)) {
3685 /*
3686 * This error is already checked for in namei.c unless we are
3687 * looking at an NFS filehandle, in which case no error
3688 * report is needed
3689 */
3690 return 0;
3691 }
3692
3693 block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
3694 gdp = ext4_get_group_desc(sb, block_group, NULL);
3695 if (!gdp)
3696 return 0;
3697
3698 /*
3699 * Figure out the offset within the block group inode table
3700 */
3701 offset = ((ino - 1) % EXT4_INODES_PER_GROUP(sb)) *
3702 EXT4_INODE_SIZE(sb);
3703 block = ext4_inode_table(sb, gdp) +
3704 (offset >> EXT4_BLOCK_SIZE_BITS(sb));
3705
3706 iloc->block_group = block_group;
3707 iloc->offset = offset & (EXT4_BLOCK_SIZE(sb) - 1);
3708 return block;
3709}
3710
3711/* 3810/*
3712 * ext4_get_inode_loc returns with an extra refcount against the inode's 3811 * ext4_get_inode_loc returns with an extra refcount against the inode's
3713 * underlying buffer_head on success. If 'in_mem' is true, we have all 3812 * underlying buffer_head on success. If 'in_mem' is true, we have all
@@ -3717,19 +3816,35 @@ static ext4_fsblk_t ext4_get_inode_block(struct super_block *sb,
3717static int __ext4_get_inode_loc(struct inode *inode, 3816static int __ext4_get_inode_loc(struct inode *inode,
3718 struct ext4_iloc *iloc, int in_mem) 3817 struct ext4_iloc *iloc, int in_mem)
3719{ 3818{
3720 ext4_fsblk_t block; 3819 struct ext4_group_desc *gdp;
3721 struct buffer_head *bh; 3820 struct buffer_head *bh;
3821 struct super_block *sb = inode->i_sb;
3822 ext4_fsblk_t block;
3823 int inodes_per_block, inode_offset;
3824
3825 iloc->bh = 0;
3826 if (!ext4_valid_inum(sb, inode->i_ino))
3827 return -EIO;
3722 3828
3723 block = ext4_get_inode_block(inode->i_sb, inode->i_ino, iloc); 3829 iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb);
3724 if (!block) 3830 gdp = ext4_get_group_desc(sb, iloc->block_group, NULL);
3831 if (!gdp)
3725 return -EIO; 3832 return -EIO;
3726 3833
3727 bh = sb_getblk(inode->i_sb, block); 3834 /*
3835 * Figure out the offset within the block group inode table
3836 */
3837 inodes_per_block = (EXT4_BLOCK_SIZE(sb) / EXT4_INODE_SIZE(sb));
3838 inode_offset = ((inode->i_ino - 1) %
3839 EXT4_INODES_PER_GROUP(sb));
3840 block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
3841 iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);
3842
3843 bh = sb_getblk(sb, block);
3728 if (!bh) { 3844 if (!bh) {
3729 ext4_error (inode->i_sb, "ext4_get_inode_loc", 3845 ext4_error(sb, "ext4_get_inode_loc", "unable to read "
3730 "unable to read inode block - " 3846 "inode block - inode=%lu, block=%llu",
3731 "inode=%lu, block=%llu", 3847 inode->i_ino, block);
3732 inode->i_ino, block);
3733 return -EIO; 3848 return -EIO;
3734 } 3849 }
3735 if (!buffer_uptodate(bh)) { 3850 if (!buffer_uptodate(bh)) {
@@ -3757,28 +3872,12 @@ static int __ext4_get_inode_loc(struct inode *inode,
3757 */ 3872 */
3758 if (in_mem) { 3873 if (in_mem) {
3759 struct buffer_head *bitmap_bh; 3874 struct buffer_head *bitmap_bh;
3760 struct ext4_group_desc *desc; 3875 int i, start;
3761 int inodes_per_buffer;
3762 int inode_offset, i;
3763 ext4_group_t block_group;
3764 int start;
3765
3766 block_group = (inode->i_ino - 1) /
3767 EXT4_INODES_PER_GROUP(inode->i_sb);
3768 inodes_per_buffer = bh->b_size /
3769 EXT4_INODE_SIZE(inode->i_sb);
3770 inode_offset = ((inode->i_ino - 1) %
3771 EXT4_INODES_PER_GROUP(inode->i_sb));
3772 start = inode_offset & ~(inodes_per_buffer - 1);
3773 3876
3774 /* Is the inode bitmap in cache? */ 3877 start = inode_offset & ~(inodes_per_block - 1);
3775 desc = ext4_get_group_desc(inode->i_sb,
3776 block_group, NULL);
3777 if (!desc)
3778 goto make_io;
3779 3878
3780 bitmap_bh = sb_getblk(inode->i_sb, 3879 /* Is the inode bitmap in cache? */
3781 ext4_inode_bitmap(inode->i_sb, desc)); 3880 bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp));
3782 if (!bitmap_bh) 3881 if (!bitmap_bh)
3783 goto make_io; 3882 goto make_io;
3784 3883
@@ -3791,14 +3890,14 @@ static int __ext4_get_inode_loc(struct inode *inode,
3791 brelse(bitmap_bh); 3890 brelse(bitmap_bh);
3792 goto make_io; 3891 goto make_io;
3793 } 3892 }
3794 for (i = start; i < start + inodes_per_buffer; i++) { 3893 for (i = start; i < start + inodes_per_block; i++) {
3795 if (i == inode_offset) 3894 if (i == inode_offset)
3796 continue; 3895 continue;
3797 if (ext4_test_bit(i, bitmap_bh->b_data)) 3896 if (ext4_test_bit(i, bitmap_bh->b_data))
3798 break; 3897 break;
3799 } 3898 }
3800 brelse(bitmap_bh); 3899 brelse(bitmap_bh);
3801 if (i == start + inodes_per_buffer) { 3900 if (i == start + inodes_per_block) {
3802 /* all other inodes are free, so skip I/O */ 3901 /* all other inodes are free, so skip I/O */
3803 memset(bh->b_data, 0, bh->b_size); 3902 memset(bh->b_data, 0, bh->b_size);
3804 set_buffer_uptodate(bh); 3903 set_buffer_uptodate(bh);
@@ -3809,6 +3908,36 @@ static int __ext4_get_inode_loc(struct inode *inode,
3809 3908
3810make_io: 3909make_io:
3811 /* 3910 /*
3911 * If we need to do any I/O, try to pre-readahead extra
3912 * blocks from the inode table.
3913 */
3914 if (EXT4_SB(sb)->s_inode_readahead_blks) {
3915 ext4_fsblk_t b, end, table;
3916 unsigned num;
3917
3918 table = ext4_inode_table(sb, gdp);
3919 /* Make sure s_inode_readahead_blks is a power of 2 */
3920 while (EXT4_SB(sb)->s_inode_readahead_blks &
3921 (EXT4_SB(sb)->s_inode_readahead_blks-1))
3922 EXT4_SB(sb)->s_inode_readahead_blks =
3923 (EXT4_SB(sb)->s_inode_readahead_blks &
3924 (EXT4_SB(sb)->s_inode_readahead_blks-1));
3925 b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1);
3926 if (table > b)
3927 b = table;
3928 end = b + EXT4_SB(sb)->s_inode_readahead_blks;
3929 num = EXT4_INODES_PER_GROUP(sb);
3930 if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
3931 EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
3932 num -= le16_to_cpu(gdp->bg_itable_unused);
3933 table += num / inodes_per_block;
3934 if (end > table)
3935 end = table;
3936 while (b <= end)
3937 sb_breadahead(sb, b++);
3938 }
3939
3940 /*
3812 * There are other valid inodes in the buffer, this inode 3941 * There are other valid inodes in the buffer, this inode
3813 * has in-inode xattrs, or we don't have this inode in memory. 3942 * has in-inode xattrs, or we don't have this inode in memory.
3814 * Read the block from disk. 3943 * Read the block from disk.
@@ -3818,10 +3947,9 @@ make_io:
3818 submit_bh(READ_META, bh); 3947 submit_bh(READ_META, bh);
3819 wait_on_buffer(bh); 3948 wait_on_buffer(bh);
3820 if (!buffer_uptodate(bh)) { 3949 if (!buffer_uptodate(bh)) {
3821 ext4_error(inode->i_sb, "ext4_get_inode_loc", 3950 ext4_error(sb, __func__,
3822 "unable to read inode block - " 3951 "unable to read inode block - inode=%lu, "
3823 "inode=%lu, block=%llu", 3952 "block=%llu", inode->i_ino, block);
3824 inode->i_ino, block);
3825 brelse(bh); 3953 brelse(bh);
3826 return -EIO; 3954 return -EIO;
3827 } 3955 }
@@ -3913,11 +4041,10 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
3913 return inode; 4041 return inode;
3914 4042
3915 ei = EXT4_I(inode); 4043 ei = EXT4_I(inode);
3916#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL 4044#ifdef CONFIG_EXT4_FS_POSIX_ACL
3917 ei->i_acl = EXT4_ACL_NOT_CACHED; 4045 ei->i_acl = EXT4_ACL_NOT_CACHED;
3918 ei->i_default_acl = EXT4_ACL_NOT_CACHED; 4046 ei->i_default_acl = EXT4_ACL_NOT_CACHED;
3919#endif 4047#endif
3920 ei->i_block_alloc_info = NULL;
3921 4048
3922 ret = __ext4_get_inode_loc(inode, &iloc, 0); 4049 ret = __ext4_get_inode_loc(inode, &iloc, 0);
3923 if (ret < 0) 4050 if (ret < 0)
@@ -3927,7 +4054,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
3927 inode->i_mode = le16_to_cpu(raw_inode->i_mode); 4054 inode->i_mode = le16_to_cpu(raw_inode->i_mode);
3928 inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); 4055 inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
3929 inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); 4056 inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
3930 if(!(test_opt (inode->i_sb, NO_UID32))) { 4057 if (!(test_opt(inode->i_sb, NO_UID32))) {
3931 inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; 4058 inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
3932 inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; 4059 inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
3933 } 4060 }
@@ -3945,7 +4072,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
3945 if (inode->i_mode == 0 || 4072 if (inode->i_mode == 0 ||
3946 !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) { 4073 !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) {
3947 /* this inode is deleted */ 4074 /* this inode is deleted */
3948 brelse (bh); 4075 brelse(bh);
3949 ret = -ESTALE; 4076 ret = -ESTALE;
3950 goto bad_inode; 4077 goto bad_inode;
3951 } 4078 }
@@ -3978,7 +4105,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
3978 ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); 4105 ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
3979 if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > 4106 if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
3980 EXT4_INODE_SIZE(inode->i_sb)) { 4107 EXT4_INODE_SIZE(inode->i_sb)) {
3981 brelse (bh); 4108 brelse(bh);
3982 ret = -EIO; 4109 ret = -EIO;
3983 goto bad_inode; 4110 goto bad_inode;
3984 } 4111 }
@@ -4031,7 +4158,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4031 init_special_inode(inode, inode->i_mode, 4158 init_special_inode(inode, inode->i_mode,
4032 new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); 4159 new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
4033 } 4160 }
4034 brelse (iloc.bh); 4161 brelse(iloc.bh);
4035 ext4_set_inode_flags(inode); 4162 ext4_set_inode_flags(inode);
4036 unlock_new_inode(inode); 4163 unlock_new_inode(inode);
4037 return inode; 4164 return inode;
@@ -4113,14 +4240,14 @@ static int ext4_do_update_inode(handle_t *handle,
4113 4240
4114 ext4_get_inode_flags(ei); 4241 ext4_get_inode_flags(ei);
4115 raw_inode->i_mode = cpu_to_le16(inode->i_mode); 4242 raw_inode->i_mode = cpu_to_le16(inode->i_mode);
4116 if(!(test_opt(inode->i_sb, NO_UID32))) { 4243 if (!(test_opt(inode->i_sb, NO_UID32))) {
4117 raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid)); 4244 raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
4118 raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid)); 4245 raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));
4119/* 4246/*
4120 * Fix up interoperability with old kernels. Otherwise, old inodes get 4247 * Fix up interoperability with old kernels. Otherwise, old inodes get
4121 * re-used with the upper 16 bits of the uid/gid intact 4248 * re-used with the upper 16 bits of the uid/gid intact
4122 */ 4249 */
4123 if(!ei->i_dtime) { 4250 if (!ei->i_dtime) {
4124 raw_inode->i_uid_high = 4251 raw_inode->i_uid_high =
4125 cpu_to_le16(high_16_bits(inode->i_uid)); 4252 cpu_to_le16(high_16_bits(inode->i_uid));
4126 raw_inode->i_gid_high = 4253 raw_inode->i_gid_high =
@@ -4208,7 +4335,7 @@ static int ext4_do_update_inode(handle_t *handle,
4208 ei->i_state &= ~EXT4_STATE_NEW; 4335 ei->i_state &= ~EXT4_STATE_NEW;
4209 4336
4210out_brelse: 4337out_brelse:
4211 brelse (bh); 4338 brelse(bh);
4212 ext4_std_error(inode->i_sb, err); 4339 ext4_std_error(inode->i_sb, err);
4213 return err; 4340 return err;
4214} 4341}
@@ -4811,6 +4938,7 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
4811 loff_t size; 4938 loff_t size;
4812 unsigned long len; 4939 unsigned long len;
4813 int ret = -EINVAL; 4940 int ret = -EINVAL;
4941 void *fsdata;
4814 struct file *file = vma->vm_file; 4942 struct file *file = vma->vm_file;
4815 struct inode *inode = file->f_path.dentry->d_inode; 4943 struct inode *inode = file->f_path.dentry->d_inode;
4816 struct address_space *mapping = inode->i_mapping; 4944 struct address_space *mapping = inode->i_mapping;
@@ -4849,11 +4977,11 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
4849 * on the same page though 4977 * on the same page though
4850 */ 4978 */
4851 ret = mapping->a_ops->write_begin(file, mapping, page_offset(page), 4979 ret = mapping->a_ops->write_begin(file, mapping, page_offset(page),
4852 len, AOP_FLAG_UNINTERRUPTIBLE, &page, NULL); 4980 len, AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
4853 if (ret < 0) 4981 if (ret < 0)
4854 goto out_unlock; 4982 goto out_unlock;
4855 ret = mapping->a_ops->write_end(file, mapping, page_offset(page), 4983 ret = mapping->a_ops->write_end(file, mapping, page_offset(page),
4856 len, len, page, NULL); 4984 len, len, page, fsdata);
4857 if (ret < 0) 4985 if (ret < 0)
4858 goto out_unlock; 4986 goto out_unlock;
4859 ret = 0; 4987 ret = 0;
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 7a6c2f1faba6..dc99b4776d58 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -23,9 +23,8 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
23 struct inode *inode = filp->f_dentry->d_inode; 23 struct inode *inode = filp->f_dentry->d_inode;
24 struct ext4_inode_info *ei = EXT4_I(inode); 24 struct ext4_inode_info *ei = EXT4_I(inode);
25 unsigned int flags; 25 unsigned int flags;
26 unsigned short rsv_window_size;
27 26
28 ext4_debug ("cmd = %u, arg = %lu\n", cmd, arg); 27 ext4_debug("cmd = %u, arg = %lu\n", cmd, arg);
29 28
30 switch (cmd) { 29 switch (cmd) {
31 case EXT4_IOC_GETFLAGS: 30 case EXT4_IOC_GETFLAGS:
@@ -34,7 +33,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
34 return put_user(flags, (int __user *) arg); 33 return put_user(flags, (int __user *) arg);
35 case EXT4_IOC_SETFLAGS: { 34 case EXT4_IOC_SETFLAGS: {
36 handle_t *handle = NULL; 35 handle_t *handle = NULL;
37 int err; 36 int err, migrate = 0;
38 struct ext4_iloc iloc; 37 struct ext4_iloc iloc;
39 unsigned int oldflags; 38 unsigned int oldflags;
40 unsigned int jflag; 39 unsigned int jflag;
@@ -82,6 +81,17 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
82 if (!capable(CAP_SYS_RESOURCE)) 81 if (!capable(CAP_SYS_RESOURCE))
83 goto flags_out; 82 goto flags_out;
84 } 83 }
84 if (oldflags & EXT4_EXTENTS_FL) {
85 /* We don't support clearning extent flags */
86 if (!(flags & EXT4_EXTENTS_FL)) {
87 err = -EOPNOTSUPP;
88 goto flags_out;
89 }
90 } else if (flags & EXT4_EXTENTS_FL) {
91 /* migrate the file */
92 migrate = 1;
93 flags &= ~EXT4_EXTENTS_FL;
94 }
85 95
86 handle = ext4_journal_start(inode, 1); 96 handle = ext4_journal_start(inode, 1);
87 if (IS_ERR(handle)) { 97 if (IS_ERR(handle)) {
@@ -109,6 +119,10 @@ flags_err:
109 119
110 if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) 120 if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL))
111 err = ext4_change_inode_journal_flag(inode, jflag); 121 err = ext4_change_inode_journal_flag(inode, jflag);
122 if (err)
123 goto flags_out;
124 if (migrate)
125 err = ext4_ext_migrate(inode);
112flags_out: 126flags_out:
113 mutex_unlock(&inode->i_mutex); 127 mutex_unlock(&inode->i_mutex);
114 mnt_drop_write(filp->f_path.mnt); 128 mnt_drop_write(filp->f_path.mnt);
@@ -175,53 +189,10 @@ setversion_out:
175 return ret; 189 return ret;
176 } 190 }
177#endif 191#endif
178 case EXT4_IOC_GETRSVSZ:
179 if (test_opt(inode->i_sb, RESERVATION)
180 && S_ISREG(inode->i_mode)
181 && ei->i_block_alloc_info) {
182 rsv_window_size = ei->i_block_alloc_info->rsv_window_node.rsv_goal_size;
183 return put_user(rsv_window_size, (int __user *)arg);
184 }
185 return -ENOTTY;
186 case EXT4_IOC_SETRSVSZ: {
187 int err;
188
189 if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode))
190 return -ENOTTY;
191
192 if (!is_owner_or_cap(inode))
193 return -EACCES;
194
195 if (get_user(rsv_window_size, (int __user *)arg))
196 return -EFAULT;
197
198 err = mnt_want_write(filp->f_path.mnt);
199 if (err)
200 return err;
201
202 if (rsv_window_size > EXT4_MAX_RESERVE_BLOCKS)
203 rsv_window_size = EXT4_MAX_RESERVE_BLOCKS;
204
205 /*
206 * need to allocate reservation structure for this inode
207 * before set the window size
208 */
209 down_write(&ei->i_data_sem);
210 if (!ei->i_block_alloc_info)
211 ext4_init_block_alloc_info(inode);
212
213 if (ei->i_block_alloc_info){
214 struct ext4_reserve_window_node *rsv = &ei->i_block_alloc_info->rsv_window_node;
215 rsv->rsv_goal_size = rsv_window_size;
216 }
217 up_write(&ei->i_data_sem);
218 mnt_drop_write(filp->f_path.mnt);
219 return 0;
220 }
221 case EXT4_IOC_GROUP_EXTEND: { 192 case EXT4_IOC_GROUP_EXTEND: {
222 ext4_fsblk_t n_blocks_count; 193 ext4_fsblk_t n_blocks_count;
223 struct super_block *sb = inode->i_sb; 194 struct super_block *sb = inode->i_sb;
224 int err; 195 int err, err2;
225 196
226 if (!capable(CAP_SYS_RESOURCE)) 197 if (!capable(CAP_SYS_RESOURCE))
227 return -EPERM; 198 return -EPERM;
@@ -235,8 +206,10 @@ setversion_out:
235 206
236 err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count); 207 err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count);
237 jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); 208 jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
238 jbd2_journal_flush(EXT4_SB(sb)->s_journal); 209 err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
239 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); 210 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
211 if (err == 0)
212 err = err2;
240 mnt_drop_write(filp->f_path.mnt); 213 mnt_drop_write(filp->f_path.mnt);
241 214
242 return err; 215 return err;
@@ -244,7 +217,7 @@ setversion_out:
244 case EXT4_IOC_GROUP_ADD: { 217 case EXT4_IOC_GROUP_ADD: {
245 struct ext4_new_group_data input; 218 struct ext4_new_group_data input;
246 struct super_block *sb = inode->i_sb; 219 struct super_block *sb = inode->i_sb;
247 int err; 220 int err, err2;
248 221
249 if (!capable(CAP_SYS_RESOURCE)) 222 if (!capable(CAP_SYS_RESOURCE))
250 return -EPERM; 223 return -EPERM;
@@ -259,15 +232,36 @@ setversion_out:
259 232
260 err = ext4_group_add(sb, &input); 233 err = ext4_group_add(sb, &input);
261 jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); 234 jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
262 jbd2_journal_flush(EXT4_SB(sb)->s_journal); 235 err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
263 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); 236 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
237 if (err == 0)
238 err = err2;
264 mnt_drop_write(filp->f_path.mnt); 239 mnt_drop_write(filp->f_path.mnt);
265 240
266 return err; 241 return err;
267 } 242 }
268 243
269 case EXT4_IOC_MIGRATE: 244 case EXT4_IOC_MIGRATE:
270 return ext4_ext_migrate(inode, filp, cmd, arg); 245 {
246 int err;
247 if (!is_owner_or_cap(inode))
248 return -EACCES;
249
250 err = mnt_want_write(filp->f_path.mnt);
251 if (err)
252 return err;
253 /*
254 * inode_mutex prevent write and truncate on the file.
255 * Read still goes through. We take i_data_sem in
256 * ext4_ext_swap_inode_data before we switch the
257 * inode format to prevent read.
258 */
259 mutex_lock(&(inode->i_mutex));
260 err = ext4_ext_migrate(inode);
261 mutex_unlock(&(inode->i_mutex));
262 mnt_drop_write(filp->f_path.mnt);
263 return err;
264 }
271 265
272 default: 266 default:
273 return -ENOTTY; 267 return -ENOTTY;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index e0e3a5eb1ddb..b580714f0d85 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -477,9 +477,10 @@ static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
477 b2 = (unsigned char *) bitmap; 477 b2 = (unsigned char *) bitmap;
478 for (i = 0; i < e4b->bd_sb->s_blocksize; i++) { 478 for (i = 0; i < e4b->bd_sb->s_blocksize; i++) {
479 if (b1[i] != b2[i]) { 479 if (b1[i] != b2[i]) {
480 printk("corruption in group %lu at byte %u(%u):" 480 printk(KERN_ERR "corruption in group %lu "
481 " %x in copy != %x on disk/prealloc\n", 481 "at byte %u(%u): %x in copy != %x "
482 e4b->bd_group, i, i * 8, b1[i], b2[i]); 482 "on disk/prealloc\n",
483 e4b->bd_group, i, i * 8, b1[i], b2[i]);
483 BUG(); 484 BUG();
484 } 485 }
485 } 486 }
@@ -533,9 +534,6 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
533 void *buddy; 534 void *buddy;
534 void *buddy2; 535 void *buddy2;
535 536
536 if (!test_opt(sb, MBALLOC))
537 return 0;
538
539 { 537 {
540 static int mb_check_counter; 538 static int mb_check_counter;
541 if (mb_check_counter++ % 100 != 0) 539 if (mb_check_counter++ % 100 != 0)
@@ -784,9 +782,11 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
784 if (bh[i] == NULL) 782 if (bh[i] == NULL)
785 goto out; 783 goto out;
786 784
787 if (bh_uptodate_or_lock(bh[i])) 785 if (buffer_uptodate(bh[i]) &&
786 !(desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))
788 continue; 787 continue;
789 788
789 lock_buffer(bh[i]);
790 spin_lock(sb_bgl_lock(EXT4_SB(sb), first_group + i)); 790 spin_lock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
791 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 791 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
792 ext4_init_block_bitmap(sb, bh[i], 792 ext4_init_block_bitmap(sb, bh[i],
@@ -2169,9 +2169,10 @@ static void ext4_mb_history_release(struct super_block *sb)
2169{ 2169{
2170 struct ext4_sb_info *sbi = EXT4_SB(sb); 2170 struct ext4_sb_info *sbi = EXT4_SB(sb);
2171 2171
2172 remove_proc_entry("mb_groups", sbi->s_mb_proc); 2172 if (sbi->s_proc != NULL) {
2173 remove_proc_entry("mb_history", sbi->s_mb_proc); 2173 remove_proc_entry("mb_groups", sbi->s_proc);
2174 2174 remove_proc_entry("mb_history", sbi->s_proc);
2175 }
2175 kfree(sbi->s_mb_history); 2176 kfree(sbi->s_mb_history);
2176} 2177}
2177 2178
@@ -2180,10 +2181,10 @@ static void ext4_mb_history_init(struct super_block *sb)
2180 struct ext4_sb_info *sbi = EXT4_SB(sb); 2181 struct ext4_sb_info *sbi = EXT4_SB(sb);
2181 int i; 2182 int i;
2182 2183
2183 if (sbi->s_mb_proc != NULL) { 2184 if (sbi->s_proc != NULL) {
2184 proc_create_data("mb_history", S_IRUGO, sbi->s_mb_proc, 2185 proc_create_data("mb_history", S_IRUGO, sbi->s_proc,
2185 &ext4_mb_seq_history_fops, sb); 2186 &ext4_mb_seq_history_fops, sb);
2186 proc_create_data("mb_groups", S_IRUGO, sbi->s_mb_proc, 2187 proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
2187 &ext4_mb_seq_groups_fops, sb); 2188 &ext4_mb_seq_groups_fops, sb);
2188 } 2189 }
2189 2190
@@ -2485,19 +2486,14 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2485 unsigned max; 2486 unsigned max;
2486 int ret; 2487 int ret;
2487 2488
2488 if (!test_opt(sb, MBALLOC))
2489 return 0;
2490
2491 i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short); 2489 i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short);
2492 2490
2493 sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); 2491 sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
2494 if (sbi->s_mb_offsets == NULL) { 2492 if (sbi->s_mb_offsets == NULL) {
2495 clear_opt(sbi->s_mount_opt, MBALLOC);
2496 return -ENOMEM; 2493 return -ENOMEM;
2497 } 2494 }
2498 sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); 2495 sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
2499 if (sbi->s_mb_maxs == NULL) { 2496 if (sbi->s_mb_maxs == NULL) {
2500 clear_opt(sbi->s_mount_opt, MBALLOC);
2501 kfree(sbi->s_mb_maxs); 2497 kfree(sbi->s_mb_maxs);
2502 return -ENOMEM; 2498 return -ENOMEM;
2503 } 2499 }
@@ -2520,7 +2516,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2520 /* init file for buddy data */ 2516 /* init file for buddy data */
2521 ret = ext4_mb_init_backend(sb); 2517 ret = ext4_mb_init_backend(sb);
2522 if (ret != 0) { 2518 if (ret != 0) {
2523 clear_opt(sbi->s_mount_opt, MBALLOC);
2524 kfree(sbi->s_mb_offsets); 2519 kfree(sbi->s_mb_offsets);
2525 kfree(sbi->s_mb_maxs); 2520 kfree(sbi->s_mb_maxs);
2526 return ret; 2521 return ret;
@@ -2540,17 +2535,15 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2540 sbi->s_mb_history_filter = EXT4_MB_HISTORY_DEFAULT; 2535 sbi->s_mb_history_filter = EXT4_MB_HISTORY_DEFAULT;
2541 sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC; 2536 sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC;
2542 2537
2543 i = sizeof(struct ext4_locality_group) * nr_cpu_ids; 2538 sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
2544 sbi->s_locality_groups = kmalloc(i, GFP_KERNEL);
2545 if (sbi->s_locality_groups == NULL) { 2539 if (sbi->s_locality_groups == NULL) {
2546 clear_opt(sbi->s_mount_opt, MBALLOC);
2547 kfree(sbi->s_mb_offsets); 2540 kfree(sbi->s_mb_offsets);
2548 kfree(sbi->s_mb_maxs); 2541 kfree(sbi->s_mb_maxs);
2549 return -ENOMEM; 2542 return -ENOMEM;
2550 } 2543 }
2551 for (i = 0; i < nr_cpu_ids; i++) { 2544 for_each_possible_cpu(i) {
2552 struct ext4_locality_group *lg; 2545 struct ext4_locality_group *lg;
2553 lg = &sbi->s_locality_groups[i]; 2546 lg = per_cpu_ptr(sbi->s_locality_groups, i);
2554 mutex_init(&lg->lg_mutex); 2547 mutex_init(&lg->lg_mutex);
2555 for (j = 0; j < PREALLOC_TB_SIZE; j++) 2548 for (j = 0; j < PREALLOC_TB_SIZE; j++)
2556 INIT_LIST_HEAD(&lg->lg_prealloc_list[j]); 2549 INIT_LIST_HEAD(&lg->lg_prealloc_list[j]);
@@ -2560,7 +2553,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2560 ext4_mb_init_per_dev_proc(sb); 2553 ext4_mb_init_per_dev_proc(sb);
2561 ext4_mb_history_init(sb); 2554 ext4_mb_history_init(sb);
2562 2555
2563 printk("EXT4-fs: mballoc enabled\n"); 2556 printk(KERN_INFO "EXT4-fs: mballoc enabled\n");
2564 return 0; 2557 return 0;
2565} 2558}
2566 2559
@@ -2589,9 +2582,6 @@ int ext4_mb_release(struct super_block *sb)
2589 struct ext4_group_info *grinfo; 2582 struct ext4_group_info *grinfo;
2590 struct ext4_sb_info *sbi = EXT4_SB(sb); 2583 struct ext4_sb_info *sbi = EXT4_SB(sb);
2591 2584
2592 if (!test_opt(sb, MBALLOC))
2593 return 0;
2594
2595 /* release freed, non-committed blocks */ 2585 /* release freed, non-committed blocks */
2596 spin_lock(&sbi->s_md_lock); 2586 spin_lock(&sbi->s_md_lock);
2597 list_splice_init(&sbi->s_closed_transaction, 2587 list_splice_init(&sbi->s_closed_transaction,
@@ -2647,8 +2637,7 @@ int ext4_mb_release(struct super_block *sb)
2647 atomic_read(&sbi->s_mb_discarded)); 2637 atomic_read(&sbi->s_mb_discarded));
2648 } 2638 }
2649 2639
2650 kfree(sbi->s_locality_groups); 2640 free_percpu(sbi->s_locality_groups);
2651
2652 ext4_mb_history_release(sb); 2641 ext4_mb_history_release(sb);
2653 ext4_mb_destroy_per_dev_proc(sb); 2642 ext4_mb_destroy_per_dev_proc(sb);
2654 2643
@@ -2721,118 +2710,46 @@ ext4_mb_free_committed_blocks(struct super_block *sb)
2721#define EXT4_MB_STREAM_REQ "stream_req" 2710#define EXT4_MB_STREAM_REQ "stream_req"
2722#define EXT4_MB_GROUP_PREALLOC "group_prealloc" 2711#define EXT4_MB_GROUP_PREALLOC "group_prealloc"
2723 2712
2724
2725
2726#define MB_PROC_FOPS(name) \
2727static int ext4_mb_##name##_proc_show(struct seq_file *m, void *v) \
2728{ \
2729 struct ext4_sb_info *sbi = m->private; \
2730 \
2731 seq_printf(m, "%ld\n", sbi->s_mb_##name); \
2732 return 0; \
2733} \
2734 \
2735static int ext4_mb_##name##_proc_open(struct inode *inode, struct file *file)\
2736{ \
2737 return single_open(file, ext4_mb_##name##_proc_show, PDE(inode)->data);\
2738} \
2739 \
2740static ssize_t ext4_mb_##name##_proc_write(struct file *file, \
2741 const char __user *buf, size_t cnt, loff_t *ppos) \
2742{ \
2743 struct ext4_sb_info *sbi = PDE(file->f_path.dentry->d_inode)->data;\
2744 char str[32]; \
2745 long value; \
2746 if (cnt >= sizeof(str)) \
2747 return -EINVAL; \
2748 if (copy_from_user(str, buf, cnt)) \
2749 return -EFAULT; \
2750 value = simple_strtol(str, NULL, 0); \
2751 if (value <= 0) \
2752 return -ERANGE; \
2753 sbi->s_mb_##name = value; \
2754 return cnt; \
2755} \
2756 \
2757static const struct file_operations ext4_mb_##name##_proc_fops = { \
2758 .owner = THIS_MODULE, \
2759 .open = ext4_mb_##name##_proc_open, \
2760 .read = seq_read, \
2761 .llseek = seq_lseek, \
2762 .release = single_release, \
2763 .write = ext4_mb_##name##_proc_write, \
2764};
2765
2766MB_PROC_FOPS(stats);
2767MB_PROC_FOPS(max_to_scan);
2768MB_PROC_FOPS(min_to_scan);
2769MB_PROC_FOPS(order2_reqs);
2770MB_PROC_FOPS(stream_request);
2771MB_PROC_FOPS(group_prealloc);
2772
2773#define MB_PROC_HANDLER(name, var) \
2774do { \
2775 proc = proc_create_data(name, mode, sbi->s_mb_proc, \
2776 &ext4_mb_##var##_proc_fops, sbi); \
2777 if (proc == NULL) { \
2778 printk(KERN_ERR "EXT4-fs: can't to create %s\n", name); \
2779 goto err_out; \
2780 } \
2781} while (0)
2782
2783static int ext4_mb_init_per_dev_proc(struct super_block *sb) 2713static int ext4_mb_init_per_dev_proc(struct super_block *sb)
2784{ 2714{
2785 mode_t mode = S_IFREG | S_IRUGO | S_IWUSR; 2715 mode_t mode = S_IFREG | S_IRUGO | S_IWUSR;
2786 struct ext4_sb_info *sbi = EXT4_SB(sb); 2716 struct ext4_sb_info *sbi = EXT4_SB(sb);
2787 struct proc_dir_entry *proc; 2717 struct proc_dir_entry *proc;
2788 char devname[64];
2789 2718
2790 if (proc_root_ext4 == NULL) { 2719 if (sbi->s_proc == NULL)
2791 sbi->s_mb_proc = NULL;
2792 return -EINVAL; 2720 return -EINVAL;
2793 }
2794 bdevname(sb->s_bdev, devname);
2795 sbi->s_mb_proc = proc_mkdir(devname, proc_root_ext4);
2796
2797 MB_PROC_HANDLER(EXT4_MB_STATS_NAME, stats);
2798 MB_PROC_HANDLER(EXT4_MB_MAX_TO_SCAN_NAME, max_to_scan);
2799 MB_PROC_HANDLER(EXT4_MB_MIN_TO_SCAN_NAME, min_to_scan);
2800 MB_PROC_HANDLER(EXT4_MB_ORDER2_REQ, order2_reqs);
2801 MB_PROC_HANDLER(EXT4_MB_STREAM_REQ, stream_request);
2802 MB_PROC_HANDLER(EXT4_MB_GROUP_PREALLOC, group_prealloc);
2803 2721
2722 EXT4_PROC_HANDLER(EXT4_MB_STATS_NAME, mb_stats);
2723 EXT4_PROC_HANDLER(EXT4_MB_MAX_TO_SCAN_NAME, mb_max_to_scan);
2724 EXT4_PROC_HANDLER(EXT4_MB_MIN_TO_SCAN_NAME, mb_min_to_scan);
2725 EXT4_PROC_HANDLER(EXT4_MB_ORDER2_REQ, mb_order2_reqs);
2726 EXT4_PROC_HANDLER(EXT4_MB_STREAM_REQ, mb_stream_request);
2727 EXT4_PROC_HANDLER(EXT4_MB_GROUP_PREALLOC, mb_group_prealloc);
2804 return 0; 2728 return 0;
2805 2729
2806err_out: 2730err_out:
2807 printk(KERN_ERR "EXT4-fs: Unable to create %s\n", devname); 2731 remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_proc);
2808 remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_mb_proc); 2732 remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_proc);
2809 remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_mb_proc); 2733 remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_proc);
2810 remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_mb_proc); 2734 remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc);
2811 remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_mb_proc); 2735 remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc);
2812 remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_mb_proc); 2736 remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc);
2813 remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_mb_proc);
2814 remove_proc_entry(devname, proc_root_ext4);
2815 sbi->s_mb_proc = NULL;
2816
2817 return -ENOMEM; 2737 return -ENOMEM;
2818} 2738}
2819 2739
2820static int ext4_mb_destroy_per_dev_proc(struct super_block *sb) 2740static int ext4_mb_destroy_per_dev_proc(struct super_block *sb)
2821{ 2741{
2822 struct ext4_sb_info *sbi = EXT4_SB(sb); 2742 struct ext4_sb_info *sbi = EXT4_SB(sb);
2823 char devname[64];
2824 2743
2825 if (sbi->s_mb_proc == NULL) 2744 if (sbi->s_proc == NULL)
2826 return -EINVAL; 2745 return -EINVAL;
2827 2746
2828 bdevname(sb->s_bdev, devname); 2747 remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_proc);
2829 remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_mb_proc); 2748 remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_proc);
2830 remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_mb_proc); 2749 remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_proc);
2831 remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_mb_proc); 2750 remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc);
2832 remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_mb_proc); 2751 remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc);
2833 remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_mb_proc); 2752 remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc);
2834 remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_mb_proc);
2835 remove_proc_entry(devname, proc_root_ext4);
2836 2753
2837 return 0; 2754 return 0;
2838} 2755}
@@ -2854,11 +2771,6 @@ int __init init_ext4_mballoc(void)
2854 kmem_cache_destroy(ext4_pspace_cachep); 2771 kmem_cache_destroy(ext4_pspace_cachep);
2855 return -ENOMEM; 2772 return -ENOMEM;
2856 } 2773 }
2857#ifdef CONFIG_PROC_FS
2858 proc_root_ext4 = proc_mkdir("fs/ext4", NULL);
2859 if (proc_root_ext4 == NULL)
2860 printk(KERN_ERR "EXT4-fs: Unable to create fs/ext4\n");
2861#endif
2862 return 0; 2774 return 0;
2863} 2775}
2864 2776
@@ -2867,9 +2779,6 @@ void exit_ext4_mballoc(void)
2867 /* XXX: synchronize_rcu(); */ 2779 /* XXX: synchronize_rcu(); */
2868 kmem_cache_destroy(ext4_pspace_cachep); 2780 kmem_cache_destroy(ext4_pspace_cachep);
2869 kmem_cache_destroy(ext4_ac_cachep); 2781 kmem_cache_destroy(ext4_ac_cachep);
2870#ifdef CONFIG_PROC_FS
2871 remove_proc_entry("fs/ext4", NULL);
2872#endif
2873} 2782}
2874 2783
2875 2784
@@ -2879,7 +2788,7 @@ void exit_ext4_mballoc(void)
2879 */ 2788 */
2880static noinline_for_stack int 2789static noinline_for_stack int
2881ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, 2790ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2882 handle_t *handle) 2791 handle_t *handle, unsigned long reserv_blks)
2883{ 2792{
2884 struct buffer_head *bitmap_bh = NULL; 2793 struct buffer_head *bitmap_bh = NULL;
2885 struct ext4_super_block *es; 2794 struct ext4_super_block *es;
@@ -2968,15 +2877,16 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2968 le16_add_cpu(&gdp->bg_free_blocks_count, -ac->ac_b_ex.fe_len); 2877 le16_add_cpu(&gdp->bg_free_blocks_count, -ac->ac_b_ex.fe_len);
2969 gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp); 2878 gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
2970 spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); 2879 spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
2971 2880 percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
2972 /* 2881 /*
2973 * free blocks account has already be reduced/reserved 2882 * Now reduce the dirty block count also. Should not go negative
2974 * at write_begin() time for delayed allocation
2975 * do not double accounting
2976 */ 2883 */
2977 if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED)) 2884 if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
2978 percpu_counter_sub(&sbi->s_freeblocks_counter, 2885 /* release all the reserved blocks if non delalloc */
2979 ac->ac_b_ex.fe_len); 2886 percpu_counter_sub(&sbi->s_dirtyblocks_counter, reserv_blks);
2887 else
2888 percpu_counter_sub(&sbi->s_dirtyblocks_counter,
2889 ac->ac_b_ex.fe_len);
2980 2890
2981 if (sbi->s_log_groups_per_flex) { 2891 if (sbi->s_log_groups_per_flex) {
2982 ext4_group_t flex_group = ext4_flex_group(sbi, 2892 ext4_group_t flex_group = ext4_flex_group(sbi,
@@ -3884,7 +3794,7 @@ out:
3884 * 3794 *
3885 * FIXME!! Make sure it is valid at all the call sites 3795 * FIXME!! Make sure it is valid at all the call sites
3886 */ 3796 */
3887void ext4_mb_discard_inode_preallocations(struct inode *inode) 3797void ext4_discard_preallocations(struct inode *inode)
3888{ 3798{
3889 struct ext4_inode_info *ei = EXT4_I(inode); 3799 struct ext4_inode_info *ei = EXT4_I(inode);
3890 struct super_block *sb = inode->i_sb; 3800 struct super_block *sb = inode->i_sb;
@@ -3896,7 +3806,7 @@ void ext4_mb_discard_inode_preallocations(struct inode *inode)
3896 struct ext4_buddy e4b; 3806 struct ext4_buddy e4b;
3897 int err; 3807 int err;
3898 3808
3899 if (!test_opt(sb, MBALLOC) || !S_ISREG(inode->i_mode)) { 3809 if (!S_ISREG(inode->i_mode)) {
3900 /*BUG_ON(!list_empty(&ei->i_prealloc_list));*/ 3810 /*BUG_ON(!list_empty(&ei->i_prealloc_list));*/
3901 return; 3811 return;
3902 } 3812 }
@@ -4094,8 +4004,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
4094 * per cpu locality group is to reduce the contention between block 4004 * per cpu locality group is to reduce the contention between block
4095 * request from multiple CPUs. 4005 * request from multiple CPUs.
4096 */ 4006 */
4097 ac->ac_lg = &sbi->s_locality_groups[get_cpu()]; 4007 ac->ac_lg = per_cpu_ptr(sbi->s_locality_groups, raw_smp_processor_id());
4098 put_cpu();
4099 4008
4100 /* we're going to use group allocation */ 4009 /* we're going to use group allocation */
4101 ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC; 4010 ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC;
@@ -4369,33 +4278,32 @@ static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
4369ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, 4278ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4370 struct ext4_allocation_request *ar, int *errp) 4279 struct ext4_allocation_request *ar, int *errp)
4371{ 4280{
4281 int freed;
4372 struct ext4_allocation_context *ac = NULL; 4282 struct ext4_allocation_context *ac = NULL;
4373 struct ext4_sb_info *sbi; 4283 struct ext4_sb_info *sbi;
4374 struct super_block *sb; 4284 struct super_block *sb;
4375 ext4_fsblk_t block = 0; 4285 ext4_fsblk_t block = 0;
4376 int freed; 4286 unsigned long inquota;
4377 int inquota; 4287 unsigned long reserv_blks = 0;
4378 4288
4379 sb = ar->inode->i_sb; 4289 sb = ar->inode->i_sb;
4380 sbi = EXT4_SB(sb); 4290 sbi = EXT4_SB(sb);
4381 4291
4382 if (!test_opt(sb, MBALLOC)) {
4383 block = ext4_old_new_blocks(handle, ar->inode, ar->goal,
4384 &(ar->len), errp);
4385 return block;
4386 }
4387 if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) { 4292 if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) {
4388 /* 4293 /*
4389 * With delalloc we already reserved the blocks 4294 * With delalloc we already reserved the blocks
4390 */ 4295 */
4391 ar->len = ext4_has_free_blocks(sbi, ar->len); 4296 while (ar->len && ext4_claim_free_blocks(sbi, ar->len)) {
4392 } 4297 /* let others to free the space */
4393 4298 yield();
4394 if (ar->len == 0) { 4299 ar->len = ar->len >> 1;
4395 *errp = -ENOSPC; 4300 }
4396 return 0; 4301 if (!ar->len) {
4302 *errp = -ENOSPC;
4303 return 0;
4304 }
4305 reserv_blks = ar->len;
4397 } 4306 }
4398
4399 while (ar->len && DQUOT_ALLOC_BLOCK(ar->inode, ar->len)) { 4307 while (ar->len && DQUOT_ALLOC_BLOCK(ar->inode, ar->len)) {
4400 ar->flags |= EXT4_MB_HINT_NOPREALLOC; 4308 ar->flags |= EXT4_MB_HINT_NOPREALLOC;
4401 ar->len--; 4309 ar->len--;
@@ -4441,7 +4349,7 @@ repeat:
4441 } 4349 }
4442 4350
4443 if (likely(ac->ac_status == AC_STATUS_FOUND)) { 4351 if (likely(ac->ac_status == AC_STATUS_FOUND)) {
4444 *errp = ext4_mb_mark_diskspace_used(ac, handle); 4352 *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_blks);
4445 if (*errp == -EAGAIN) { 4353 if (*errp == -EAGAIN) {
4446 ac->ac_b_ex.fe_group = 0; 4354 ac->ac_b_ex.fe_group = 0;
4447 ac->ac_b_ex.fe_start = 0; 4355 ac->ac_b_ex.fe_start = 0;
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index c7c9906c2a75..b3b4828f8b89 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -257,7 +257,6 @@ static void ext4_mb_store_history(struct ext4_allocation_context *ac);
257 257
258#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) 258#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
259 259
260static struct proc_dir_entry *proc_root_ext4;
261struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t); 260struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t);
262 261
263static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, 262static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 46fc0b5b12ba..f2a9cf498ecd 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -447,8 +447,7 @@ static int free_ext_block(handle_t *handle, struct inode *inode)
447 447
448} 448}
449 449
450int ext4_ext_migrate(struct inode *inode, struct file *filp, 450int ext4_ext_migrate(struct inode *inode)
451 unsigned int cmd, unsigned long arg)
452{ 451{
453 handle_t *handle; 452 handle_t *handle;
454 int retval = 0, i; 453 int retval = 0, i;
@@ -516,12 +515,6 @@ int ext4_ext_migrate(struct inode *inode, struct file *filp,
516 * when we add extents we extent the journal 515 * when we add extents we extent the journal
517 */ 516 */
518 /* 517 /*
519 * inode_mutex prevent write and truncate on the file. Read still goes
520 * through. We take i_data_sem in ext4_ext_swap_inode_data before we
521 * switch the inode format to prevent read.
522 */
523 mutex_lock(&(inode->i_mutex));
524 /*
525 * Even though we take i_mutex we can still cause block allocation 518 * Even though we take i_mutex we can still cause block allocation
526 * via mmap write to holes. If we have allocated new blocks we fail 519 * via mmap write to holes. If we have allocated new blocks we fail
527 * migrate. New block allocation will clear EXT4_EXT_MIGRATE flag. 520 * migrate. New block allocation will clear EXT4_EXT_MIGRATE flag.
@@ -623,7 +616,6 @@ err_out:
623 tmp_inode->i_nlink = 0; 616 tmp_inode->i_nlink = 0;
624 617
625 ext4_journal_stop(handle); 618 ext4_journal_stop(handle);
626 mutex_unlock(&(inode->i_mutex));
627 619
628 if (tmp_inode) 620 if (tmp_inode)
629 iput(tmp_inode); 621 iput(tmp_inode);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 387ad98350c3..92db9e945147 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -151,34 +151,36 @@ struct dx_map_entry
151 151
152static inline ext4_lblk_t dx_get_block(struct dx_entry *entry); 152static inline ext4_lblk_t dx_get_block(struct dx_entry *entry);
153static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value); 153static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value);
154static inline unsigned dx_get_hash (struct dx_entry *entry); 154static inline unsigned dx_get_hash(struct dx_entry *entry);
155static void dx_set_hash (struct dx_entry *entry, unsigned value); 155static void dx_set_hash(struct dx_entry *entry, unsigned value);
156static unsigned dx_get_count (struct dx_entry *entries); 156static unsigned dx_get_count(struct dx_entry *entries);
157static unsigned dx_get_limit (struct dx_entry *entries); 157static unsigned dx_get_limit(struct dx_entry *entries);
158static void dx_set_count (struct dx_entry *entries, unsigned value); 158static void dx_set_count(struct dx_entry *entries, unsigned value);
159static void dx_set_limit (struct dx_entry *entries, unsigned value); 159static void dx_set_limit(struct dx_entry *entries, unsigned value);
160static unsigned dx_root_limit (struct inode *dir, unsigned infosize); 160static unsigned dx_root_limit(struct inode *dir, unsigned infosize);
161static unsigned dx_node_limit (struct inode *dir); 161static unsigned dx_node_limit(struct inode *dir);
162static struct dx_frame *dx_probe(struct dentry *dentry, 162static struct dx_frame *dx_probe(const struct qstr *d_name,
163 struct inode *dir, 163 struct inode *dir,
164 struct dx_hash_info *hinfo, 164 struct dx_hash_info *hinfo,
165 struct dx_frame *frame, 165 struct dx_frame *frame,
166 int *err); 166 int *err);
167static void dx_release (struct dx_frame *frames); 167static void dx_release(struct dx_frame *frames);
168static int dx_make_map (struct ext4_dir_entry_2 *de, int size, 168static int dx_make_map(struct ext4_dir_entry_2 *de, int size,
169 struct dx_hash_info *hinfo, struct dx_map_entry map[]); 169 struct dx_hash_info *hinfo, struct dx_map_entry map[]);
170static void dx_sort_map(struct dx_map_entry *map, unsigned count); 170static void dx_sort_map(struct dx_map_entry *map, unsigned count);
171static struct ext4_dir_entry_2 *dx_move_dirents (char *from, char *to, 171static struct ext4_dir_entry_2 *dx_move_dirents(char *from, char *to,
172 struct dx_map_entry *offsets, int count); 172 struct dx_map_entry *offsets, int count);
173static struct ext4_dir_entry_2* dx_pack_dirents (char *base, int size); 173static struct ext4_dir_entry_2* dx_pack_dirents(char *base, int size);
174static void dx_insert_block(struct dx_frame *frame, 174static void dx_insert_block(struct dx_frame *frame,
175 u32 hash, ext4_lblk_t block); 175 u32 hash, ext4_lblk_t block);
176static int ext4_htree_next_block(struct inode *dir, __u32 hash, 176static int ext4_htree_next_block(struct inode *dir, __u32 hash,
177 struct dx_frame *frame, 177 struct dx_frame *frame,
178 struct dx_frame *frames, 178 struct dx_frame *frames,
179 __u32 *start_hash); 179 __u32 *start_hash);
180static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry, 180static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
181 struct ext4_dir_entry_2 **res_dir, int *err); 181 const struct qstr *d_name,
182 struct ext4_dir_entry_2 **res_dir,
183 int *err);
182static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, 184static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
183 struct inode *inode); 185 struct inode *inode);
184 186
@@ -207,44 +209,44 @@ static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value)
207 entry->block = cpu_to_le32(value); 209 entry->block = cpu_to_le32(value);
208} 210}
209 211
210static inline unsigned dx_get_hash (struct dx_entry *entry) 212static inline unsigned dx_get_hash(struct dx_entry *entry)
211{ 213{
212 return le32_to_cpu(entry->hash); 214 return le32_to_cpu(entry->hash);
213} 215}
214 216
215static inline void dx_set_hash (struct dx_entry *entry, unsigned value) 217static inline void dx_set_hash(struct dx_entry *entry, unsigned value)
216{ 218{
217 entry->hash = cpu_to_le32(value); 219 entry->hash = cpu_to_le32(value);
218} 220}
219 221
220static inline unsigned dx_get_count (struct dx_entry *entries) 222static inline unsigned dx_get_count(struct dx_entry *entries)
221{ 223{
222 return le16_to_cpu(((struct dx_countlimit *) entries)->count); 224 return le16_to_cpu(((struct dx_countlimit *) entries)->count);
223} 225}
224 226
225static inline unsigned dx_get_limit (struct dx_entry *entries) 227static inline unsigned dx_get_limit(struct dx_entry *entries)
226{ 228{
227 return le16_to_cpu(((struct dx_countlimit *) entries)->limit); 229 return le16_to_cpu(((struct dx_countlimit *) entries)->limit);
228} 230}
229 231
230static inline void dx_set_count (struct dx_entry *entries, unsigned value) 232static inline void dx_set_count(struct dx_entry *entries, unsigned value)
231{ 233{
232 ((struct dx_countlimit *) entries)->count = cpu_to_le16(value); 234 ((struct dx_countlimit *) entries)->count = cpu_to_le16(value);
233} 235}
234 236
235static inline void dx_set_limit (struct dx_entry *entries, unsigned value) 237static inline void dx_set_limit(struct dx_entry *entries, unsigned value)
236{ 238{
237 ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value); 239 ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value);
238} 240}
239 241
240static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize) 242static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize)
241{ 243{
242 unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) - 244 unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) -
243 EXT4_DIR_REC_LEN(2) - infosize; 245 EXT4_DIR_REC_LEN(2) - infosize;
244 return entry_space / sizeof(struct dx_entry); 246 return entry_space / sizeof(struct dx_entry);
245} 247}
246 248
247static inline unsigned dx_node_limit (struct inode *dir) 249static inline unsigned dx_node_limit(struct inode *dir)
248{ 250{
249 unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0); 251 unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0);
250 return entry_space / sizeof(struct dx_entry); 252 return entry_space / sizeof(struct dx_entry);
@@ -254,12 +256,12 @@ static inline unsigned dx_node_limit (struct inode *dir)
254 * Debug 256 * Debug
255 */ 257 */
256#ifdef DX_DEBUG 258#ifdef DX_DEBUG
257static void dx_show_index (char * label, struct dx_entry *entries) 259static void dx_show_index(char * label, struct dx_entry *entries)
258{ 260{
259 int i, n = dx_get_count (entries); 261 int i, n = dx_get_count (entries);
260 printk("%s index ", label); 262 printk(KERN_DEBUG "%s index ", label);
261 for (i = 0; i < n; i++) { 263 for (i = 0; i < n; i++) {
262 printk("%x->%lu ", i? dx_get_hash(entries + i) : 264 printk("%x->%lu ", i ? dx_get_hash(entries + i) :
263 0, (unsigned long)dx_get_block(entries + i)); 265 0, (unsigned long)dx_get_block(entries + i));
264 } 266 }
265 printk("\n"); 267 printk("\n");
@@ -306,7 +308,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
306 struct dx_entry *entries, int levels) 308 struct dx_entry *entries, int levels)
307{ 309{
308 unsigned blocksize = dir->i_sb->s_blocksize; 310 unsigned blocksize = dir->i_sb->s_blocksize;
309 unsigned count = dx_get_count (entries), names = 0, space = 0, i; 311 unsigned count = dx_get_count(entries), names = 0, space = 0, i;
310 unsigned bcount = 0; 312 unsigned bcount = 0;
311 struct buffer_head *bh; 313 struct buffer_head *bh;
312 int err; 314 int err;
@@ -325,11 +327,12 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
325 names += stats.names; 327 names += stats.names;
326 space += stats.space; 328 space += stats.space;
327 bcount += stats.bcount; 329 bcount += stats.bcount;
328 brelse (bh); 330 brelse(bh);
329 } 331 }
330 if (bcount) 332 if (bcount)
331 printk("%snames %u, fullness %u (%u%%)\n", levels?"":" ", 333 printk(KERN_DEBUG "%snames %u, fullness %u (%u%%)\n",
332 names, space/bcount,(space/bcount)*100/blocksize); 334 levels ? "" : " ", names, space/bcount,
335 (space/bcount)*100/blocksize);
333 return (struct stats) { names, space, bcount}; 336 return (struct stats) { names, space, bcount};
334} 337}
335#endif /* DX_DEBUG */ 338#endif /* DX_DEBUG */
@@ -344,7 +347,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
344 * back to userspace. 347 * back to userspace.
345 */ 348 */
346static struct dx_frame * 349static struct dx_frame *
347dx_probe(struct dentry *dentry, struct inode *dir, 350dx_probe(const struct qstr *d_name, struct inode *dir,
348 struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err) 351 struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err)
349{ 352{
350 unsigned count, indirect; 353 unsigned count, indirect;
@@ -355,8 +358,6 @@ dx_probe(struct dentry *dentry, struct inode *dir,
355 u32 hash; 358 u32 hash;
356 359
357 frame->bh = NULL; 360 frame->bh = NULL;
358 if (dentry)
359 dir = dentry->d_parent->d_inode;
360 if (!(bh = ext4_bread (NULL,dir, 0, 0, err))) 361 if (!(bh = ext4_bread (NULL,dir, 0, 0, err)))
361 goto fail; 362 goto fail;
362 root = (struct dx_root *) bh->b_data; 363 root = (struct dx_root *) bh->b_data;
@@ -372,8 +373,8 @@ dx_probe(struct dentry *dentry, struct inode *dir,
372 } 373 }
373 hinfo->hash_version = root->info.hash_version; 374 hinfo->hash_version = root->info.hash_version;
374 hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed; 375 hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed;
375 if (dentry) 376 if (d_name)
376 ext4fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo); 377 ext4fs_dirhash(d_name->name, d_name->len, hinfo);
377 hash = hinfo->hash; 378 hash = hinfo->hash;
378 379
379 if (root->info.unused_flags & 1) { 380 if (root->info.unused_flags & 1) {
@@ -406,7 +407,7 @@ dx_probe(struct dentry *dentry, struct inode *dir,
406 goto fail; 407 goto fail;
407 } 408 }
408 409
409 dxtrace (printk("Look up %x", hash)); 410 dxtrace(printk("Look up %x", hash));
410 while (1) 411 while (1)
411 { 412 {
412 count = dx_get_count(entries); 413 count = dx_get_count(entries);
@@ -555,7 +556,7 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
555 0, &err))) 556 0, &err)))
556 return err; /* Failure */ 557 return err; /* Failure */
557 p++; 558 p++;
558 brelse (p->bh); 559 brelse(p->bh);
559 p->bh = bh; 560 p->bh = bh;
560 p->at = p->entries = ((struct dx_node *) bh->b_data)->entries; 561 p->at = p->entries = ((struct dx_node *) bh->b_data)->entries;
561 } 562 }
@@ -593,7 +594,7 @@ static int htree_dirblock_to_tree(struct file *dir_file,
593 /* On error, skip the f_pos to the next block. */ 594 /* On error, skip the f_pos to the next block. */
594 dir_file->f_pos = (dir_file->f_pos | 595 dir_file->f_pos = (dir_file->f_pos |
595 (dir->i_sb->s_blocksize - 1)) + 1; 596 (dir->i_sb->s_blocksize - 1)) + 1;
596 brelse (bh); 597 brelse(bh);
597 return count; 598 return count;
598 } 599 }
599 ext4fs_dirhash(de->name, de->name_len, hinfo); 600 ext4fs_dirhash(de->name, de->name_len, hinfo);
@@ -635,8 +636,8 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
635 int ret, err; 636 int ret, err;
636 __u32 hashval; 637 __u32 hashval;
637 638
638 dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash, 639 dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n",
639 start_minor_hash)); 640 start_hash, start_minor_hash));
640 dir = dir_file->f_path.dentry->d_inode; 641 dir = dir_file->f_path.dentry->d_inode;
641 if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) { 642 if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) {
642 hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; 643 hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
@@ -648,7 +649,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
648 } 649 }
649 hinfo.hash = start_hash; 650 hinfo.hash = start_hash;
650 hinfo.minor_hash = 0; 651 hinfo.minor_hash = 0;
651 frame = dx_probe(NULL, dir_file->f_path.dentry->d_inode, &hinfo, frames, &err); 652 frame = dx_probe(NULL, dir, &hinfo, frames, &err);
652 if (!frame) 653 if (!frame)
653 return err; 654 return err;
654 655
@@ -694,8 +695,8 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
694 break; 695 break;
695 } 696 }
696 dx_release(frames); 697 dx_release(frames);
697 dxtrace(printk("Fill tree: returned %d entries, next hash: %x\n", 698 dxtrace(printk(KERN_DEBUG "Fill tree: returned %d entries, "
698 count, *next_hash)); 699 "next hash: %x\n", count, *next_hash));
699 return count; 700 return count;
700errout: 701errout:
701 dx_release(frames); 702 dx_release(frames);
@@ -802,17 +803,17 @@ static inline int ext4_match (int len, const char * const name,
802/* 803/*
803 * Returns 0 if not found, -1 on failure, and 1 on success 804 * Returns 0 if not found, -1 on failure, and 1 on success
804 */ 805 */
805static inline int search_dirblock(struct buffer_head * bh, 806static inline int search_dirblock(struct buffer_head *bh,
806 struct inode *dir, 807 struct inode *dir,
807 struct dentry *dentry, 808 const struct qstr *d_name,
808 unsigned long offset, 809 unsigned long offset,
809 struct ext4_dir_entry_2 ** res_dir) 810 struct ext4_dir_entry_2 ** res_dir)
810{ 811{
811 struct ext4_dir_entry_2 * de; 812 struct ext4_dir_entry_2 * de;
812 char * dlimit; 813 char * dlimit;
813 int de_len; 814 int de_len;
814 const char *name = dentry->d_name.name; 815 const char *name = d_name->name;
815 int namelen = dentry->d_name.len; 816 int namelen = d_name->len;
816 817
817 de = (struct ext4_dir_entry_2 *) bh->b_data; 818 de = (struct ext4_dir_entry_2 *) bh->b_data;
818 dlimit = bh->b_data + dir->i_sb->s_blocksize; 819 dlimit = bh->b_data + dir->i_sb->s_blocksize;
@@ -851,12 +852,13 @@ static inline int search_dirblock(struct buffer_head * bh,
851 * The returned buffer_head has ->b_count elevated. The caller is expected 852 * The returned buffer_head has ->b_count elevated. The caller is expected
852 * to brelse() it when appropriate. 853 * to brelse() it when appropriate.
853 */ 854 */
854static struct buffer_head * ext4_find_entry (struct dentry *dentry, 855static struct buffer_head * ext4_find_entry (struct inode *dir,
856 const struct qstr *d_name,
855 struct ext4_dir_entry_2 ** res_dir) 857 struct ext4_dir_entry_2 ** res_dir)
856{ 858{
857 struct super_block * sb; 859 struct super_block *sb;
858 struct buffer_head * bh_use[NAMEI_RA_SIZE]; 860 struct buffer_head *bh_use[NAMEI_RA_SIZE];
859 struct buffer_head * bh, *ret = NULL; 861 struct buffer_head *bh, *ret = NULL;
860 ext4_lblk_t start, block, b; 862 ext4_lblk_t start, block, b;
861 int ra_max = 0; /* Number of bh's in the readahead 863 int ra_max = 0; /* Number of bh's in the readahead
862 buffer, bh_use[] */ 864 buffer, bh_use[] */
@@ -865,16 +867,15 @@ static struct buffer_head * ext4_find_entry (struct dentry *dentry,
865 int num = 0; 867 int num = 0;
866 ext4_lblk_t nblocks; 868 ext4_lblk_t nblocks;
867 int i, err; 869 int i, err;
868 struct inode *dir = dentry->d_parent->d_inode;
869 int namelen; 870 int namelen;
870 871
871 *res_dir = NULL; 872 *res_dir = NULL;
872 sb = dir->i_sb; 873 sb = dir->i_sb;
873 namelen = dentry->d_name.len; 874 namelen = d_name->len;
874 if (namelen > EXT4_NAME_LEN) 875 if (namelen > EXT4_NAME_LEN)
875 return NULL; 876 return NULL;
876 if (is_dx(dir)) { 877 if (is_dx(dir)) {
877 bh = ext4_dx_find_entry(dentry, res_dir, &err); 878 bh = ext4_dx_find_entry(dir, d_name, res_dir, &err);
878 /* 879 /*
879 * On success, or if the error was file not found, 880 * On success, or if the error was file not found,
880 * return. Otherwise, fall back to doing a search the 881 * return. Otherwise, fall back to doing a search the
@@ -882,7 +883,8 @@ static struct buffer_head * ext4_find_entry (struct dentry *dentry,
882 */ 883 */
883 if (bh || (err != ERR_BAD_DX_DIR)) 884 if (bh || (err != ERR_BAD_DX_DIR))
884 return bh; 885 return bh;
885 dxtrace(printk("ext4_find_entry: dx failed, falling back\n")); 886 dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, "
887 "falling back\n"));
886 } 888 }
887 nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb); 889 nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb);
888 start = EXT4_I(dir)->i_dir_start_lookup; 890 start = EXT4_I(dir)->i_dir_start_lookup;
@@ -926,7 +928,7 @@ restart:
926 brelse(bh); 928 brelse(bh);
927 goto next; 929 goto next;
928 } 930 }
929 i = search_dirblock(bh, dir, dentry, 931 i = search_dirblock(bh, dir, d_name,
930 block << EXT4_BLOCK_SIZE_BITS(sb), res_dir); 932 block << EXT4_BLOCK_SIZE_BITS(sb), res_dir);
931 if (i == 1) { 933 if (i == 1) {
932 EXT4_I(dir)->i_dir_start_lookup = block; 934 EXT4_I(dir)->i_dir_start_lookup = block;
@@ -956,11 +958,11 @@ restart:
956cleanup_and_exit: 958cleanup_and_exit:
957 /* Clean up the read-ahead blocks */ 959 /* Clean up the read-ahead blocks */
958 for (; ra_ptr < ra_max; ra_ptr++) 960 for (; ra_ptr < ra_max; ra_ptr++)
959 brelse (bh_use[ra_ptr]); 961 brelse(bh_use[ra_ptr]);
960 return ret; 962 return ret;
961} 963}
962 964
963static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry, 965static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name,
964 struct ext4_dir_entry_2 **res_dir, int *err) 966 struct ext4_dir_entry_2 **res_dir, int *err)
965{ 967{
966 struct super_block * sb; 968 struct super_block * sb;
@@ -971,14 +973,13 @@ static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
971 struct buffer_head *bh; 973 struct buffer_head *bh;
972 ext4_lblk_t block; 974 ext4_lblk_t block;
973 int retval; 975 int retval;
974 int namelen = dentry->d_name.len; 976 int namelen = d_name->len;
975 const u8 *name = dentry->d_name.name; 977 const u8 *name = d_name->name;
976 struct inode *dir = dentry->d_parent->d_inode;
977 978
978 sb = dir->i_sb; 979 sb = dir->i_sb;
979 /* NFS may look up ".." - look at dx_root directory block */ 980 /* NFS may look up ".." - look at dx_root directory block */
980 if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){ 981 if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){
981 if (!(frame = dx_probe(dentry, NULL, &hinfo, frames, err))) 982 if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err)))
982 return NULL; 983 return NULL;
983 } else { 984 } else {
984 frame = frames; 985 frame = frames;
@@ -1010,7 +1011,7 @@ static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
1010 return bh; 1011 return bh;
1011 } 1012 }
1012 } 1013 }
1013 brelse (bh); 1014 brelse(bh);
1014 /* Check to see if we should continue to search */ 1015 /* Check to see if we should continue to search */
1015 retval = ext4_htree_next_block(dir, hash, frame, 1016 retval = ext4_htree_next_block(dir, hash, frame,
1016 frames, NULL); 1017 frames, NULL);
@@ -1025,25 +1026,25 @@ static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
1025 1026
1026 *err = -ENOENT; 1027 *err = -ENOENT;
1027errout: 1028errout:
1028 dxtrace(printk("%s not found\n", name)); 1029 dxtrace(printk(KERN_DEBUG "%s not found\n", name));
1029 dx_release (frames); 1030 dx_release (frames);
1030 return NULL; 1031 return NULL;
1031} 1032}
1032 1033
1033static struct dentry *ext4_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd) 1034static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
1034{ 1035{
1035 struct inode * inode; 1036 struct inode *inode;
1036 struct ext4_dir_entry_2 * de; 1037 struct ext4_dir_entry_2 *de;
1037 struct buffer_head * bh; 1038 struct buffer_head *bh;
1038 1039
1039 if (dentry->d_name.len > EXT4_NAME_LEN) 1040 if (dentry->d_name.len > EXT4_NAME_LEN)
1040 return ERR_PTR(-ENAMETOOLONG); 1041 return ERR_PTR(-ENAMETOOLONG);
1041 1042
1042 bh = ext4_find_entry(dentry, &de); 1043 bh = ext4_find_entry(dir, &dentry->d_name, &de);
1043 inode = NULL; 1044 inode = NULL;
1044 if (bh) { 1045 if (bh) {
1045 unsigned long ino = le32_to_cpu(de->inode); 1046 unsigned long ino = le32_to_cpu(de->inode);
1046 brelse (bh); 1047 brelse(bh);
1047 if (!ext4_valid_inum(dir->i_sb, ino)) { 1048 if (!ext4_valid_inum(dir->i_sb, ino)) {
1048 ext4_error(dir->i_sb, "ext4_lookup", 1049 ext4_error(dir->i_sb, "ext4_lookup",
1049 "bad inode number: %lu", ino); 1050 "bad inode number: %lu", ino);
@@ -1062,15 +1063,14 @@ struct dentry *ext4_get_parent(struct dentry *child)
1062 unsigned long ino; 1063 unsigned long ino;
1063 struct dentry *parent; 1064 struct dentry *parent;
1064 struct inode *inode; 1065 struct inode *inode;
1065 struct dentry dotdot; 1066 static const struct qstr dotdot = {
1067 .name = "..",
1068 .len = 2,
1069 };
1066 struct ext4_dir_entry_2 * de; 1070 struct ext4_dir_entry_2 * de;
1067 struct buffer_head *bh; 1071 struct buffer_head *bh;
1068 1072
1069 dotdot.d_name.name = ".."; 1073 bh = ext4_find_entry(child->d_inode, &dotdot, &de);
1070 dotdot.d_name.len = 2;
1071 dotdot.d_parent = child; /* confusing, isn't it! */
1072
1073 bh = ext4_find_entry(&dotdot, &de);
1074 inode = NULL; 1074 inode = NULL;
1075 if (!bh) 1075 if (!bh)
1076 return ERR_PTR(-ENOENT); 1076 return ERR_PTR(-ENOENT);
@@ -1201,10 +1201,10 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
1201 1201
1202 /* create map in the end of data2 block */ 1202 /* create map in the end of data2 block */
1203 map = (struct dx_map_entry *) (data2 + blocksize); 1203 map = (struct dx_map_entry *) (data2 + blocksize);
1204 count = dx_make_map ((struct ext4_dir_entry_2 *) data1, 1204 count = dx_make_map((struct ext4_dir_entry_2 *) data1,
1205 blocksize, hinfo, map); 1205 blocksize, hinfo, map);
1206 map -= count; 1206 map -= count;
1207 dx_sort_map (map, count); 1207 dx_sort_map(map, count);
1208 /* Split the existing block in the middle, size-wise */ 1208 /* Split the existing block in the middle, size-wise */
1209 size = 0; 1209 size = 0;
1210 move = 0; 1210 move = 0;
@@ -1225,7 +1225,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
1225 1225
1226 /* Fancy dance to stay within two buffers */ 1226 /* Fancy dance to stay within two buffers */
1227 de2 = dx_move_dirents(data1, data2, map + split, count - split); 1227 de2 = dx_move_dirents(data1, data2, map + split, count - split);
1228 de = dx_pack_dirents(data1,blocksize); 1228 de = dx_pack_dirents(data1, blocksize);
1229 de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de); 1229 de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de);
1230 de2->rec_len = ext4_rec_len_to_disk(data2 + blocksize - (char *) de2); 1230 de2->rec_len = ext4_rec_len_to_disk(data2 + blocksize - (char *) de2);
1231 dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1)); 1231 dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1));
@@ -1237,15 +1237,15 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
1237 swap(*bh, bh2); 1237 swap(*bh, bh2);
1238 de = de2; 1238 de = de2;
1239 } 1239 }
1240 dx_insert_block (frame, hash2 + continued, newblock); 1240 dx_insert_block(frame, hash2 + continued, newblock);
1241 err = ext4_journal_dirty_metadata (handle, bh2); 1241 err = ext4_journal_dirty_metadata(handle, bh2);
1242 if (err) 1242 if (err)
1243 goto journal_error; 1243 goto journal_error;
1244 err = ext4_journal_dirty_metadata (handle, frame->bh); 1244 err = ext4_journal_dirty_metadata(handle, frame->bh);
1245 if (err) 1245 if (err)
1246 goto journal_error; 1246 goto journal_error;
1247 brelse (bh2); 1247 brelse(bh2);
1248 dxtrace(dx_show_index ("frame", frame->entries)); 1248 dxtrace(dx_show_index("frame", frame->entries));
1249 return de; 1249 return de;
1250 1250
1251journal_error: 1251journal_error:
@@ -1271,7 +1271,7 @@ errout:
1271 */ 1271 */
1272static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, 1272static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1273 struct inode *inode, struct ext4_dir_entry_2 *de, 1273 struct inode *inode, struct ext4_dir_entry_2 *de,
1274 struct buffer_head * bh) 1274 struct buffer_head *bh)
1275{ 1275{
1276 struct inode *dir = dentry->d_parent->d_inode; 1276 struct inode *dir = dentry->d_parent->d_inode;
1277 const char *name = dentry->d_name.name; 1277 const char *name = dentry->d_name.name;
@@ -1288,11 +1288,11 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1288 while ((char *) de <= top) { 1288 while ((char *) de <= top) {
1289 if (!ext4_check_dir_entry("ext4_add_entry", dir, de, 1289 if (!ext4_check_dir_entry("ext4_add_entry", dir, de,
1290 bh, offset)) { 1290 bh, offset)) {
1291 brelse (bh); 1291 brelse(bh);
1292 return -EIO; 1292 return -EIO;
1293 } 1293 }
1294 if (ext4_match (namelen, name, de)) { 1294 if (ext4_match(namelen, name, de)) {
1295 brelse (bh); 1295 brelse(bh);
1296 return -EEXIST; 1296 return -EEXIST;
1297 } 1297 }
1298 nlen = EXT4_DIR_REC_LEN(de->name_len); 1298 nlen = EXT4_DIR_REC_LEN(de->name_len);
@@ -1329,7 +1329,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1329 } else 1329 } else
1330 de->inode = 0; 1330 de->inode = 0;
1331 de->name_len = namelen; 1331 de->name_len = namelen;
1332 memcpy (de->name, name, namelen); 1332 memcpy(de->name, name, namelen);
1333 /* 1333 /*
1334 * XXX shouldn't update any times until successful 1334 * XXX shouldn't update any times until successful
1335 * completion of syscall, but too many callers depend 1335 * completion of syscall, but too many callers depend
@@ -1377,7 +1377,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1377 struct fake_dirent *fde; 1377 struct fake_dirent *fde;
1378 1378
1379 blocksize = dir->i_sb->s_blocksize; 1379 blocksize = dir->i_sb->s_blocksize;
1380 dxtrace(printk("Creating index\n")); 1380 dxtrace(printk(KERN_DEBUG "Creating index\n"));
1381 retval = ext4_journal_get_write_access(handle, bh); 1381 retval = ext4_journal_get_write_access(handle, bh);
1382 if (retval) { 1382 if (retval) {
1383 ext4_std_error(dir->i_sb, retval); 1383 ext4_std_error(dir->i_sb, retval);
@@ -1386,7 +1386,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1386 } 1386 }
1387 root = (struct dx_root *) bh->b_data; 1387 root = (struct dx_root *) bh->b_data;
1388 1388
1389 bh2 = ext4_append (handle, dir, &block, &retval); 1389 bh2 = ext4_append(handle, dir, &block, &retval);
1390 if (!(bh2)) { 1390 if (!(bh2)) {
1391 brelse(bh); 1391 brelse(bh);
1392 return retval; 1392 return retval;
@@ -1412,9 +1412,9 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1412 root->info.info_length = sizeof(root->info); 1412 root->info.info_length = sizeof(root->info);
1413 root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; 1413 root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
1414 entries = root->entries; 1414 entries = root->entries;
1415 dx_set_block (entries, 1); 1415 dx_set_block(entries, 1);
1416 dx_set_count (entries, 1); 1416 dx_set_count(entries, 1);
1417 dx_set_limit (entries, dx_root_limit(dir, sizeof(root->info))); 1417 dx_set_limit(entries, dx_root_limit(dir, sizeof(root->info)));
1418 1418
1419 /* Initialize as for dx_probe */ 1419 /* Initialize as for dx_probe */
1420 hinfo.hash_version = root->info.hash_version; 1420 hinfo.hash_version = root->info.hash_version;
@@ -1443,14 +1443,14 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1443 * may not sleep between calling this and putting something into 1443 * may not sleep between calling this and putting something into
1444 * the entry, as someone else might have used it while you slept. 1444 * the entry, as someone else might have used it while you slept.
1445 */ 1445 */
1446static int ext4_add_entry (handle_t *handle, struct dentry *dentry, 1446static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
1447 struct inode *inode) 1447 struct inode *inode)
1448{ 1448{
1449 struct inode *dir = dentry->d_parent->d_inode; 1449 struct inode *dir = dentry->d_parent->d_inode;
1450 unsigned long offset; 1450 unsigned long offset;
1451 struct buffer_head * bh; 1451 struct buffer_head *bh;
1452 struct ext4_dir_entry_2 *de; 1452 struct ext4_dir_entry_2 *de;
1453 struct super_block * sb; 1453 struct super_block *sb;
1454 int retval; 1454 int retval;
1455 int dx_fallback=0; 1455 int dx_fallback=0;
1456 unsigned blocksize; 1456 unsigned blocksize;
@@ -1500,13 +1500,13 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1500 struct dx_frame frames[2], *frame; 1500 struct dx_frame frames[2], *frame;
1501 struct dx_entry *entries, *at; 1501 struct dx_entry *entries, *at;
1502 struct dx_hash_info hinfo; 1502 struct dx_hash_info hinfo;
1503 struct buffer_head * bh; 1503 struct buffer_head *bh;
1504 struct inode *dir = dentry->d_parent->d_inode; 1504 struct inode *dir = dentry->d_parent->d_inode;
1505 struct super_block * sb = dir->i_sb; 1505 struct super_block *sb = dir->i_sb;
1506 struct ext4_dir_entry_2 *de; 1506 struct ext4_dir_entry_2 *de;
1507 int err; 1507 int err;
1508 1508
1509 frame = dx_probe(dentry, NULL, &hinfo, frames, &err); 1509 frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err);
1510 if (!frame) 1510 if (!frame)
1511 return err; 1511 return err;
1512 entries = frame->entries; 1512 entries = frame->entries;
@@ -1527,7 +1527,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1527 } 1527 }
1528 1528
1529 /* Block full, should compress but for now just split */ 1529 /* Block full, should compress but for now just split */
1530 dxtrace(printk("using %u of %u node entries\n", 1530 dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
1531 dx_get_count(entries), dx_get_limit(entries))); 1531 dx_get_count(entries), dx_get_limit(entries)));
1532 /* Need to split index? */ 1532 /* Need to split index? */
1533 if (dx_get_count(entries) == dx_get_limit(entries)) { 1533 if (dx_get_count(entries) == dx_get_limit(entries)) {
@@ -1559,7 +1559,8 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1559 if (levels) { 1559 if (levels) {
1560 unsigned icount1 = icount/2, icount2 = icount - icount1; 1560 unsigned icount1 = icount/2, icount2 = icount - icount1;
1561 unsigned hash2 = dx_get_hash(entries + icount1); 1561 unsigned hash2 = dx_get_hash(entries + icount1);
1562 dxtrace(printk("Split index %i/%i\n", icount1, icount2)); 1562 dxtrace(printk(KERN_DEBUG "Split index %i/%i\n",
1563 icount1, icount2));
1563 1564
1564 BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */ 1565 BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
1565 err = ext4_journal_get_write_access(handle, 1566 err = ext4_journal_get_write_access(handle,
@@ -1567,11 +1568,11 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1567 if (err) 1568 if (err)
1568 goto journal_error; 1569 goto journal_error;
1569 1570
1570 memcpy ((char *) entries2, (char *) (entries + icount1), 1571 memcpy((char *) entries2, (char *) (entries + icount1),
1571 icount2 * sizeof(struct dx_entry)); 1572 icount2 * sizeof(struct dx_entry));
1572 dx_set_count (entries, icount1); 1573 dx_set_count(entries, icount1);
1573 dx_set_count (entries2, icount2); 1574 dx_set_count(entries2, icount2);
1574 dx_set_limit (entries2, dx_node_limit(dir)); 1575 dx_set_limit(entries2, dx_node_limit(dir));
1575 1576
1576 /* Which index block gets the new entry? */ 1577 /* Which index block gets the new entry? */
1577 if (at - entries >= icount1) { 1578 if (at - entries >= icount1) {
@@ -1579,16 +1580,17 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1579 frame->entries = entries = entries2; 1580 frame->entries = entries = entries2;
1580 swap(frame->bh, bh2); 1581 swap(frame->bh, bh2);
1581 } 1582 }
1582 dx_insert_block (frames + 0, hash2, newblock); 1583 dx_insert_block(frames + 0, hash2, newblock);
1583 dxtrace(dx_show_index ("node", frames[1].entries)); 1584 dxtrace(dx_show_index("node", frames[1].entries));
1584 dxtrace(dx_show_index ("node", 1585 dxtrace(dx_show_index("node",
1585 ((struct dx_node *) bh2->b_data)->entries)); 1586 ((struct dx_node *) bh2->b_data)->entries));
1586 err = ext4_journal_dirty_metadata(handle, bh2); 1587 err = ext4_journal_dirty_metadata(handle, bh2);
1587 if (err) 1588 if (err)
1588 goto journal_error; 1589 goto journal_error;
1589 brelse (bh2); 1590 brelse (bh2);
1590 } else { 1591 } else {
1591 dxtrace(printk("Creating second level index...\n")); 1592 dxtrace(printk(KERN_DEBUG
1593 "Creating second level index...\n"));
1592 memcpy((char *) entries2, (char *) entries, 1594 memcpy((char *) entries2, (char *) entries,
1593 icount * sizeof(struct dx_entry)); 1595 icount * sizeof(struct dx_entry));
1594 dx_set_limit(entries2, dx_node_limit(dir)); 1596 dx_set_limit(entries2, dx_node_limit(dir));
@@ -1630,12 +1632,12 @@ cleanup:
1630 * ext4_delete_entry deletes a directory entry by merging it with the 1632 * ext4_delete_entry deletes a directory entry by merging it with the
1631 * previous entry 1633 * previous entry
1632 */ 1634 */
1633static int ext4_delete_entry (handle_t *handle, 1635static int ext4_delete_entry(handle_t *handle,
1634 struct inode * dir, 1636 struct inode *dir,
1635 struct ext4_dir_entry_2 * de_del, 1637 struct ext4_dir_entry_2 *de_del,
1636 struct buffer_head * bh) 1638 struct buffer_head *bh)
1637{ 1639{
1638 struct ext4_dir_entry_2 * de, * pde; 1640 struct ext4_dir_entry_2 *de, *pde;
1639 int i; 1641 int i;
1640 1642
1641 i = 0; 1643 i = 0;
@@ -1716,11 +1718,11 @@ static int ext4_add_nondir(handle_t *handle,
1716 * If the create succeeds, we fill in the inode information 1718 * If the create succeeds, we fill in the inode information
1717 * with d_instantiate(). 1719 * with d_instantiate().
1718 */ 1720 */
1719static int ext4_create (struct inode * dir, struct dentry * dentry, int mode, 1721static int ext4_create(struct inode *dir, struct dentry *dentry, int mode,
1720 struct nameidata *nd) 1722 struct nameidata *nd)
1721{ 1723{
1722 handle_t *handle; 1724 handle_t *handle;
1723 struct inode * inode; 1725 struct inode *inode;
1724 int err, retries = 0; 1726 int err, retries = 0;
1725 1727
1726retry: 1728retry:
@@ -1747,8 +1749,8 @@ retry:
1747 return err; 1749 return err;
1748} 1750}
1749 1751
1750static int ext4_mknod (struct inode * dir, struct dentry *dentry, 1752static int ext4_mknod(struct inode *dir, struct dentry *dentry,
1751 int mode, dev_t rdev) 1753 int mode, dev_t rdev)
1752{ 1754{
1753 handle_t *handle; 1755 handle_t *handle;
1754 struct inode *inode; 1756 struct inode *inode;
@@ -1767,11 +1769,11 @@ retry:
1767 if (IS_DIRSYNC(dir)) 1769 if (IS_DIRSYNC(dir))
1768 handle->h_sync = 1; 1770 handle->h_sync = 1;
1769 1771
1770 inode = ext4_new_inode (handle, dir, mode); 1772 inode = ext4_new_inode(handle, dir, mode);
1771 err = PTR_ERR(inode); 1773 err = PTR_ERR(inode);
1772 if (!IS_ERR(inode)) { 1774 if (!IS_ERR(inode)) {
1773 init_special_inode(inode, inode->i_mode, rdev); 1775 init_special_inode(inode, inode->i_mode, rdev);
1774#ifdef CONFIG_EXT4DEV_FS_XATTR 1776#ifdef CONFIG_EXT4_FS_XATTR
1775 inode->i_op = &ext4_special_inode_operations; 1777 inode->i_op = &ext4_special_inode_operations;
1776#endif 1778#endif
1777 err = ext4_add_nondir(handle, dentry, inode); 1779 err = ext4_add_nondir(handle, dentry, inode);
@@ -1782,12 +1784,12 @@ retry:
1782 return err; 1784 return err;
1783} 1785}
1784 1786
1785static int ext4_mkdir(struct inode * dir, struct dentry * dentry, int mode) 1787static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode)
1786{ 1788{
1787 handle_t *handle; 1789 handle_t *handle;
1788 struct inode * inode; 1790 struct inode *inode;
1789 struct buffer_head * dir_block; 1791 struct buffer_head *dir_block;
1790 struct ext4_dir_entry_2 * de; 1792 struct ext4_dir_entry_2 *de;
1791 int err, retries = 0; 1793 int err, retries = 0;
1792 1794
1793 if (EXT4_DIR_LINK_MAX(dir)) 1795 if (EXT4_DIR_LINK_MAX(dir))
@@ -1803,7 +1805,7 @@ retry:
1803 if (IS_DIRSYNC(dir)) 1805 if (IS_DIRSYNC(dir))
1804 handle->h_sync = 1; 1806 handle->h_sync = 1;
1805 1807
1806 inode = ext4_new_inode (handle, dir, S_IFDIR | mode); 1808 inode = ext4_new_inode(handle, dir, S_IFDIR | mode);
1807 err = PTR_ERR(inode); 1809 err = PTR_ERR(inode);
1808 if (IS_ERR(inode)) 1810 if (IS_ERR(inode))
1809 goto out_stop; 1811 goto out_stop;
@@ -1811,7 +1813,7 @@ retry:
1811 inode->i_op = &ext4_dir_inode_operations; 1813 inode->i_op = &ext4_dir_inode_operations;
1812 inode->i_fop = &ext4_dir_operations; 1814 inode->i_fop = &ext4_dir_operations;
1813 inode->i_size = EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize; 1815 inode->i_size = EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize;
1814 dir_block = ext4_bread (handle, inode, 0, 1, &err); 1816 dir_block = ext4_bread(handle, inode, 0, 1, &err);
1815 if (!dir_block) 1817 if (!dir_block)
1816 goto out_clear_inode; 1818 goto out_clear_inode;
1817 BUFFER_TRACE(dir_block, "get_write_access"); 1819 BUFFER_TRACE(dir_block, "get_write_access");
@@ -1820,26 +1822,26 @@ retry:
1820 de->inode = cpu_to_le32(inode->i_ino); 1822 de->inode = cpu_to_le32(inode->i_ino);
1821 de->name_len = 1; 1823 de->name_len = 1;
1822 de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len)); 1824 de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len));
1823 strcpy (de->name, "."); 1825 strcpy(de->name, ".");
1824 ext4_set_de_type(dir->i_sb, de, S_IFDIR); 1826 ext4_set_de_type(dir->i_sb, de, S_IFDIR);
1825 de = ext4_next_entry(de); 1827 de = ext4_next_entry(de);
1826 de->inode = cpu_to_le32(dir->i_ino); 1828 de->inode = cpu_to_le32(dir->i_ino);
1827 de->rec_len = ext4_rec_len_to_disk(inode->i_sb->s_blocksize - 1829 de->rec_len = ext4_rec_len_to_disk(inode->i_sb->s_blocksize -
1828 EXT4_DIR_REC_LEN(1)); 1830 EXT4_DIR_REC_LEN(1));
1829 de->name_len = 2; 1831 de->name_len = 2;
1830 strcpy (de->name, ".."); 1832 strcpy(de->name, "..");
1831 ext4_set_de_type(dir->i_sb, de, S_IFDIR); 1833 ext4_set_de_type(dir->i_sb, de, S_IFDIR);
1832 inode->i_nlink = 2; 1834 inode->i_nlink = 2;
1833 BUFFER_TRACE(dir_block, "call ext4_journal_dirty_metadata"); 1835 BUFFER_TRACE(dir_block, "call ext4_journal_dirty_metadata");
1834 ext4_journal_dirty_metadata(handle, dir_block); 1836 ext4_journal_dirty_metadata(handle, dir_block);
1835 brelse (dir_block); 1837 brelse(dir_block);
1836 ext4_mark_inode_dirty(handle, inode); 1838 ext4_mark_inode_dirty(handle, inode);
1837 err = ext4_add_entry (handle, dentry, inode); 1839 err = ext4_add_entry(handle, dentry, inode);
1838 if (err) { 1840 if (err) {
1839out_clear_inode: 1841out_clear_inode:
1840 clear_nlink(inode); 1842 clear_nlink(inode);
1841 ext4_mark_inode_dirty(handle, inode); 1843 ext4_mark_inode_dirty(handle, inode);
1842 iput (inode); 1844 iput(inode);
1843 goto out_stop; 1845 goto out_stop;
1844 } 1846 }
1845 ext4_inc_count(handle, dir); 1847 ext4_inc_count(handle, dir);
@@ -1856,17 +1858,17 @@ out_stop:
1856/* 1858/*
1857 * routine to check that the specified directory is empty (for rmdir) 1859 * routine to check that the specified directory is empty (for rmdir)
1858 */ 1860 */
1859static int empty_dir (struct inode * inode) 1861static int empty_dir(struct inode *inode)
1860{ 1862{
1861 unsigned long offset; 1863 unsigned long offset;
1862 struct buffer_head * bh; 1864 struct buffer_head *bh;
1863 struct ext4_dir_entry_2 * de, * de1; 1865 struct ext4_dir_entry_2 *de, *de1;
1864 struct super_block * sb; 1866 struct super_block *sb;
1865 int err = 0; 1867 int err = 0;
1866 1868
1867 sb = inode->i_sb; 1869 sb = inode->i_sb;
1868 if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) || 1870 if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) ||
1869 !(bh = ext4_bread (NULL, inode, 0, 0, &err))) { 1871 !(bh = ext4_bread(NULL, inode, 0, 0, &err))) {
1870 if (err) 1872 if (err)
1871 ext4_error(inode->i_sb, __func__, 1873 ext4_error(inode->i_sb, __func__,
1872 "error %d reading directory #%lu offset 0", 1874 "error %d reading directory #%lu offset 0",
@@ -1881,23 +1883,23 @@ static int empty_dir (struct inode * inode)
1881 de1 = ext4_next_entry(de); 1883 de1 = ext4_next_entry(de);
1882 if (le32_to_cpu(de->inode) != inode->i_ino || 1884 if (le32_to_cpu(de->inode) != inode->i_ino ||
1883 !le32_to_cpu(de1->inode) || 1885 !le32_to_cpu(de1->inode) ||
1884 strcmp (".", de->name) || 1886 strcmp(".", de->name) ||
1885 strcmp ("..", de1->name)) { 1887 strcmp("..", de1->name)) {
1886 ext4_warning (inode->i_sb, "empty_dir", 1888 ext4_warning(inode->i_sb, "empty_dir",
1887 "bad directory (dir #%lu) - no `.' or `..'", 1889 "bad directory (dir #%lu) - no `.' or `..'",
1888 inode->i_ino); 1890 inode->i_ino);
1889 brelse (bh); 1891 brelse(bh);
1890 return 1; 1892 return 1;
1891 } 1893 }
1892 offset = ext4_rec_len_from_disk(de->rec_len) + 1894 offset = ext4_rec_len_from_disk(de->rec_len) +
1893 ext4_rec_len_from_disk(de1->rec_len); 1895 ext4_rec_len_from_disk(de1->rec_len);
1894 de = ext4_next_entry(de1); 1896 de = ext4_next_entry(de1);
1895 while (offset < inode->i_size ) { 1897 while (offset < inode->i_size) {
1896 if (!bh || 1898 if (!bh ||
1897 (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) { 1899 (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
1898 err = 0; 1900 err = 0;
1899 brelse (bh); 1901 brelse(bh);
1900 bh = ext4_bread (NULL, inode, 1902 bh = ext4_bread(NULL, inode,
1901 offset >> EXT4_BLOCK_SIZE_BITS(sb), 0, &err); 1903 offset >> EXT4_BLOCK_SIZE_BITS(sb), 0, &err);
1902 if (!bh) { 1904 if (!bh) {
1903 if (err) 1905 if (err)
@@ -1917,13 +1919,13 @@ static int empty_dir (struct inode * inode)
1917 continue; 1919 continue;
1918 } 1920 }
1919 if (le32_to_cpu(de->inode)) { 1921 if (le32_to_cpu(de->inode)) {
1920 brelse (bh); 1922 brelse(bh);
1921 return 0; 1923 return 0;
1922 } 1924 }
1923 offset += ext4_rec_len_from_disk(de->rec_len); 1925 offset += ext4_rec_len_from_disk(de->rec_len);
1924 de = ext4_next_entry(de); 1926 de = ext4_next_entry(de);
1925 } 1927 }
1926 brelse (bh); 1928 brelse(bh);
1927 return 1; 1929 return 1;
1928} 1930}
1929 1931
@@ -1954,8 +1956,8 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
1954 * ->i_nlink. For, say it, character device. Not a regular file, 1956 * ->i_nlink. For, say it, character device. Not a regular file,
1955 * not a directory, not a symlink and ->i_nlink > 0. 1957 * not a directory, not a symlink and ->i_nlink > 0.
1956 */ 1958 */
1957 J_ASSERT ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || 1959 J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
1958 S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); 1960 S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
1959 1961
1960 BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access"); 1962 BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access");
1961 err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh); 1963 err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
@@ -2069,12 +2071,12 @@ out_brelse:
2069 goto out_err; 2071 goto out_err;
2070} 2072}
2071 2073
2072static int ext4_rmdir (struct inode * dir, struct dentry *dentry) 2074static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
2073{ 2075{
2074 int retval; 2076 int retval;
2075 struct inode * inode; 2077 struct inode *inode;
2076 struct buffer_head * bh; 2078 struct buffer_head *bh;
2077 struct ext4_dir_entry_2 * de; 2079 struct ext4_dir_entry_2 *de;
2078 handle_t *handle; 2080 handle_t *handle;
2079 2081
2080 /* Initialize quotas before so that eventual writes go in 2082 /* Initialize quotas before so that eventual writes go in
@@ -2085,7 +2087,7 @@ static int ext4_rmdir (struct inode * dir, struct dentry *dentry)
2085 return PTR_ERR(handle); 2087 return PTR_ERR(handle);
2086 2088
2087 retval = -ENOENT; 2089 retval = -ENOENT;
2088 bh = ext4_find_entry (dentry, &de); 2090 bh = ext4_find_entry(dir, &dentry->d_name, &de);
2089 if (!bh) 2091 if (!bh)
2090 goto end_rmdir; 2092 goto end_rmdir;
2091 2093
@@ -2099,16 +2101,16 @@ static int ext4_rmdir (struct inode * dir, struct dentry *dentry)
2099 goto end_rmdir; 2101 goto end_rmdir;
2100 2102
2101 retval = -ENOTEMPTY; 2103 retval = -ENOTEMPTY;
2102 if (!empty_dir (inode)) 2104 if (!empty_dir(inode))
2103 goto end_rmdir; 2105 goto end_rmdir;
2104 2106
2105 retval = ext4_delete_entry(handle, dir, de, bh); 2107 retval = ext4_delete_entry(handle, dir, de, bh);
2106 if (retval) 2108 if (retval)
2107 goto end_rmdir; 2109 goto end_rmdir;
2108 if (!EXT4_DIR_LINK_EMPTY(inode)) 2110 if (!EXT4_DIR_LINK_EMPTY(inode))
2109 ext4_warning (inode->i_sb, "ext4_rmdir", 2111 ext4_warning(inode->i_sb, "ext4_rmdir",
2110 "empty directory has too many links (%d)", 2112 "empty directory has too many links (%d)",
2111 inode->i_nlink); 2113 inode->i_nlink);
2112 inode->i_version++; 2114 inode->i_version++;
2113 clear_nlink(inode); 2115 clear_nlink(inode);
2114 /* There's no need to set i_disksize: the fact that i_nlink is 2116 /* There's no need to set i_disksize: the fact that i_nlink is
@@ -2124,16 +2126,16 @@ static int ext4_rmdir (struct inode * dir, struct dentry *dentry)
2124 2126
2125end_rmdir: 2127end_rmdir:
2126 ext4_journal_stop(handle); 2128 ext4_journal_stop(handle);
2127 brelse (bh); 2129 brelse(bh);
2128 return retval; 2130 return retval;
2129} 2131}
2130 2132
2131static int ext4_unlink(struct inode * dir, struct dentry *dentry) 2133static int ext4_unlink(struct inode *dir, struct dentry *dentry)
2132{ 2134{
2133 int retval; 2135 int retval;
2134 struct inode * inode; 2136 struct inode *inode;
2135 struct buffer_head * bh; 2137 struct buffer_head *bh;
2136 struct ext4_dir_entry_2 * de; 2138 struct ext4_dir_entry_2 *de;
2137 handle_t *handle; 2139 handle_t *handle;
2138 2140
2139 /* Initialize quotas before so that eventual writes go 2141 /* Initialize quotas before so that eventual writes go
@@ -2147,7 +2149,7 @@ static int ext4_unlink(struct inode * dir, struct dentry *dentry)
2147 handle->h_sync = 1; 2149 handle->h_sync = 1;
2148 2150
2149 retval = -ENOENT; 2151 retval = -ENOENT;
2150 bh = ext4_find_entry (dentry, &de); 2152 bh = ext4_find_entry(dir, &dentry->d_name, &de);
2151 if (!bh) 2153 if (!bh)
2152 goto end_unlink; 2154 goto end_unlink;
2153 2155
@@ -2158,9 +2160,9 @@ static int ext4_unlink(struct inode * dir, struct dentry *dentry)
2158 goto end_unlink; 2160 goto end_unlink;
2159 2161
2160 if (!inode->i_nlink) { 2162 if (!inode->i_nlink) {
2161 ext4_warning (inode->i_sb, "ext4_unlink", 2163 ext4_warning(inode->i_sb, "ext4_unlink",
2162 "Deleting nonexistent file (%lu), %d", 2164 "Deleting nonexistent file (%lu), %d",
2163 inode->i_ino, inode->i_nlink); 2165 inode->i_ino, inode->i_nlink);
2164 inode->i_nlink = 1; 2166 inode->i_nlink = 1;
2165 } 2167 }
2166 retval = ext4_delete_entry(handle, dir, de, bh); 2168 retval = ext4_delete_entry(handle, dir, de, bh);
@@ -2178,15 +2180,15 @@ static int ext4_unlink(struct inode * dir, struct dentry *dentry)
2178 2180
2179end_unlink: 2181end_unlink:
2180 ext4_journal_stop(handle); 2182 ext4_journal_stop(handle);
2181 brelse (bh); 2183 brelse(bh);
2182 return retval; 2184 return retval;
2183} 2185}
2184 2186
2185static int ext4_symlink (struct inode * dir, 2187static int ext4_symlink(struct inode *dir,
2186 struct dentry *dentry, const char * symname) 2188 struct dentry *dentry, const char *symname)
2187{ 2189{
2188 handle_t *handle; 2190 handle_t *handle;
2189 struct inode * inode; 2191 struct inode *inode;
2190 int l, err, retries = 0; 2192 int l, err, retries = 0;
2191 2193
2192 l = strlen(symname)+1; 2194 l = strlen(symname)+1;
@@ -2203,12 +2205,12 @@ retry:
2203 if (IS_DIRSYNC(dir)) 2205 if (IS_DIRSYNC(dir))
2204 handle->h_sync = 1; 2206 handle->h_sync = 1;
2205 2207
2206 inode = ext4_new_inode (handle, dir, S_IFLNK|S_IRWXUGO); 2208 inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO);
2207 err = PTR_ERR(inode); 2209 err = PTR_ERR(inode);
2208 if (IS_ERR(inode)) 2210 if (IS_ERR(inode))
2209 goto out_stop; 2211 goto out_stop;
2210 2212
2211 if (l > sizeof (EXT4_I(inode)->i_data)) { 2213 if (l > sizeof(EXT4_I(inode)->i_data)) {
2212 inode->i_op = &ext4_symlink_inode_operations; 2214 inode->i_op = &ext4_symlink_inode_operations;
2213 ext4_set_aops(inode); 2215 ext4_set_aops(inode);
2214 /* 2216 /*
@@ -2221,14 +2223,14 @@ retry:
2221 if (err) { 2223 if (err) {
2222 clear_nlink(inode); 2224 clear_nlink(inode);
2223 ext4_mark_inode_dirty(handle, inode); 2225 ext4_mark_inode_dirty(handle, inode);
2224 iput (inode); 2226 iput(inode);
2225 goto out_stop; 2227 goto out_stop;
2226 } 2228 }
2227 } else { 2229 } else {
2228 /* clear the extent format for fast symlink */ 2230 /* clear the extent format for fast symlink */
2229 EXT4_I(inode)->i_flags &= ~EXT4_EXTENTS_FL; 2231 EXT4_I(inode)->i_flags &= ~EXT4_EXTENTS_FL;
2230 inode->i_op = &ext4_fast_symlink_inode_operations; 2232 inode->i_op = &ext4_fast_symlink_inode_operations;
2231 memcpy((char*)&EXT4_I(inode)->i_data,symname,l); 2233 memcpy((char *)&EXT4_I(inode)->i_data, symname, l);
2232 inode->i_size = l-1; 2234 inode->i_size = l-1;
2233 } 2235 }
2234 EXT4_I(inode)->i_disksize = inode->i_size; 2236 EXT4_I(inode)->i_disksize = inode->i_size;
@@ -2240,8 +2242,8 @@ out_stop:
2240 return err; 2242 return err;
2241} 2243}
2242 2244
2243static int ext4_link (struct dentry * old_dentry, 2245static int ext4_link(struct dentry *old_dentry,
2244 struct inode * dir, struct dentry *dentry) 2246 struct inode *dir, struct dentry *dentry)
2245{ 2247{
2246 handle_t *handle; 2248 handle_t *handle;
2247 struct inode *inode = old_dentry->d_inode; 2249 struct inode *inode = old_dentry->d_inode;
@@ -2284,13 +2286,13 @@ retry:
2284 * Anybody can rename anything with this: the permission checks are left to the 2286 * Anybody can rename anything with this: the permission checks are left to the
2285 * higher-level routines. 2287 * higher-level routines.
2286 */ 2288 */
2287static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry, 2289static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2288 struct inode * new_dir,struct dentry *new_dentry) 2290 struct inode *new_dir, struct dentry *new_dentry)
2289{ 2291{
2290 handle_t *handle; 2292 handle_t *handle;
2291 struct inode * old_inode, * new_inode; 2293 struct inode *old_inode, *new_inode;
2292 struct buffer_head * old_bh, * new_bh, * dir_bh; 2294 struct buffer_head *old_bh, *new_bh, *dir_bh;
2293 struct ext4_dir_entry_2 * old_de, * new_de; 2295 struct ext4_dir_entry_2 *old_de, *new_de;
2294 int retval; 2296 int retval;
2295 2297
2296 old_bh = new_bh = dir_bh = NULL; 2298 old_bh = new_bh = dir_bh = NULL;
@@ -2308,7 +2310,7 @@ static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry,
2308 if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) 2310 if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
2309 handle->h_sync = 1; 2311 handle->h_sync = 1;
2310 2312
2311 old_bh = ext4_find_entry (old_dentry, &old_de); 2313 old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de);
2312 /* 2314 /*
2313 * Check for inode number is _not_ due to possible IO errors. 2315 * Check for inode number is _not_ due to possible IO errors.
2314 * We might rmdir the source, keep it as pwd of some process 2316 * We might rmdir the source, keep it as pwd of some process
@@ -2321,32 +2323,32 @@ static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry,
2321 goto end_rename; 2323 goto end_rename;
2322 2324
2323 new_inode = new_dentry->d_inode; 2325 new_inode = new_dentry->d_inode;
2324 new_bh = ext4_find_entry (new_dentry, &new_de); 2326 new_bh = ext4_find_entry(new_dir, &new_dentry->d_name, &new_de);
2325 if (new_bh) { 2327 if (new_bh) {
2326 if (!new_inode) { 2328 if (!new_inode) {
2327 brelse (new_bh); 2329 brelse(new_bh);
2328 new_bh = NULL; 2330 new_bh = NULL;
2329 } 2331 }
2330 } 2332 }
2331 if (S_ISDIR(old_inode->i_mode)) { 2333 if (S_ISDIR(old_inode->i_mode)) {
2332 if (new_inode) { 2334 if (new_inode) {
2333 retval = -ENOTEMPTY; 2335 retval = -ENOTEMPTY;
2334 if (!empty_dir (new_inode)) 2336 if (!empty_dir(new_inode))
2335 goto end_rename; 2337 goto end_rename;
2336 } 2338 }
2337 retval = -EIO; 2339 retval = -EIO;
2338 dir_bh = ext4_bread (handle, old_inode, 0, 0, &retval); 2340 dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval);
2339 if (!dir_bh) 2341 if (!dir_bh)
2340 goto end_rename; 2342 goto end_rename;
2341 if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino) 2343 if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino)
2342 goto end_rename; 2344 goto end_rename;
2343 retval = -EMLINK; 2345 retval = -EMLINK;
2344 if (!new_inode && new_dir!=old_dir && 2346 if (!new_inode && new_dir != old_dir &&
2345 new_dir->i_nlink >= EXT4_LINK_MAX) 2347 new_dir->i_nlink >= EXT4_LINK_MAX)
2346 goto end_rename; 2348 goto end_rename;
2347 } 2349 }
2348 if (!new_bh) { 2350 if (!new_bh) {
2349 retval = ext4_add_entry (handle, new_dentry, old_inode); 2351 retval = ext4_add_entry(handle, new_dentry, old_inode);
2350 if (retval) 2352 if (retval)
2351 goto end_rename; 2353 goto end_rename;
2352 } else { 2354 } else {
@@ -2388,7 +2390,7 @@ static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry,
2388 struct buffer_head *old_bh2; 2390 struct buffer_head *old_bh2;
2389 struct ext4_dir_entry_2 *old_de2; 2391 struct ext4_dir_entry_2 *old_de2;
2390 2392
2391 old_bh2 = ext4_find_entry(old_dentry, &old_de2); 2393 old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de2);
2392 if (old_bh2) { 2394 if (old_bh2) {
2393 retval = ext4_delete_entry(handle, old_dir, 2395 retval = ext4_delete_entry(handle, old_dir,
2394 old_de2, old_bh2); 2396 old_de2, old_bh2);
@@ -2433,9 +2435,9 @@ static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry,
2433 retval = 0; 2435 retval = 0;
2434 2436
2435end_rename: 2437end_rename:
2436 brelse (dir_bh); 2438 brelse(dir_bh);
2437 brelse (old_bh); 2439 brelse(old_bh);
2438 brelse (new_bh); 2440 brelse(new_bh);
2439 ext4_journal_stop(handle); 2441 ext4_journal_stop(handle);
2440 return retval; 2442 return retval;
2441} 2443}
@@ -2454,7 +2456,7 @@ const struct inode_operations ext4_dir_inode_operations = {
2454 .mknod = ext4_mknod, 2456 .mknod = ext4_mknod,
2455 .rename = ext4_rename, 2457 .rename = ext4_rename,
2456 .setattr = ext4_setattr, 2458 .setattr = ext4_setattr,
2457#ifdef CONFIG_EXT4DEV_FS_XATTR 2459#ifdef CONFIG_EXT4_FS_XATTR
2458 .setxattr = generic_setxattr, 2460 .setxattr = generic_setxattr,
2459 .getxattr = generic_getxattr, 2461 .getxattr = generic_getxattr,
2460 .listxattr = ext4_listxattr, 2462 .listxattr = ext4_listxattr,
@@ -2465,7 +2467,7 @@ const struct inode_operations ext4_dir_inode_operations = {
2465 2467
2466const struct inode_operations ext4_special_inode_operations = { 2468const struct inode_operations ext4_special_inode_operations = {
2467 .setattr = ext4_setattr, 2469 .setattr = ext4_setattr,
2468#ifdef CONFIG_EXT4DEV_FS_XATTR 2470#ifdef CONFIG_EXT4_FS_XATTR
2469 .setxattr = generic_setxattr, 2471 .setxattr = generic_setxattr,
2470 .getxattr = generic_getxattr, 2472 .getxattr = generic_getxattr,
2471 .listxattr = ext4_listxattr, 2473 .listxattr = ext4_listxattr,
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index b3d35604ea18..b6ec1843a015 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -416,8 +416,8 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
416 "EXT4-fs: ext4_add_new_gdb: adding group block %lu\n", 416 "EXT4-fs: ext4_add_new_gdb: adding group block %lu\n",
417 gdb_num); 417 gdb_num);
418 418
419 /* 419 /*
420 * If we are not using the primary superblock/GDT copy don't resize, 420 * If we are not using the primary superblock/GDT copy don't resize,
421 * because the user tools have no way of handling this. Probably a 421 * because the user tools have no way of handling this. Probably a
422 * bad time to do it anyways. 422 * bad time to do it anyways.
423 */ 423 */
@@ -870,11 +870,10 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
870 * We can allocate memory for mb_alloc based on the new group 870 * We can allocate memory for mb_alloc based on the new group
871 * descriptor 871 * descriptor
872 */ 872 */
873 if (test_opt(sb, MBALLOC)) { 873 err = ext4_mb_add_more_groupinfo(sb, input->group, gdp);
874 err = ext4_mb_add_more_groupinfo(sb, input->group, gdp); 874 if (err)
875 if (err) 875 goto exit_journal;
876 goto exit_journal; 876
877 }
878 /* 877 /*
879 * Make the new blocks and inodes valid next. We do this before 878 * Make the new blocks and inodes valid next. We do this before
880 * increasing the group count so that once the group is enabled, 879 * increasing the group count so that once the group is enabled,
@@ -929,6 +928,15 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
929 percpu_counter_add(&sbi->s_freeinodes_counter, 928 percpu_counter_add(&sbi->s_freeinodes_counter,
930 EXT4_INODES_PER_GROUP(sb)); 929 EXT4_INODES_PER_GROUP(sb));
931 930
931 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
932 ext4_group_t flex_group;
933 flex_group = ext4_flex_group(sbi, input->group);
934 sbi->s_flex_groups[flex_group].free_blocks +=
935 input->free_blocks_count;
936 sbi->s_flex_groups[flex_group].free_inodes +=
937 EXT4_INODES_PER_GROUP(sb);
938 }
939
932 ext4_journal_dirty_metadata(handle, sbi->s_sbh); 940 ext4_journal_dirty_metadata(handle, sbi->s_sbh);
933 sb->s_dirt = 1; 941 sb->s_dirt = 1;
934 942
@@ -964,7 +972,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
964 ext4_group_t o_groups_count; 972 ext4_group_t o_groups_count;
965 ext4_grpblk_t last; 973 ext4_grpblk_t last;
966 ext4_grpblk_t add; 974 ext4_grpblk_t add;
967 struct buffer_head * bh; 975 struct buffer_head *bh;
968 handle_t *handle; 976 handle_t *handle;
969 int err; 977 int err;
970 unsigned long freed_blocks; 978 unsigned long freed_blocks;
@@ -1077,8 +1085,15 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
1077 /* 1085 /*
1078 * Mark mballoc pages as not up to date so that they will be updated 1086 * Mark mballoc pages as not up to date so that they will be updated
1079 * next time they are loaded by ext4_mb_load_buddy. 1087 * next time they are loaded by ext4_mb_load_buddy.
1088 *
1089 * XXX Bad, Bad, BAD!!! We should not be overloading the
1090 * Uptodate flag, particularly on thte bitmap bh, as way of
1091 * hinting to ext4_mb_load_buddy() that it needs to be
1092 * overloaded. A user could take a LVM snapshot, then do an
1093 * on-line fsck, and clear the uptodate flag, and this would
1094 * not be a bug in userspace, but a bug in the kernel. FIXME!!!
1080 */ 1095 */
1081 if (test_opt(sb, MBALLOC)) { 1096 {
1082 struct ext4_sb_info *sbi = EXT4_SB(sb); 1097 struct ext4_sb_info *sbi = EXT4_SB(sb);
1083 struct inode *inode = sbi->s_buddy_cache; 1098 struct inode *inode = sbi->s_buddy_cache;
1084 int blocks_per_page; 1099 int blocks_per_page;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 566344b926b7..fb940c22ab0d 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -34,6 +34,8 @@
34#include <linux/namei.h> 34#include <linux/namei.h>
35#include <linux/quotaops.h> 35#include <linux/quotaops.h>
36#include <linux/seq_file.h> 36#include <linux/seq_file.h>
37#include <linux/proc_fs.h>
38#include <linux/marker.h>
37#include <linux/log2.h> 39#include <linux/log2.h>
38#include <linux/crc16.h> 40#include <linux/crc16.h>
39#include <asm/uaccess.h> 41#include <asm/uaccess.h>
@@ -45,6 +47,8 @@
45#include "namei.h" 47#include "namei.h"
46#include "group.h" 48#include "group.h"
47 49
50struct proc_dir_entry *ext4_proc_root;
51
48static int ext4_load_journal(struct super_block *, struct ext4_super_block *, 52static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
49 unsigned long journal_devnum); 53 unsigned long journal_devnum);
50static int ext4_create_journal(struct super_block *, struct ext4_super_block *, 54static int ext4_create_journal(struct super_block *, struct ext4_super_block *,
@@ -503,15 +507,18 @@ static void ext4_put_super(struct super_block *sb)
503 ext4_mb_release(sb); 507 ext4_mb_release(sb);
504 ext4_ext_release(sb); 508 ext4_ext_release(sb);
505 ext4_xattr_put_super(sb); 509 ext4_xattr_put_super(sb);
506 jbd2_journal_destroy(sbi->s_journal); 510 if (jbd2_journal_destroy(sbi->s_journal) < 0)
511 ext4_abort(sb, __func__, "Couldn't clean up the journal");
507 sbi->s_journal = NULL; 512 sbi->s_journal = NULL;
508 if (!(sb->s_flags & MS_RDONLY)) { 513 if (!(sb->s_flags & MS_RDONLY)) {
509 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); 514 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
510 es->s_state = cpu_to_le16(sbi->s_mount_state); 515 es->s_state = cpu_to_le16(sbi->s_mount_state);
511 BUFFER_TRACE(sbi->s_sbh, "marking dirty");
512 mark_buffer_dirty(sbi->s_sbh);
513 ext4_commit_super(sb, es, 1); 516 ext4_commit_super(sb, es, 1);
514 } 517 }
518 if (sbi->s_proc) {
519 remove_proc_entry("inode_readahead_blks", sbi->s_proc);
520 remove_proc_entry(sb->s_id, ext4_proc_root);
521 }
515 522
516 for (i = 0; i < sbi->s_gdb_count; i++) 523 for (i = 0; i < sbi->s_gdb_count; i++)
517 brelse(sbi->s_group_desc[i]); 524 brelse(sbi->s_group_desc[i]);
@@ -520,6 +527,7 @@ static void ext4_put_super(struct super_block *sb)
520 percpu_counter_destroy(&sbi->s_freeblocks_counter); 527 percpu_counter_destroy(&sbi->s_freeblocks_counter);
521 percpu_counter_destroy(&sbi->s_freeinodes_counter); 528 percpu_counter_destroy(&sbi->s_freeinodes_counter);
522 percpu_counter_destroy(&sbi->s_dirs_counter); 529 percpu_counter_destroy(&sbi->s_dirs_counter);
530 percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
523 brelse(sbi->s_sbh); 531 brelse(sbi->s_sbh);
524#ifdef CONFIG_QUOTA 532#ifdef CONFIG_QUOTA
525 for (i = 0; i < MAXQUOTAS; i++) 533 for (i = 0; i < MAXQUOTAS; i++)
@@ -562,11 +570,10 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
562 ei = kmem_cache_alloc(ext4_inode_cachep, GFP_NOFS); 570 ei = kmem_cache_alloc(ext4_inode_cachep, GFP_NOFS);
563 if (!ei) 571 if (!ei)
564 return NULL; 572 return NULL;
565#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL 573#ifdef CONFIG_EXT4_FS_POSIX_ACL
566 ei->i_acl = EXT4_ACL_NOT_CACHED; 574 ei->i_acl = EXT4_ACL_NOT_CACHED;
567 ei->i_default_acl = EXT4_ACL_NOT_CACHED; 575 ei->i_default_acl = EXT4_ACL_NOT_CACHED;
568#endif 576#endif
569 ei->i_block_alloc_info = NULL;
570 ei->vfs_inode.i_version = 1; 577 ei->vfs_inode.i_version = 1;
571 ei->vfs_inode.i_data.writeback_index = 0; 578 ei->vfs_inode.i_data.writeback_index = 0;
572 memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache)); 579 memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
@@ -599,7 +606,7 @@ static void init_once(void *foo)
599 struct ext4_inode_info *ei = (struct ext4_inode_info *) foo; 606 struct ext4_inode_info *ei = (struct ext4_inode_info *) foo;
600 607
601 INIT_LIST_HEAD(&ei->i_orphan); 608 INIT_LIST_HEAD(&ei->i_orphan);
602#ifdef CONFIG_EXT4DEV_FS_XATTR 609#ifdef CONFIG_EXT4_FS_XATTR
603 init_rwsem(&ei->xattr_sem); 610 init_rwsem(&ei->xattr_sem);
604#endif 611#endif
605 init_rwsem(&ei->i_data_sem); 612 init_rwsem(&ei->i_data_sem);
@@ -625,8 +632,7 @@ static void destroy_inodecache(void)
625 632
626static void ext4_clear_inode(struct inode *inode) 633static void ext4_clear_inode(struct inode *inode)
627{ 634{
628 struct ext4_block_alloc_info *rsv = EXT4_I(inode)->i_block_alloc_info; 635#ifdef CONFIG_EXT4_FS_POSIX_ACL
629#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
630 if (EXT4_I(inode)->i_acl && 636 if (EXT4_I(inode)->i_acl &&
631 EXT4_I(inode)->i_acl != EXT4_ACL_NOT_CACHED) { 637 EXT4_I(inode)->i_acl != EXT4_ACL_NOT_CACHED) {
632 posix_acl_release(EXT4_I(inode)->i_acl); 638 posix_acl_release(EXT4_I(inode)->i_acl);
@@ -638,10 +644,7 @@ static void ext4_clear_inode(struct inode *inode)
638 EXT4_I(inode)->i_default_acl = EXT4_ACL_NOT_CACHED; 644 EXT4_I(inode)->i_default_acl = EXT4_ACL_NOT_CACHED;
639 } 645 }
640#endif 646#endif
641 ext4_discard_reservation(inode); 647 ext4_discard_preallocations(inode);
642 EXT4_I(inode)->i_block_alloc_info = NULL;
643 if (unlikely(rsv))
644 kfree(rsv);
645 jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal, 648 jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal,
646 &EXT4_I(inode)->jinode); 649 &EXT4_I(inode)->jinode);
647} 650}
@@ -654,7 +657,7 @@ static inline void ext4_show_quota_options(struct seq_file *seq,
654 657
655 if (sbi->s_jquota_fmt) 658 if (sbi->s_jquota_fmt)
656 seq_printf(seq, ",jqfmt=%s", 659 seq_printf(seq, ",jqfmt=%s",
657 (sbi->s_jquota_fmt == QFMT_VFS_OLD) ? "vfsold": "vfsv0"); 660 (sbi->s_jquota_fmt == QFMT_VFS_OLD) ? "vfsold" : "vfsv0");
658 661
659 if (sbi->s_qf_names[USRQUOTA]) 662 if (sbi->s_qf_names[USRQUOTA])
660 seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]); 663 seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]);
@@ -718,7 +721,7 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
718 seq_puts(seq, ",debug"); 721 seq_puts(seq, ",debug");
719 if (test_opt(sb, OLDALLOC)) 722 if (test_opt(sb, OLDALLOC))
720 seq_puts(seq, ",oldalloc"); 723 seq_puts(seq, ",oldalloc");
721#ifdef CONFIG_EXT4DEV_FS_XATTR 724#ifdef CONFIG_EXT4_FS_XATTR
722 if (test_opt(sb, XATTR_USER) && 725 if (test_opt(sb, XATTR_USER) &&
723 !(def_mount_opts & EXT4_DEFM_XATTR_USER)) 726 !(def_mount_opts & EXT4_DEFM_XATTR_USER))
724 seq_puts(seq, ",user_xattr"); 727 seq_puts(seq, ",user_xattr");
@@ -727,7 +730,7 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
727 seq_puts(seq, ",nouser_xattr"); 730 seq_puts(seq, ",nouser_xattr");
728 } 731 }
729#endif 732#endif
730#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL 733#ifdef CONFIG_EXT4_FS_POSIX_ACL
731 if (test_opt(sb, POSIX_ACL) && !(def_mount_opts & EXT4_DEFM_ACL)) 734 if (test_opt(sb, POSIX_ACL) && !(def_mount_opts & EXT4_DEFM_ACL))
732 seq_puts(seq, ",acl"); 735 seq_puts(seq, ",acl");
733 if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT4_DEFM_ACL)) 736 if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT4_DEFM_ACL))
@@ -752,8 +755,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
752 seq_puts(seq, ",nobh"); 755 seq_puts(seq, ",nobh");
753 if (!test_opt(sb, EXTENTS)) 756 if (!test_opt(sb, EXTENTS))
754 seq_puts(seq, ",noextents"); 757 seq_puts(seq, ",noextents");
755 if (!test_opt(sb, MBALLOC))
756 seq_puts(seq, ",nomballoc");
757 if (test_opt(sb, I_VERSION)) 758 if (test_opt(sb, I_VERSION))
758 seq_puts(seq, ",i_version"); 759 seq_puts(seq, ",i_version");
759 if (!test_opt(sb, DELALLOC)) 760 if (!test_opt(sb, DELALLOC))
@@ -773,6 +774,13 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
773 else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) 774 else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
774 seq_puts(seq, ",data=writeback"); 775 seq_puts(seq, ",data=writeback");
775 776
777 if (sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS)
778 seq_printf(seq, ",inode_readahead_blks=%u",
779 sbi->s_inode_readahead_blks);
780
781 if (test_opt(sb, DATA_ERR_ABORT))
782 seq_puts(seq, ",data_err=abort");
783
776 ext4_show_quota_options(seq, sb); 784 ext4_show_quota_options(seq, sb);
777 return 0; 785 return 0;
778} 786}
@@ -822,7 +830,7 @@ static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid,
822} 830}
823 831
824#ifdef CONFIG_QUOTA 832#ifdef CONFIG_QUOTA
825#define QTYPE2NAME(t) ((t) == USRQUOTA?"user":"group") 833#define QTYPE2NAME(t) ((t) == USRQUOTA ? "user" : "group")
826#define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA)) 834#define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
827 835
828static int ext4_dquot_initialize(struct inode *inode, int type); 836static int ext4_dquot_initialize(struct inode *inode, int type);
@@ -902,11 +910,13 @@ enum {
902 Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev, 910 Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
903 Opt_journal_checksum, Opt_journal_async_commit, 911 Opt_journal_checksum, Opt_journal_async_commit,
904 Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, 912 Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
913 Opt_data_err_abort, Opt_data_err_ignore,
905 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, 914 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
906 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, 915 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
907 Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, 916 Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
908 Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version, 917 Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
909 Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc, Opt_nodelalloc, 918 Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc, Opt_nodelalloc,
919 Opt_inode_readahead_blks
910}; 920};
911 921
912static match_table_t tokens = { 922static match_table_t tokens = {
@@ -947,6 +957,8 @@ static match_table_t tokens = {
947 {Opt_data_journal, "data=journal"}, 957 {Opt_data_journal, "data=journal"},
948 {Opt_data_ordered, "data=ordered"}, 958 {Opt_data_ordered, "data=ordered"},
949 {Opt_data_writeback, "data=writeback"}, 959 {Opt_data_writeback, "data=writeback"},
960 {Opt_data_err_abort, "data_err=abort"},
961 {Opt_data_err_ignore, "data_err=ignore"},
950 {Opt_offusrjquota, "usrjquota="}, 962 {Opt_offusrjquota, "usrjquota="},
951 {Opt_usrjquota, "usrjquota=%s"}, 963 {Opt_usrjquota, "usrjquota=%s"},
952 {Opt_offgrpjquota, "grpjquota="}, 964 {Opt_offgrpjquota, "grpjquota="},
@@ -967,6 +979,7 @@ static match_table_t tokens = {
967 {Opt_resize, "resize"}, 979 {Opt_resize, "resize"},
968 {Opt_delalloc, "delalloc"}, 980 {Opt_delalloc, "delalloc"},
969 {Opt_nodelalloc, "nodelalloc"}, 981 {Opt_nodelalloc, "nodelalloc"},
982 {Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
970 {Opt_err, NULL}, 983 {Opt_err, NULL},
971}; 984};
972 985
@@ -981,7 +994,7 @@ static ext4_fsblk_t get_sb_block(void **data)
981 /*todo: use simple_strtoll with >32bit ext4 */ 994 /*todo: use simple_strtoll with >32bit ext4 */
982 sb_block = simple_strtoul(options, &options, 0); 995 sb_block = simple_strtoul(options, &options, 0);
983 if (*options && *options != ',') { 996 if (*options && *options != ',') {
984 printk("EXT4-fs: Invalid sb specification: %s\n", 997 printk(KERN_ERR "EXT4-fs: Invalid sb specification: %s\n",
985 (char *) *data); 998 (char *) *data);
986 return 1; 999 return 1;
987 } 1000 }
@@ -1072,7 +1085,7 @@ static int parse_options(char *options, struct super_block *sb,
1072 case Opt_orlov: 1085 case Opt_orlov:
1073 clear_opt(sbi->s_mount_opt, OLDALLOC); 1086 clear_opt(sbi->s_mount_opt, OLDALLOC);
1074 break; 1087 break;
1075#ifdef CONFIG_EXT4DEV_FS_XATTR 1088#ifdef CONFIG_EXT4_FS_XATTR
1076 case Opt_user_xattr: 1089 case Opt_user_xattr:
1077 set_opt(sbi->s_mount_opt, XATTR_USER); 1090 set_opt(sbi->s_mount_opt, XATTR_USER);
1078 break; 1091 break;
@@ -1082,10 +1095,11 @@ static int parse_options(char *options, struct super_block *sb,
1082#else 1095#else
1083 case Opt_user_xattr: 1096 case Opt_user_xattr:
1084 case Opt_nouser_xattr: 1097 case Opt_nouser_xattr:
1085 printk("EXT4 (no)user_xattr options not supported\n"); 1098 printk(KERN_ERR "EXT4 (no)user_xattr options "
1099 "not supported\n");
1086 break; 1100 break;
1087#endif 1101#endif
1088#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL 1102#ifdef CONFIG_EXT4_FS_POSIX_ACL
1089 case Opt_acl: 1103 case Opt_acl:
1090 set_opt(sbi->s_mount_opt, POSIX_ACL); 1104 set_opt(sbi->s_mount_opt, POSIX_ACL);
1091 break; 1105 break;
@@ -1095,7 +1109,8 @@ static int parse_options(char *options, struct super_block *sb,
1095#else 1109#else
1096 case Opt_acl: 1110 case Opt_acl:
1097 case Opt_noacl: 1111 case Opt_noacl:
1098 printk("EXT4 (no)acl options not supported\n"); 1112 printk(KERN_ERR "EXT4 (no)acl options "
1113 "not supported\n");
1099 break; 1114 break;
1100#endif 1115#endif
1101 case Opt_reservation: 1116 case Opt_reservation:
@@ -1178,6 +1193,12 @@ static int parse_options(char *options, struct super_block *sb,
1178 sbi->s_mount_opt |= data_opt; 1193 sbi->s_mount_opt |= data_opt;
1179 } 1194 }
1180 break; 1195 break;
1196 case Opt_data_err_abort:
1197 set_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
1198 break;
1199 case Opt_data_err_ignore:
1200 clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
1201 break;
1181#ifdef CONFIG_QUOTA 1202#ifdef CONFIG_QUOTA
1182 case Opt_usrjquota: 1203 case Opt_usrjquota:
1183 qtype = USRQUOTA; 1204 qtype = USRQUOTA;
@@ -1189,8 +1210,8 @@ set_qf_name:
1189 sb_any_quota_suspended(sb)) && 1210 sb_any_quota_suspended(sb)) &&
1190 !sbi->s_qf_names[qtype]) { 1211 !sbi->s_qf_names[qtype]) {
1191 printk(KERN_ERR 1212 printk(KERN_ERR
1192 "EXT4-fs: Cannot change journaled " 1213 "EXT4-fs: Cannot change journaled "
1193 "quota options when quota turned on.\n"); 1214 "quota options when quota turned on.\n");
1194 return 0; 1215 return 0;
1195 } 1216 }
1196 qname = match_strdup(&args[0]); 1217 qname = match_strdup(&args[0]);
@@ -1357,12 +1378,6 @@ set_qf_format:
1357 case Opt_nodelalloc: 1378 case Opt_nodelalloc:
1358 clear_opt(sbi->s_mount_opt, DELALLOC); 1379 clear_opt(sbi->s_mount_opt, DELALLOC);
1359 break; 1380 break;
1360 case Opt_mballoc:
1361 set_opt(sbi->s_mount_opt, MBALLOC);
1362 break;
1363 case Opt_nomballoc:
1364 clear_opt(sbi->s_mount_opt, MBALLOC);
1365 break;
1366 case Opt_stripe: 1381 case Opt_stripe:
1367 if (match_int(&args[0], &option)) 1382 if (match_int(&args[0], &option))
1368 return 0; 1383 return 0;
@@ -1373,6 +1388,13 @@ set_qf_format:
1373 case Opt_delalloc: 1388 case Opt_delalloc:
1374 set_opt(sbi->s_mount_opt, DELALLOC); 1389 set_opt(sbi->s_mount_opt, DELALLOC);
1375 break; 1390 break;
1391 case Opt_inode_readahead_blks:
1392 if (match_int(&args[0], &option))
1393 return 0;
1394 if (option < 0 || option > (1 << 30))
1395 return 0;
1396 sbi->s_inode_readahead_blks = option;
1397 break;
1376 default: 1398 default:
1377 printk(KERN_ERR 1399 printk(KERN_ERR
1378 "EXT4-fs: Unrecognized mount option \"%s\" " 1400 "EXT4-fs: Unrecognized mount option \"%s\" "
@@ -1473,15 +1495,9 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
1473 EXT4_INODES_PER_GROUP(sb), 1495 EXT4_INODES_PER_GROUP(sb),
1474 sbi->s_mount_opt); 1496 sbi->s_mount_opt);
1475 1497
1476 printk(KERN_INFO "EXT4 FS on %s, ", sb->s_id); 1498 printk(KERN_INFO "EXT4 FS on %s, %s journal on %s\n",
1477 if (EXT4_SB(sb)->s_journal->j_inode == NULL) { 1499 sb->s_id, EXT4_SB(sb)->s_journal->j_inode ? "internal" :
1478 char b[BDEVNAME_SIZE]; 1500 "external", EXT4_SB(sb)->s_journal->j_devname);
1479
1480 printk("external journal on %s\n",
1481 bdevname(EXT4_SB(sb)->s_journal->j_dev, b));
1482 } else {
1483 printk("internal journal\n");
1484 }
1485 return res; 1501 return res;
1486} 1502}
1487 1503
@@ -1504,8 +1520,11 @@ static int ext4_fill_flex_info(struct super_block *sb)
1504 sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; 1520 sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
1505 groups_per_flex = 1 << sbi->s_log_groups_per_flex; 1521 groups_per_flex = 1 << sbi->s_log_groups_per_flex;
1506 1522
1507 flex_group_count = (sbi->s_groups_count + groups_per_flex - 1) / 1523 /* We allocate both existing and potentially added groups */
1508 groups_per_flex; 1524 flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) +
1525 ((sbi->s_es->s_reserved_gdt_blocks +1 ) <<
1526 EXT4_DESC_PER_BLOCK_BITS(sb))) /
1527 groups_per_flex;
1509 sbi->s_flex_groups = kzalloc(flex_group_count * 1528 sbi->s_flex_groups = kzalloc(flex_group_count *
1510 sizeof(struct flex_groups), GFP_KERNEL); 1529 sizeof(struct flex_groups), GFP_KERNEL);
1511 if (sbi->s_flex_groups == NULL) { 1530 if (sbi->s_flex_groups == NULL) {
@@ -1584,7 +1603,7 @@ static int ext4_check_descriptors(struct super_block *sb)
1584 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) 1603 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
1585 flexbg_flag = 1; 1604 flexbg_flag = 1;
1586 1605
1587 ext4_debug ("Checking group descriptors"); 1606 ext4_debug("Checking group descriptors");
1588 1607
1589 for (i = 0; i < sbi->s_groups_count; i++) { 1608 for (i = 0; i < sbi->s_groups_count; i++) {
1590 struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL); 1609 struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
@@ -1623,8 +1642,10 @@ static int ext4_check_descriptors(struct super_block *sb)
1623 "Checksum for group %lu failed (%u!=%u)\n", 1642 "Checksum for group %lu failed (%u!=%u)\n",
1624 i, le16_to_cpu(ext4_group_desc_csum(sbi, i, 1643 i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
1625 gdp)), le16_to_cpu(gdp->bg_checksum)); 1644 gdp)), le16_to_cpu(gdp->bg_checksum));
1626 if (!(sb->s_flags & MS_RDONLY)) 1645 if (!(sb->s_flags & MS_RDONLY)) {
1646 spin_unlock(sb_bgl_lock(sbi, i));
1627 return 0; 1647 return 0;
1648 }
1628 } 1649 }
1629 spin_unlock(sb_bgl_lock(sbi, i)); 1650 spin_unlock(sb_bgl_lock(sbi, i));
1630 if (!flexbg_flag) 1651 if (!flexbg_flag)
@@ -1714,9 +1735,9 @@ static void ext4_orphan_cleanup(struct super_block *sb,
1714 DQUOT_INIT(inode); 1735 DQUOT_INIT(inode);
1715 if (inode->i_nlink) { 1736 if (inode->i_nlink) {
1716 printk(KERN_DEBUG 1737 printk(KERN_DEBUG
1717 "%s: truncating inode %lu to %Ld bytes\n", 1738 "%s: truncating inode %lu to %lld bytes\n",
1718 __func__, inode->i_ino, inode->i_size); 1739 __func__, inode->i_ino, inode->i_size);
1719 jbd_debug(2, "truncating inode %lu to %Ld bytes\n", 1740 jbd_debug(2, "truncating inode %lu to %lld bytes\n",
1720 inode->i_ino, inode->i_size); 1741 inode->i_ino, inode->i_size);
1721 ext4_truncate(inode); 1742 ext4_truncate(inode);
1722 nr_truncates++; 1743 nr_truncates++;
@@ -1914,6 +1935,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
1914 unsigned long journal_devnum = 0; 1935 unsigned long journal_devnum = 0;
1915 unsigned long def_mount_opts; 1936 unsigned long def_mount_opts;
1916 struct inode *root; 1937 struct inode *root;
1938 char *cp;
1917 int ret = -EINVAL; 1939 int ret = -EINVAL;
1918 int blocksize; 1940 int blocksize;
1919 int db_count; 1941 int db_count;
@@ -1930,10 +1952,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
1930 sbi->s_mount_opt = 0; 1952 sbi->s_mount_opt = 0;
1931 sbi->s_resuid = EXT4_DEF_RESUID; 1953 sbi->s_resuid = EXT4_DEF_RESUID;
1932 sbi->s_resgid = EXT4_DEF_RESGID; 1954 sbi->s_resgid = EXT4_DEF_RESGID;
1955 sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
1933 sbi->s_sb_block = sb_block; 1956 sbi->s_sb_block = sb_block;
1934 1957
1935 unlock_kernel(); 1958 unlock_kernel();
1936 1959
1960 /* Cleanup superblock name */
1961 for (cp = sb->s_id; (cp = strchr(cp, '/'));)
1962 *cp = '!';
1963
1937 blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE); 1964 blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE);
1938 if (!blocksize) { 1965 if (!blocksize) {
1939 printk(KERN_ERR "EXT4-fs: unable to set blocksize\n"); 1966 printk(KERN_ERR "EXT4-fs: unable to set blocksize\n");
@@ -1973,11 +2000,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
1973 set_opt(sbi->s_mount_opt, GRPID); 2000 set_opt(sbi->s_mount_opt, GRPID);
1974 if (def_mount_opts & EXT4_DEFM_UID16) 2001 if (def_mount_opts & EXT4_DEFM_UID16)
1975 set_opt(sbi->s_mount_opt, NO_UID32); 2002 set_opt(sbi->s_mount_opt, NO_UID32);
1976#ifdef CONFIG_EXT4DEV_FS_XATTR 2003#ifdef CONFIG_EXT4_FS_XATTR
1977 if (def_mount_opts & EXT4_DEFM_XATTR_USER) 2004 if (def_mount_opts & EXT4_DEFM_XATTR_USER)
1978 set_opt(sbi->s_mount_opt, XATTR_USER); 2005 set_opt(sbi->s_mount_opt, XATTR_USER);
1979#endif 2006#endif
1980#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL 2007#ifdef CONFIG_EXT4_FS_POSIX_ACL
1981 if (def_mount_opts & EXT4_DEFM_ACL) 2008 if (def_mount_opts & EXT4_DEFM_ACL)
1982 set_opt(sbi->s_mount_opt, POSIX_ACL); 2009 set_opt(sbi->s_mount_opt, POSIX_ACL);
1983#endif 2010#endif
@@ -2012,11 +2039,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2012 ext4_warning(sb, __func__, 2039 ext4_warning(sb, __func__,
2013 "extents feature not enabled on this filesystem, " 2040 "extents feature not enabled on this filesystem, "
2014 "use tune2fs.\n"); 2041 "use tune2fs.\n");
2015 /*
2016 * turn on mballoc code by default in ext4 filesystem
2017 * Use -o nomballoc to turn it off
2018 */
2019 set_opt(sbi->s_mount_opt, MBALLOC);
2020 2042
2021 /* 2043 /*
2022 * enable delayed allocation by default 2044 * enable delayed allocation by default
@@ -2041,16 +2063,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2041 "running e2fsck is recommended\n"); 2063 "running e2fsck is recommended\n");
2042 2064
2043 /* 2065 /*
2044 * Since ext4 is still considered development code, we require
2045 * that the TEST_FILESYS flag in s->flags be set.
2046 */
2047 if (!(le32_to_cpu(es->s_flags) & EXT2_FLAGS_TEST_FILESYS)) {
2048 printk(KERN_WARNING "EXT4-fs: %s: not marked "
2049 "OK to use with test code.\n", sb->s_id);
2050 goto failed_mount;
2051 }
2052
2053 /*
2054 * Check feature flags regardless of the revision level, since we 2066 * Check feature flags regardless of the revision level, since we
2055 * previously didn't change the revision level when setting the flags, 2067 * previously didn't change the revision level when setting the flags,
2056 * so there is a chance incompat flags are set on a rev 0 filesystem. 2068 * so there is a chance incompat flags are set on a rev 0 filesystem.
@@ -2219,6 +2231,16 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2219 goto failed_mount; 2231 goto failed_mount;
2220 } 2232 }
2221 2233
2234#ifdef CONFIG_PROC_FS
2235 if (ext4_proc_root)
2236 sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root);
2237
2238 if (sbi->s_proc)
2239 proc_create_data("inode_readahead_blks", 0644, sbi->s_proc,
2240 &ext4_ui_proc_fops,
2241 &sbi->s_inode_readahead_blks);
2242#endif
2243
2222 bgl_lock_init(&sbi->s_blockgroup_lock); 2244 bgl_lock_init(&sbi->s_blockgroup_lock);
2223 2245
2224 for (i = 0; i < db_count; i++) { 2246 for (i = 0; i < db_count; i++) {
@@ -2257,24 +2279,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2257 err = percpu_counter_init(&sbi->s_dirs_counter, 2279 err = percpu_counter_init(&sbi->s_dirs_counter,
2258 ext4_count_dirs(sb)); 2280 ext4_count_dirs(sb));
2259 } 2281 }
2282 if (!err) {
2283 err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
2284 }
2260 if (err) { 2285 if (err) {
2261 printk(KERN_ERR "EXT4-fs: insufficient memory\n"); 2286 printk(KERN_ERR "EXT4-fs: insufficient memory\n");
2262 goto failed_mount3; 2287 goto failed_mount3;
2263 } 2288 }
2264 2289
2265 /* per fileystem reservation list head & lock */
2266 spin_lock_init(&sbi->s_rsv_window_lock);
2267 sbi->s_rsv_window_root = RB_ROOT;
2268 /* Add a single, static dummy reservation to the start of the
2269 * reservation window list --- it gives us a placeholder for
2270 * append-at-start-of-list which makes the allocation logic
2271 * _much_ simpler. */
2272 sbi->s_rsv_window_head.rsv_start = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
2273 sbi->s_rsv_window_head.rsv_end = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
2274 sbi->s_rsv_window_head.rsv_alloc_hit = 0;
2275 sbi->s_rsv_window_head.rsv_goal_size = 0;
2276 ext4_rsv_window_add(sb, &sbi->s_rsv_window_head);
2277
2278 sbi->s_stripe = ext4_get_stripe_size(sbi); 2290 sbi->s_stripe = ext4_get_stripe_size(sbi);
2279 2291
2280 /* 2292 /*
@@ -2471,7 +2483,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2471 printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n"); 2483 printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n");
2472 2484
2473 ext4_ext_init(sb); 2485 ext4_ext_init(sb);
2474 ext4_mb_init(sb, needs_recovery); 2486 err = ext4_mb_init(sb, needs_recovery);
2487 if (err) {
2488 printk(KERN_ERR "EXT4-fs: failed to initalize mballoc (%d)\n",
2489 err);
2490 goto failed_mount4;
2491 }
2475 2492
2476 lock_kernel(); 2493 lock_kernel();
2477 return 0; 2494 return 0;
@@ -2489,11 +2506,16 @@ failed_mount3:
2489 percpu_counter_destroy(&sbi->s_freeblocks_counter); 2506 percpu_counter_destroy(&sbi->s_freeblocks_counter);
2490 percpu_counter_destroy(&sbi->s_freeinodes_counter); 2507 percpu_counter_destroy(&sbi->s_freeinodes_counter);
2491 percpu_counter_destroy(&sbi->s_dirs_counter); 2508 percpu_counter_destroy(&sbi->s_dirs_counter);
2509 percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
2492failed_mount2: 2510failed_mount2:
2493 for (i = 0; i < db_count; i++) 2511 for (i = 0; i < db_count; i++)
2494 brelse(sbi->s_group_desc[i]); 2512 brelse(sbi->s_group_desc[i]);
2495 kfree(sbi->s_group_desc); 2513 kfree(sbi->s_group_desc);
2496failed_mount: 2514failed_mount:
2515 if (sbi->s_proc) {
2516 remove_proc_entry("inode_readahead_blks", sbi->s_proc);
2517 remove_proc_entry(sb->s_id, ext4_proc_root);
2518 }
2497#ifdef CONFIG_QUOTA 2519#ifdef CONFIG_QUOTA
2498 for (i = 0; i < MAXQUOTAS; i++) 2520 for (i = 0; i < MAXQUOTAS; i++)
2499 kfree(sbi->s_qf_names[i]); 2521 kfree(sbi->s_qf_names[i]);
@@ -2527,6 +2549,10 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
2527 journal->j_flags |= JBD2_BARRIER; 2549 journal->j_flags |= JBD2_BARRIER;
2528 else 2550 else
2529 journal->j_flags &= ~JBD2_BARRIER; 2551 journal->j_flags &= ~JBD2_BARRIER;
2552 if (test_opt(sb, DATA_ERR_ABORT))
2553 journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR;
2554 else
2555 journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR;
2530 spin_unlock(&journal->j_state_lock); 2556 spin_unlock(&journal->j_state_lock);
2531} 2557}
2532 2558
@@ -2552,7 +2578,7 @@ static journal_t *ext4_get_journal(struct super_block *sb,
2552 return NULL; 2578 return NULL;
2553 } 2579 }
2554 2580
2555 jbd_debug(2, "Journal inode found at %p: %Ld bytes\n", 2581 jbd_debug(2, "Journal inode found at %p: %lld bytes\n",
2556 journal_inode, journal_inode->i_size); 2582 journal_inode, journal_inode->i_size);
2557 if (!S_ISREG(journal_inode->i_mode)) { 2583 if (!S_ISREG(journal_inode->i_mode)) {
2558 printk(KERN_ERR "EXT4-fs: invalid journal inode.\n"); 2584 printk(KERN_ERR "EXT4-fs: invalid journal inode.\n");
@@ -2715,6 +2741,11 @@ static int ext4_load_journal(struct super_block *sb,
2715 return -EINVAL; 2741 return -EINVAL;
2716 } 2742 }
2717 2743
2744 if (journal->j_flags & JBD2_BARRIER)
2745 printk(KERN_INFO "EXT4-fs: barriers enabled\n");
2746 else
2747 printk(KERN_INFO "EXT4-fs: barriers disabled\n");
2748
2718 if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) { 2749 if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
2719 err = jbd2_journal_update_format(journal); 2750 err = jbd2_journal_update_format(journal);
2720 if (err) { 2751 if (err) {
@@ -2799,13 +2830,34 @@ static void ext4_commit_super(struct super_block *sb,
2799 2830
2800 if (!sbh) 2831 if (!sbh)
2801 return; 2832 return;
2833 if (buffer_write_io_error(sbh)) {
2834 /*
2835 * Oh, dear. A previous attempt to write the
2836 * superblock failed. This could happen because the
2837 * USB device was yanked out. Or it could happen to
2838 * be a transient write error and maybe the block will
2839 * be remapped. Nothing we can do but to retry the
2840 * write and hope for the best.
2841 */
2842 printk(KERN_ERR "ext4: previous I/O error to "
2843 "superblock detected for %s.\n", sb->s_id);
2844 clear_buffer_write_io_error(sbh);
2845 set_buffer_uptodate(sbh);
2846 }
2802 es->s_wtime = cpu_to_le32(get_seconds()); 2847 es->s_wtime = cpu_to_le32(get_seconds());
2803 ext4_free_blocks_count_set(es, ext4_count_free_blocks(sb)); 2848 ext4_free_blocks_count_set(es, ext4_count_free_blocks(sb));
2804 es->s_free_inodes_count = cpu_to_le32(ext4_count_free_inodes(sb)); 2849 es->s_free_inodes_count = cpu_to_le32(ext4_count_free_inodes(sb));
2805 BUFFER_TRACE(sbh, "marking dirty"); 2850 BUFFER_TRACE(sbh, "marking dirty");
2806 mark_buffer_dirty(sbh); 2851 mark_buffer_dirty(sbh);
2807 if (sync) 2852 if (sync) {
2808 sync_dirty_buffer(sbh); 2853 sync_dirty_buffer(sbh);
2854 if (buffer_write_io_error(sbh)) {
2855 printk(KERN_ERR "ext4: I/O error while writing "
2856 "superblock for %s.\n", sb->s_id);
2857 clear_buffer_write_io_error(sbh);
2858 set_buffer_uptodate(sbh);
2859 }
2860 }
2809} 2861}
2810 2862
2811 2863
@@ -2820,7 +2872,9 @@ static void ext4_mark_recovery_complete(struct super_block *sb,
2820 journal_t *journal = EXT4_SB(sb)->s_journal; 2872 journal_t *journal = EXT4_SB(sb)->s_journal;
2821 2873
2822 jbd2_journal_lock_updates(journal); 2874 jbd2_journal_lock_updates(journal);
2823 jbd2_journal_flush(journal); 2875 if (jbd2_journal_flush(journal) < 0)
2876 goto out;
2877
2824 lock_super(sb); 2878 lock_super(sb);
2825 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER) && 2879 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER) &&
2826 sb->s_flags & MS_RDONLY) { 2880 sb->s_flags & MS_RDONLY) {
@@ -2829,6 +2883,8 @@ static void ext4_mark_recovery_complete(struct super_block *sb,
2829 ext4_commit_super(sb, es, 1); 2883 ext4_commit_super(sb, es, 1);
2830 } 2884 }
2831 unlock_super(sb); 2885 unlock_super(sb);
2886
2887out:
2832 jbd2_journal_unlock_updates(journal); 2888 jbd2_journal_unlock_updates(journal);
2833} 2889}
2834 2890
@@ -2907,6 +2963,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
2907{ 2963{
2908 tid_t target; 2964 tid_t target;
2909 2965
2966 trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait);
2910 sb->s_dirt = 0; 2967 sb->s_dirt = 0;
2911 if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) { 2968 if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) {
2912 if (wait) 2969 if (wait)
@@ -2928,7 +2985,13 @@ static void ext4_write_super_lockfs(struct super_block *sb)
2928 2985
2929 /* Now we set up the journal barrier. */ 2986 /* Now we set up the journal barrier. */
2930 jbd2_journal_lock_updates(journal); 2987 jbd2_journal_lock_updates(journal);
2931 jbd2_journal_flush(journal); 2988
2989 /*
2990 * We don't want to clear needs_recovery flag when we failed
2991 * to flush the journal.
2992 */
2993 if (jbd2_journal_flush(journal) < 0)
2994 return;
2932 2995
2933 /* Journal blocked and flushed, clear needs_recovery flag. */ 2996 /* Journal blocked and flushed, clear needs_recovery flag. */
2934 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); 2997 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
@@ -3162,7 +3225,8 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
3162 buf->f_type = EXT4_SUPER_MAGIC; 3225 buf->f_type = EXT4_SUPER_MAGIC;
3163 buf->f_bsize = sb->s_blocksize; 3226 buf->f_bsize = sb->s_blocksize;
3164 buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last; 3227 buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last;
3165 buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter); 3228 buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) -
3229 percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter);
3166 ext4_free_blocks_count_set(es, buf->f_bfree); 3230 ext4_free_blocks_count_set(es, buf->f_bfree);
3167 buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es); 3231 buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
3168 if (buf->f_bfree < ext4_r_blocks_count(es)) 3232 if (buf->f_bfree < ext4_r_blocks_count(es))
@@ -3367,8 +3431,12 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
3367 * otherwise be livelocked... 3431 * otherwise be livelocked...
3368 */ 3432 */
3369 jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); 3433 jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
3370 jbd2_journal_flush(EXT4_SB(sb)->s_journal); 3434 err = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
3371 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); 3435 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
3436 if (err) {
3437 path_put(&nd.path);
3438 return err;
3439 }
3372 } 3440 }
3373 3441
3374 err = vfs_quota_on_path(sb, type, format_id, &nd.path); 3442 err = vfs_quota_on_path(sb, type, format_id, &nd.path);
@@ -3432,7 +3500,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
3432 handle_t *handle = journal_current_handle(); 3500 handle_t *handle = journal_current_handle();
3433 3501
3434 if (!handle) { 3502 if (!handle) {
3435 printk(KERN_WARNING "EXT4-fs: Quota write (off=%Lu, len=%Lu)" 3503 printk(KERN_WARNING "EXT4-fs: Quota write (off=%llu, len=%llu)"
3436 " cancelled because transaction is not started.\n", 3504 " cancelled because transaction is not started.\n",
3437 (unsigned long long)off, (unsigned long long)len); 3505 (unsigned long long)off, (unsigned long long)len);
3438 return -EIO; 3506 return -EIO;
@@ -3493,18 +3561,82 @@ static int ext4_get_sb(struct file_system_type *fs_type,
3493 return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt); 3561 return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt);
3494} 3562}
3495 3563
3564#ifdef CONFIG_PROC_FS
3565static int ext4_ui_proc_show(struct seq_file *m, void *v)
3566{
3567 unsigned int *p = m->private;
3568
3569 seq_printf(m, "%u\n", *p);
3570 return 0;
3571}
3572
3573static int ext4_ui_proc_open(struct inode *inode, struct file *file)
3574{
3575 return single_open(file, ext4_ui_proc_show, PDE(inode)->data);
3576}
3577
3578static ssize_t ext4_ui_proc_write(struct file *file, const char __user *buf,
3579 size_t cnt, loff_t *ppos)
3580{
3581 unsigned int *p = PDE(file->f_path.dentry->d_inode)->data;
3582 char str[32];
3583 unsigned long value;
3584
3585 if (cnt >= sizeof(str))
3586 return -EINVAL;
3587 if (copy_from_user(str, buf, cnt))
3588 return -EFAULT;
3589 value = simple_strtol(str, NULL, 0);
3590 if (value < 0)
3591 return -ERANGE;
3592 *p = value;
3593 return cnt;
3594}
3595
3596const struct file_operations ext4_ui_proc_fops = {
3597 .owner = THIS_MODULE,
3598 .open = ext4_ui_proc_open,
3599 .read = seq_read,
3600 .llseek = seq_lseek,
3601 .release = single_release,
3602 .write = ext4_ui_proc_write,
3603};
3604#endif
3605
3606static struct file_system_type ext4_fs_type = {
3607 .owner = THIS_MODULE,
3608 .name = "ext4",
3609 .get_sb = ext4_get_sb,
3610 .kill_sb = kill_block_super,
3611 .fs_flags = FS_REQUIRES_DEV,
3612};
3613
3614#ifdef CONFIG_EXT4DEV_COMPAT
3615static int ext4dev_get_sb(struct file_system_type *fs_type,
3616 int flags, const char *dev_name, void *data, struct vfsmount *mnt)
3617{
3618 printk(KERN_WARNING "EXT4-fs: Update your userspace programs "
3619 "to mount using ext4\n");
3620 printk(KERN_WARNING "EXT4-fs: ext4dev backwards compatibility "
3621 "will go away by 2.6.31\n");
3622 return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt);
3623}
3624
3496static struct file_system_type ext4dev_fs_type = { 3625static struct file_system_type ext4dev_fs_type = {
3497 .owner = THIS_MODULE, 3626 .owner = THIS_MODULE,
3498 .name = "ext4dev", 3627 .name = "ext4dev",
3499 .get_sb = ext4_get_sb, 3628 .get_sb = ext4dev_get_sb,
3500 .kill_sb = kill_block_super, 3629 .kill_sb = kill_block_super,
3501 .fs_flags = FS_REQUIRES_DEV, 3630 .fs_flags = FS_REQUIRES_DEV,
3502}; 3631};
3632MODULE_ALIAS("ext4dev");
3633#endif
3503 3634
3504static int __init init_ext4_fs(void) 3635static int __init init_ext4_fs(void)
3505{ 3636{
3506 int err; 3637 int err;
3507 3638
3639 ext4_proc_root = proc_mkdir("fs/ext4", NULL);
3508 err = init_ext4_mballoc(); 3640 err = init_ext4_mballoc();
3509 if (err) 3641 if (err)
3510 return err; 3642 return err;
@@ -3515,9 +3647,16 @@ static int __init init_ext4_fs(void)
3515 err = init_inodecache(); 3647 err = init_inodecache();
3516 if (err) 3648 if (err)
3517 goto out1; 3649 goto out1;
3518 err = register_filesystem(&ext4dev_fs_type); 3650 err = register_filesystem(&ext4_fs_type);
3519 if (err) 3651 if (err)
3520 goto out; 3652 goto out;
3653#ifdef CONFIG_EXT4DEV_COMPAT
3654 err = register_filesystem(&ext4dev_fs_type);
3655 if (err) {
3656 unregister_filesystem(&ext4_fs_type);
3657 goto out;
3658 }
3659#endif
3521 return 0; 3660 return 0;
3522out: 3661out:
3523 destroy_inodecache(); 3662 destroy_inodecache();
@@ -3530,10 +3669,14 @@ out2:
3530 3669
3531static void __exit exit_ext4_fs(void) 3670static void __exit exit_ext4_fs(void)
3532{ 3671{
3672 unregister_filesystem(&ext4_fs_type);
3673#ifdef CONFIG_EXT4DEV_COMPAT
3533 unregister_filesystem(&ext4dev_fs_type); 3674 unregister_filesystem(&ext4dev_fs_type);
3675#endif
3534 destroy_inodecache(); 3676 destroy_inodecache();
3535 exit_ext4_xattr(); 3677 exit_ext4_xattr();
3536 exit_ext4_mballoc(); 3678 exit_ext4_mballoc();
3679 remove_proc_entry("fs/ext4", NULL);
3537} 3680}
3538 3681
3539MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); 3682MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
index e9178643dc01..00740cb32be3 100644
--- a/fs/ext4/symlink.c
+++ b/fs/ext4/symlink.c
@@ -23,10 +23,10 @@
23#include "ext4.h" 23#include "ext4.h"
24#include "xattr.h" 24#include "xattr.h"
25 25
26static void * ext4_follow_link(struct dentry *dentry, struct nameidata *nd) 26static void *ext4_follow_link(struct dentry *dentry, struct nameidata *nd)
27{ 27{
28 struct ext4_inode_info *ei = EXT4_I(dentry->d_inode); 28 struct ext4_inode_info *ei = EXT4_I(dentry->d_inode);
29 nd_set_link(nd, (char*)ei->i_data); 29 nd_set_link(nd, (char *) ei->i_data);
30 return NULL; 30 return NULL;
31} 31}
32 32
@@ -34,7 +34,7 @@ const struct inode_operations ext4_symlink_inode_operations = {
34 .readlink = generic_readlink, 34 .readlink = generic_readlink,
35 .follow_link = page_follow_link_light, 35 .follow_link = page_follow_link_light,
36 .put_link = page_put_link, 36 .put_link = page_put_link,
37#ifdef CONFIG_EXT4DEV_FS_XATTR 37#ifdef CONFIG_EXT4_FS_XATTR
38 .setxattr = generic_setxattr, 38 .setxattr = generic_setxattr,
39 .getxattr = generic_getxattr, 39 .getxattr = generic_getxattr,
40 .listxattr = ext4_listxattr, 40 .listxattr = ext4_listxattr,
@@ -45,7 +45,7 @@ const struct inode_operations ext4_symlink_inode_operations = {
45const struct inode_operations ext4_fast_symlink_inode_operations = { 45const struct inode_operations ext4_fast_symlink_inode_operations = {
46 .readlink = generic_readlink, 46 .readlink = generic_readlink,
47 .follow_link = ext4_follow_link, 47 .follow_link = ext4_follow_link,
48#ifdef CONFIG_EXT4DEV_FS_XATTR 48#ifdef CONFIG_EXT4_FS_XATTR
49 .setxattr = generic_setxattr, 49 .setxattr = generic_setxattr,
50 .getxattr = generic_getxattr, 50 .getxattr = generic_getxattr,
51 .listxattr = ext4_listxattr, 51 .listxattr = ext4_listxattr,
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 8954208b4893..80626d516fee 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -99,12 +99,12 @@ static struct mb_cache *ext4_xattr_cache;
99 99
100static struct xattr_handler *ext4_xattr_handler_map[] = { 100static struct xattr_handler *ext4_xattr_handler_map[] = {
101 [EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler, 101 [EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler,
102#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL 102#ifdef CONFIG_EXT4_FS_POSIX_ACL
103 [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext4_xattr_acl_access_handler, 103 [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext4_xattr_acl_access_handler,
104 [EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ext4_xattr_acl_default_handler, 104 [EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ext4_xattr_acl_default_handler,
105#endif 105#endif
106 [EXT4_XATTR_INDEX_TRUSTED] = &ext4_xattr_trusted_handler, 106 [EXT4_XATTR_INDEX_TRUSTED] = &ext4_xattr_trusted_handler,
107#ifdef CONFIG_EXT4DEV_FS_SECURITY 107#ifdef CONFIG_EXT4_FS_SECURITY
108 [EXT4_XATTR_INDEX_SECURITY] = &ext4_xattr_security_handler, 108 [EXT4_XATTR_INDEX_SECURITY] = &ext4_xattr_security_handler,
109#endif 109#endif
110}; 110};
@@ -112,11 +112,11 @@ static struct xattr_handler *ext4_xattr_handler_map[] = {
112struct xattr_handler *ext4_xattr_handlers[] = { 112struct xattr_handler *ext4_xattr_handlers[] = {
113 &ext4_xattr_user_handler, 113 &ext4_xattr_user_handler,
114 &ext4_xattr_trusted_handler, 114 &ext4_xattr_trusted_handler,
115#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL 115#ifdef CONFIG_EXT4_FS_POSIX_ACL
116 &ext4_xattr_acl_access_handler, 116 &ext4_xattr_acl_access_handler,
117 &ext4_xattr_acl_default_handler, 117 &ext4_xattr_acl_default_handler,
118#endif 118#endif
119#ifdef CONFIG_EXT4DEV_FS_SECURITY 119#ifdef CONFIG_EXT4_FS_SECURITY
120 &ext4_xattr_security_handler, 120 &ext4_xattr_security_handler,
121#endif 121#endif
122 NULL 122 NULL
@@ -959,6 +959,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
959 struct ext4_xattr_block_find bs = { 959 struct ext4_xattr_block_find bs = {
960 .s = { .not_found = -ENODATA, }, 960 .s = { .not_found = -ENODATA, },
961 }; 961 };
962 unsigned long no_expand;
962 int error; 963 int error;
963 964
964 if (!name) 965 if (!name)
@@ -966,6 +967,9 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
966 if (strlen(name) > 255) 967 if (strlen(name) > 255)
967 return -ERANGE; 968 return -ERANGE;
968 down_write(&EXT4_I(inode)->xattr_sem); 969 down_write(&EXT4_I(inode)->xattr_sem);
970 no_expand = EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND;
971 EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND;
972
969 error = ext4_get_inode_loc(inode, &is.iloc); 973 error = ext4_get_inode_loc(inode, &is.iloc);
970 if (error) 974 if (error)
971 goto cleanup; 975 goto cleanup;
@@ -1042,6 +1046,8 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
1042cleanup: 1046cleanup:
1043 brelse(is.iloc.bh); 1047 brelse(is.iloc.bh);
1044 brelse(bs.bh); 1048 brelse(bs.bh);
1049 if (no_expand == 0)
1050 EXT4_I(inode)->i_state &= ~EXT4_STATE_NO_EXPAND;
1045 up_write(&EXT4_I(inode)->xattr_sem); 1051 up_write(&EXT4_I(inode)->xattr_sem);
1046 return error; 1052 return error;
1047} 1053}
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 5992fe979bb9..8ede88b18c29 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -51,8 +51,8 @@ struct ext4_xattr_entry {
51 (((name_len) + EXT4_XATTR_ROUND + \ 51 (((name_len) + EXT4_XATTR_ROUND + \
52 sizeof(struct ext4_xattr_entry)) & ~EXT4_XATTR_ROUND) 52 sizeof(struct ext4_xattr_entry)) & ~EXT4_XATTR_ROUND)
53#define EXT4_XATTR_NEXT(entry) \ 53#define EXT4_XATTR_NEXT(entry) \
54 ( (struct ext4_xattr_entry *)( \ 54 ((struct ext4_xattr_entry *)( \
55 (char *)(entry) + EXT4_XATTR_LEN((entry)->e_name_len)) ) 55 (char *)(entry) + EXT4_XATTR_LEN((entry)->e_name_len)))
56#define EXT4_XATTR_SIZE(size) \ 56#define EXT4_XATTR_SIZE(size) \
57 (((size) + EXT4_XATTR_ROUND) & ~EXT4_XATTR_ROUND) 57 (((size) + EXT4_XATTR_ROUND) & ~EXT4_XATTR_ROUND)
58 58
@@ -63,7 +63,7 @@ struct ext4_xattr_entry {
63 EXT4_I(inode)->i_extra_isize)) 63 EXT4_I(inode)->i_extra_isize))
64#define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1)) 64#define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))
65 65
66# ifdef CONFIG_EXT4DEV_FS_XATTR 66# ifdef CONFIG_EXT4_FS_XATTR
67 67
68extern struct xattr_handler ext4_xattr_user_handler; 68extern struct xattr_handler ext4_xattr_user_handler;
69extern struct xattr_handler ext4_xattr_trusted_handler; 69extern struct xattr_handler ext4_xattr_trusted_handler;
@@ -88,7 +88,7 @@ extern void exit_ext4_xattr(void);
88 88
89extern struct xattr_handler *ext4_xattr_handlers[]; 89extern struct xattr_handler *ext4_xattr_handlers[];
90 90
91# else /* CONFIG_EXT4DEV_FS_XATTR */ 91# else /* CONFIG_EXT4_FS_XATTR */
92 92
93static inline int 93static inline int
94ext4_xattr_get(struct inode *inode, int name_index, const char *name, 94ext4_xattr_get(struct inode *inode, int name_index, const char *name,
@@ -141,9 +141,9 @@ ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
141 141
142#define ext4_xattr_handlers NULL 142#define ext4_xattr_handlers NULL
143 143
144# endif /* CONFIG_EXT4DEV_FS_XATTR */ 144# endif /* CONFIG_EXT4_FS_XATTR */
145 145
146#ifdef CONFIG_EXT4DEV_FS_SECURITY 146#ifdef CONFIG_EXT4_FS_SECURITY
147extern int ext4_init_security(handle_t *handle, struct inode *inode, 147extern int ext4_init_security(handle_t *handle, struct inode *inode,
148 struct inode *dir); 148 struct inode *dir);
149#else 149#else
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index 302e95c4af7e..fb98b3d847ed 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -6,6 +6,7 @@
6#include <linux/module.h> 6#include <linux/module.h>
7#include <linux/fs.h> 7#include <linux/fs.h>
8#include <linux/msdos_fs.h> 8#include <linux/msdos_fs.h>
9#include <linux/blkdev.h>
9 10
10struct fatent_operations { 11struct fatent_operations {
11 void (*ent_blocknr)(struct super_block *, int, int *, sector_t *); 12 void (*ent_blocknr)(struct super_block *, int, int *, sector_t *);
@@ -535,6 +536,7 @@ int fat_free_clusters(struct inode *inode, int cluster)
535 struct fat_entry fatent; 536 struct fat_entry fatent;
536 struct buffer_head *bhs[MAX_BUF_PER_PAGE]; 537 struct buffer_head *bhs[MAX_BUF_PER_PAGE];
537 int i, err, nr_bhs; 538 int i, err, nr_bhs;
539 int first_cl = cluster;
538 540
539 nr_bhs = 0; 541 nr_bhs = 0;
540 fatent_init(&fatent); 542 fatent_init(&fatent);
@@ -551,6 +553,18 @@ int fat_free_clusters(struct inode *inode, int cluster)
551 goto error; 553 goto error;
552 } 554 }
553 555
556 /*
557 * Issue discard for the sectors we no longer care about,
558 * batching contiguous clusters into one request
559 */
560 if (cluster != fatent.entry + 1) {
561 int nr_clus = fatent.entry - first_cl + 1;
562
563 sb_issue_discard(sb, fat_clus_to_blknr(sbi, first_cl),
564 nr_clus * sbi->sec_per_clus);
565 first_cl = cluster;
566 }
567
554 ops->ent_put(&fatent, FAT_ENT_FREE); 568 ops->ent_put(&fatent, FAT_ENT_FREE);
555 if (sbi->free_clusters != -1) { 569 if (sbi->free_clusters != -1) {
556 sbi->free_clusters++; 570 sbi->free_clusters++;
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 13391e546616..c962283d4e7f 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1265,6 +1265,8 @@ static void blocking_cb(struct gfs2_sbd *sdp, struct lm_lockname *name,
1265 holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time; 1265 holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time;
1266 if (time_before(now, holdtime)) 1266 if (time_before(now, holdtime))
1267 delay = holdtime - now; 1267 delay = holdtime - now;
1268 if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags))
1269 delay = gl->gl_ops->go_min_hold_time;
1268 1270
1269 spin_lock(&gl->gl_spin); 1271 spin_lock(&gl->gl_spin);
1270 handle_callback(gl, state, 1, delay); 1272 handle_callback(gl, state, 1, delay);
@@ -1578,8 +1580,6 @@ static const char *hflags2str(char *buf, unsigned flags, unsigned long iflags)
1578 *p++ = 'a'; 1580 *p++ = 'a';
1579 if (flags & GL_EXACT) 1581 if (flags & GL_EXACT)
1580 *p++ = 'E'; 1582 *p++ = 'E';
1581 if (flags & GL_ATIME)
1582 *p++ = 'a';
1583 if (flags & GL_NOCACHE) 1583 if (flags & GL_NOCACHE)
1584 *p++ = 'c'; 1584 *p++ = 'c';
1585 if (test_bit(HIF_HOLDER, &iflags)) 1585 if (test_bit(HIF_HOLDER, &iflags))
@@ -1816,15 +1816,17 @@ restart:
1816 if (gl) { 1816 if (gl) {
1817 gi->gl = hlist_entry(gl->gl_list.next, 1817 gi->gl = hlist_entry(gl->gl_list.next,
1818 struct gfs2_glock, gl_list); 1818 struct gfs2_glock, gl_list);
1819 if (gi->gl) 1819 } else {
1820 gfs2_glock_hold(gi->gl); 1820 gi->gl = hlist_entry(gl_hash_table[gi->hash].hb_list.first,
1821 struct gfs2_glock, gl_list);
1821 } 1822 }
1823 if (gi->gl)
1824 gfs2_glock_hold(gi->gl);
1822 read_unlock(gl_lock_addr(gi->hash)); 1825 read_unlock(gl_lock_addr(gi->hash));
1823 if (gl) 1826 if (gl)
1824 gfs2_glock_put(gl); 1827 gfs2_glock_put(gl);
1825 if (gl && gi->gl == NULL)
1826 gi->hash++;
1827 while (gi->gl == NULL) { 1828 while (gi->gl == NULL) {
1829 gi->hash++;
1828 if (gi->hash >= GFS2_GL_HASH_SIZE) 1830 if (gi->hash >= GFS2_GL_HASH_SIZE)
1829 return 1; 1831 return 1;
1830 read_lock(gl_lock_addr(gi->hash)); 1832 read_lock(gl_lock_addr(gi->hash));
@@ -1833,7 +1835,6 @@ restart:
1833 if (gi->gl) 1835 if (gi->gl)
1834 gfs2_glock_hold(gi->gl); 1836 gfs2_glock_hold(gi->gl);
1835 read_unlock(gl_lock_addr(gi->hash)); 1837 read_unlock(gl_lock_addr(gi->hash));
1836 gi->hash++;
1837 } 1838 }
1838 1839
1839 if (gi->sdp != gi->gl->gl_sbd) 1840 if (gi->sdp != gi->gl->gl_sbd)
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 971d92af70fc..695c6b193611 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -24,7 +24,6 @@
24#define GL_ASYNC 0x00000040 24#define GL_ASYNC 0x00000040
25#define GL_EXACT 0x00000080 25#define GL_EXACT 0x00000080
26#define GL_SKIP 0x00000100 26#define GL_SKIP 0x00000100
27#define GL_ATIME 0x00000200
28#define GL_NOCACHE 0x00000400 27#define GL_NOCACHE 0x00000400
29 28
30#define GLR_TRYFAILED 13 29#define GLR_TRYFAILED 13
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 448697a5c462..f566ec1b4e8e 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -386,20 +386,21 @@ struct gfs2_statfs_change_host {
386#define GFS2_DATA_ORDERED 2 386#define GFS2_DATA_ORDERED 2
387 387
388struct gfs2_args { 388struct gfs2_args {
389 char ar_lockproto[GFS2_LOCKNAME_LEN]; /* Name of the Lock Protocol */ 389 char ar_lockproto[GFS2_LOCKNAME_LEN]; /* Name of the Lock Protocol */
390 char ar_locktable[GFS2_LOCKNAME_LEN]; /* Name of the Lock Table */ 390 char ar_locktable[GFS2_LOCKNAME_LEN]; /* Name of the Lock Table */
391 char ar_hostdata[GFS2_LOCKNAME_LEN]; /* Host specific data */ 391 char ar_hostdata[GFS2_LOCKNAME_LEN]; /* Host specific data */
392 int ar_spectator; /* Don't get a journal because we're always RO */ 392 unsigned int ar_spectator:1; /* Don't get a journal */
393 int ar_ignore_local_fs; /* Don't optimize even if local_fs is 1 */ 393 unsigned int ar_ignore_local_fs:1; /* Ignore optimisations */
394 int ar_localflocks; /* Let the VFS do flock|fcntl locks for us */ 394 unsigned int ar_localflocks:1; /* Let the VFS do flock|fcntl */
395 int ar_localcaching; /* Local-style caching (dangerous on multihost) */ 395 unsigned int ar_localcaching:1; /* Local caching */
396 int ar_debug; /* Oops on errors instead of trying to be graceful */ 396 unsigned int ar_debug:1; /* Oops on errors */
397 int ar_upgrade; /* Upgrade ondisk/multihost format */ 397 unsigned int ar_upgrade:1; /* Upgrade ondisk format */
398 unsigned int ar_num_glockd; /* Number of glockd threads */ 398 unsigned int ar_posix_acl:1; /* Enable posix acls */
399 int ar_posix_acl; /* Enable posix acls */ 399 unsigned int ar_quota:2; /* off/account/on */
400 int ar_quota; /* off/account/on */ 400 unsigned int ar_suiddir:1; /* suiddir support */
401 int ar_suiddir; /* suiddir support */ 401 unsigned int ar_data:2; /* ordered/writeback */
402 int ar_data; /* ordered/writeback */ 402 unsigned int ar_meta:1; /* mount metafs */
403 unsigned int ar_num_glockd; /* Number of glockd threads */
403}; 404};
404 405
405struct gfs2_tune { 406struct gfs2_tune {
@@ -419,7 +420,6 @@ struct gfs2_tune {
419 unsigned int gt_quota_scale_den; /* Denominator */ 420 unsigned int gt_quota_scale_den; /* Denominator */
420 unsigned int gt_quota_cache_secs; 421 unsigned int gt_quota_cache_secs;
421 unsigned int gt_quota_quantum; /* Secs between syncs to quota file */ 422 unsigned int gt_quota_quantum; /* Secs between syncs to quota file */
422 unsigned int gt_atime_quantum; /* Min secs between atime updates */
423 unsigned int gt_new_files_jdata; 423 unsigned int gt_new_files_jdata;
424 unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */ 424 unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */
425 unsigned int gt_stall_secs; /* Detects trouble! */ 425 unsigned int gt_stall_secs; /* Detects trouble! */
@@ -432,7 +432,7 @@ enum {
432 SDF_JOURNAL_CHECKED = 0, 432 SDF_JOURNAL_CHECKED = 0,
433 SDF_JOURNAL_LIVE = 1, 433 SDF_JOURNAL_LIVE = 1,
434 SDF_SHUTDOWN = 2, 434 SDF_SHUTDOWN = 2,
435 SDF_NOATIME = 3, 435 SDF_NOBARRIERS = 3,
436}; 436};
437 437
438#define GFS2_FSNAME_LEN 256 438#define GFS2_FSNAME_LEN 256
@@ -461,7 +461,6 @@ struct gfs2_sb_host {
461 461
462struct gfs2_sbd { 462struct gfs2_sbd {
463 struct super_block *sd_vfs; 463 struct super_block *sd_vfs;
464 struct super_block *sd_vfs_meta;
465 struct kobject sd_kobj; 464 struct kobject sd_kobj;
466 unsigned long sd_flags; /* SDF_... */ 465 unsigned long sd_flags; /* SDF_... */
467 struct gfs2_sb_host sd_sb; 466 struct gfs2_sb_host sd_sb;
@@ -499,7 +498,9 @@ struct gfs2_sbd {
499 498
500 /* Inode Stuff */ 499 /* Inode Stuff */
501 500
502 struct inode *sd_master_dir; 501 struct dentry *sd_master_dir;
502 struct dentry *sd_root_dir;
503
503 struct inode *sd_jindex; 504 struct inode *sd_jindex;
504 struct inode *sd_inum_inode; 505 struct inode *sd_inum_inode;
505 struct inode *sd_statfs_inode; 506 struct inode *sd_statfs_inode;
@@ -634,7 +635,6 @@ struct gfs2_sbd {
634 /* Debugging crud */ 635 /* Debugging crud */
635 636
636 unsigned long sd_last_warning; 637 unsigned long sd_last_warning;
637 struct vfsmount *sd_gfs2mnt;
638 struct dentry *debugfs_dir; /* debugfs directory */ 638 struct dentry *debugfs_dir; /* debugfs directory */
639 struct dentry *debugfs_dentry_glocks; /* for debugfs */ 639 struct dentry *debugfs_dentry_glocks; /* for debugfs */
640}; 640};
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 8b0806a32948..7cee695fa441 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -18,6 +18,7 @@
18#include <linux/crc32.h> 18#include <linux/crc32.h>
19#include <linux/lm_interface.h> 19#include <linux/lm_interface.h>
20#include <linux/security.h> 20#include <linux/security.h>
21#include <linux/time.h>
21 22
22#include "gfs2.h" 23#include "gfs2.h"
23#include "incore.h" 24#include "incore.h"
@@ -249,6 +250,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
249{ 250{
250 struct gfs2_dinode_host *di = &ip->i_di; 251 struct gfs2_dinode_host *di = &ip->i_di;
251 const struct gfs2_dinode *str = buf; 252 const struct gfs2_dinode *str = buf;
253 struct timespec atime;
252 u16 height, depth; 254 u16 height, depth;
253 255
254 if (unlikely(ip->i_no_addr != be64_to_cpu(str->di_num.no_addr))) 256 if (unlikely(ip->i_no_addr != be64_to_cpu(str->di_num.no_addr)))
@@ -275,8 +277,10 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
275 di->di_size = be64_to_cpu(str->di_size); 277 di->di_size = be64_to_cpu(str->di_size);
276 i_size_write(&ip->i_inode, di->di_size); 278 i_size_write(&ip->i_inode, di->di_size);
277 gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks)); 279 gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks));
278 ip->i_inode.i_atime.tv_sec = be64_to_cpu(str->di_atime); 280 atime.tv_sec = be64_to_cpu(str->di_atime);
279 ip->i_inode.i_atime.tv_nsec = be32_to_cpu(str->di_atime_nsec); 281 atime.tv_nsec = be32_to_cpu(str->di_atime_nsec);
282 if (timespec_compare(&ip->i_inode.i_atime, &atime) < 0)
283 ip->i_inode.i_atime = atime;
280 ip->i_inode.i_mtime.tv_sec = be64_to_cpu(str->di_mtime); 284 ip->i_inode.i_mtime.tv_sec = be64_to_cpu(str->di_mtime);
281 ip->i_inode.i_mtime.tv_nsec = be32_to_cpu(str->di_mtime_nsec); 285 ip->i_inode.i_mtime.tv_nsec = be32_to_cpu(str->di_mtime_nsec);
282 ip->i_inode.i_ctime.tv_sec = be64_to_cpu(str->di_ctime); 286 ip->i_inode.i_ctime.tv_sec = be64_to_cpu(str->di_ctime);
@@ -1033,13 +1037,11 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
1033 1037
1034 if (bh) 1038 if (bh)
1035 brelse(bh); 1039 brelse(bh);
1036 if (!inode)
1037 return ERR_PTR(-ENOMEM);
1038 return inode; 1040 return inode;
1039 1041
1040fail_gunlock2: 1042fail_gunlock2:
1041 gfs2_glock_dq_uninit(ghs + 1); 1043 gfs2_glock_dq_uninit(ghs + 1);
1042 if (inode) 1044 if (inode && !IS_ERR(inode))
1043 iput(inode); 1045 iput(inode);
1044fail_gunlock: 1046fail_gunlock:
1045 gfs2_glock_dq(ghs); 1047 gfs2_glock_dq(ghs);
@@ -1140,54 +1142,6 @@ int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
1140 return 0; 1142 return 0;
1141} 1143}
1142 1144
1143/*
1144 * gfs2_ok_to_move - check if it's ok to move a directory to another directory
1145 * @this: move this
1146 * @to: to here
1147 *
1148 * Follow @to back to the root and make sure we don't encounter @this
1149 * Assumes we already hold the rename lock.
1150 *
1151 * Returns: errno
1152 */
1153
1154int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
1155{
1156 struct inode *dir = &to->i_inode;
1157 struct super_block *sb = dir->i_sb;
1158 struct inode *tmp;
1159 struct qstr dotdot;
1160 int error = 0;
1161
1162 gfs2_str2qstr(&dotdot, "..");
1163
1164 igrab(dir);
1165
1166 for (;;) {
1167 if (dir == &this->i_inode) {
1168 error = -EINVAL;
1169 break;
1170 }
1171 if (dir == sb->s_root->d_inode) {
1172 error = 0;
1173 break;
1174 }
1175
1176 tmp = gfs2_lookupi(dir, &dotdot, 1);
1177 if (IS_ERR(tmp)) {
1178 error = PTR_ERR(tmp);
1179 break;
1180 }
1181
1182 iput(dir);
1183 dir = tmp;
1184 }
1185
1186 iput(dir);
1187
1188 return error;
1189}
1190
1191/** 1145/**
1192 * gfs2_readlinki - return the contents of a symlink 1146 * gfs2_readlinki - return the contents of a symlink
1193 * @ip: the symlink's inode 1147 * @ip: the symlink's inode
@@ -1207,8 +1161,8 @@ int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len)
1207 unsigned int x; 1161 unsigned int x;
1208 int error; 1162 int error;
1209 1163
1210 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &i_gh); 1164 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
1211 error = gfs2_glock_nq_atime(&i_gh); 1165 error = gfs2_glock_nq(&i_gh);
1212 if (error) { 1166 if (error) {
1213 gfs2_holder_uninit(&i_gh); 1167 gfs2_holder_uninit(&i_gh);
1214 return error; 1168 return error;
@@ -1243,101 +1197,6 @@ out:
1243 return error; 1197 return error;
1244} 1198}
1245 1199
1246/**
1247 * gfs2_glock_nq_atime - Acquire a hold on an inode's glock, and
1248 * conditionally update the inode's atime
1249 * @gh: the holder to acquire
1250 *
1251 * Tests atime (access time) for gfs2_read, gfs2_readdir and gfs2_mmap
1252 * Update if the difference between the current time and the inode's current
1253 * atime is greater than an interval specified at mount.
1254 *
1255 * Returns: errno
1256 */
1257
1258int gfs2_glock_nq_atime(struct gfs2_holder *gh)
1259{
1260 struct gfs2_glock *gl = gh->gh_gl;
1261 struct gfs2_sbd *sdp = gl->gl_sbd;
1262 struct gfs2_inode *ip = gl->gl_object;
1263 s64 quantum = gfs2_tune_get(sdp, gt_atime_quantum);
1264 unsigned int state;
1265 int flags;
1266 int error;
1267 struct timespec tv = CURRENT_TIME;
1268
1269 if (gfs2_assert_warn(sdp, gh->gh_flags & GL_ATIME) ||
1270 gfs2_assert_warn(sdp, !(gh->gh_flags & GL_ASYNC)) ||
1271 gfs2_assert_warn(sdp, gl->gl_ops == &gfs2_inode_glops))
1272 return -EINVAL;
1273
1274 state = gh->gh_state;
1275 flags = gh->gh_flags;
1276
1277 error = gfs2_glock_nq(gh);
1278 if (error)
1279 return error;
1280
1281 if (test_bit(SDF_NOATIME, &sdp->sd_flags) ||
1282 (sdp->sd_vfs->s_flags & MS_RDONLY))
1283 return 0;
1284
1285 if (tv.tv_sec - ip->i_inode.i_atime.tv_sec >= quantum) {
1286 gfs2_glock_dq(gh);
1287 gfs2_holder_reinit(LM_ST_EXCLUSIVE, gh->gh_flags & ~LM_FLAG_ANY,
1288 gh);
1289 error = gfs2_glock_nq(gh);
1290 if (error)
1291 return error;
1292
1293 /* Verify that atime hasn't been updated while we were
1294 trying to get exclusive lock. */
1295
1296 tv = CURRENT_TIME;
1297 if (tv.tv_sec - ip->i_inode.i_atime.tv_sec >= quantum) {
1298 struct buffer_head *dibh;
1299 struct gfs2_dinode *di;
1300
1301 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
1302 if (error == -EROFS)
1303 return 0;
1304 if (error)
1305 goto fail;
1306
1307 error = gfs2_meta_inode_buffer(ip, &dibh);
1308 if (error)
1309 goto fail_end_trans;
1310
1311 ip->i_inode.i_atime = tv;
1312
1313 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1314 di = (struct gfs2_dinode *)dibh->b_data;
1315 di->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec);
1316 di->di_atime_nsec = cpu_to_be32(ip->i_inode.i_atime.tv_nsec);
1317 brelse(dibh);
1318
1319 gfs2_trans_end(sdp);
1320 }
1321
1322 /* If someone else has asked for the glock,
1323 unlock and let them have it. Then reacquire
1324 in the original state. */
1325 if (gfs2_glock_is_blocking(gl)) {
1326 gfs2_glock_dq(gh);
1327 gfs2_holder_reinit(state, flags, gh);
1328 return gfs2_glock_nq(gh);
1329 }
1330 }
1331
1332 return 0;
1333
1334fail_end_trans:
1335 gfs2_trans_end(sdp);
1336fail:
1337 gfs2_glock_dq(gh);
1338 return error;
1339}
1340
1341static int 1200static int
1342__gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr) 1201__gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
1343{ 1202{
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 58f9607d6a86..2d43f69610a0 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -91,9 +91,7 @@ int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
91int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name, 91int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
92 const struct gfs2_inode *ip); 92 const struct gfs2_inode *ip);
93int gfs2_permission(struct inode *inode, int mask); 93int gfs2_permission(struct inode *inode, int mask);
94int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to);
95int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len); 94int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len);
96int gfs2_glock_nq_atime(struct gfs2_holder *gh);
97int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr); 95int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr);
98struct inode *gfs2_lookup_simple(struct inode *dip, const char *name); 96struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
99void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf); 97void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
diff --git a/fs/gfs2/locking/dlm/mount.c b/fs/gfs2/locking/dlm/mount.c
index 09d78c216f48..0c4cbe6c8285 100644
--- a/fs/gfs2/locking/dlm/mount.c
+++ b/fs/gfs2/locking/dlm/mount.c
@@ -144,7 +144,8 @@ static int gdlm_mount(char *table_name, char *host_data,
144 144
145 error = dlm_new_lockspace(ls->fsname, strlen(ls->fsname), 145 error = dlm_new_lockspace(ls->fsname, strlen(ls->fsname),
146 &ls->dlm_lockspace, 146 &ls->dlm_lockspace,
147 DLM_LSFL_FS | (nodir ? DLM_LSFL_NODIR : 0), 147 DLM_LSFL_FS | DLM_LSFL_NEWEXCL |
148 (nodir ? DLM_LSFL_NODIR : 0),
148 GDLM_LVB_SIZE); 149 GDLM_LVB_SIZE);
149 if (error) { 150 if (error) {
150 log_error("dlm_new_lockspace error %d", error); 151 log_error("dlm_new_lockspace error %d", error);
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 6c6af9f5e3ab..ad305854bdc6 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -18,6 +18,7 @@
18#include <linux/delay.h> 18#include <linux/delay.h>
19#include <linux/kthread.h> 19#include <linux/kthread.h>
20#include <linux/freezer.h> 20#include <linux/freezer.h>
21#include <linux/bio.h>
21 22
22#include "gfs2.h" 23#include "gfs2.h"
23#include "incore.h" 24#include "incore.h"
@@ -584,7 +585,6 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
584 memset(bh->b_data, 0, bh->b_size); 585 memset(bh->b_data, 0, bh->b_size);
585 set_buffer_uptodate(bh); 586 set_buffer_uptodate(bh);
586 clear_buffer_dirty(bh); 587 clear_buffer_dirty(bh);
587 unlock_buffer(bh);
588 588
589 gfs2_ail1_empty(sdp, 0); 589 gfs2_ail1_empty(sdp, 0);
590 tail = current_tail(sdp); 590 tail = current_tail(sdp);
@@ -601,8 +601,23 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
601 hash = gfs2_disk_hash(bh->b_data, sizeof(struct gfs2_log_header)); 601 hash = gfs2_disk_hash(bh->b_data, sizeof(struct gfs2_log_header));
602 lh->lh_hash = cpu_to_be32(hash); 602 lh->lh_hash = cpu_to_be32(hash);
603 603
604 set_buffer_dirty(bh); 604 bh->b_end_io = end_buffer_write_sync;
605 if (sync_dirty_buffer(bh)) 605 if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
606 goto skip_barrier;
607 get_bh(bh);
608 submit_bh(WRITE_BARRIER | (1 << BIO_RW_META), bh);
609 wait_on_buffer(bh);
610 if (buffer_eopnotsupp(bh)) {
611 clear_buffer_eopnotsupp(bh);
612 set_buffer_uptodate(bh);
613 set_bit(SDF_NOBARRIERS, &sdp->sd_flags);
614 lock_buffer(bh);
615skip_barrier:
616 get_bh(bh);
617 submit_bh(WRITE_SYNC | (1 << BIO_RW_META), bh);
618 wait_on_buffer(bh);
619 }
620 if (!buffer_uptodate(bh))
606 gfs2_io_error_bh(sdp, bh); 621 gfs2_io_error_bh(sdp, bh);
607 brelse(bh); 622 brelse(bh);
608 623
diff --git a/fs/gfs2/mount.c b/fs/gfs2/mount.c
index b941f9f9f958..df48333e6f01 100644
--- a/fs/gfs2/mount.c
+++ b/fs/gfs2/mount.c
@@ -42,6 +42,7 @@ enum {
42 Opt_nosuiddir, 42 Opt_nosuiddir,
43 Opt_data_writeback, 43 Opt_data_writeback,
44 Opt_data_ordered, 44 Opt_data_ordered,
45 Opt_meta,
45 Opt_err, 46 Opt_err,
46}; 47};
47 48
@@ -66,6 +67,7 @@ static match_table_t tokens = {
66 {Opt_nosuiddir, "nosuiddir"}, 67 {Opt_nosuiddir, "nosuiddir"},
67 {Opt_data_writeback, "data=writeback"}, 68 {Opt_data_writeback, "data=writeback"},
68 {Opt_data_ordered, "data=ordered"}, 69 {Opt_data_ordered, "data=ordered"},
70 {Opt_meta, "meta"},
69 {Opt_err, NULL} 71 {Opt_err, NULL}
70}; 72};
71 73
@@ -239,6 +241,11 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
239 case Opt_data_ordered: 241 case Opt_data_ordered:
240 args->ar_data = GFS2_DATA_ORDERED; 242 args->ar_data = GFS2_DATA_ORDERED;
241 break; 243 break;
244 case Opt_meta:
245 if (remount && args->ar_meta != 1)
246 goto cant_remount;
247 args->ar_meta = 1;
248 break;
242 case Opt_err: 249 case Opt_err:
243 default: 250 default:
244 fs_info(sdp, "unknown option: %s\n", o); 251 fs_info(sdp, "unknown option: %s\n", o);
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index e64a1b04117a..27563816e1c5 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -512,8 +512,8 @@ static int gfs2_readpage(struct file *file, struct page *page)
512 int error; 512 int error;
513 513
514 unlock_page(page); 514 unlock_page(page);
515 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh); 515 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
516 error = gfs2_glock_nq_atime(&gh); 516 error = gfs2_glock_nq(&gh);
517 if (unlikely(error)) 517 if (unlikely(error))
518 goto out; 518 goto out;
519 error = AOP_TRUNCATED_PAGE; 519 error = AOP_TRUNCATED_PAGE;
@@ -594,8 +594,8 @@ static int gfs2_readpages(struct file *file, struct address_space *mapping,
594 struct gfs2_holder gh; 594 struct gfs2_holder gh;
595 int ret; 595 int ret;
596 596
597 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh); 597 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
598 ret = gfs2_glock_nq_atime(&gh); 598 ret = gfs2_glock_nq(&gh);
599 if (unlikely(ret)) 599 if (unlikely(ret))
600 goto out_uninit; 600 goto out_uninit;
601 if (!gfs2_is_stuffed(ip)) 601 if (!gfs2_is_stuffed(ip))
@@ -636,8 +636,8 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
636 unsigned to = from + len; 636 unsigned to = from + len;
637 struct page *page; 637 struct page *page;
638 638
639 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_ATIME, &ip->i_gh); 639 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
640 error = gfs2_glock_nq_atime(&ip->i_gh); 640 error = gfs2_glock_nq(&ip->i_gh);
641 if (unlikely(error)) 641 if (unlikely(error))
642 goto out_uninit; 642 goto out_uninit;
643 643
@@ -975,7 +975,7 @@ static int gfs2_ok_for_dio(struct gfs2_inode *ip, int rw, loff_t offset)
975 if (gfs2_is_stuffed(ip)) 975 if (gfs2_is_stuffed(ip))
976 return 0; 976 return 0;
977 977
978 if (offset > i_size_read(&ip->i_inode)) 978 if (offset >= i_size_read(&ip->i_inode))
979 return 0; 979 return 0;
980 return 1; 980 return 1;
981} 981}
@@ -1000,8 +1000,8 @@ static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
1000 * unfortunately have the option of only flushing a range like 1000 * unfortunately have the option of only flushing a range like
1001 * the VFS does. 1001 * the VFS does.
1002 */ 1002 */
1003 gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, GL_ATIME, &gh); 1003 gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, &gh);
1004 rv = gfs2_glock_nq_atime(&gh); 1004 rv = gfs2_glock_nq(&gh);
1005 if (rv) 1005 if (rv)
1006 return rv; 1006 return rv;
1007 rv = gfs2_ok_for_dio(ip, rw, offset); 1007 rv = gfs2_ok_for_dio(ip, rw, offset);
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index e9a366d4411c..3a747f8e2188 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -89,8 +89,8 @@ static int gfs2_readdir(struct file *file, void *dirent, filldir_t filldir)
89 u64 offset = file->f_pos; 89 u64 offset = file->f_pos;
90 int error; 90 int error;
91 91
92 gfs2_holder_init(dip->i_gl, LM_ST_SHARED, GL_ATIME, &d_gh); 92 gfs2_holder_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
93 error = gfs2_glock_nq_atime(&d_gh); 93 error = gfs2_glock_nq(&d_gh);
94 if (error) { 94 if (error) {
95 gfs2_holder_uninit(&d_gh); 95 gfs2_holder_uninit(&d_gh);
96 return error; 96 return error;
@@ -153,8 +153,8 @@ static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
153 int error; 153 int error;
154 u32 fsflags; 154 u32 fsflags;
155 155
156 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh); 156 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
157 error = gfs2_glock_nq_atime(&gh); 157 error = gfs2_glock_nq(&gh);
158 if (error) 158 if (error)
159 return error; 159 return error;
160 160
@@ -351,8 +351,8 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
351 struct gfs2_alloc *al; 351 struct gfs2_alloc *al;
352 int ret; 352 int ret;
353 353
354 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_ATIME, &gh); 354 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
355 ret = gfs2_glock_nq_atime(&gh); 355 ret = gfs2_glock_nq(&gh);
356 if (ret) 356 if (ret)
357 goto out; 357 goto out;
358 358
@@ -434,8 +434,8 @@ static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
434 struct gfs2_holder i_gh; 434 struct gfs2_holder i_gh;
435 int error; 435 int error;
436 436
437 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &i_gh); 437 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
438 error = gfs2_glock_nq_atime(&i_gh); 438 error = gfs2_glock_nq(&i_gh);
439 if (error) { 439 if (error) {
440 gfs2_holder_uninit(&i_gh); 440 gfs2_holder_uninit(&i_gh);
441 return error; 441 return error;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index b4d1d6490633..b117fcf2c4f5 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -40,6 +40,44 @@
40#define DO 0 40#define DO 0
41#define UNDO 1 41#define UNDO 1
42 42
43static const u32 gfs2_old_fs_formats[] = {
44 0
45};
46
47static const u32 gfs2_old_multihost_formats[] = {
48 0
49};
50
51/**
52 * gfs2_tune_init - Fill a gfs2_tune structure with default values
53 * @gt: tune
54 *
55 */
56
57static void gfs2_tune_init(struct gfs2_tune *gt)
58{
59 spin_lock_init(&gt->gt_spin);
60
61 gt->gt_demote_secs = 300;
62 gt->gt_incore_log_blocks = 1024;
63 gt->gt_log_flush_secs = 60;
64 gt->gt_recoverd_secs = 60;
65 gt->gt_logd_secs = 1;
66 gt->gt_quotad_secs = 5;
67 gt->gt_quota_simul_sync = 64;
68 gt->gt_quota_warn_period = 10;
69 gt->gt_quota_scale_num = 1;
70 gt->gt_quota_scale_den = 1;
71 gt->gt_quota_cache_secs = 300;
72 gt->gt_quota_quantum = 60;
73 gt->gt_new_files_jdata = 0;
74 gt->gt_max_readahead = 1 << 18;
75 gt->gt_stall_secs = 600;
76 gt->gt_complain_secs = 10;
77 gt->gt_statfs_quantum = 30;
78 gt->gt_statfs_slow = 0;
79}
80
43static struct gfs2_sbd *init_sbd(struct super_block *sb) 81static struct gfs2_sbd *init_sbd(struct super_block *sb)
44{ 82{
45 struct gfs2_sbd *sdp; 83 struct gfs2_sbd *sdp;
@@ -96,21 +134,271 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
96 return sdp; 134 return sdp;
97} 135}
98 136
99static void init_vfs(struct super_block *sb, unsigned noatime) 137
138/**
139 * gfs2_check_sb - Check superblock
140 * @sdp: the filesystem
141 * @sb: The superblock
142 * @silent: Don't print a message if the check fails
143 *
144 * Checks the version code of the FS is one that we understand how to
145 * read and that the sizes of the various on-disk structures have not
146 * changed.
147 */
148
149static int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent)
100{ 150{
101 struct gfs2_sbd *sdp = sb->s_fs_info; 151 unsigned int x;
102 152
103 sb->s_magic = GFS2_MAGIC; 153 if (sb->sb_magic != GFS2_MAGIC ||
104 sb->s_op = &gfs2_super_ops; 154 sb->sb_type != GFS2_METATYPE_SB) {
105 sb->s_export_op = &gfs2_export_ops; 155 if (!silent)
106 sb->s_time_gran = 1; 156 printk(KERN_WARNING "GFS2: not a GFS2 filesystem\n");
107 sb->s_maxbytes = MAX_LFS_FILESIZE; 157 return -EINVAL;
158 }
159
160 /* If format numbers match exactly, we're done. */
161
162 if (sb->sb_fs_format == GFS2_FORMAT_FS &&
163 sb->sb_multihost_format == GFS2_FORMAT_MULTI)
164 return 0;
165
166 if (sb->sb_fs_format != GFS2_FORMAT_FS) {
167 for (x = 0; gfs2_old_fs_formats[x]; x++)
168 if (gfs2_old_fs_formats[x] == sb->sb_fs_format)
169 break;
170
171 if (!gfs2_old_fs_formats[x]) {
172 printk(KERN_WARNING
173 "GFS2: code version (%u, %u) is incompatible "
174 "with ondisk format (%u, %u)\n",
175 GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
176 sb->sb_fs_format, sb->sb_multihost_format);
177 printk(KERN_WARNING
178 "GFS2: I don't know how to upgrade this FS\n");
179 return -EINVAL;
180 }
181 }
182
183 if (sb->sb_multihost_format != GFS2_FORMAT_MULTI) {
184 for (x = 0; gfs2_old_multihost_formats[x]; x++)
185 if (gfs2_old_multihost_formats[x] ==
186 sb->sb_multihost_format)
187 break;
188
189 if (!gfs2_old_multihost_formats[x]) {
190 printk(KERN_WARNING
191 "GFS2: code version (%u, %u) is incompatible "
192 "with ondisk format (%u, %u)\n",
193 GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
194 sb->sb_fs_format, sb->sb_multihost_format);
195 printk(KERN_WARNING
196 "GFS2: I don't know how to upgrade this FS\n");
197 return -EINVAL;
198 }
199 }
200
201 if (!sdp->sd_args.ar_upgrade) {
202 printk(KERN_WARNING
203 "GFS2: code version (%u, %u) is incompatible "
204 "with ondisk format (%u, %u)\n",
205 GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
206 sb->sb_fs_format, sb->sb_multihost_format);
207 printk(KERN_INFO
208 "GFS2: Use the \"upgrade\" mount option to upgrade "
209 "the FS\n");
210 printk(KERN_INFO "GFS2: See the manual for more details\n");
211 return -EINVAL;
212 }
213
214 return 0;
215}
216
217static void end_bio_io_page(struct bio *bio, int error)
218{
219 struct page *page = bio->bi_private;
108 220
109 if (sb->s_flags & (MS_NOATIME | MS_NODIRATIME)) 221 if (!error)
110 set_bit(noatime, &sdp->sd_flags); 222 SetPageUptodate(page);
223 else
224 printk(KERN_WARNING "gfs2: error %d reading superblock\n", error);
225 unlock_page(page);
226}
227
228static void gfs2_sb_in(struct gfs2_sb_host *sb, const void *buf)
229{
230 const struct gfs2_sb *str = buf;
231
232 sb->sb_magic = be32_to_cpu(str->sb_header.mh_magic);
233 sb->sb_type = be32_to_cpu(str->sb_header.mh_type);
234 sb->sb_format = be32_to_cpu(str->sb_header.mh_format);
235 sb->sb_fs_format = be32_to_cpu(str->sb_fs_format);
236 sb->sb_multihost_format = be32_to_cpu(str->sb_multihost_format);
237 sb->sb_bsize = be32_to_cpu(str->sb_bsize);
238 sb->sb_bsize_shift = be32_to_cpu(str->sb_bsize_shift);
239 sb->sb_master_dir.no_addr = be64_to_cpu(str->sb_master_dir.no_addr);
240 sb->sb_master_dir.no_formal_ino = be64_to_cpu(str->sb_master_dir.no_formal_ino);
241 sb->sb_root_dir.no_addr = be64_to_cpu(str->sb_root_dir.no_addr);
242 sb->sb_root_dir.no_formal_ino = be64_to_cpu(str->sb_root_dir.no_formal_ino);
243
244 memcpy(sb->sb_lockproto, str->sb_lockproto, GFS2_LOCKNAME_LEN);
245 memcpy(sb->sb_locktable, str->sb_locktable, GFS2_LOCKNAME_LEN);
246}
247
248/**
249 * gfs2_read_super - Read the gfs2 super block from disk
250 * @sdp: The GFS2 super block
251 * @sector: The location of the super block
252 * @error: The error code to return
253 *
254 * This uses the bio functions to read the super block from disk
255 * because we want to be 100% sure that we never read cached data.
256 * A super block is read twice only during each GFS2 mount and is
257 * never written to by the filesystem. The first time its read no
258 * locks are held, and the only details which are looked at are those
259 * relating to the locking protocol. Once locking is up and working,
260 * the sb is read again under the lock to establish the location of
261 * the master directory (contains pointers to journals etc) and the
262 * root directory.
263 *
264 * Returns: 0 on success or error
265 */
266
267static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector)
268{
269 struct super_block *sb = sdp->sd_vfs;
270 struct gfs2_sb *p;
271 struct page *page;
272 struct bio *bio;
273
274 page = alloc_page(GFP_NOFS);
275 if (unlikely(!page))
276 return -ENOBUFS;
277
278 ClearPageUptodate(page);
279 ClearPageDirty(page);
280 lock_page(page);
281
282 bio = bio_alloc(GFP_NOFS, 1);
283 if (unlikely(!bio)) {
284 __free_page(page);
285 return -ENOBUFS;
286 }
111 287
112 /* Don't let the VFS update atimes. GFS2 handles this itself. */ 288 bio->bi_sector = sector * (sb->s_blocksize >> 9);
113 sb->s_flags |= MS_NOATIME | MS_NODIRATIME; 289 bio->bi_bdev = sb->s_bdev;
290 bio_add_page(bio, page, PAGE_SIZE, 0);
291
292 bio->bi_end_io = end_bio_io_page;
293 bio->bi_private = page;
294 submit_bio(READ_SYNC | (1 << BIO_RW_META), bio);
295 wait_on_page_locked(page);
296 bio_put(bio);
297 if (!PageUptodate(page)) {
298 __free_page(page);
299 return -EIO;
300 }
301 p = kmap(page);
302 gfs2_sb_in(&sdp->sd_sb, p);
303 kunmap(page);
304 __free_page(page);
305 return 0;
306}
307/**
308 * gfs2_read_sb - Read super block
309 * @sdp: The GFS2 superblock
310 * @gl: the glock for the superblock (assumed to be held)
311 * @silent: Don't print message if mount fails
312 *
313 */
314
315static int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent)
316{
317 u32 hash_blocks, ind_blocks, leaf_blocks;
318 u32 tmp_blocks;
319 unsigned int x;
320 int error;
321
322 error = gfs2_read_super(sdp, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift);
323 if (error) {
324 if (!silent)
325 fs_err(sdp, "can't read superblock\n");
326 return error;
327 }
328
329 error = gfs2_check_sb(sdp, &sdp->sd_sb, silent);
330 if (error)
331 return error;
332
333 sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift -
334 GFS2_BASIC_BLOCK_SHIFT;
335 sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
336 sdp->sd_diptrs = (sdp->sd_sb.sb_bsize -
337 sizeof(struct gfs2_dinode)) / sizeof(u64);
338 sdp->sd_inptrs = (sdp->sd_sb.sb_bsize -
339 sizeof(struct gfs2_meta_header)) / sizeof(u64);
340 sdp->sd_jbsize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header);
341 sdp->sd_hash_bsize = sdp->sd_sb.sb_bsize / 2;
342 sdp->sd_hash_bsize_shift = sdp->sd_sb.sb_bsize_shift - 1;
343 sdp->sd_hash_ptrs = sdp->sd_hash_bsize / sizeof(u64);
344 sdp->sd_qc_per_block = (sdp->sd_sb.sb_bsize -
345 sizeof(struct gfs2_meta_header)) /
346 sizeof(struct gfs2_quota_change);
347
348 /* Compute maximum reservation required to add a entry to a directory */
349
350 hash_blocks = DIV_ROUND_UP(sizeof(u64) * (1 << GFS2_DIR_MAX_DEPTH),
351 sdp->sd_jbsize);
352
353 ind_blocks = 0;
354 for (tmp_blocks = hash_blocks; tmp_blocks > sdp->sd_diptrs;) {
355 tmp_blocks = DIV_ROUND_UP(tmp_blocks, sdp->sd_inptrs);
356 ind_blocks += tmp_blocks;
357 }
358
359 leaf_blocks = 2 + GFS2_DIR_MAX_DEPTH;
360
361 sdp->sd_max_dirres = hash_blocks + ind_blocks + leaf_blocks;
362
363 sdp->sd_heightsize[0] = sdp->sd_sb.sb_bsize -
364 sizeof(struct gfs2_dinode);
365 sdp->sd_heightsize[1] = sdp->sd_sb.sb_bsize * sdp->sd_diptrs;
366 for (x = 2;; x++) {
367 u64 space, d;
368 u32 m;
369
370 space = sdp->sd_heightsize[x - 1] * sdp->sd_inptrs;
371 d = space;
372 m = do_div(d, sdp->sd_inptrs);
373
374 if (d != sdp->sd_heightsize[x - 1] || m)
375 break;
376 sdp->sd_heightsize[x] = space;
377 }
378 sdp->sd_max_height = x;
379 sdp->sd_heightsize[x] = ~0;
380 gfs2_assert(sdp, sdp->sd_max_height <= GFS2_MAX_META_HEIGHT);
381
382 sdp->sd_jheightsize[0] = sdp->sd_sb.sb_bsize -
383 sizeof(struct gfs2_dinode);
384 sdp->sd_jheightsize[1] = sdp->sd_jbsize * sdp->sd_diptrs;
385 for (x = 2;; x++) {
386 u64 space, d;
387 u32 m;
388
389 space = sdp->sd_jheightsize[x - 1] * sdp->sd_inptrs;
390 d = space;
391 m = do_div(d, sdp->sd_inptrs);
392
393 if (d != sdp->sd_jheightsize[x - 1] || m)
394 break;
395 sdp->sd_jheightsize[x] = space;
396 }
397 sdp->sd_max_jheight = x;
398 sdp->sd_jheightsize[x] = ~0;
399 gfs2_assert(sdp, sdp->sd_max_jheight <= GFS2_MAX_META_HEIGHT);
400
401 return 0;
114} 402}
115 403
116static int init_names(struct gfs2_sbd *sdp, int silent) 404static int init_names(struct gfs2_sbd *sdp, int silent)
@@ -224,51 +512,59 @@ fail:
224 return error; 512 return error;
225} 513}
226 514
227static inline struct inode *gfs2_lookup_root(struct super_block *sb, 515static int gfs2_lookup_root(struct super_block *sb, struct dentry **dptr,
228 u64 no_addr) 516 u64 no_addr, const char *name)
229{ 517{
230 return gfs2_inode_lookup(sb, DT_DIR, no_addr, 0, 0); 518 struct gfs2_sbd *sdp = sb->s_fs_info;
519 struct dentry *dentry;
520 struct inode *inode;
521
522 inode = gfs2_inode_lookup(sb, DT_DIR, no_addr, 0, 0);
523 if (IS_ERR(inode)) {
524 fs_err(sdp, "can't read in %s inode: %ld\n", name, PTR_ERR(inode));
525 return PTR_ERR(inode);
526 }
527 dentry = d_alloc_root(inode);
528 if (!dentry) {
529 fs_err(sdp, "can't alloc %s dentry\n", name);
530 iput(inode);
531 return -ENOMEM;
532 }
533 dentry->d_op = &gfs2_dops;
534 *dptr = dentry;
535 return 0;
231} 536}
232 537
233static int init_sb(struct gfs2_sbd *sdp, int silent, int undo) 538static int init_sb(struct gfs2_sbd *sdp, int silent)
234{ 539{
235 struct super_block *sb = sdp->sd_vfs; 540 struct super_block *sb = sdp->sd_vfs;
236 struct gfs2_holder sb_gh; 541 struct gfs2_holder sb_gh;
237 u64 no_addr; 542 u64 no_addr;
238 struct inode *inode; 543 int ret;
239 int error = 0;
240 544
241 if (undo) { 545 ret = gfs2_glock_nq_num(sdp, GFS2_SB_LOCK, &gfs2_meta_glops,
242 if (sb->s_root) { 546 LM_ST_SHARED, 0, &sb_gh);
243 dput(sb->s_root); 547 if (ret) {
244 sb->s_root = NULL; 548 fs_err(sdp, "can't acquire superblock glock: %d\n", ret);
245 } 549 return ret;
246 return 0;
247 } 550 }
248 551
249 error = gfs2_glock_nq_num(sdp, GFS2_SB_LOCK, &gfs2_meta_glops, 552 ret = gfs2_read_sb(sdp, sb_gh.gh_gl, silent);
250 LM_ST_SHARED, 0, &sb_gh); 553 if (ret) {
251 if (error) { 554 fs_err(sdp, "can't read superblock: %d\n", ret);
252 fs_err(sdp, "can't acquire superblock glock: %d\n", error);
253 return error;
254 }
255
256 error = gfs2_read_sb(sdp, sb_gh.gh_gl, silent);
257 if (error) {
258 fs_err(sdp, "can't read superblock: %d\n", error);
259 goto out; 555 goto out;
260 } 556 }
261 557
262 /* Set up the buffer cache and SB for real */ 558 /* Set up the buffer cache and SB for real */
263 if (sdp->sd_sb.sb_bsize < bdev_hardsect_size(sb->s_bdev)) { 559 if (sdp->sd_sb.sb_bsize < bdev_hardsect_size(sb->s_bdev)) {
264 error = -EINVAL; 560 ret = -EINVAL;
265 fs_err(sdp, "FS block size (%u) is too small for device " 561 fs_err(sdp, "FS block size (%u) is too small for device "
266 "block size (%u)\n", 562 "block size (%u)\n",
267 sdp->sd_sb.sb_bsize, bdev_hardsect_size(sb->s_bdev)); 563 sdp->sd_sb.sb_bsize, bdev_hardsect_size(sb->s_bdev));
268 goto out; 564 goto out;
269 } 565 }
270 if (sdp->sd_sb.sb_bsize > PAGE_SIZE) { 566 if (sdp->sd_sb.sb_bsize > PAGE_SIZE) {
271 error = -EINVAL; 567 ret = -EINVAL;
272 fs_err(sdp, "FS block size (%u) is too big for machine " 568 fs_err(sdp, "FS block size (%u) is too big for machine "
273 "page size (%u)\n", 569 "page size (%u)\n",
274 sdp->sd_sb.sb_bsize, (unsigned int)PAGE_SIZE); 570 sdp->sd_sb.sb_bsize, (unsigned int)PAGE_SIZE);
@@ -278,26 +574,21 @@ static int init_sb(struct gfs2_sbd *sdp, int silent, int undo)
278 574
279 /* Get the root inode */ 575 /* Get the root inode */
280 no_addr = sdp->sd_sb.sb_root_dir.no_addr; 576 no_addr = sdp->sd_sb.sb_root_dir.no_addr;
281 if (sb->s_type == &gfs2meta_fs_type) 577 ret = gfs2_lookup_root(sb, &sdp->sd_root_dir, no_addr, "root");
282 no_addr = sdp->sd_sb.sb_master_dir.no_addr; 578 if (ret)
283 inode = gfs2_lookup_root(sb, no_addr);
284 if (IS_ERR(inode)) {
285 error = PTR_ERR(inode);
286 fs_err(sdp, "can't read in root inode: %d\n", error);
287 goto out; 579 goto out;
288 }
289 580
290 sb->s_root = d_alloc_root(inode); 581 /* Get the master inode */
291 if (!sb->s_root) { 582 no_addr = sdp->sd_sb.sb_master_dir.no_addr;
292 fs_err(sdp, "can't get root dentry\n"); 583 ret = gfs2_lookup_root(sb, &sdp->sd_master_dir, no_addr, "master");
293 error = -ENOMEM; 584 if (ret) {
294 iput(inode); 585 dput(sdp->sd_root_dir);
295 } else 586 goto out;
296 sb->s_root->d_op = &gfs2_dops; 587 }
297 588 sb->s_root = dget(sdp->sd_args.ar_meta ? sdp->sd_master_dir : sdp->sd_root_dir);
298out: 589out:
299 gfs2_glock_dq_uninit(&sb_gh); 590 gfs2_glock_dq_uninit(&sb_gh);
300 return error; 591 return ret;
301} 592}
302 593
303/** 594/**
@@ -372,6 +663,7 @@ static void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp)
372 663
373static int init_journal(struct gfs2_sbd *sdp, int undo) 664static int init_journal(struct gfs2_sbd *sdp, int undo)
374{ 665{
666 struct inode *master = sdp->sd_master_dir->d_inode;
375 struct gfs2_holder ji_gh; 667 struct gfs2_holder ji_gh;
376 struct task_struct *p; 668 struct task_struct *p;
377 struct gfs2_inode *ip; 669 struct gfs2_inode *ip;
@@ -383,7 +675,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
383 goto fail_recoverd; 675 goto fail_recoverd;
384 } 676 }
385 677
386 sdp->sd_jindex = gfs2_lookup_simple(sdp->sd_master_dir, "jindex"); 678 sdp->sd_jindex = gfs2_lookup_simple(master, "jindex");
387 if (IS_ERR(sdp->sd_jindex)) { 679 if (IS_ERR(sdp->sd_jindex)) {
388 fs_err(sdp, "can't lookup journal index: %d\n", error); 680 fs_err(sdp, "can't lookup journal index: %d\n", error);
389 return PTR_ERR(sdp->sd_jindex); 681 return PTR_ERR(sdp->sd_jindex);
@@ -506,25 +798,17 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
506{ 798{
507 int error = 0; 799 int error = 0;
508 struct gfs2_inode *ip; 800 struct gfs2_inode *ip;
509 struct inode *inode; 801 struct inode *master = sdp->sd_master_dir->d_inode;
510 802
511 if (undo) 803 if (undo)
512 goto fail_qinode; 804 goto fail_qinode;
513 805
514 inode = gfs2_lookup_root(sdp->sd_vfs, sdp->sd_sb.sb_master_dir.no_addr);
515 if (IS_ERR(inode)) {
516 error = PTR_ERR(inode);
517 fs_err(sdp, "can't read in master directory: %d\n", error);
518 goto fail;
519 }
520 sdp->sd_master_dir = inode;
521
522 error = init_journal(sdp, undo); 806 error = init_journal(sdp, undo);
523 if (error) 807 if (error)
524 goto fail_master; 808 goto fail;
525 809
526 /* Read in the master inode number inode */ 810 /* Read in the master inode number inode */
527 sdp->sd_inum_inode = gfs2_lookup_simple(sdp->sd_master_dir, "inum"); 811 sdp->sd_inum_inode = gfs2_lookup_simple(master, "inum");
528 if (IS_ERR(sdp->sd_inum_inode)) { 812 if (IS_ERR(sdp->sd_inum_inode)) {
529 error = PTR_ERR(sdp->sd_inum_inode); 813 error = PTR_ERR(sdp->sd_inum_inode);
530 fs_err(sdp, "can't read in inum inode: %d\n", error); 814 fs_err(sdp, "can't read in inum inode: %d\n", error);
@@ -533,7 +817,7 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
533 817
534 818
535 /* Read in the master statfs inode */ 819 /* Read in the master statfs inode */
536 sdp->sd_statfs_inode = gfs2_lookup_simple(sdp->sd_master_dir, "statfs"); 820 sdp->sd_statfs_inode = gfs2_lookup_simple(master, "statfs");
537 if (IS_ERR(sdp->sd_statfs_inode)) { 821 if (IS_ERR(sdp->sd_statfs_inode)) {
538 error = PTR_ERR(sdp->sd_statfs_inode); 822 error = PTR_ERR(sdp->sd_statfs_inode);
539 fs_err(sdp, "can't read in statfs inode: %d\n", error); 823 fs_err(sdp, "can't read in statfs inode: %d\n", error);
@@ -541,7 +825,7 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
541 } 825 }
542 826
543 /* Read in the resource index inode */ 827 /* Read in the resource index inode */
544 sdp->sd_rindex = gfs2_lookup_simple(sdp->sd_master_dir, "rindex"); 828 sdp->sd_rindex = gfs2_lookup_simple(master, "rindex");
545 if (IS_ERR(sdp->sd_rindex)) { 829 if (IS_ERR(sdp->sd_rindex)) {
546 error = PTR_ERR(sdp->sd_rindex); 830 error = PTR_ERR(sdp->sd_rindex);
547 fs_err(sdp, "can't get resource index inode: %d\n", error); 831 fs_err(sdp, "can't get resource index inode: %d\n", error);
@@ -552,7 +836,7 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
552 sdp->sd_rindex_uptodate = 0; 836 sdp->sd_rindex_uptodate = 0;
553 837
554 /* Read in the quota inode */ 838 /* Read in the quota inode */
555 sdp->sd_quota_inode = gfs2_lookup_simple(sdp->sd_master_dir, "quota"); 839 sdp->sd_quota_inode = gfs2_lookup_simple(master, "quota");
556 if (IS_ERR(sdp->sd_quota_inode)) { 840 if (IS_ERR(sdp->sd_quota_inode)) {
557 error = PTR_ERR(sdp->sd_quota_inode); 841 error = PTR_ERR(sdp->sd_quota_inode);
558 fs_err(sdp, "can't get quota file inode: %d\n", error); 842 fs_err(sdp, "can't get quota file inode: %d\n", error);
@@ -571,8 +855,6 @@ fail_inum:
571 iput(sdp->sd_inum_inode); 855 iput(sdp->sd_inum_inode);
572fail_journal: 856fail_journal:
573 init_journal(sdp, UNDO); 857 init_journal(sdp, UNDO);
574fail_master:
575 iput(sdp->sd_master_dir);
576fail: 858fail:
577 return error; 859 return error;
578} 860}
@@ -583,6 +865,7 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo)
583 char buf[30]; 865 char buf[30];
584 int error = 0; 866 int error = 0;
585 struct gfs2_inode *ip; 867 struct gfs2_inode *ip;
868 struct inode *master = sdp->sd_master_dir->d_inode;
586 869
587 if (sdp->sd_args.ar_spectator) 870 if (sdp->sd_args.ar_spectator)
588 return 0; 871 return 0;
@@ -590,7 +873,7 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo)
590 if (undo) 873 if (undo)
591 goto fail_qc_gh; 874 goto fail_qc_gh;
592 875
593 pn = gfs2_lookup_simple(sdp->sd_master_dir, "per_node"); 876 pn = gfs2_lookup_simple(master, "per_node");
594 if (IS_ERR(pn)) { 877 if (IS_ERR(pn)) {
595 error = PTR_ERR(pn); 878 error = PTR_ERR(pn);
596 fs_err(sdp, "can't find per_node directory: %d\n", error); 879 fs_err(sdp, "can't find per_node directory: %d\n", error);
@@ -800,7 +1083,11 @@ static int fill_super(struct super_block *sb, void *data, int silent)
800 goto fail; 1083 goto fail;
801 } 1084 }
802 1085
803 init_vfs(sb, SDF_NOATIME); 1086 sb->s_magic = GFS2_MAGIC;
1087 sb->s_op = &gfs2_super_ops;
1088 sb->s_export_op = &gfs2_export_ops;
1089 sb->s_time_gran = 1;
1090 sb->s_maxbytes = MAX_LFS_FILESIZE;
804 1091
805 /* Set up the buffer cache and fill in some fake block size values 1092 /* Set up the buffer cache and fill in some fake block size values
806 to allow us to read-in the on-disk superblock. */ 1093 to allow us to read-in the on-disk superblock. */
@@ -828,7 +1115,7 @@ static int fill_super(struct super_block *sb, void *data, int silent)
828 if (error) 1115 if (error)
829 goto fail_lm; 1116 goto fail_lm;
830 1117
831 error = init_sb(sdp, silent, DO); 1118 error = init_sb(sdp, silent);
832 if (error) 1119 if (error)
833 goto fail_locking; 1120 goto fail_locking;
834 1121
@@ -869,7 +1156,11 @@ fail_per_node:
869fail_inodes: 1156fail_inodes:
870 init_inodes(sdp, UNDO); 1157 init_inodes(sdp, UNDO);
871fail_sb: 1158fail_sb:
872 init_sb(sdp, 0, UNDO); 1159 if (sdp->sd_root_dir)
1160 dput(sdp->sd_root_dir);
1161 if (sdp->sd_master_dir)
1162 dput(sdp->sd_master_dir);
1163 sb->s_root = NULL;
873fail_locking: 1164fail_locking:
874 init_locking(sdp, &mount_gh, UNDO); 1165 init_locking(sdp, &mount_gh, UNDO);
875fail_lm: 1166fail_lm:
@@ -887,151 +1178,63 @@ fail:
887} 1178}
888 1179
889static int gfs2_get_sb(struct file_system_type *fs_type, int flags, 1180static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
890 const char *dev_name, void *data, struct vfsmount *mnt) 1181 const char *dev_name, void *data, struct vfsmount *mnt)
891{ 1182{
892 struct super_block *sb; 1183 return get_sb_bdev(fs_type, flags, dev_name, data, fill_super, mnt);
893 struct gfs2_sbd *sdp;
894 int error = get_sb_bdev(fs_type, flags, dev_name, data, fill_super, mnt);
895 if (error)
896 goto out;
897 sb = mnt->mnt_sb;
898 sdp = sb->s_fs_info;
899 sdp->sd_gfs2mnt = mnt;
900out:
901 return error;
902} 1184}
903 1185
904static int fill_super_meta(struct super_block *sb, struct super_block *new, 1186static struct super_block *get_gfs2_sb(const char *dev_name)
905 void *data, int silent)
906{ 1187{
907 struct gfs2_sbd *sdp = sb->s_fs_info; 1188 struct super_block *sb;
908 struct inode *inode;
909 int error = 0;
910
911 new->s_fs_info = sdp;
912 sdp->sd_vfs_meta = sb;
913
914 init_vfs(new, SDF_NOATIME);
915
916 /* Get the master inode */
917 inode = igrab(sdp->sd_master_dir);
918
919 new->s_root = d_alloc_root(inode);
920 if (!new->s_root) {
921 fs_err(sdp, "can't get root dentry\n");
922 error = -ENOMEM;
923 iput(inode);
924 } else
925 new->s_root->d_op = &gfs2_dops;
926
927 return error;
928}
929
930static int set_bdev_super(struct super_block *s, void *data)
931{
932 s->s_bdev = data;
933 s->s_dev = s->s_bdev->bd_dev;
934 return 0;
935}
936
937static int test_bdev_super(struct super_block *s, void *data)
938{
939 return s->s_bdev == data;
940}
941
942static struct super_block* get_gfs2_sb(const char *dev_name)
943{
944 struct kstat stat;
945 struct nameidata nd; 1189 struct nameidata nd;
946 struct super_block *sb = NULL, *s;
947 int error; 1190 int error;
948 1191
949 error = path_lookup(dev_name, LOOKUP_FOLLOW, &nd); 1192 error = path_lookup(dev_name, LOOKUP_FOLLOW, &nd);
950 if (error) { 1193 if (error) {
951 printk(KERN_WARNING "GFS2: path_lookup on %s returned error\n", 1194 printk(KERN_WARNING "GFS2: path_lookup on %s returned error %d\n",
952 dev_name); 1195 dev_name, error);
953 goto out; 1196 return NULL;
954 }
955 error = vfs_getattr(nd.path.mnt, nd.path.dentry, &stat);
956
957 list_for_each_entry(s, &gfs2_fs_type.fs_supers, s_instances) {
958 if ((S_ISBLK(stat.mode) && s->s_dev == stat.rdev) ||
959 (S_ISDIR(stat.mode) &&
960 s == nd.path.dentry->d_inode->i_sb)) {
961 sb = s;
962 goto free_nd;
963 }
964 } 1197 }
965 1198 sb = nd.path.dentry->d_inode->i_sb;
966 printk(KERN_WARNING "GFS2: Unrecognized block device or " 1199 if (sb && (sb->s_type == &gfs2_fs_type))
967 "mount point %s\n", dev_name); 1200 atomic_inc(&sb->s_active);
968 1201 else
969free_nd: 1202 sb = NULL;
970 path_put(&nd.path); 1203 path_put(&nd.path);
971out:
972 return sb; 1204 return sb;
973} 1205}
974 1206
975static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags, 1207static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
976 const char *dev_name, void *data, struct vfsmount *mnt) 1208 const char *dev_name, void *data, struct vfsmount *mnt)
977{ 1209{
978 int error = 0; 1210 struct super_block *sb = NULL;
979 struct super_block *sb = NULL, *new;
980 struct gfs2_sbd *sdp; 1211 struct gfs2_sbd *sdp;
981 1212
982 sb = get_gfs2_sb(dev_name); 1213 sb = get_gfs2_sb(dev_name);
983 if (!sb) { 1214 if (!sb) {
984 printk(KERN_WARNING "GFS2: gfs2 mount does not exist\n"); 1215 printk(KERN_WARNING "GFS2: gfs2 mount does not exist\n");
985 error = -ENOENT; 1216 return -ENOENT;
986 goto error;
987 } 1217 }
988 sdp = sb->s_fs_info; 1218 sdp = sb->s_fs_info;
989 if (sdp->sd_vfs_meta) { 1219 mnt->mnt_sb = sb;
990 printk(KERN_WARNING "GFS2: gfs2meta mount already exists\n"); 1220 mnt->mnt_root = dget(sdp->sd_master_dir);
991 error = -EBUSY; 1221 return 0;
992 goto error;
993 }
994 down(&sb->s_bdev->bd_mount_sem);
995 new = sget(fs_type, test_bdev_super, set_bdev_super, sb->s_bdev);
996 up(&sb->s_bdev->bd_mount_sem);
997 if (IS_ERR(new)) {
998 error = PTR_ERR(new);
999 goto error;
1000 }
1001 new->s_flags = flags;
1002 strlcpy(new->s_id, sb->s_id, sizeof(new->s_id));
1003 sb_set_blocksize(new, sb->s_blocksize);
1004 error = fill_super_meta(sb, new, data, flags & MS_SILENT ? 1 : 0);
1005 if (error) {
1006 up_write(&new->s_umount);
1007 deactivate_super(new);
1008 goto error;
1009 }
1010
1011 new->s_flags |= MS_ACTIVE;
1012
1013 /* Grab a reference to the gfs2 mount point */
1014 atomic_inc(&sdp->sd_gfs2mnt->mnt_count);
1015 return simple_set_mnt(mnt, new);
1016error:
1017 return error;
1018} 1222}
1019 1223
1020static void gfs2_kill_sb(struct super_block *sb) 1224static void gfs2_kill_sb(struct super_block *sb)
1021{ 1225{
1022 if (sb->s_fs_info) { 1226 struct gfs2_sbd *sdp = sb->s_fs_info;
1023 gfs2_delete_debugfs_file(sb->s_fs_info); 1227 if (sdp) {
1024 gfs2_meta_syncfs(sb->s_fs_info); 1228 gfs2_meta_syncfs(sdp);
1229 dput(sdp->sd_root_dir);
1230 dput(sdp->sd_master_dir);
1231 sdp->sd_root_dir = NULL;
1232 sdp->sd_master_dir = NULL;
1025 } 1233 }
1234 shrink_dcache_sb(sb);
1026 kill_block_super(sb); 1235 kill_block_super(sb);
1027} 1236 if (sdp)
1028 1237 gfs2_delete_debugfs_file(sdp);
1029static void gfs2_kill_sb_meta(struct super_block *sb)
1030{
1031 struct gfs2_sbd *sdp = sb->s_fs_info;
1032 generic_shutdown_super(sb);
1033 sdp->sd_vfs_meta = NULL;
1034 atomic_dec(&sdp->sd_gfs2mnt->mnt_count);
1035} 1238}
1036 1239
1037struct file_system_type gfs2_fs_type = { 1240struct file_system_type gfs2_fs_type = {
@@ -1046,7 +1249,6 @@ struct file_system_type gfs2meta_fs_type = {
1046 .name = "gfs2meta", 1249 .name = "gfs2meta",
1047 .fs_flags = FS_REQUIRES_DEV, 1250 .fs_flags = FS_REQUIRES_DEV,
1048 .get_sb = gfs2_get_sb_meta, 1251 .get_sb = gfs2_get_sb_meta,
1049 .kill_sb = gfs2_kill_sb_meta,
1050 .owner = THIS_MODULE, 1252 .owner = THIS_MODULE,
1051}; 1253};
1052 1254
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index e2c62f73a778..534e1e2c65ca 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -159,9 +159,13 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
159 gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); 159 gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
160 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1); 160 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1);
161 161
162 error = gfs2_glock_nq_m(2, ghs); 162 error = gfs2_glock_nq(ghs); /* parent */
163 if (error) 163 if (error)
164 goto out; 164 goto out_parent;
165
166 error = gfs2_glock_nq(ghs + 1); /* child */
167 if (error)
168 goto out_child;
165 169
166 error = gfs2_permission(dir, MAY_WRITE | MAY_EXEC); 170 error = gfs2_permission(dir, MAY_WRITE | MAY_EXEC);
167 if (error) 171 if (error)
@@ -245,8 +249,10 @@ out_alloc:
245 if (alloc_required) 249 if (alloc_required)
246 gfs2_alloc_put(dip); 250 gfs2_alloc_put(dip);
247out_gunlock: 251out_gunlock:
248 gfs2_glock_dq_m(2, ghs); 252 gfs2_glock_dq(ghs + 1);
249out: 253out_child:
254 gfs2_glock_dq(ghs);
255out_parent:
250 gfs2_holder_uninit(ghs); 256 gfs2_holder_uninit(ghs);
251 gfs2_holder_uninit(ghs + 1); 257 gfs2_holder_uninit(ghs + 1);
252 if (!error) { 258 if (!error) {
@@ -302,7 +308,7 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
302 308
303 error = gfs2_unlink_ok(dip, &dentry->d_name, ip); 309 error = gfs2_unlink_ok(dip, &dentry->d_name, ip);
304 if (error) 310 if (error)
305 goto out_rgrp; 311 goto out_gunlock;
306 312
307 error = gfs2_trans_begin(sdp, 2*RES_DINODE + RES_LEAF + RES_RG_BIT, 0); 313 error = gfs2_trans_begin(sdp, 2*RES_DINODE + RES_LEAF + RES_RG_BIT, 0);
308 if (error) 314 if (error)
@@ -316,6 +322,7 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
316 322
317out_end_trans: 323out_end_trans:
318 gfs2_trans_end(sdp); 324 gfs2_trans_end(sdp);
325out_gunlock:
319 gfs2_glock_dq(ghs + 2); 326 gfs2_glock_dq(ghs + 2);
320out_rgrp: 327out_rgrp:
321 gfs2_holder_uninit(ghs + 2); 328 gfs2_holder_uninit(ghs + 2);
@@ -485,7 +492,6 @@ static int gfs2_rmdir(struct inode *dir, struct dentry *dentry)
485 struct gfs2_holder ri_gh; 492 struct gfs2_holder ri_gh;
486 int error; 493 int error;
487 494
488
489 error = gfs2_rindex_hold(sdp, &ri_gh); 495 error = gfs2_rindex_hold(sdp, &ri_gh);
490 if (error) 496 if (error)
491 return error; 497 return error;
@@ -495,9 +501,17 @@ static int gfs2_rmdir(struct inode *dir, struct dentry *dentry)
495 rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr); 501 rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr);
496 gfs2_holder_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + 2); 502 gfs2_holder_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + 2);
497 503
498 error = gfs2_glock_nq_m(3, ghs); 504 error = gfs2_glock_nq(ghs); /* parent */
499 if (error) 505 if (error)
500 goto out; 506 goto out_parent;
507
508 error = gfs2_glock_nq(ghs + 1); /* child */
509 if (error)
510 goto out_child;
511
512 error = gfs2_glock_nq(ghs + 2); /* rgrp */
513 if (error)
514 goto out_rgrp;
501 515
502 error = gfs2_unlink_ok(dip, &dentry->d_name, ip); 516 error = gfs2_unlink_ok(dip, &dentry->d_name, ip);
503 if (error) 517 if (error)
@@ -523,11 +537,15 @@ static int gfs2_rmdir(struct inode *dir, struct dentry *dentry)
523 gfs2_trans_end(sdp); 537 gfs2_trans_end(sdp);
524 538
525out_gunlock: 539out_gunlock:
526 gfs2_glock_dq_m(3, ghs); 540 gfs2_glock_dq(ghs + 2);
527out: 541out_rgrp:
528 gfs2_holder_uninit(ghs);
529 gfs2_holder_uninit(ghs + 1);
530 gfs2_holder_uninit(ghs + 2); 542 gfs2_holder_uninit(ghs + 2);
543 gfs2_glock_dq(ghs + 1);
544out_child:
545 gfs2_holder_uninit(ghs + 1);
546 gfs2_glock_dq(ghs);
547out_parent:
548 gfs2_holder_uninit(ghs);
531 gfs2_glock_dq_uninit(&ri_gh); 549 gfs2_glock_dq_uninit(&ri_gh);
532 return error; 550 return error;
533} 551}
@@ -571,6 +589,54 @@ static int gfs2_mknod(struct inode *dir, struct dentry *dentry, int mode,
571 return 0; 589 return 0;
572} 590}
573 591
592/*
593 * gfs2_ok_to_move - check if it's ok to move a directory to another directory
594 * @this: move this
595 * @to: to here
596 *
597 * Follow @to back to the root and make sure we don't encounter @this
598 * Assumes we already hold the rename lock.
599 *
600 * Returns: errno
601 */
602
603static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
604{
605 struct inode *dir = &to->i_inode;
606 struct super_block *sb = dir->i_sb;
607 struct inode *tmp;
608 struct qstr dotdot;
609 int error = 0;
610
611 gfs2_str2qstr(&dotdot, "..");
612
613 igrab(dir);
614
615 for (;;) {
616 if (dir == &this->i_inode) {
617 error = -EINVAL;
618 break;
619 }
620 if (dir == sb->s_root->d_inode) {
621 error = 0;
622 break;
623 }
624
625 tmp = gfs2_lookupi(dir, &dotdot, 1);
626 if (IS_ERR(tmp)) {
627 error = PTR_ERR(tmp);
628 break;
629 }
630
631 iput(dir);
632 dir = tmp;
633 }
634
635 iput(dir);
636
637 return error;
638}
639
574/** 640/**
575 * gfs2_rename - Rename a file 641 * gfs2_rename - Rename a file
576 * @odir: Parent directory of old file name 642 * @odir: Parent directory of old file name
@@ -589,7 +655,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
589 struct gfs2_inode *ip = GFS2_I(odentry->d_inode); 655 struct gfs2_inode *ip = GFS2_I(odentry->d_inode);
590 struct gfs2_inode *nip = NULL; 656 struct gfs2_inode *nip = NULL;
591 struct gfs2_sbd *sdp = GFS2_SB(odir); 657 struct gfs2_sbd *sdp = GFS2_SB(odir);
592 struct gfs2_holder ghs[5], r_gh; 658 struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, };
593 struct gfs2_rgrpd *nrgd; 659 struct gfs2_rgrpd *nrgd;
594 unsigned int num_gh; 660 unsigned int num_gh;
595 int dir_rename = 0; 661 int dir_rename = 0;
@@ -603,19 +669,20 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
603 return 0; 669 return 0;
604 } 670 }
605 671
606 /* Make sure we aren't trying to move a dirctory into it's subdir */
607
608 if (S_ISDIR(ip->i_inode.i_mode) && odip != ndip) {
609 dir_rename = 1;
610 672
611 error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE, 0, 673 if (odip != ndip) {
612 &r_gh); 674 error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE,
675 0, &r_gh);
613 if (error) 676 if (error)
614 goto out; 677 goto out;
615 678
616 error = gfs2_ok_to_move(ip, ndip); 679 if (S_ISDIR(ip->i_inode.i_mode)) {
617 if (error) 680 dir_rename = 1;
618 goto out_gunlock_r; 681 /* don't move a dirctory into it's subdir */
682 error = gfs2_ok_to_move(ip, ndip);
683 if (error)
684 goto out_gunlock_r;
685 }
619 } 686 }
620 687
621 num_gh = 1; 688 num_gh = 1;
@@ -639,9 +706,11 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
639 gfs2_holder_init(nrgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh++); 706 gfs2_holder_init(nrgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh++);
640 } 707 }
641 708
642 error = gfs2_glock_nq_m(num_gh, ghs); 709 for (x = 0; x < num_gh; x++) {
643 if (error) 710 error = gfs2_glock_nq(ghs + x);
644 goto out_uninit; 711 if (error)
712 goto out_gunlock;
713 }
645 714
646 /* Check out the old directory */ 715 /* Check out the old directory */
647 716
@@ -804,12 +873,12 @@ out_alloc:
804 if (alloc_required) 873 if (alloc_required)
805 gfs2_alloc_put(ndip); 874 gfs2_alloc_put(ndip);
806out_gunlock: 875out_gunlock:
807 gfs2_glock_dq_m(num_gh, ghs); 876 while (x--) {
808out_uninit: 877 gfs2_glock_dq(ghs + x);
809 for (x = 0; x < num_gh; x++)
810 gfs2_holder_uninit(ghs + x); 878 gfs2_holder_uninit(ghs + x);
879 }
811out_gunlock_r: 880out_gunlock_r:
812 if (dir_rename) 881 if (r_gh.gh_gl)
813 gfs2_glock_dq_uninit(&r_gh); 882 gfs2_glock_dq_uninit(&r_gh);
814out: 883out:
815 return error; 884 return error;
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index f66ea0f7a356..d5355d9b5926 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -20,6 +20,7 @@
20#include <linux/gfs2_ondisk.h> 20#include <linux/gfs2_ondisk.h>
21#include <linux/crc32.h> 21#include <linux/crc32.h>
22#include <linux/lm_interface.h> 22#include <linux/lm_interface.h>
23#include <linux/time.h>
23 24
24#include "gfs2.h" 25#include "gfs2.h"
25#include "incore.h" 26#include "incore.h"
@@ -38,6 +39,7 @@
38#include "dir.h" 39#include "dir.h"
39#include "eattr.h" 40#include "eattr.h"
40#include "bmap.h" 41#include "bmap.h"
42#include "meta_io.h"
41 43
42/** 44/**
43 * gfs2_write_inode - Make sure the inode is stable on the disk 45 * gfs2_write_inode - Make sure the inode is stable on the disk
@@ -50,16 +52,74 @@
50static int gfs2_write_inode(struct inode *inode, int sync) 52static int gfs2_write_inode(struct inode *inode, int sync)
51{ 53{
52 struct gfs2_inode *ip = GFS2_I(inode); 54 struct gfs2_inode *ip = GFS2_I(inode);
53 55 struct gfs2_sbd *sdp = GFS2_SB(inode);
54 /* Check this is a "normal" inode */ 56 struct gfs2_holder gh;
55 if (test_bit(GIF_USER, &ip->i_flags)) { 57 struct buffer_head *bh;
56 if (current->flags & PF_MEMALLOC) 58 struct timespec atime;
57 return 0; 59 struct gfs2_dinode *di;
58 if (sync) 60 int ret = 0;
59 gfs2_log_flush(GFS2_SB(inode), ip->i_gl); 61
62 /* Check this is a "normal" inode, etc */
63 if (!test_bit(GIF_USER, &ip->i_flags) ||
64 (current->flags & PF_MEMALLOC))
65 return 0;
66 ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
67 if (ret)
68 goto do_flush;
69 ret = gfs2_trans_begin(sdp, RES_DINODE, 0);
70 if (ret)
71 goto do_unlock;
72 ret = gfs2_meta_inode_buffer(ip, &bh);
73 if (ret == 0) {
74 di = (struct gfs2_dinode *)bh->b_data;
75 atime.tv_sec = be64_to_cpu(di->di_atime);
76 atime.tv_nsec = be32_to_cpu(di->di_atime_nsec);
77 if (timespec_compare(&inode->i_atime, &atime) > 0) {
78 gfs2_trans_add_bh(ip->i_gl, bh, 1);
79 gfs2_dinode_out(ip, bh->b_data);
80 }
81 brelse(bh);
60 } 82 }
83 gfs2_trans_end(sdp);
84do_unlock:
85 gfs2_glock_dq_uninit(&gh);
86do_flush:
87 if (sync != 0)
88 gfs2_log_flush(GFS2_SB(inode), ip->i_gl);
89 return ret;
90}
61 91
62 return 0; 92/**
93 * gfs2_make_fs_ro - Turn a Read-Write FS into a Read-Only one
94 * @sdp: the filesystem
95 *
96 * Returns: errno
97 */
98
99static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
100{
101 struct gfs2_holder t_gh;
102 int error;
103
104 gfs2_quota_sync(sdp);
105 gfs2_statfs_sync(sdp);
106
107 error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, GL_NOCACHE,
108 &t_gh);
109 if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
110 return error;
111
112 gfs2_meta_syncfs(sdp);
113 gfs2_log_shutdown(sdp);
114
115 clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
116
117 if (t_gh.gh_gl)
118 gfs2_glock_dq_uninit(&t_gh);
119
120 gfs2_quota_cleanup(sdp);
121
122 return error;
63} 123}
64 124
65/** 125/**
@@ -73,12 +133,6 @@ static void gfs2_put_super(struct super_block *sb)
73 struct gfs2_sbd *sdp = sb->s_fs_info; 133 struct gfs2_sbd *sdp = sb->s_fs_info;
74 int error; 134 int error;
75 135
76 if (!sdp)
77 return;
78
79 if (!strncmp(sb->s_type->name, "gfs2meta", 8))
80 return; /* Nothing to do */
81
82 /* Unfreeze the filesystem, if we need to */ 136 /* Unfreeze the filesystem, if we need to */
83 137
84 mutex_lock(&sdp->sd_freeze_lock); 138 mutex_lock(&sdp->sd_freeze_lock);
@@ -101,7 +155,6 @@ static void gfs2_put_super(struct super_block *sb)
101 155
102 /* Release stuff */ 156 /* Release stuff */
103 157
104 iput(sdp->sd_master_dir);
105 iput(sdp->sd_jindex); 158 iput(sdp->sd_jindex);
106 iput(sdp->sd_inum_inode); 159 iput(sdp->sd_inum_inode);
107 iput(sdp->sd_statfs_inode); 160 iput(sdp->sd_statfs_inode);
@@ -152,6 +205,7 @@ static void gfs2_write_super(struct super_block *sb)
152 * 205 *
153 * Flushes the log to disk. 206 * Flushes the log to disk.
154 */ 207 */
208
155static int gfs2_sync_fs(struct super_block *sb, int wait) 209static int gfs2_sync_fs(struct super_block *sb, int wait)
156{ 210{
157 sb->s_dirt = 0; 211 sb->s_dirt = 0;
@@ -270,14 +324,6 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
270 } 324 }
271 } 325 }
272 326
273 if (*flags & (MS_NOATIME | MS_NODIRATIME))
274 set_bit(SDF_NOATIME, &sdp->sd_flags);
275 else
276 clear_bit(SDF_NOATIME, &sdp->sd_flags);
277
278 /* Don't let the VFS update atimes. GFS2 handles this itself. */
279 *flags |= MS_NOATIME | MS_NODIRATIME;
280
281 return error; 327 return error;
282} 328}
283 329
@@ -295,6 +341,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
295 * inode's blocks, or alternatively pass the baton on to another 341 * inode's blocks, or alternatively pass the baton on to another
296 * node for later deallocation. 342 * node for later deallocation.
297 */ 343 */
344
298static void gfs2_drop_inode(struct inode *inode) 345static void gfs2_drop_inode(struct inode *inode)
299{ 346{
300 struct gfs2_inode *ip = GFS2_I(inode); 347 struct gfs2_inode *ip = GFS2_I(inode);
@@ -333,6 +380,16 @@ static void gfs2_clear_inode(struct inode *inode)
333 } 380 }
334} 381}
335 382
383static int is_ancestor(const struct dentry *d1, const struct dentry *d2)
384{
385 do {
386 if (d1 == d2)
387 return 1;
388 d1 = d1->d_parent;
389 } while (!IS_ROOT(d1));
390 return 0;
391}
392
336/** 393/**
337 * gfs2_show_options - Show mount options for /proc/mounts 394 * gfs2_show_options - Show mount options for /proc/mounts
338 * @s: seq_file structure 395 * @s: seq_file structure
@@ -346,6 +403,8 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
346 struct gfs2_sbd *sdp = mnt->mnt_sb->s_fs_info; 403 struct gfs2_sbd *sdp = mnt->mnt_sb->s_fs_info;
347 struct gfs2_args *args = &sdp->sd_args; 404 struct gfs2_args *args = &sdp->sd_args;
348 405
406 if (is_ancestor(mnt->mnt_root, sdp->sd_master_dir))
407 seq_printf(s, ",meta");
349 if (args->ar_lockproto[0]) 408 if (args->ar_lockproto[0])
350 seq_printf(s, ",lockproto=%s", args->ar_lockproto); 409 seq_printf(s, ",lockproto=%s", args->ar_lockproto);
351 if (args->ar_locktable[0]) 410 if (args->ar_locktable[0])
@@ -414,6 +473,7 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
414 * conversion on the iopen lock, but we can change that later. This 473 * conversion on the iopen lock, but we can change that later. This
415 * is safe, just less efficient. 474 * is safe, just less efficient.
416 */ 475 */
476
417static void gfs2_delete_inode(struct inode *inode) 477static void gfs2_delete_inode(struct inode *inode)
418{ 478{
419 struct gfs2_sbd *sdp = inode->i_sb->s_fs_info; 479 struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
@@ -478,8 +538,6 @@ out:
478 clear_inode(inode); 538 clear_inode(inode);
479} 539}
480 540
481
482
483static struct inode *gfs2_alloc_inode(struct super_block *sb) 541static struct inode *gfs2_alloc_inode(struct super_block *sb)
484{ 542{
485 struct gfs2_inode *ip; 543 struct gfs2_inode *ip;
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index ca831991cbc2..c3ba3d9d0aac 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -33,313 +33,6 @@
33#include "trans.h" 33#include "trans.h"
34#include "util.h" 34#include "util.h"
35 35
36static const u32 gfs2_old_fs_formats[] = {
37 0
38};
39
40static const u32 gfs2_old_multihost_formats[] = {
41 0
42};
43
44/**
45 * gfs2_tune_init - Fill a gfs2_tune structure with default values
46 * @gt: tune
47 *
48 */
49
50void gfs2_tune_init(struct gfs2_tune *gt)
51{
52 spin_lock_init(&gt->gt_spin);
53
54 gt->gt_demote_secs = 300;
55 gt->gt_incore_log_blocks = 1024;
56 gt->gt_log_flush_secs = 60;
57 gt->gt_recoverd_secs = 60;
58 gt->gt_logd_secs = 1;
59 gt->gt_quotad_secs = 5;
60 gt->gt_quota_simul_sync = 64;
61 gt->gt_quota_warn_period = 10;
62 gt->gt_quota_scale_num = 1;
63 gt->gt_quota_scale_den = 1;
64 gt->gt_quota_cache_secs = 300;
65 gt->gt_quota_quantum = 60;
66 gt->gt_atime_quantum = 3600;
67 gt->gt_new_files_jdata = 0;
68 gt->gt_max_readahead = 1 << 18;
69 gt->gt_stall_secs = 600;
70 gt->gt_complain_secs = 10;
71 gt->gt_statfs_quantum = 30;
72 gt->gt_statfs_slow = 0;
73}
74
75/**
76 * gfs2_check_sb - Check superblock
77 * @sdp: the filesystem
78 * @sb: The superblock
79 * @silent: Don't print a message if the check fails
80 *
81 * Checks the version code of the FS is one that we understand how to
82 * read and that the sizes of the various on-disk structures have not
83 * changed.
84 */
85
86int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent)
87{
88 unsigned int x;
89
90 if (sb->sb_magic != GFS2_MAGIC ||
91 sb->sb_type != GFS2_METATYPE_SB) {
92 if (!silent)
93 printk(KERN_WARNING "GFS2: not a GFS2 filesystem\n");
94 return -EINVAL;
95 }
96
97 /* If format numbers match exactly, we're done. */
98
99 if (sb->sb_fs_format == GFS2_FORMAT_FS &&
100 sb->sb_multihost_format == GFS2_FORMAT_MULTI)
101 return 0;
102
103 if (sb->sb_fs_format != GFS2_FORMAT_FS) {
104 for (x = 0; gfs2_old_fs_formats[x]; x++)
105 if (gfs2_old_fs_formats[x] == sb->sb_fs_format)
106 break;
107
108 if (!gfs2_old_fs_formats[x]) {
109 printk(KERN_WARNING
110 "GFS2: code version (%u, %u) is incompatible "
111 "with ondisk format (%u, %u)\n",
112 GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
113 sb->sb_fs_format, sb->sb_multihost_format);
114 printk(KERN_WARNING
115 "GFS2: I don't know how to upgrade this FS\n");
116 return -EINVAL;
117 }
118 }
119
120 if (sb->sb_multihost_format != GFS2_FORMAT_MULTI) {
121 for (x = 0; gfs2_old_multihost_formats[x]; x++)
122 if (gfs2_old_multihost_formats[x] ==
123 sb->sb_multihost_format)
124 break;
125
126 if (!gfs2_old_multihost_formats[x]) {
127 printk(KERN_WARNING
128 "GFS2: code version (%u, %u) is incompatible "
129 "with ondisk format (%u, %u)\n",
130 GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
131 sb->sb_fs_format, sb->sb_multihost_format);
132 printk(KERN_WARNING
133 "GFS2: I don't know how to upgrade this FS\n");
134 return -EINVAL;
135 }
136 }
137
138 if (!sdp->sd_args.ar_upgrade) {
139 printk(KERN_WARNING
140 "GFS2: code version (%u, %u) is incompatible "
141 "with ondisk format (%u, %u)\n",
142 GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
143 sb->sb_fs_format, sb->sb_multihost_format);
144 printk(KERN_INFO
145 "GFS2: Use the \"upgrade\" mount option to upgrade "
146 "the FS\n");
147 printk(KERN_INFO "GFS2: See the manual for more details\n");
148 return -EINVAL;
149 }
150
151 return 0;
152}
153
154
155static void end_bio_io_page(struct bio *bio, int error)
156{
157 struct page *page = bio->bi_private;
158
159 if (!error)
160 SetPageUptodate(page);
161 else
162 printk(KERN_WARNING "gfs2: error %d reading superblock\n", error);
163 unlock_page(page);
164}
165
166static void gfs2_sb_in(struct gfs2_sb_host *sb, const void *buf)
167{
168 const struct gfs2_sb *str = buf;
169
170 sb->sb_magic = be32_to_cpu(str->sb_header.mh_magic);
171 sb->sb_type = be32_to_cpu(str->sb_header.mh_type);
172 sb->sb_format = be32_to_cpu(str->sb_header.mh_format);
173 sb->sb_fs_format = be32_to_cpu(str->sb_fs_format);
174 sb->sb_multihost_format = be32_to_cpu(str->sb_multihost_format);
175 sb->sb_bsize = be32_to_cpu(str->sb_bsize);
176 sb->sb_bsize_shift = be32_to_cpu(str->sb_bsize_shift);
177 sb->sb_master_dir.no_addr = be64_to_cpu(str->sb_master_dir.no_addr);
178 sb->sb_master_dir.no_formal_ino = be64_to_cpu(str->sb_master_dir.no_formal_ino);
179 sb->sb_root_dir.no_addr = be64_to_cpu(str->sb_root_dir.no_addr);
180 sb->sb_root_dir.no_formal_ino = be64_to_cpu(str->sb_root_dir.no_formal_ino);
181
182 memcpy(sb->sb_lockproto, str->sb_lockproto, GFS2_LOCKNAME_LEN);
183 memcpy(sb->sb_locktable, str->sb_locktable, GFS2_LOCKNAME_LEN);
184}
185
186/**
187 * gfs2_read_super - Read the gfs2 super block from disk
188 * @sdp: The GFS2 super block
189 * @sector: The location of the super block
190 * @error: The error code to return
191 *
192 * This uses the bio functions to read the super block from disk
193 * because we want to be 100% sure that we never read cached data.
194 * A super block is read twice only during each GFS2 mount and is
195 * never written to by the filesystem. The first time its read no
196 * locks are held, and the only details which are looked at are those
197 * relating to the locking protocol. Once locking is up and working,
198 * the sb is read again under the lock to establish the location of
199 * the master directory (contains pointers to journals etc) and the
200 * root directory.
201 *
202 * Returns: 0 on success or error
203 */
204
205int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector)
206{
207 struct super_block *sb = sdp->sd_vfs;
208 struct gfs2_sb *p;
209 struct page *page;
210 struct bio *bio;
211
212 page = alloc_page(GFP_NOFS);
213 if (unlikely(!page))
214 return -ENOBUFS;
215
216 ClearPageUptodate(page);
217 ClearPageDirty(page);
218 lock_page(page);
219
220 bio = bio_alloc(GFP_NOFS, 1);
221 if (unlikely(!bio)) {
222 __free_page(page);
223 return -ENOBUFS;
224 }
225
226 bio->bi_sector = sector * (sb->s_blocksize >> 9);
227 bio->bi_bdev = sb->s_bdev;
228 bio_add_page(bio, page, PAGE_SIZE, 0);
229
230 bio->bi_end_io = end_bio_io_page;
231 bio->bi_private = page;
232 submit_bio(READ_SYNC | (1 << BIO_RW_META), bio);
233 wait_on_page_locked(page);
234 bio_put(bio);
235 if (!PageUptodate(page)) {
236 __free_page(page);
237 return -EIO;
238 }
239 p = kmap(page);
240 gfs2_sb_in(&sdp->sd_sb, p);
241 kunmap(page);
242 __free_page(page);
243 return 0;
244}
245
246/**
247 * gfs2_read_sb - Read super block
248 * @sdp: The GFS2 superblock
249 * @gl: the glock for the superblock (assumed to be held)
250 * @silent: Don't print message if mount fails
251 *
252 */
253
254int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent)
255{
256 u32 hash_blocks, ind_blocks, leaf_blocks;
257 u32 tmp_blocks;
258 unsigned int x;
259 int error;
260
261 error = gfs2_read_super(sdp, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift);
262 if (error) {
263 if (!silent)
264 fs_err(sdp, "can't read superblock\n");
265 return error;
266 }
267
268 error = gfs2_check_sb(sdp, &sdp->sd_sb, silent);
269 if (error)
270 return error;
271
272 sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift -
273 GFS2_BASIC_BLOCK_SHIFT;
274 sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
275 sdp->sd_diptrs = (sdp->sd_sb.sb_bsize -
276 sizeof(struct gfs2_dinode)) / sizeof(u64);
277 sdp->sd_inptrs = (sdp->sd_sb.sb_bsize -
278 sizeof(struct gfs2_meta_header)) / sizeof(u64);
279 sdp->sd_jbsize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header);
280 sdp->sd_hash_bsize = sdp->sd_sb.sb_bsize / 2;
281 sdp->sd_hash_bsize_shift = sdp->sd_sb.sb_bsize_shift - 1;
282 sdp->sd_hash_ptrs = sdp->sd_hash_bsize / sizeof(u64);
283 sdp->sd_qc_per_block = (sdp->sd_sb.sb_bsize -
284 sizeof(struct gfs2_meta_header)) /
285 sizeof(struct gfs2_quota_change);
286
287 /* Compute maximum reservation required to add a entry to a directory */
288
289 hash_blocks = DIV_ROUND_UP(sizeof(u64) * (1 << GFS2_DIR_MAX_DEPTH),
290 sdp->sd_jbsize);
291
292 ind_blocks = 0;
293 for (tmp_blocks = hash_blocks; tmp_blocks > sdp->sd_diptrs;) {
294 tmp_blocks = DIV_ROUND_UP(tmp_blocks, sdp->sd_inptrs);
295 ind_blocks += tmp_blocks;
296 }
297
298 leaf_blocks = 2 + GFS2_DIR_MAX_DEPTH;
299
300 sdp->sd_max_dirres = hash_blocks + ind_blocks + leaf_blocks;
301
302 sdp->sd_heightsize[0] = sdp->sd_sb.sb_bsize -
303 sizeof(struct gfs2_dinode);
304 sdp->sd_heightsize[1] = sdp->sd_sb.sb_bsize * sdp->sd_diptrs;
305 for (x = 2;; x++) {
306 u64 space, d;
307 u32 m;
308
309 space = sdp->sd_heightsize[x - 1] * sdp->sd_inptrs;
310 d = space;
311 m = do_div(d, sdp->sd_inptrs);
312
313 if (d != sdp->sd_heightsize[x - 1] || m)
314 break;
315 sdp->sd_heightsize[x] = space;
316 }
317 sdp->sd_max_height = x;
318 sdp->sd_heightsize[x] = ~0;
319 gfs2_assert(sdp, sdp->sd_max_height <= GFS2_MAX_META_HEIGHT);
320
321 sdp->sd_jheightsize[0] = sdp->sd_sb.sb_bsize -
322 sizeof(struct gfs2_dinode);
323 sdp->sd_jheightsize[1] = sdp->sd_jbsize * sdp->sd_diptrs;
324 for (x = 2;; x++) {
325 u64 space, d;
326 u32 m;
327
328 space = sdp->sd_jheightsize[x - 1] * sdp->sd_inptrs;
329 d = space;
330 m = do_div(d, sdp->sd_inptrs);
331
332 if (d != sdp->sd_jheightsize[x - 1] || m)
333 break;
334 sdp->sd_jheightsize[x] = space;
335 }
336 sdp->sd_max_jheight = x;
337 sdp->sd_jheightsize[x] = ~0;
338 gfs2_assert(sdp, sdp->sd_max_jheight <= GFS2_MAX_META_HEIGHT);
339
340 return 0;
341}
342
343/** 36/**
344 * gfs2_jindex_hold - Grab a lock on the jindex 37 * gfs2_jindex_hold - Grab a lock on the jindex
345 * @sdp: The GFS2 superblock 38 * @sdp: The GFS2 superblock
@@ -581,39 +274,6 @@ fail:
581 return error; 274 return error;
582} 275}
583 276
584/**
585 * gfs2_make_fs_ro - Turn a Read-Write FS into a Read-Only one
586 * @sdp: the filesystem
587 *
588 * Returns: errno
589 */
590
591int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
592{
593 struct gfs2_holder t_gh;
594 int error;
595
596 gfs2_quota_sync(sdp);
597 gfs2_statfs_sync(sdp);
598
599 error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, GL_NOCACHE,
600 &t_gh);
601 if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
602 return error;
603
604 gfs2_meta_syncfs(sdp);
605 gfs2_log_shutdown(sdp);
606
607 clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
608
609 if (t_gh.gh_gl)
610 gfs2_glock_dq_uninit(&t_gh);
611
612 gfs2_quota_cleanup(sdp);
613
614 return error;
615}
616
617static void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, const void *buf) 277static void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, const void *buf)
618{ 278{
619 const struct gfs2_statfs_change *str = buf; 279 const struct gfs2_statfs_change *str = buf;
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index 44361ecc44f7..50a4c9b1215e 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -12,11 +12,6 @@
12 12
13#include "incore.h" 13#include "incore.h"
14 14
15void gfs2_tune_init(struct gfs2_tune *gt);
16
17int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent);
18int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent);
19int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector);
20void gfs2_lm_unmount(struct gfs2_sbd *sdp); 15void gfs2_lm_unmount(struct gfs2_sbd *sdp);
21 16
22static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp) 17static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp)
@@ -40,7 +35,6 @@ int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename,
40 struct gfs2_inode **ipp); 35 struct gfs2_inode **ipp);
41 36
42int gfs2_make_fs_rw(struct gfs2_sbd *sdp); 37int gfs2_make_fs_rw(struct gfs2_sbd *sdp);
43int gfs2_make_fs_ro(struct gfs2_sbd *sdp);
44 38
45int gfs2_statfs_init(struct gfs2_sbd *sdp); 39int gfs2_statfs_init(struct gfs2_sbd *sdp);
46void gfs2_statfs_change(struct gfs2_sbd *sdp, 40void gfs2_statfs_change(struct gfs2_sbd *sdp,
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 74846559fc3f..7e1879f1a02c 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -269,14 +269,6 @@ ARGS_ATTR(quota, "%u\n");
269ARGS_ATTR(suiddir, "%d\n"); 269ARGS_ATTR(suiddir, "%d\n");
270ARGS_ATTR(data, "%d\n"); 270ARGS_ATTR(data, "%d\n");
271 271
272/* one oddball doesn't fit the macro mold */
273static ssize_t noatime_show(struct gfs2_sbd *sdp, char *buf)
274{
275 return snprintf(buf, PAGE_SIZE, "%d\n",
276 !!test_bit(SDF_NOATIME, &sdp->sd_flags));
277}
278static struct args_attr args_attr_noatime = __ATTR_RO(noatime);
279
280static struct attribute *args_attrs[] = { 272static struct attribute *args_attrs[] = {
281 &args_attr_lockproto.attr, 273 &args_attr_lockproto.attr,
282 &args_attr_locktable.attr, 274 &args_attr_locktable.attr,
@@ -292,7 +284,6 @@ static struct attribute *args_attrs[] = {
292 &args_attr_quota.attr, 284 &args_attr_quota.attr,
293 &args_attr_suiddir.attr, 285 &args_attr_suiddir.attr,
294 &args_attr_data.attr, 286 &args_attr_data.attr,
295 &args_attr_noatime.attr,
296 NULL, 287 NULL,
297}; 288};
298 289
@@ -407,7 +398,6 @@ TUNE_ATTR(incore_log_blocks, 0);
407TUNE_ATTR(log_flush_secs, 0); 398TUNE_ATTR(log_flush_secs, 0);
408TUNE_ATTR(quota_warn_period, 0); 399TUNE_ATTR(quota_warn_period, 0);
409TUNE_ATTR(quota_quantum, 0); 400TUNE_ATTR(quota_quantum, 0);
410TUNE_ATTR(atime_quantum, 0);
411TUNE_ATTR(max_readahead, 0); 401TUNE_ATTR(max_readahead, 0);
412TUNE_ATTR(complain_secs, 0); 402TUNE_ATTR(complain_secs, 0);
413TUNE_ATTR(statfs_slow, 0); 403TUNE_ATTR(statfs_slow, 0);
@@ -427,7 +417,6 @@ static struct attribute *tune_attrs[] = {
427 &tune_attr_log_flush_secs.attr, 417 &tune_attr_log_flush_secs.attr,
428 &tune_attr_quota_warn_period.attr, 418 &tune_attr_quota_warn_period.attr,
429 &tune_attr_quota_quantum.attr, 419 &tune_attr_quota_quantum.attr,
430 &tune_attr_atime_quantum.attr,
431 &tune_attr_max_readahead.attr, 420 &tune_attr_max_readahead.attr,
432 &tune_attr_complain_secs.attr, 421 &tune_attr_complain_secs.attr,
433 &tune_attr_statfs_slow.attr, 422 &tune_attr_statfs_slow.attr,
diff --git a/fs/inotify_user.c b/fs/inotify_user.c
index 60249429a253..d85c7d931cdf 100644
--- a/fs/inotify_user.c
+++ b/fs/inotify_user.c
@@ -323,7 +323,7 @@ out:
323} 323}
324 324
325/* 325/*
326 * remove_kevent - cleans up and ultimately frees the given kevent 326 * remove_kevent - cleans up the given kevent
327 * 327 *
328 * Caller must hold dev->ev_mutex. 328 * Caller must hold dev->ev_mutex.
329 */ 329 */
@@ -334,7 +334,13 @@ static void remove_kevent(struct inotify_device *dev,
334 334
335 dev->event_count--; 335 dev->event_count--;
336 dev->queue_size -= sizeof(struct inotify_event) + kevent->event.len; 336 dev->queue_size -= sizeof(struct inotify_event) + kevent->event.len;
337}
337 338
339/*
340 * free_kevent - frees the given kevent.
341 */
342static void free_kevent(struct inotify_kernel_event *kevent)
343{
338 kfree(kevent->name); 344 kfree(kevent->name);
339 kmem_cache_free(event_cachep, kevent); 345 kmem_cache_free(event_cachep, kevent);
340} 346}
@@ -350,6 +356,7 @@ static void inotify_dev_event_dequeue(struct inotify_device *dev)
350 struct inotify_kernel_event *kevent; 356 struct inotify_kernel_event *kevent;
351 kevent = inotify_dev_get_event(dev); 357 kevent = inotify_dev_get_event(dev);
352 remove_kevent(dev, kevent); 358 remove_kevent(dev, kevent);
359 free_kevent(kevent);
353 } 360 }
354} 361}
355 362
@@ -433,17 +440,15 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
433 dev = file->private_data; 440 dev = file->private_data;
434 441
435 while (1) { 442 while (1) {
436 int events;
437 443
438 prepare_to_wait(&dev->wq, &wait, TASK_INTERRUPTIBLE); 444 prepare_to_wait(&dev->wq, &wait, TASK_INTERRUPTIBLE);
439 445
440 mutex_lock(&dev->ev_mutex); 446 mutex_lock(&dev->ev_mutex);
441 events = !list_empty(&dev->events); 447 if (!list_empty(&dev->events)) {
442 mutex_unlock(&dev->ev_mutex);
443 if (events) {
444 ret = 0; 448 ret = 0;
445 break; 449 break;
446 } 450 }
451 mutex_unlock(&dev->ev_mutex);
447 452
448 if (file->f_flags & O_NONBLOCK) { 453 if (file->f_flags & O_NONBLOCK) {
449 ret = -EAGAIN; 454 ret = -EAGAIN;
@@ -462,7 +467,6 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
462 if (ret) 467 if (ret)
463 return ret; 468 return ret;
464 469
465 mutex_lock(&dev->ev_mutex);
466 while (1) { 470 while (1) {
467 struct inotify_kernel_event *kevent; 471 struct inotify_kernel_event *kevent;
468 472
@@ -481,6 +485,13 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
481 } 485 }
482 break; 486 break;
483 } 487 }
488 remove_kevent(dev, kevent);
489
490 /*
491 * Must perform the copy_to_user outside the mutex in order
492 * to avoid a lock order reversal with mmap_sem.
493 */
494 mutex_unlock(&dev->ev_mutex);
484 495
485 if (copy_to_user(buf, &kevent->event, event_size)) { 496 if (copy_to_user(buf, &kevent->event, event_size)) {
486 ret = -EFAULT; 497 ret = -EFAULT;
@@ -498,7 +509,9 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
498 count -= kevent->event.len; 509 count -= kevent->event.len;
499 } 510 }
500 511
501 remove_kevent(dev, kevent); 512 free_kevent(kevent);
513
514 mutex_lock(&dev->ev_mutex);
502 } 515 }
503 mutex_unlock(&dev->ev_mutex); 516 mutex_unlock(&dev->ev_mutex);
504 517
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 7db32b3382d3..d152856c371b 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -13,9 +13,14 @@
13#include <linux/security.h> 13#include <linux/security.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/uaccess.h> 15#include <linux/uaccess.h>
16#include <linux/writeback.h>
17#include <linux/buffer_head.h>
16 18
17#include <asm/ioctls.h> 19#include <asm/ioctls.h>
18 20
21/* So that the fiemap access checks can't overflow on 32 bit machines. */
22#define FIEMAP_MAX_EXTENTS (UINT_MAX / sizeof(struct fiemap_extent))
23
19/** 24/**
20 * vfs_ioctl - call filesystem specific ioctl methods 25 * vfs_ioctl - call filesystem specific ioctl methods
21 * @filp: open file to invoke ioctl method on 26 * @filp: open file to invoke ioctl method on
@@ -71,6 +76,276 @@ static int ioctl_fibmap(struct file *filp, int __user *p)
71 return put_user(res, p); 76 return put_user(res, p);
72} 77}
73 78
79/**
80 * fiemap_fill_next_extent - Fiemap helper function
81 * @fieinfo: Fiemap context passed into ->fiemap
82 * @logical: Extent logical start offset, in bytes
83 * @phys: Extent physical start offset, in bytes
84 * @len: Extent length, in bytes
85 * @flags: FIEMAP_EXTENT flags that describe this extent
86 *
87 * Called from file system ->fiemap callback. Will populate extent
88 * info as passed in via arguments and copy to user memory. On
89 * success, extent count on fieinfo is incremented.
90 *
91 * Returns 0 on success, -errno on error, 1 if this was the last
92 * extent that will fit in user array.
93 */
94#define SET_UNKNOWN_FLAGS (FIEMAP_EXTENT_DELALLOC)
95#define SET_NO_UNMOUNTED_IO_FLAGS (FIEMAP_EXTENT_DATA_ENCRYPTED)
96#define SET_NOT_ALIGNED_FLAGS (FIEMAP_EXTENT_DATA_TAIL|FIEMAP_EXTENT_DATA_INLINE)
97int fiemap_fill_next_extent(struct fiemap_extent_info *fieinfo, u64 logical,
98 u64 phys, u64 len, u32 flags)
99{
100 struct fiemap_extent extent;
101 struct fiemap_extent *dest = fieinfo->fi_extents_start;
102
103 /* only count the extents */
104 if (fieinfo->fi_extents_max == 0) {
105 fieinfo->fi_extents_mapped++;
106 return (flags & FIEMAP_EXTENT_LAST) ? 1 : 0;
107 }
108
109 if (fieinfo->fi_extents_mapped >= fieinfo->fi_extents_max)
110 return 1;
111
112 if (flags & SET_UNKNOWN_FLAGS)
113 flags |= FIEMAP_EXTENT_UNKNOWN;
114 if (flags & SET_NO_UNMOUNTED_IO_FLAGS)
115 flags |= FIEMAP_EXTENT_ENCODED;
116 if (flags & SET_NOT_ALIGNED_FLAGS)
117 flags |= FIEMAP_EXTENT_NOT_ALIGNED;
118
119 memset(&extent, 0, sizeof(extent));
120 extent.fe_logical = logical;
121 extent.fe_physical = phys;
122 extent.fe_length = len;
123 extent.fe_flags = flags;
124
125 dest += fieinfo->fi_extents_mapped;
126 if (copy_to_user(dest, &extent, sizeof(extent)))
127 return -EFAULT;
128
129 fieinfo->fi_extents_mapped++;
130 if (fieinfo->fi_extents_mapped == fieinfo->fi_extents_max)
131 return 1;
132 return (flags & FIEMAP_EXTENT_LAST) ? 1 : 0;
133}
134EXPORT_SYMBOL(fiemap_fill_next_extent);
135
136/**
137 * fiemap_check_flags - check validity of requested flags for fiemap
138 * @fieinfo: Fiemap context passed into ->fiemap
139 * @fs_flags: Set of fiemap flags that the file system understands
140 *
141 * Called from file system ->fiemap callback. This will compute the
142 * intersection of valid fiemap flags and those that the fs supports. That
143 * value is then compared against the user supplied flags. In case of bad user
144 * flags, the invalid values will be written into the fieinfo structure, and
145 * -EBADR is returned, which tells ioctl_fiemap() to return those values to
146 * userspace. For this reason, a return code of -EBADR should be preserved.
147 *
148 * Returns 0 on success, -EBADR on bad flags.
149 */
150int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags)
151{
152 u32 incompat_flags;
153
154 incompat_flags = fieinfo->fi_flags & ~(FIEMAP_FLAGS_COMPAT & fs_flags);
155 if (incompat_flags) {
156 fieinfo->fi_flags = incompat_flags;
157 return -EBADR;
158 }
159 return 0;
160}
161EXPORT_SYMBOL(fiemap_check_flags);
162
163static int fiemap_check_ranges(struct super_block *sb,
164 u64 start, u64 len, u64 *new_len)
165{
166 *new_len = len;
167
168 if (len == 0)
169 return -EINVAL;
170
171 if (start > sb->s_maxbytes)
172 return -EFBIG;
173
174 /*
175 * Shrink request scope to what the fs can actually handle.
176 */
177 if ((len > sb->s_maxbytes) ||
178 (sb->s_maxbytes - len) < start)
179 *new_len = sb->s_maxbytes - start;
180
181 return 0;
182}
183
184static int ioctl_fiemap(struct file *filp, unsigned long arg)
185{
186 struct fiemap fiemap;
187 struct fiemap_extent_info fieinfo = { 0, };
188 struct inode *inode = filp->f_path.dentry->d_inode;
189 struct super_block *sb = inode->i_sb;
190 u64 len;
191 int error;
192
193 if (!inode->i_op->fiemap)
194 return -EOPNOTSUPP;
195
196 if (copy_from_user(&fiemap, (struct fiemap __user *)arg,
197 sizeof(struct fiemap)))
198 return -EFAULT;
199
200 if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS)
201 return -EINVAL;
202
203 error = fiemap_check_ranges(sb, fiemap.fm_start, fiemap.fm_length,
204 &len);
205 if (error)
206 return error;
207
208 fieinfo.fi_flags = fiemap.fm_flags;
209 fieinfo.fi_extents_max = fiemap.fm_extent_count;
210 fieinfo.fi_extents_start = (struct fiemap_extent *)(arg + sizeof(fiemap));
211
212 if (fiemap.fm_extent_count != 0 &&
213 !access_ok(VERIFY_WRITE, fieinfo.fi_extents_start,
214 fieinfo.fi_extents_max * sizeof(struct fiemap_extent)))
215 return -EFAULT;
216
217 if (fieinfo.fi_flags & FIEMAP_FLAG_SYNC)
218 filemap_write_and_wait(inode->i_mapping);
219
220 error = inode->i_op->fiemap(inode, &fieinfo, fiemap.fm_start, len);
221 fiemap.fm_flags = fieinfo.fi_flags;
222 fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped;
223 if (copy_to_user((char *)arg, &fiemap, sizeof(fiemap)))
224 error = -EFAULT;
225
226 return error;
227}
228
229#ifdef CONFIG_BLOCK
230
231#define blk_to_logical(inode, blk) (blk << (inode)->i_blkbits)
232#define logical_to_blk(inode, offset) (offset >> (inode)->i_blkbits);
233
234/*
235 * @inode - the inode to map
236 * @arg - the pointer to userspace where we copy everything to
237 * @get_block - the fs's get_block function
238 *
239 * This does FIEMAP for block based inodes. Basically it will just loop
240 * through get_block until we hit the number of extents we want to map, or we
241 * go past the end of the file and hit a hole.
242 *
243 * If it is possible to have data blocks beyond a hole past @inode->i_size, then
244 * please do not use this function, it will stop at the first unmapped block
245 * beyond i_size
246 */
247int generic_block_fiemap(struct inode *inode,
248 struct fiemap_extent_info *fieinfo, u64 start,
249 u64 len, get_block_t *get_block)
250{
251 struct buffer_head tmp;
252 unsigned int start_blk;
253 long long length = 0, map_len = 0;
254 u64 logical = 0, phys = 0, size = 0;
255 u32 flags = FIEMAP_EXTENT_MERGED;
256 int ret = 0;
257
258 if ((ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC)))
259 return ret;
260
261 start_blk = logical_to_blk(inode, start);
262
263 /* guard against change */
264 mutex_lock(&inode->i_mutex);
265
266 length = (long long)min_t(u64, len, i_size_read(inode));
267 map_len = length;
268
269 do {
270 /*
271 * we set b_size to the total size we want so it will map as
272 * many contiguous blocks as possible at once
273 */
274 memset(&tmp, 0, sizeof(struct buffer_head));
275 tmp.b_size = map_len;
276
277 ret = get_block(inode, start_blk, &tmp, 0);
278 if (ret)
279 break;
280
281 /* HOLE */
282 if (!buffer_mapped(&tmp)) {
283 /*
284 * first hole after going past the EOF, this is our
285 * last extent
286 */
287 if (length <= 0) {
288 flags = FIEMAP_EXTENT_MERGED|FIEMAP_EXTENT_LAST;
289 ret = fiemap_fill_next_extent(fieinfo, logical,
290 phys, size,
291 flags);
292 break;
293 }
294
295 length -= blk_to_logical(inode, 1);
296
297 /* if we have holes up to/past EOF then we're done */
298 if (length <= 0)
299 break;
300
301 start_blk++;
302 } else {
303 if (length <= 0 && size) {
304 ret = fiemap_fill_next_extent(fieinfo, logical,
305 phys, size,
306 flags);
307 if (ret)
308 break;
309 }
310
311 logical = blk_to_logical(inode, start_blk);
312 phys = blk_to_logical(inode, tmp.b_blocknr);
313 size = tmp.b_size;
314 flags = FIEMAP_EXTENT_MERGED;
315
316 length -= tmp.b_size;
317 start_blk += logical_to_blk(inode, size);
318
319 /*
320 * if we are past the EOF we need to loop again to see
321 * if there is a hole so we can mark this extent as the
322 * last one, and if not keep mapping things until we
323 * find a hole, or we run out of slots in the extent
324 * array
325 */
326 if (length <= 0)
327 continue;
328
329 ret = fiemap_fill_next_extent(fieinfo, logical, phys,
330 size, flags);
331 if (ret)
332 break;
333 }
334 cond_resched();
335 } while (1);
336
337 mutex_unlock(&inode->i_mutex);
338
339 /* if ret is 1 then we just hit the end of the extent array */
340 if (ret == 1)
341 ret = 0;
342
343 return ret;
344}
345EXPORT_SYMBOL(generic_block_fiemap);
346
347#endif /* CONFIG_BLOCK */
348
74static int file_ioctl(struct file *filp, unsigned int cmd, 349static int file_ioctl(struct file *filp, unsigned int cmd,
75 unsigned long arg) 350 unsigned long arg)
76{ 351{
@@ -80,6 +355,8 @@ static int file_ioctl(struct file *filp, unsigned int cmd,
80 switch (cmd) { 355 switch (cmd) {
81 case FIBMAP: 356 case FIBMAP:
82 return ioctl_fibmap(filp, p); 357 return ioctl_fibmap(filp, p);
358 case FS_IOC_FIEMAP:
359 return ioctl_fiemap(filp, arg);
83 case FIGETBSZ: 360 case FIGETBSZ:
84 return put_user(inode->i_sb->s_blocksize, p); 361 return put_user(inode->i_sb->s_blocksize, p);
85 case FIONREAD: 362 case FIONREAD:
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 91389c8aee8a..9203c3332f17 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -20,6 +20,7 @@
20#include <linux/time.h> 20#include <linux/time.h>
21#include <linux/fs.h> 21#include <linux/fs.h>
22#include <linux/jbd2.h> 22#include <linux/jbd2.h>
23#include <linux/marker.h>
23#include <linux/errno.h> 24#include <linux/errno.h>
24#include <linux/slab.h> 25#include <linux/slab.h>
25 26
@@ -93,7 +94,8 @@ static int __try_to_free_cp_buf(struct journal_head *jh)
93 int ret = 0; 94 int ret = 0;
94 struct buffer_head *bh = jh2bh(jh); 95 struct buffer_head *bh = jh2bh(jh);
95 96
96 if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) { 97 if (jh->b_jlist == BJ_None && !buffer_locked(bh) &&
98 !buffer_dirty(bh) && !buffer_write_io_error(bh)) {
97 JBUFFER_TRACE(jh, "remove from checkpoint list"); 99 JBUFFER_TRACE(jh, "remove from checkpoint list");
98 ret = __jbd2_journal_remove_checkpoint(jh) + 1; 100 ret = __jbd2_journal_remove_checkpoint(jh) + 1;
99 jbd_unlock_bh_state(bh); 101 jbd_unlock_bh_state(bh);
@@ -126,14 +128,29 @@ void __jbd2_log_wait_for_space(journal_t *journal)
126 128
127 /* 129 /*
128 * Test again, another process may have checkpointed while we 130 * Test again, another process may have checkpointed while we
129 * were waiting for the checkpoint lock 131 * were waiting for the checkpoint lock. If there are no
132 * outstanding transactions there is nothing to checkpoint and
133 * we can't make progress. Abort the journal in this case.
130 */ 134 */
131 spin_lock(&journal->j_state_lock); 135 spin_lock(&journal->j_state_lock);
136 spin_lock(&journal->j_list_lock);
132 nblocks = jbd_space_needed(journal); 137 nblocks = jbd_space_needed(journal);
133 if (__jbd2_log_space_left(journal) < nblocks) { 138 if (__jbd2_log_space_left(journal) < nblocks) {
139 int chkpt = journal->j_checkpoint_transactions != NULL;
140
141 spin_unlock(&journal->j_list_lock);
134 spin_unlock(&journal->j_state_lock); 142 spin_unlock(&journal->j_state_lock);
135 jbd2_log_do_checkpoint(journal); 143 if (chkpt) {
144 jbd2_log_do_checkpoint(journal);
145 } else {
146 printk(KERN_ERR "%s: no transactions\n",
147 __func__);
148 jbd2_journal_abort(journal, 0);
149 }
150
136 spin_lock(&journal->j_state_lock); 151 spin_lock(&journal->j_state_lock);
152 } else {
153 spin_unlock(&journal->j_list_lock);
137 } 154 }
138 mutex_unlock(&journal->j_checkpoint_mutex); 155 mutex_unlock(&journal->j_checkpoint_mutex);
139 } 156 }
@@ -160,21 +177,25 @@ static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh)
160 * buffers. Note that we take the buffers in the opposite ordering 177 * buffers. Note that we take the buffers in the opposite ordering
161 * from the one in which they were submitted for IO. 178 * from the one in which they were submitted for IO.
162 * 179 *
180 * Return 0 on success, and return <0 if some buffers have failed
181 * to be written out.
182 *
163 * Called with j_list_lock held. 183 * Called with j_list_lock held.
164 */ 184 */
165static void __wait_cp_io(journal_t *journal, transaction_t *transaction) 185static int __wait_cp_io(journal_t *journal, transaction_t *transaction)
166{ 186{
167 struct journal_head *jh; 187 struct journal_head *jh;
168 struct buffer_head *bh; 188 struct buffer_head *bh;
169 tid_t this_tid; 189 tid_t this_tid;
170 int released = 0; 190 int released = 0;
191 int ret = 0;
171 192
172 this_tid = transaction->t_tid; 193 this_tid = transaction->t_tid;
173restart: 194restart:
174 /* Did somebody clean up the transaction in the meanwhile? */ 195 /* Did somebody clean up the transaction in the meanwhile? */
175 if (journal->j_checkpoint_transactions != transaction || 196 if (journal->j_checkpoint_transactions != transaction ||
176 transaction->t_tid != this_tid) 197 transaction->t_tid != this_tid)
177 return; 198 return ret;
178 while (!released && transaction->t_checkpoint_io_list) { 199 while (!released && transaction->t_checkpoint_io_list) {
179 jh = transaction->t_checkpoint_io_list; 200 jh = transaction->t_checkpoint_io_list;
180 bh = jh2bh(jh); 201 bh = jh2bh(jh);
@@ -194,6 +215,9 @@ restart:
194 spin_lock(&journal->j_list_lock); 215 spin_lock(&journal->j_list_lock);
195 goto restart; 216 goto restart;
196 } 217 }
218 if (unlikely(buffer_write_io_error(bh)))
219 ret = -EIO;
220
197 /* 221 /*
198 * Now in whatever state the buffer currently is, we know that 222 * Now in whatever state the buffer currently is, we know that
199 * it has been written out and so we can drop it from the list 223 * it has been written out and so we can drop it from the list
@@ -203,6 +227,8 @@ restart:
203 jbd2_journal_remove_journal_head(bh); 227 jbd2_journal_remove_journal_head(bh);
204 __brelse(bh); 228 __brelse(bh);
205 } 229 }
230
231 return ret;
206} 232}
207 233
208#define NR_BATCH 64 234#define NR_BATCH 64
@@ -226,7 +252,8 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
226 * Try to flush one buffer from the checkpoint list to disk. 252 * Try to flush one buffer from the checkpoint list to disk.
227 * 253 *
228 * Return 1 if something happened which requires us to abort the current 254 * Return 1 if something happened which requires us to abort the current
229 * scan of the checkpoint list. 255 * scan of the checkpoint list. Return <0 if the buffer has failed to
256 * be written out.
230 * 257 *
231 * Called with j_list_lock held and drops it if 1 is returned 258 * Called with j_list_lock held and drops it if 1 is returned
232 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it 259 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
@@ -258,6 +285,9 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
258 jbd2_log_wait_commit(journal, tid); 285 jbd2_log_wait_commit(journal, tid);
259 ret = 1; 286 ret = 1;
260 } else if (!buffer_dirty(bh)) { 287 } else if (!buffer_dirty(bh)) {
288 ret = 1;
289 if (unlikely(buffer_write_io_error(bh)))
290 ret = -EIO;
261 J_ASSERT_JH(jh, !buffer_jbddirty(bh)); 291 J_ASSERT_JH(jh, !buffer_jbddirty(bh));
262 BUFFER_TRACE(bh, "remove from checkpoint"); 292 BUFFER_TRACE(bh, "remove from checkpoint");
263 __jbd2_journal_remove_checkpoint(jh); 293 __jbd2_journal_remove_checkpoint(jh);
@@ -265,7 +295,6 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
265 jbd_unlock_bh_state(bh); 295 jbd_unlock_bh_state(bh);
266 jbd2_journal_remove_journal_head(bh); 296 jbd2_journal_remove_journal_head(bh);
267 __brelse(bh); 297 __brelse(bh);
268 ret = 1;
269 } else { 298 } else {
270 /* 299 /*
271 * Important: we are about to write the buffer, and 300 * Important: we are about to write the buffer, and
@@ -298,6 +327,7 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
298 * to disk. We submit larger chunks of data at once. 327 * to disk. We submit larger chunks of data at once.
299 * 328 *
300 * The journal should be locked before calling this function. 329 * The journal should be locked before calling this function.
330 * Called with j_checkpoint_mutex held.
301 */ 331 */
302int jbd2_log_do_checkpoint(journal_t *journal) 332int jbd2_log_do_checkpoint(journal_t *journal)
303{ 333{
@@ -313,6 +343,8 @@ int jbd2_log_do_checkpoint(journal_t *journal)
313 * journal straight away. 343 * journal straight away.
314 */ 344 */
315 result = jbd2_cleanup_journal_tail(journal); 345 result = jbd2_cleanup_journal_tail(journal);
346 trace_mark(jbd2_checkpoint, "dev %s need_checkpoint %d",
347 journal->j_devname, result);
316 jbd_debug(1, "cleanup_journal_tail returned %d\n", result); 348 jbd_debug(1, "cleanup_journal_tail returned %d\n", result);
317 if (result <= 0) 349 if (result <= 0)
318 return result; 350 return result;
@@ -321,6 +353,7 @@ int jbd2_log_do_checkpoint(journal_t *journal)
321 * OK, we need to start writing disk blocks. Take one transaction 353 * OK, we need to start writing disk blocks. Take one transaction
322 * and write it. 354 * and write it.
323 */ 355 */
356 result = 0;
324 spin_lock(&journal->j_list_lock); 357 spin_lock(&journal->j_list_lock);
325 if (!journal->j_checkpoint_transactions) 358 if (!journal->j_checkpoint_transactions)
326 goto out; 359 goto out;
@@ -339,7 +372,7 @@ restart:
339 int batch_count = 0; 372 int batch_count = 0;
340 struct buffer_head *bhs[NR_BATCH]; 373 struct buffer_head *bhs[NR_BATCH];
341 struct journal_head *jh; 374 struct journal_head *jh;
342 int retry = 0; 375 int retry = 0, err;
343 376
344 while (!retry && transaction->t_checkpoint_list) { 377 while (!retry && transaction->t_checkpoint_list) {
345 struct buffer_head *bh; 378 struct buffer_head *bh;
@@ -353,6 +386,8 @@ restart:
353 } 386 }
354 retry = __process_buffer(journal, jh, bhs, &batch_count, 387 retry = __process_buffer(journal, jh, bhs, &batch_count,
355 transaction); 388 transaction);
389 if (retry < 0 && !result)
390 result = retry;
356 if (!retry && (need_resched() || 391 if (!retry && (need_resched() ||
357 spin_needbreak(&journal->j_list_lock))) { 392 spin_needbreak(&journal->j_list_lock))) {
358 spin_unlock(&journal->j_list_lock); 393 spin_unlock(&journal->j_list_lock);
@@ -377,14 +412,18 @@ restart:
377 * Now we have cleaned up the first transaction's checkpoint 412 * Now we have cleaned up the first transaction's checkpoint
378 * list. Let's clean up the second one 413 * list. Let's clean up the second one
379 */ 414 */
380 __wait_cp_io(journal, transaction); 415 err = __wait_cp_io(journal, transaction);
416 if (!result)
417 result = err;
381 } 418 }
382out: 419out:
383 spin_unlock(&journal->j_list_lock); 420 spin_unlock(&journal->j_list_lock);
384 result = jbd2_cleanup_journal_tail(journal);
385 if (result < 0) 421 if (result < 0)
386 return result; 422 jbd2_journal_abort(journal, result);
387 return 0; 423 else
424 result = jbd2_cleanup_journal_tail(journal);
425
426 return (result < 0) ? result : 0;
388} 427}
389 428
390/* 429/*
@@ -400,8 +439,9 @@ out:
400 * This is the only part of the journaling code which really needs to be 439 * This is the only part of the journaling code which really needs to be
401 * aware of transaction aborts. Checkpointing involves writing to the 440 * aware of transaction aborts. Checkpointing involves writing to the
402 * main filesystem area rather than to the journal, so it can proceed 441 * main filesystem area rather than to the journal, so it can proceed
403 * even in abort state, but we must not update the journal superblock if 442 * even in abort state, but we must not update the super block if
404 * we have an abort error outstanding. 443 * checkpointing may have failed. Otherwise, we would lose some metadata
444 * buffers which should be written-back to the filesystem.
405 */ 445 */
406 446
407int jbd2_cleanup_journal_tail(journal_t *journal) 447int jbd2_cleanup_journal_tail(journal_t *journal)
@@ -410,6 +450,9 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
410 tid_t first_tid; 450 tid_t first_tid;
411 unsigned long blocknr, freed; 451 unsigned long blocknr, freed;
412 452
453 if (is_journal_aborted(journal))
454 return 1;
455
413 /* OK, work out the oldest transaction remaining in the log, and 456 /* OK, work out the oldest transaction remaining in the log, and
414 * the log block it starts at. 457 * the log block it starts at.
415 * 458 *
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index f2ad061e95ec..0abe02c4242a 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -16,6 +16,7 @@
16#include <linux/time.h> 16#include <linux/time.h>
17#include <linux/fs.h> 17#include <linux/fs.h>
18#include <linux/jbd2.h> 18#include <linux/jbd2.h>
19#include <linux/marker.h>
19#include <linux/errno.h> 20#include <linux/errno.h>
20#include <linux/slab.h> 21#include <linux/slab.h>
21#include <linux/mm.h> 22#include <linux/mm.h>
@@ -126,8 +127,7 @@ static int journal_submit_commit_record(journal_t *journal,
126 127
127 JBUFFER_TRACE(descriptor, "submit commit block"); 128 JBUFFER_TRACE(descriptor, "submit commit block");
128 lock_buffer(bh); 129 lock_buffer(bh);
129 get_bh(bh); 130 clear_buffer_dirty(bh);
130 set_buffer_dirty(bh);
131 set_buffer_uptodate(bh); 131 set_buffer_uptodate(bh);
132 bh->b_end_io = journal_end_buffer_io_sync; 132 bh->b_end_io = journal_end_buffer_io_sync;
133 133
@@ -147,12 +147,9 @@ static int journal_submit_commit_record(journal_t *journal,
147 * to remember if we sent a barrier request 147 * to remember if we sent a barrier request
148 */ 148 */
149 if (ret == -EOPNOTSUPP && barrier_done) { 149 if (ret == -EOPNOTSUPP && barrier_done) {
150 char b[BDEVNAME_SIZE];
151
152 printk(KERN_WARNING 150 printk(KERN_WARNING
153 "JBD: barrier-based sync failed on %s - " 151 "JBD: barrier-based sync failed on %s - "
154 "disabling barriers\n", 152 "disabling barriers\n", journal->j_devname);
155 bdevname(journal->j_dev, b));
156 spin_lock(&journal->j_state_lock); 153 spin_lock(&journal->j_state_lock);
157 journal->j_flags &= ~JBD2_BARRIER; 154 journal->j_flags &= ~JBD2_BARRIER;
158 spin_unlock(&journal->j_state_lock); 155 spin_unlock(&journal->j_state_lock);
@@ -160,7 +157,7 @@ static int journal_submit_commit_record(journal_t *journal,
160 /* And try again, without the barrier */ 157 /* And try again, without the barrier */
161 lock_buffer(bh); 158 lock_buffer(bh);
162 set_buffer_uptodate(bh); 159 set_buffer_uptodate(bh);
163 set_buffer_dirty(bh); 160 clear_buffer_dirty(bh);
164 ret = submit_bh(WRITE, bh); 161 ret = submit_bh(WRITE, bh);
165 } 162 }
166 *cbh = bh; 163 *cbh = bh;
@@ -371,6 +368,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
371 commit_transaction = journal->j_running_transaction; 368 commit_transaction = journal->j_running_transaction;
372 J_ASSERT(commit_transaction->t_state == T_RUNNING); 369 J_ASSERT(commit_transaction->t_state == T_RUNNING);
373 370
371 trace_mark(jbd2_start_commit, "dev %s transaction %d",
372 journal->j_devname, commit_transaction->t_tid);
374 jbd_debug(1, "JBD: starting commit of transaction %d\n", 373 jbd_debug(1, "JBD: starting commit of transaction %d\n",
375 commit_transaction->t_tid); 374 commit_transaction->t_tid);
376 375
@@ -505,9 +504,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
505 jh = commit_transaction->t_buffers; 504 jh = commit_transaction->t_buffers;
506 505
507 /* If we're in abort mode, we just un-journal the buffer and 506 /* If we're in abort mode, we just un-journal the buffer and
508 release it for background writing. */ 507 release it. */
509 508
510 if (is_journal_aborted(journal)) { 509 if (is_journal_aborted(journal)) {
510 clear_buffer_jbddirty(jh2bh(jh));
511 JBUFFER_TRACE(jh, "journal is aborting: refile"); 511 JBUFFER_TRACE(jh, "journal is aborting: refile");
512 jbd2_journal_refile_buffer(journal, jh); 512 jbd2_journal_refile_buffer(journal, jh);
513 /* If that was the last one, we need to clean up 513 /* If that was the last one, we need to clean up
@@ -681,11 +681,11 @@ start_journal_io:
681 */ 681 */
682 err = journal_finish_inode_data_buffers(journal, commit_transaction); 682 err = journal_finish_inode_data_buffers(journal, commit_transaction);
683 if (err) { 683 if (err) {
684 char b[BDEVNAME_SIZE];
685
686 printk(KERN_WARNING 684 printk(KERN_WARNING
687 "JBD2: Detected IO errors while flushing file data " 685 "JBD2: Detected IO errors while flushing file data "
688 "on %s\n", bdevname(journal->j_fs_dev, b)); 686 "on %s\n", journal->j_devname);
687 if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
688 jbd2_journal_abort(journal, err);
689 err = 0; 689 err = 0;
690 } 690 }
691 691
@@ -786,6 +786,9 @@ wait_for_iobuf:
786 /* AKPM: bforget here */ 786 /* AKPM: bforget here */
787 } 787 }
788 788
789 if (err)
790 jbd2_journal_abort(journal, err);
791
789 jbd_debug(3, "JBD: commit phase 5\n"); 792 jbd_debug(3, "JBD: commit phase 5\n");
790 793
791 if (!JBD2_HAS_INCOMPAT_FEATURE(journal, 794 if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
@@ -884,6 +887,8 @@ restart_loop:
884 if (buffer_jbddirty(bh)) { 887 if (buffer_jbddirty(bh)) {
885 JBUFFER_TRACE(jh, "add to new checkpointing trans"); 888 JBUFFER_TRACE(jh, "add to new checkpointing trans");
886 __jbd2_journal_insert_checkpoint(jh, commit_transaction); 889 __jbd2_journal_insert_checkpoint(jh, commit_transaction);
890 if (is_journal_aborted(journal))
891 clear_buffer_jbddirty(bh);
887 JBUFFER_TRACE(jh, "refile for checkpoint writeback"); 892 JBUFFER_TRACE(jh, "refile for checkpoint writeback");
888 __jbd2_journal_refile_buffer(jh); 893 __jbd2_journal_refile_buffer(jh);
889 jbd_unlock_bh_state(bh); 894 jbd_unlock_bh_state(bh);
@@ -990,6 +995,9 @@ restart_loop:
990 } 995 }
991 spin_unlock(&journal->j_list_lock); 996 spin_unlock(&journal->j_list_lock);
992 997
998 trace_mark(jbd2_end_commit, "dev %s transaction %d head %d",
999 journal->j_devname, commit_transaction->t_tid,
1000 journal->j_tail_sequence);
993 jbd_debug(1, "JBD: commit %d complete, head %d\n", 1001 jbd_debug(1, "JBD: commit %d complete, head %d\n",
994 journal->j_commit_sequence, journal->j_tail_sequence); 1002 journal->j_commit_sequence, journal->j_tail_sequence);
995 1003
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 8207a01c4edb..783de118de92 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -597,13 +597,9 @@ int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr,
597 if (ret) 597 if (ret)
598 *retp = ret; 598 *retp = ret;
599 else { 599 else {
600 char b[BDEVNAME_SIZE];
601
602 printk(KERN_ALERT "%s: journal block not found " 600 printk(KERN_ALERT "%s: journal block not found "
603 "at offset %lu on %s\n", 601 "at offset %lu on %s\n",
604 __func__, 602 __func__, blocknr, journal->j_devname);
605 blocknr,
606 bdevname(journal->j_dev, b));
607 err = -EIO; 603 err = -EIO;
608 __journal_abort_soft(journal, err); 604 __journal_abort_soft(journal, err);
609 } 605 }
@@ -901,10 +897,7 @@ static struct proc_dir_entry *proc_jbd2_stats;
901 897
902static void jbd2_stats_proc_init(journal_t *journal) 898static void jbd2_stats_proc_init(journal_t *journal)
903{ 899{
904 char name[BDEVNAME_SIZE]; 900 journal->j_proc_entry = proc_mkdir(journal->j_devname, proc_jbd2_stats);
905
906 bdevname(journal->j_dev, name);
907 journal->j_proc_entry = proc_mkdir(name, proc_jbd2_stats);
908 if (journal->j_proc_entry) { 901 if (journal->j_proc_entry) {
909 proc_create_data("history", S_IRUGO, journal->j_proc_entry, 902 proc_create_data("history", S_IRUGO, journal->j_proc_entry,
910 &jbd2_seq_history_fops, journal); 903 &jbd2_seq_history_fops, journal);
@@ -915,12 +908,9 @@ static void jbd2_stats_proc_init(journal_t *journal)
915 908
916static void jbd2_stats_proc_exit(journal_t *journal) 909static void jbd2_stats_proc_exit(journal_t *journal)
917{ 910{
918 char name[BDEVNAME_SIZE];
919
920 bdevname(journal->j_dev, name);
921 remove_proc_entry("info", journal->j_proc_entry); 911 remove_proc_entry("info", journal->j_proc_entry);
922 remove_proc_entry("history", journal->j_proc_entry); 912 remove_proc_entry("history", journal->j_proc_entry);
923 remove_proc_entry(name, proc_jbd2_stats); 913 remove_proc_entry(journal->j_devname, proc_jbd2_stats);
924} 914}
925 915
926static void journal_init_stats(journal_t *journal) 916static void journal_init_stats(journal_t *journal)
@@ -1018,6 +1008,7 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
1018{ 1008{
1019 journal_t *journal = journal_init_common(); 1009 journal_t *journal = journal_init_common();
1020 struct buffer_head *bh; 1010 struct buffer_head *bh;
1011 char *p;
1021 int n; 1012 int n;
1022 1013
1023 if (!journal) 1014 if (!journal)
@@ -1039,6 +1030,10 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
1039 journal->j_fs_dev = fs_dev; 1030 journal->j_fs_dev = fs_dev;
1040 journal->j_blk_offset = start; 1031 journal->j_blk_offset = start;
1041 journal->j_maxlen = len; 1032 journal->j_maxlen = len;
1033 bdevname(journal->j_dev, journal->j_devname);
1034 p = journal->j_devname;
1035 while ((p = strchr(p, '/')))
1036 *p = '!';
1042 jbd2_stats_proc_init(journal); 1037 jbd2_stats_proc_init(journal);
1043 1038
1044 bh = __getblk(journal->j_dev, start, journal->j_blocksize); 1039 bh = __getblk(journal->j_dev, start, journal->j_blocksize);
@@ -1061,6 +1056,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
1061{ 1056{
1062 struct buffer_head *bh; 1057 struct buffer_head *bh;
1063 journal_t *journal = journal_init_common(); 1058 journal_t *journal = journal_init_common();
1059 char *p;
1064 int err; 1060 int err;
1065 int n; 1061 int n;
1066 unsigned long long blocknr; 1062 unsigned long long blocknr;
@@ -1070,6 +1066,12 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
1070 1066
1071 journal->j_dev = journal->j_fs_dev = inode->i_sb->s_bdev; 1067 journal->j_dev = journal->j_fs_dev = inode->i_sb->s_bdev;
1072 journal->j_inode = inode; 1068 journal->j_inode = inode;
1069 bdevname(journal->j_dev, journal->j_devname);
1070 p = journal->j_devname;
1071 while ((p = strchr(p, '/')))
1072 *p = '!';
1073 p = journal->j_devname + strlen(journal->j_devname);
1074 sprintf(p, ":%lu", journal->j_inode->i_ino);
1073 jbd_debug(1, 1075 jbd_debug(1,
1074 "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n", 1076 "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n",
1075 journal, inode->i_sb->s_id, inode->i_ino, 1077 journal, inode->i_sb->s_id, inode->i_ino,
@@ -1253,6 +1255,22 @@ void jbd2_journal_update_superblock(journal_t *journal, int wait)
1253 goto out; 1255 goto out;
1254 } 1256 }
1255 1257
1258 if (buffer_write_io_error(bh)) {
1259 /*
1260 * Oh, dear. A previous attempt to write the journal
1261 * superblock failed. This could happen because the
1262 * USB device was yanked out. Or it could happen to
1263 * be a transient write error and maybe the block will
1264 * be remapped. Nothing we can do but to retry the
1265 * write and hope for the best.
1266 */
1267 printk(KERN_ERR "JBD2: previous I/O error detected "
1268 "for journal superblock update for %s.\n",
1269 journal->j_devname);
1270 clear_buffer_write_io_error(bh);
1271 set_buffer_uptodate(bh);
1272 }
1273
1256 spin_lock(&journal->j_state_lock); 1274 spin_lock(&journal->j_state_lock);
1257 jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n", 1275 jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n",
1258 journal->j_tail, journal->j_tail_sequence, journal->j_errno); 1276 journal->j_tail, journal->j_tail_sequence, journal->j_errno);
@@ -1264,9 +1282,16 @@ void jbd2_journal_update_superblock(journal_t *journal, int wait)
1264 1282
1265 BUFFER_TRACE(bh, "marking dirty"); 1283 BUFFER_TRACE(bh, "marking dirty");
1266 mark_buffer_dirty(bh); 1284 mark_buffer_dirty(bh);
1267 if (wait) 1285 if (wait) {
1268 sync_dirty_buffer(bh); 1286 sync_dirty_buffer(bh);
1269 else 1287 if (buffer_write_io_error(bh)) {
1288 printk(KERN_ERR "JBD2: I/O error detected "
1289 "when updating journal superblock for %s.\n",
1290 journal->j_devname);
1291 clear_buffer_write_io_error(bh);
1292 set_buffer_uptodate(bh);
1293 }
1294 } else
1270 ll_rw_block(SWRITE, 1, &bh); 1295 ll_rw_block(SWRITE, 1, &bh);
1271 1296
1272out: 1297out:
@@ -1426,9 +1451,12 @@ recovery_error:
1426 * 1451 *
1427 * Release a journal_t structure once it is no longer in use by the 1452 * Release a journal_t structure once it is no longer in use by the
1428 * journaled object. 1453 * journaled object.
1454 * Return <0 if we couldn't clean up the journal.
1429 */ 1455 */
1430void jbd2_journal_destroy(journal_t *journal) 1456int jbd2_journal_destroy(journal_t *journal)
1431{ 1457{
1458 int err = 0;
1459
1432 /* Wait for the commit thread to wake up and die. */ 1460 /* Wait for the commit thread to wake up and die. */
1433 journal_kill_thread(journal); 1461 journal_kill_thread(journal);
1434 1462
@@ -1451,11 +1479,16 @@ void jbd2_journal_destroy(journal_t *journal)
1451 J_ASSERT(journal->j_checkpoint_transactions == NULL); 1479 J_ASSERT(journal->j_checkpoint_transactions == NULL);
1452 spin_unlock(&journal->j_list_lock); 1480 spin_unlock(&journal->j_list_lock);
1453 1481
1454 /* We can now mark the journal as empty. */
1455 journal->j_tail = 0;
1456 journal->j_tail_sequence = ++journal->j_transaction_sequence;
1457 if (journal->j_sb_buffer) { 1482 if (journal->j_sb_buffer) {
1458 jbd2_journal_update_superblock(journal, 1); 1483 if (!is_journal_aborted(journal)) {
1484 /* We can now mark the journal as empty. */
1485 journal->j_tail = 0;
1486 journal->j_tail_sequence =
1487 ++journal->j_transaction_sequence;
1488 jbd2_journal_update_superblock(journal, 1);
1489 } else {
1490 err = -EIO;
1491 }
1459 brelse(journal->j_sb_buffer); 1492 brelse(journal->j_sb_buffer);
1460 } 1493 }
1461 1494
@@ -1467,6 +1500,8 @@ void jbd2_journal_destroy(journal_t *journal)
1467 jbd2_journal_destroy_revoke(journal); 1500 jbd2_journal_destroy_revoke(journal);
1468 kfree(journal->j_wbuf); 1501 kfree(journal->j_wbuf);
1469 kfree(journal); 1502 kfree(journal);
1503
1504 return err;
1470} 1505}
1471 1506
1472 1507
@@ -1692,10 +1727,16 @@ int jbd2_journal_flush(journal_t *journal)
1692 spin_lock(&journal->j_list_lock); 1727 spin_lock(&journal->j_list_lock);
1693 while (!err && journal->j_checkpoint_transactions != NULL) { 1728 while (!err && journal->j_checkpoint_transactions != NULL) {
1694 spin_unlock(&journal->j_list_lock); 1729 spin_unlock(&journal->j_list_lock);
1730 mutex_lock(&journal->j_checkpoint_mutex);
1695 err = jbd2_log_do_checkpoint(journal); 1731 err = jbd2_log_do_checkpoint(journal);
1732 mutex_unlock(&journal->j_checkpoint_mutex);
1696 spin_lock(&journal->j_list_lock); 1733 spin_lock(&journal->j_list_lock);
1697 } 1734 }
1698 spin_unlock(&journal->j_list_lock); 1735 spin_unlock(&journal->j_list_lock);
1736
1737 if (is_journal_aborted(journal))
1738 return -EIO;
1739
1699 jbd2_cleanup_journal_tail(journal); 1740 jbd2_cleanup_journal_tail(journal);
1700 1741
1701 /* Finally, mark the journal as really needing no recovery. 1742 /* Finally, mark the journal as really needing no recovery.
@@ -1717,7 +1758,7 @@ int jbd2_journal_flush(journal_t *journal)
1717 J_ASSERT(journal->j_head == journal->j_tail); 1758 J_ASSERT(journal->j_head == journal->j_tail);
1718 J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence); 1759 J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence);
1719 spin_unlock(&journal->j_state_lock); 1760 spin_unlock(&journal->j_state_lock);
1720 return err; 1761 return 0;
1721} 1762}
1722 1763
1723/** 1764/**
@@ -1761,23 +1802,6 @@ int jbd2_journal_wipe(journal_t *journal, int write)
1761} 1802}
1762 1803
1763/* 1804/*
1764 * journal_dev_name: format a character string to describe on what
1765 * device this journal is present.
1766 */
1767
1768static const char *journal_dev_name(journal_t *journal, char *buffer)
1769{
1770 struct block_device *bdev;
1771
1772 if (journal->j_inode)
1773 bdev = journal->j_inode->i_sb->s_bdev;
1774 else
1775 bdev = journal->j_dev;
1776
1777 return bdevname(bdev, buffer);
1778}
1779
1780/*
1781 * Journal abort has very specific semantics, which we describe 1805 * Journal abort has very specific semantics, which we describe
1782 * for journal abort. 1806 * for journal abort.
1783 * 1807 *
@@ -1793,13 +1817,12 @@ static const char *journal_dev_name(journal_t *journal, char *buffer)
1793void __jbd2_journal_abort_hard(journal_t *journal) 1817void __jbd2_journal_abort_hard(journal_t *journal)
1794{ 1818{
1795 transaction_t *transaction; 1819 transaction_t *transaction;
1796 char b[BDEVNAME_SIZE];
1797 1820
1798 if (journal->j_flags & JBD2_ABORT) 1821 if (journal->j_flags & JBD2_ABORT)
1799 return; 1822 return;
1800 1823
1801 printk(KERN_ERR "Aborting journal on device %s.\n", 1824 printk(KERN_ERR "Aborting journal on device %s.\n",
1802 journal_dev_name(journal, b)); 1825 journal->j_devname);
1803 1826
1804 spin_lock(&journal->j_state_lock); 1827 spin_lock(&journal->j_state_lock);
1805 journal->j_flags |= JBD2_ABORT; 1828 journal->j_flags |= JBD2_ABORT;
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 058f50f65b76..73063285b13f 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -225,7 +225,7 @@ do { \
225 */ 225 */
226int jbd2_journal_recover(journal_t *journal) 226int jbd2_journal_recover(journal_t *journal)
227{ 227{
228 int err; 228 int err, err2;
229 journal_superblock_t * sb; 229 journal_superblock_t * sb;
230 230
231 struct recovery_info info; 231 struct recovery_info info;
@@ -263,7 +263,10 @@ int jbd2_journal_recover(journal_t *journal)
263 journal->j_transaction_sequence = ++info.end_transaction; 263 journal->j_transaction_sequence = ++info.end_transaction;
264 264
265 jbd2_journal_clear_revoke(journal); 265 jbd2_journal_clear_revoke(journal);
266 sync_blockdev(journal->j_fs_dev); 266 err2 = sync_blockdev(journal->j_fs_dev);
267 if (!err)
268 err = err2;
269
267 return err; 270 return err;
268} 271}
269 272
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 9abcd2b329f7..e9b20173fef3 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1279,6 +1279,12 @@ static int nfs_parse_mount_options(char *raw,
1279 } 1279 }
1280 } 1280 }
1281 1281
1282 if (errors > 0) {
1283 dfprintk(MOUNT, "NFS: parsing encountered %d error%s\n",
1284 errors, (errors == 1 ? "" : "s"));
1285 if (!sloppy)
1286 return 0;
1287 }
1282 return 1; 1288 return 1;
1283 1289
1284out_nomem: 1290out_nomem:
diff --git a/fs/ntfs/usnjrnl.h b/fs/ntfs/usnjrnl.h
index 3a8af75351e8..4087fbdac327 100644
--- a/fs/ntfs/usnjrnl.h
+++ b/fs/ntfs/usnjrnl.h
@@ -113,7 +113,7 @@ typedef struct {
113 * Reason flags (32-bit). Cumulative flags describing the change(s) to the 113 * Reason flags (32-bit). Cumulative flags describing the change(s) to the
114 * file since it was last opened. I think the names speak for themselves but 114 * file since it was last opened. I think the names speak for themselves but
115 * if you disagree check out the descriptions in the Linux NTFS project NTFS 115 * if you disagree check out the descriptions in the Linux NTFS project NTFS
116 * documentation: http://linux-ntfs.sourceforge.net/ntfs/files/usnjrnl.html 116 * documentation: http://www.linux-ntfs.org/
117 */ 117 */
118enum { 118enum {
119 USN_REASON_DATA_OVERWRITE = const_cpu_to_le32(0x00000001), 119 USN_REASON_DATA_OVERWRITE = const_cpu_to_le32(0x00000001),
@@ -145,7 +145,7 @@ typedef le32 USN_REASON_FLAGS;
145 * Source info flags (32-bit). Information about the source of the change(s) 145 * Source info flags (32-bit). Information about the source of the change(s)
146 * to the file. For detailed descriptions of what these mean, see the Linux 146 * to the file. For detailed descriptions of what these mean, see the Linux
147 * NTFS project NTFS documentation: 147 * NTFS project NTFS documentation:
148 * http://linux-ntfs.sourceforge.net/ntfs/files/usnjrnl.html 148 * http://www.linux-ntfs.org/
149 */ 149 */
150enum { 150enum {
151 USN_SOURCE_DATA_MANAGEMENT = const_cpu_to_le32(0x00000001), 151 USN_SOURCE_DATA_MANAGEMENT = const_cpu_to_le32(0x00000001),
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 10bfb466e068..29ff57ec5d1f 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -990,15 +990,6 @@ out:
990} 990}
991 991
992/* 992/*
993 * This is only valid for leaf nodes, which are the only ones that can
994 * have empty extents anyway.
995 */
996static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec)
997{
998 return !rec->e_leaf_clusters;
999}
1000
1001/*
1002 * This function will discard the rightmost extent record. 993 * This function will discard the rightmost extent record.
1003 */ 994 */
1004static void ocfs2_shift_records_right(struct ocfs2_extent_list *el) 995static void ocfs2_shift_records_right(struct ocfs2_extent_list *el)
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 42ff94bd8011..60cd3d59230c 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -146,4 +146,13 @@ static inline unsigned int ocfs2_rec_clusters(struct ocfs2_extent_list *el,
146 return le16_to_cpu(rec->e_leaf_clusters); 146 return le16_to_cpu(rec->e_leaf_clusters);
147} 147}
148 148
149/*
150 * This is only valid for leaf nodes, which are the only ones that can
151 * have empty extents anyway.
152 */
153static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec)
154{
155 return !rec->e_leaf_clusters;
156}
157
149#endif /* OCFS2_ALLOC_H */ 158#endif /* OCFS2_ALLOC_H */
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 506c24fb5078..a53da1466277 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -594,7 +594,7 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
594 goto bail; 594 goto bail;
595 } 595 }
596 596
597 if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno) { 597 if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno && create) {
598 ocfs2_error(inode->i_sb, 598 ocfs2_error(inode->i_sb,
599 "Inode %llu has a hole at block %llu\n", 599 "Inode %llu has a hole at block %llu\n",
600 (unsigned long long)OCFS2_I(inode)->ip_blkno, 600 (unsigned long long)OCFS2_I(inode)->ip_blkno,
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index c58668a326fe..aed268e80b49 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -25,6 +25,7 @@
25#include <linux/fs.h> 25#include <linux/fs.h>
26#include <linux/init.h> 26#include <linux/init.h>
27#include <linux/types.h> 27#include <linux/types.h>
28#include <linux/fiemap.h>
28 29
29#define MLOG_MASK_PREFIX ML_EXTENT_MAP 30#define MLOG_MASK_PREFIX ML_EXTENT_MAP
30#include <cluster/masklog.h> 31#include <cluster/masklog.h>
@@ -32,6 +33,7 @@
32#include "ocfs2.h" 33#include "ocfs2.h"
33 34
34#include "alloc.h" 35#include "alloc.h"
36#include "dlmglue.h"
35#include "extent_map.h" 37#include "extent_map.h"
36#include "inode.h" 38#include "inode.h"
37#include "super.h" 39#include "super.h"
@@ -282,6 +284,51 @@ out:
282 kfree(new_emi); 284 kfree(new_emi);
283} 285}
284 286
287static int ocfs2_last_eb_is_empty(struct inode *inode,
288 struct ocfs2_dinode *di)
289{
290 int ret, next_free;
291 u64 last_eb_blk = le64_to_cpu(di->i_last_eb_blk);
292 struct buffer_head *eb_bh = NULL;
293 struct ocfs2_extent_block *eb;
294 struct ocfs2_extent_list *el;
295
296 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), last_eb_blk,
297 &eb_bh, OCFS2_BH_CACHED, inode);
298 if (ret) {
299 mlog_errno(ret);
300 goto out;
301 }
302
303 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
304 el = &eb->h_list;
305
306 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
307 ret = -EROFS;
308 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
309 goto out;
310 }
311
312 if (el->l_tree_depth) {
313 ocfs2_error(inode->i_sb,
314 "Inode %lu has non zero tree depth in "
315 "leaf block %llu\n", inode->i_ino,
316 (unsigned long long)eb_bh->b_blocknr);
317 ret = -EROFS;
318 goto out;
319 }
320
321 next_free = le16_to_cpu(el->l_next_free_rec);
322
323 if (next_free == 0 ||
324 (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0])))
325 ret = 1;
326
327out:
328 brelse(eb_bh);
329 return ret;
330}
331
285/* 332/*
286 * Return the 1st index within el which contains an extent start 333 * Return the 1st index within el which contains an extent start
287 * larger than v_cluster. 334 * larger than v_cluster.
@@ -373,42 +420,28 @@ out:
373 return ret; 420 return ret;
374} 421}
375 422
376int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, 423static int ocfs2_get_clusters_nocache(struct inode *inode,
377 u32 *p_cluster, u32 *num_clusters, 424 struct buffer_head *di_bh,
378 unsigned int *extent_flags) 425 u32 v_cluster, unsigned int *hole_len,
426 struct ocfs2_extent_rec *ret_rec,
427 unsigned int *is_last)
379{ 428{
380 int ret, i; 429 int i, ret, tree_height, len;
381 unsigned int flags = 0;
382 struct buffer_head *di_bh = NULL;
383 struct buffer_head *eb_bh = NULL;
384 struct ocfs2_dinode *di; 430 struct ocfs2_dinode *di;
385 struct ocfs2_extent_block *eb; 431 struct ocfs2_extent_block *uninitialized_var(eb);
386 struct ocfs2_extent_list *el; 432 struct ocfs2_extent_list *el;
387 struct ocfs2_extent_rec *rec; 433 struct ocfs2_extent_rec *rec;
388 u32 coff; 434 struct buffer_head *eb_bh = NULL;
389
390 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
391 ret = -ERANGE;
392 mlog_errno(ret);
393 goto out;
394 }
395
396 ret = ocfs2_extent_map_lookup(inode, v_cluster, p_cluster,
397 num_clusters, extent_flags);
398 if (ret == 0)
399 goto out;
400 435
401 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), OCFS2_I(inode)->ip_blkno, 436 memset(ret_rec, 0, sizeof(*ret_rec));
402 &di_bh, OCFS2_BH_CACHED, inode); 437 if (is_last)
403 if (ret) { 438 *is_last = 0;
404 mlog_errno(ret);
405 goto out;
406 }
407 439
408 di = (struct ocfs2_dinode *) di_bh->b_data; 440 di = (struct ocfs2_dinode *) di_bh->b_data;
409 el = &di->id2.i_list; 441 el = &di->id2.i_list;
442 tree_height = le16_to_cpu(el->l_tree_depth);
410 443
411 if (el->l_tree_depth) { 444 if (tree_height > 0) {
412 ret = ocfs2_find_leaf(inode, el, v_cluster, &eb_bh); 445 ret = ocfs2_find_leaf(inode, el, v_cluster, &eb_bh);
413 if (ret) { 446 if (ret) {
414 mlog_errno(ret); 447 mlog_errno(ret);
@@ -431,46 +464,143 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
431 i = ocfs2_search_extent_list(el, v_cluster); 464 i = ocfs2_search_extent_list(el, v_cluster);
432 if (i == -1) { 465 if (i == -1) {
433 /* 466 /*
434 * A hole was found. Return some canned values that 467 * Holes can be larger than the maximum size of an
435 * callers can key on. If asked for, num_clusters will 468 * extent, so we return their lengths in a seperate
436 * be populated with the size of the hole. 469 * field.
437 */ 470 */
438 *p_cluster = 0; 471 if (hole_len) {
439 if (num_clusters) {
440 ret = ocfs2_figure_hole_clusters(inode, el, eb_bh, 472 ret = ocfs2_figure_hole_clusters(inode, el, eb_bh,
441 v_cluster, 473 v_cluster, &len);
442 num_clusters);
443 if (ret) { 474 if (ret) {
444 mlog_errno(ret); 475 mlog_errno(ret);
445 goto out; 476 goto out;
446 } 477 }
478
479 *hole_len = len;
447 } 480 }
448 } else { 481 goto out_hole;
449 rec = &el->l_recs[i]; 482 }
450 483
451 BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos)); 484 rec = &el->l_recs[i];
452 485
453 if (!rec->e_blkno) { 486 BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
454 ocfs2_error(inode->i_sb, "Inode %lu has bad extent " 487
455 "record (%u, %u, 0)", inode->i_ino, 488 if (!rec->e_blkno) {
456 le32_to_cpu(rec->e_cpos), 489 ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
457 ocfs2_rec_clusters(el, rec)); 490 "record (%u, %u, 0)", inode->i_ino,
458 ret = -EROFS; 491 le32_to_cpu(rec->e_cpos),
459 goto out; 492 ocfs2_rec_clusters(el, rec));
493 ret = -EROFS;
494 goto out;
495 }
496
497 *ret_rec = *rec;
498
499 /*
500 * Checking for last extent is potentially expensive - we
501 * might have to look at the next leaf over to see if it's
502 * empty.
503 *
504 * The first two checks are to see whether the caller even
505 * cares for this information, and if the extent is at least
506 * the last in it's list.
507 *
508 * If those hold true, then the extent is last if any of the
509 * additional conditions hold true:
510 * - Extent list is in-inode
511 * - Extent list is right-most
512 * - Extent list is 2nd to rightmost, with empty right-most
513 */
514 if (is_last) {
515 if (i == (le16_to_cpu(el->l_next_free_rec) - 1)) {
516 if (tree_height == 0)
517 *is_last = 1;
518 else if (eb->h_blkno == di->i_last_eb_blk)
519 *is_last = 1;
520 else if (eb->h_next_leaf_blk == di->i_last_eb_blk) {
521 ret = ocfs2_last_eb_is_empty(inode, di);
522 if (ret < 0) {
523 mlog_errno(ret);
524 goto out;
525 }
526 if (ret == 1)
527 *is_last = 1;
528 }
460 } 529 }
530 }
531
532out_hole:
533 ret = 0;
534out:
535 brelse(eb_bh);
536 return ret;
537}
538
539static void ocfs2_relative_extent_offsets(struct super_block *sb,
540 u32 v_cluster,
541 struct ocfs2_extent_rec *rec,
542 u32 *p_cluster, u32 *num_clusters)
543
544{
545 u32 coff = v_cluster - le32_to_cpu(rec->e_cpos);
546
547 *p_cluster = ocfs2_blocks_to_clusters(sb, le64_to_cpu(rec->e_blkno));
548 *p_cluster = *p_cluster + coff;
549
550 if (num_clusters)
551 *num_clusters = le16_to_cpu(rec->e_leaf_clusters) - coff;
552}
553
554int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
555 u32 *p_cluster, u32 *num_clusters,
556 unsigned int *extent_flags)
557{
558 int ret;
559 unsigned int uninitialized_var(hole_len), flags = 0;
560 struct buffer_head *di_bh = NULL;
561 struct ocfs2_extent_rec rec;
461 562
462 coff = v_cluster - le32_to_cpu(rec->e_cpos); 563 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
564 ret = -ERANGE;
565 mlog_errno(ret);
566 goto out;
567 }
463 568
464 *p_cluster = ocfs2_blocks_to_clusters(inode->i_sb, 569 ret = ocfs2_extent_map_lookup(inode, v_cluster, p_cluster,
465 le64_to_cpu(rec->e_blkno)); 570 num_clusters, extent_flags);
466 *p_cluster = *p_cluster + coff; 571 if (ret == 0)
572 goto out;
467 573
468 if (num_clusters) 574 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), OCFS2_I(inode)->ip_blkno,
469 *num_clusters = ocfs2_rec_clusters(el, rec) - coff; 575 &di_bh, OCFS2_BH_CACHED, inode);
576 if (ret) {
577 mlog_errno(ret);
578 goto out;
579 }
470 580
471 flags = rec->e_flags; 581 ret = ocfs2_get_clusters_nocache(inode, di_bh, v_cluster, &hole_len,
582 &rec, NULL);
583 if (ret) {
584 mlog_errno(ret);
585 goto out;
586 }
472 587
473 ocfs2_extent_map_insert_rec(inode, rec); 588 if (rec.e_blkno == 0ULL) {
589 /*
590 * A hole was found. Return some canned values that
591 * callers can key on. If asked for, num_clusters will
592 * be populated with the size of the hole.
593 */
594 *p_cluster = 0;
595 if (num_clusters) {
596 *num_clusters = hole_len;
597 }
598 } else {
599 ocfs2_relative_extent_offsets(inode->i_sb, v_cluster, &rec,
600 p_cluster, num_clusters);
601 flags = rec.e_flags;
602
603 ocfs2_extent_map_insert_rec(inode, &rec);
474 } 604 }
475 605
476 if (extent_flags) 606 if (extent_flags)
@@ -478,7 +608,6 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
478 608
479out: 609out:
480 brelse(di_bh); 610 brelse(di_bh);
481 brelse(eb_bh);
482 return ret; 611 return ret;
483} 612}
484 613
@@ -521,3 +650,114 @@ int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
521out: 650out:
522 return ret; 651 return ret;
523} 652}
653
654static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh,
655 struct fiemap_extent_info *fieinfo,
656 u64 map_start)
657{
658 int ret;
659 unsigned int id_count;
660 struct ocfs2_dinode *di;
661 u64 phys;
662 u32 flags = FIEMAP_EXTENT_DATA_INLINE|FIEMAP_EXTENT_LAST;
663 struct ocfs2_inode_info *oi = OCFS2_I(inode);
664
665 di = (struct ocfs2_dinode *)di_bh->b_data;
666 id_count = le16_to_cpu(di->id2.i_data.id_count);
667
668 if (map_start < id_count) {
669 phys = oi->ip_blkno << inode->i_sb->s_blocksize_bits;
670 phys += offsetof(struct ocfs2_dinode, id2.i_data.id_data);
671
672 ret = fiemap_fill_next_extent(fieinfo, 0, phys, id_count,
673 flags);
674 if (ret < 0)
675 return ret;
676 }
677
678 return 0;
679}
680
681#define OCFS2_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC)
682
683int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
684 u64 map_start, u64 map_len)
685{
686 int ret, is_last;
687 u32 mapping_end, cpos;
688 unsigned int hole_size;
689 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
690 u64 len_bytes, phys_bytes, virt_bytes;
691 struct buffer_head *di_bh = NULL;
692 struct ocfs2_extent_rec rec;
693
694 ret = fiemap_check_flags(fieinfo, OCFS2_FIEMAP_FLAGS);
695 if (ret)
696 return ret;
697
698 ret = ocfs2_inode_lock(inode, &di_bh, 0);
699 if (ret) {
700 mlog_errno(ret);
701 goto out;
702 }
703
704 down_read(&OCFS2_I(inode)->ip_alloc_sem);
705
706 /*
707 * Handle inline-data separately.
708 */
709 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
710 ret = ocfs2_fiemap_inline(inode, di_bh, fieinfo, map_start);
711 goto out_unlock;
712 }
713
714 cpos = map_start >> osb->s_clustersize_bits;
715 mapping_end = ocfs2_clusters_for_bytes(inode->i_sb,
716 map_start + map_len);
717 mapping_end -= cpos;
718 is_last = 0;
719 while (cpos < mapping_end && !is_last) {
720 u32 fe_flags;
721
722 ret = ocfs2_get_clusters_nocache(inode, di_bh, cpos,
723 &hole_size, &rec, &is_last);
724 if (ret) {
725 mlog_errno(ret);
726 goto out;
727 }
728
729 if (rec.e_blkno == 0ULL) {
730 cpos += hole_size;
731 continue;
732 }
733
734 fe_flags = 0;
735 if (rec.e_flags & OCFS2_EXT_UNWRITTEN)
736 fe_flags |= FIEMAP_EXTENT_UNWRITTEN;
737 if (is_last)
738 fe_flags |= FIEMAP_EXTENT_LAST;
739 len_bytes = (u64)le16_to_cpu(rec.e_leaf_clusters) << osb->s_clustersize_bits;
740 phys_bytes = le64_to_cpu(rec.e_blkno) << osb->sb->s_blocksize_bits;
741 virt_bytes = (u64)le32_to_cpu(rec.e_cpos) << osb->s_clustersize_bits;
742
743 ret = fiemap_fill_next_extent(fieinfo, virt_bytes, phys_bytes,
744 len_bytes, fe_flags);
745 if (ret)
746 break;
747
748 cpos = le32_to_cpu(rec.e_cpos)+ le16_to_cpu(rec.e_leaf_clusters);
749 }
750
751 if (ret > 0)
752 ret = 0;
753
754out_unlock:
755 brelse(di_bh);
756
757 up_read(&OCFS2_I(inode)->ip_alloc_sem);
758
759 ocfs2_inode_unlock(inode, 0);
760out:
761
762 return ret;
763}
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h
index de91e3e41a22..1b97490e1ea8 100644
--- a/fs/ocfs2/extent_map.h
+++ b/fs/ocfs2/extent_map.h
@@ -50,4 +50,7 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, u32 *p_cluster,
50int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno, 50int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
51 u64 *ret_count, unsigned int *extent_flags); 51 u64 *ret_count, unsigned int *extent_flags);
52 52
53int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
54 u64 map_start, u64 map_len);
55
53#endif /* _EXTENT_MAP_H */ 56#endif /* _EXTENT_MAP_H */
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index ec2ed15c3daa..ed38796052d2 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2228,6 +2228,7 @@ const struct inode_operations ocfs2_file_iops = {
2228 .getattr = ocfs2_getattr, 2228 .getattr = ocfs2_getattr,
2229 .permission = ocfs2_permission, 2229 .permission = ocfs2_permission,
2230 .fallocate = ocfs2_fallocate, 2230 .fallocate = ocfs2_fallocate,
2231 .fiemap = ocfs2_fiemap,
2231}; 2232};
2232 2233
2233const struct inode_operations ocfs2_special_file_iops = { 2234const struct inode_operations ocfs2_special_file_iops = {
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 7d6b34e201db..7408227c49c9 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -120,22 +120,21 @@ static int (*check_part[])(struct parsed_partitions *, struct block_device *) =
120 * a pointer to that same buffer (for convenience). 120 * a pointer to that same buffer (for convenience).
121 */ 121 */
122 122
123char *disk_name(struct gendisk *hd, int part, char *buf) 123char *disk_name(struct gendisk *hd, int partno, char *buf)
124{ 124{
125 if (!part) 125 if (!partno)
126 snprintf(buf, BDEVNAME_SIZE, "%s", hd->disk_name); 126 snprintf(buf, BDEVNAME_SIZE, "%s", hd->disk_name);
127 else if (isdigit(hd->disk_name[strlen(hd->disk_name)-1])) 127 else if (isdigit(hd->disk_name[strlen(hd->disk_name)-1]))
128 snprintf(buf, BDEVNAME_SIZE, "%sp%d", hd->disk_name, part); 128 snprintf(buf, BDEVNAME_SIZE, "%sp%d", hd->disk_name, partno);
129 else 129 else
130 snprintf(buf, BDEVNAME_SIZE, "%s%d", hd->disk_name, part); 130 snprintf(buf, BDEVNAME_SIZE, "%s%d", hd->disk_name, partno);
131 131
132 return buf; 132 return buf;
133} 133}
134 134
135const char *bdevname(struct block_device *bdev, char *buf) 135const char *bdevname(struct block_device *bdev, char *buf)
136{ 136{
137 int part = MINOR(bdev->bd_dev) - bdev->bd_disk->first_minor; 137 return disk_name(bdev->bd_disk, bdev->bd_part->partno, buf);
138 return disk_name(bdev->bd_disk, part, buf);
139} 138}
140 139
141EXPORT_SYMBOL(bdevname); 140EXPORT_SYMBOL(bdevname);
@@ -169,7 +168,7 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
169 if (isdigit(state->name[strlen(state->name)-1])) 168 if (isdigit(state->name[strlen(state->name)-1]))
170 sprintf(state->name, "p"); 169 sprintf(state->name, "p");
171 170
172 state->limit = hd->minors; 171 state->limit = disk_max_parts(hd);
173 i = res = err = 0; 172 i = res = err = 0;
174 while (!res && check_part[i]) { 173 while (!res && check_part[i]) {
175 memset(&state->parts, 0, sizeof(state->parts)); 174 memset(&state->parts, 0, sizeof(state->parts));
@@ -204,21 +203,22 @@ static ssize_t part_start_show(struct device *dev,
204 return sprintf(buf, "%llu\n",(unsigned long long)p->start_sect); 203 return sprintf(buf, "%llu\n",(unsigned long long)p->start_sect);
205} 204}
206 205
207static ssize_t part_size_show(struct device *dev, 206ssize_t part_size_show(struct device *dev,
208 struct device_attribute *attr, char *buf) 207 struct device_attribute *attr, char *buf)
209{ 208{
210 struct hd_struct *p = dev_to_part(dev); 209 struct hd_struct *p = dev_to_part(dev);
211 return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects); 210 return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects);
212} 211}
213 212
214static ssize_t part_stat_show(struct device *dev, 213ssize_t part_stat_show(struct device *dev,
215 struct device_attribute *attr, char *buf) 214 struct device_attribute *attr, char *buf)
216{ 215{
217 struct hd_struct *p = dev_to_part(dev); 216 struct hd_struct *p = dev_to_part(dev);
217 int cpu;
218 218
219 preempt_disable(); 219 cpu = part_stat_lock();
220 part_round_stats(p); 220 part_round_stats(cpu, p);
221 preempt_enable(); 221 part_stat_unlock();
222 return sprintf(buf, 222 return sprintf(buf,
223 "%8lu %8lu %8llu %8u " 223 "%8lu %8lu %8llu %8u "
224 "%8lu %8lu %8llu %8u " 224 "%8lu %8lu %8llu %8u "
@@ -238,17 +238,17 @@ static ssize_t part_stat_show(struct device *dev,
238} 238}
239 239
240#ifdef CONFIG_FAIL_MAKE_REQUEST 240#ifdef CONFIG_FAIL_MAKE_REQUEST
241static ssize_t part_fail_show(struct device *dev, 241ssize_t part_fail_show(struct device *dev,
242 struct device_attribute *attr, char *buf) 242 struct device_attribute *attr, char *buf)
243{ 243{
244 struct hd_struct *p = dev_to_part(dev); 244 struct hd_struct *p = dev_to_part(dev);
245 245
246 return sprintf(buf, "%d\n", p->make_it_fail); 246 return sprintf(buf, "%d\n", p->make_it_fail);
247} 247}
248 248
249static ssize_t part_fail_store(struct device *dev, 249ssize_t part_fail_store(struct device *dev,
250 struct device_attribute *attr, 250 struct device_attribute *attr,
251 const char *buf, size_t count) 251 const char *buf, size_t count)
252{ 252{
253 struct hd_struct *p = dev_to_part(dev); 253 struct hd_struct *p = dev_to_part(dev);
254 int i; 254 int i;
@@ -300,40 +300,34 @@ struct device_type part_type = {
300 .release = part_release, 300 .release = part_release,
301}; 301};
302 302
303static inline void partition_sysfs_add_subdir(struct hd_struct *p) 303static void delete_partition_rcu_cb(struct rcu_head *head)
304{
305 struct kobject *k;
306
307 k = kobject_get(&p->dev.kobj);
308 p->holder_dir = kobject_create_and_add("holders", k);
309 kobject_put(k);
310}
311
312static inline void disk_sysfs_add_subdirs(struct gendisk *disk)
313{ 304{
314 struct kobject *k; 305 struct hd_struct *part = container_of(head, struct hd_struct, rcu_head);
315 306
316 k = kobject_get(&disk->dev.kobj); 307 part->start_sect = 0;
317 disk->holder_dir = kobject_create_and_add("holders", k); 308 part->nr_sects = 0;
318 disk->slave_dir = kobject_create_and_add("slaves", k); 309 part_stat_set_all(part, 0);
319 kobject_put(k); 310 put_device(part_to_dev(part));
320} 311}
321 312
322void delete_partition(struct gendisk *disk, int part) 313void delete_partition(struct gendisk *disk, int partno)
323{ 314{
324 struct hd_struct *p = disk->part[part-1]; 315 struct disk_part_tbl *ptbl = disk->part_tbl;
316 struct hd_struct *part;
325 317
326 if (!p) 318 if (partno >= ptbl->len)
327 return; 319 return;
328 if (!p->nr_sects) 320
321 part = ptbl->part[partno];
322 if (!part)
329 return; 323 return;
330 disk->part[part-1] = NULL; 324
331 p->start_sect = 0; 325 blk_free_devt(part_devt(part));
332 p->nr_sects = 0; 326 rcu_assign_pointer(ptbl->part[partno], NULL);
333 part_stat_set_all(p, 0); 327 kobject_put(part->holder_dir);
334 kobject_put(p->holder_dir); 328 device_del(part_to_dev(part));
335 device_del(&p->dev); 329
336 put_device(&p->dev); 330 call_rcu(&part->rcu_head, delete_partition_rcu_cb);
337} 331}
338 332
339static ssize_t whole_disk_show(struct device *dev, 333static ssize_t whole_disk_show(struct device *dev,
@@ -344,102 +338,132 @@ static ssize_t whole_disk_show(struct device *dev,
344static DEVICE_ATTR(whole_disk, S_IRUSR | S_IRGRP | S_IROTH, 338static DEVICE_ATTR(whole_disk, S_IRUSR | S_IRGRP | S_IROTH,
345 whole_disk_show, NULL); 339 whole_disk_show, NULL);
346 340
347int add_partition(struct gendisk *disk, int part, sector_t start, sector_t len, int flags) 341int add_partition(struct gendisk *disk, int partno,
342 sector_t start, sector_t len, int flags)
348{ 343{
349 struct hd_struct *p; 344 struct hd_struct *p;
345 dev_t devt = MKDEV(0, 0);
346 struct device *ddev = disk_to_dev(disk);
347 struct device *pdev;
348 struct disk_part_tbl *ptbl;
349 const char *dname;
350 int err; 350 int err;
351 351
352 err = disk_expand_part_tbl(disk, partno);
353 if (err)
354 return err;
355 ptbl = disk->part_tbl;
356
357 if (ptbl->part[partno])
358 return -EBUSY;
359
352 p = kzalloc(sizeof(*p), GFP_KERNEL); 360 p = kzalloc(sizeof(*p), GFP_KERNEL);
353 if (!p) 361 if (!p)
354 return -ENOMEM; 362 return -ENOMEM;
355 363
356 if (!init_part_stats(p)) { 364 if (!init_part_stats(p)) {
357 err = -ENOMEM; 365 err = -ENOMEM;
358 goto out0; 366 goto out_free;
359 } 367 }
368 pdev = part_to_dev(p);
369
360 p->start_sect = start; 370 p->start_sect = start;
361 p->nr_sects = len; 371 p->nr_sects = len;
362 p->partno = part; 372 p->partno = partno;
363 p->policy = disk->policy; 373 p->policy = get_disk_ro(disk);
364 374
365 if (isdigit(disk->dev.bus_id[strlen(disk->dev.bus_id)-1])) 375 dname = dev_name(ddev);
366 snprintf(p->dev.bus_id, BUS_ID_SIZE, 376 if (isdigit(dname[strlen(dname) - 1]))
367 "%sp%d", disk->dev.bus_id, part); 377 snprintf(pdev->bus_id, BUS_ID_SIZE, "%sp%d", dname, partno);
368 else 378 else
369 snprintf(p->dev.bus_id, BUS_ID_SIZE, 379 snprintf(pdev->bus_id, BUS_ID_SIZE, "%s%d", dname, partno);
370 "%s%d", disk->dev.bus_id, part); 380
381 device_initialize(pdev);
382 pdev->class = &block_class;
383 pdev->type = &part_type;
384 pdev->parent = ddev;
371 385
372 device_initialize(&p->dev); 386 err = blk_alloc_devt(p, &devt);
373 p->dev.devt = MKDEV(disk->major, disk->first_minor + part); 387 if (err)
374 p->dev.class = &block_class; 388 goto out_free;
375 p->dev.type = &part_type; 389 pdev->devt = devt;
376 p->dev.parent = &disk->dev;
377 disk->part[part-1] = p;
378 390
379 /* delay uevent until 'holders' subdir is created */ 391 /* delay uevent until 'holders' subdir is created */
380 p->dev.uevent_suppress = 1; 392 pdev->uevent_suppress = 1;
381 err = device_add(&p->dev); 393 err = device_add(pdev);
382 if (err) 394 if (err)
383 goto out1; 395 goto out_put;
384 partition_sysfs_add_subdir(p); 396
385 p->dev.uevent_suppress = 0; 397 err = -ENOMEM;
398 p->holder_dir = kobject_create_and_add("holders", &pdev->kobj);
399 if (!p->holder_dir)
400 goto out_del;
401
402 pdev->uevent_suppress = 0;
386 if (flags & ADDPART_FLAG_WHOLEDISK) { 403 if (flags & ADDPART_FLAG_WHOLEDISK) {
387 err = device_create_file(&p->dev, &dev_attr_whole_disk); 404 err = device_create_file(pdev, &dev_attr_whole_disk);
388 if (err) 405 if (err)
389 goto out2; 406 goto out_del;
390 } 407 }
391 408
409 /* everything is up and running, commence */
410 INIT_RCU_HEAD(&p->rcu_head);
411 rcu_assign_pointer(ptbl->part[partno], p);
412
392 /* suppress uevent if the disk supresses it */ 413 /* suppress uevent if the disk supresses it */
393 if (!disk->dev.uevent_suppress) 414 if (!ddev->uevent_suppress)
394 kobject_uevent(&p->dev.kobj, KOBJ_ADD); 415 kobject_uevent(&pdev->kobj, KOBJ_ADD);
395 416
396 return 0; 417 return 0;
397 418
398out2: 419out_free:
399 device_del(&p->dev);
400out1:
401 put_device(&p->dev);
402 free_part_stats(p);
403out0:
404 kfree(p); 420 kfree(p);
405 return err; 421 return err;
422out_del:
423 kobject_put(p->holder_dir);
424 device_del(pdev);
425out_put:
426 put_device(pdev);
427 blk_free_devt(devt);
428 return err;
406} 429}
407 430
408/* Not exported, helper to add_disk(). */ 431/* Not exported, helper to add_disk(). */
409void register_disk(struct gendisk *disk) 432void register_disk(struct gendisk *disk)
410{ 433{
434 struct device *ddev = disk_to_dev(disk);
411 struct block_device *bdev; 435 struct block_device *bdev;
436 struct disk_part_iter piter;
437 struct hd_struct *part;
412 char *s; 438 char *s;
413 int i;
414 struct hd_struct *p;
415 int err; 439 int err;
416 440
417 disk->dev.parent = disk->driverfs_dev; 441 ddev->parent = disk->driverfs_dev;
418 disk->dev.devt = MKDEV(disk->major, disk->first_minor);
419 442
420 strlcpy(disk->dev.bus_id, disk->disk_name, BUS_ID_SIZE); 443 strlcpy(ddev->bus_id, disk->disk_name, BUS_ID_SIZE);
421 /* ewww... some of these buggers have / in the name... */ 444 /* ewww... some of these buggers have / in the name... */
422 s = strchr(disk->dev.bus_id, '/'); 445 s = strchr(ddev->bus_id, '/');
423 if (s) 446 if (s)
424 *s = '!'; 447 *s = '!';
425 448
426 /* delay uevents, until we scanned partition table */ 449 /* delay uevents, until we scanned partition table */
427 disk->dev.uevent_suppress = 1; 450 ddev->uevent_suppress = 1;
428 451
429 if (device_add(&disk->dev)) 452 if (device_add(ddev))
430 return; 453 return;
431#ifndef CONFIG_SYSFS_DEPRECATED 454#ifndef CONFIG_SYSFS_DEPRECATED
432 err = sysfs_create_link(block_depr, &disk->dev.kobj, 455 err = sysfs_create_link(block_depr, &ddev->kobj,
433 kobject_name(&disk->dev.kobj)); 456 kobject_name(&ddev->kobj));
434 if (err) { 457 if (err) {
435 device_del(&disk->dev); 458 device_del(ddev);
436 return; 459 return;
437 } 460 }
438#endif 461#endif
439 disk_sysfs_add_subdirs(disk); 462 disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj);
463 disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
440 464
441 /* No minors to use for partitions */ 465 /* No minors to use for partitions */
442 if (disk->minors == 1) 466 if (!disk_partitionable(disk))
443 goto exit; 467 goto exit;
444 468
445 /* No such device (e.g., media were just removed) */ 469 /* No such device (e.g., media were just removed) */
@@ -458,50 +482,66 @@ void register_disk(struct gendisk *disk)
458 482
459exit: 483exit:
460 /* announce disk after possible partitions are created */ 484 /* announce disk after possible partitions are created */
461 disk->dev.uevent_suppress = 0; 485 ddev->uevent_suppress = 0;
462 kobject_uevent(&disk->dev.kobj, KOBJ_ADD); 486 kobject_uevent(&ddev->kobj, KOBJ_ADD);
463 487
464 /* announce possible partitions */ 488 /* announce possible partitions */
465 for (i = 1; i < disk->minors; i++) { 489 disk_part_iter_init(&piter, disk, 0);
466 p = disk->part[i-1]; 490 while ((part = disk_part_iter_next(&piter)))
467 if (!p || !p->nr_sects) 491 kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD);
468 continue; 492 disk_part_iter_exit(&piter);
469 kobject_uevent(&p->dev.kobj, KOBJ_ADD);
470 }
471} 493}
472 494
473int rescan_partitions(struct gendisk *disk, struct block_device *bdev) 495int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
474{ 496{
497 struct disk_part_iter piter;
498 struct hd_struct *part;
475 struct parsed_partitions *state; 499 struct parsed_partitions *state;
476 int p, res; 500 int p, highest, res;
477 501
478 if (bdev->bd_part_count) 502 if (bdev->bd_part_count)
479 return -EBUSY; 503 return -EBUSY;
480 res = invalidate_partition(disk, 0); 504 res = invalidate_partition(disk, 0);
481 if (res) 505 if (res)
482 return res; 506 return res;
483 bdev->bd_invalidated = 0; 507
484 for (p = 1; p < disk->minors; p++) 508 disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY);
485 delete_partition(disk, p); 509 while ((part = disk_part_iter_next(&piter)))
510 delete_partition(disk, part->partno);
511 disk_part_iter_exit(&piter);
512
486 if (disk->fops->revalidate_disk) 513 if (disk->fops->revalidate_disk)
487 disk->fops->revalidate_disk(disk); 514 disk->fops->revalidate_disk(disk);
515 check_disk_size_change(disk, bdev);
516 bdev->bd_invalidated = 0;
488 if (!get_capacity(disk) || !(state = check_partition(disk, bdev))) 517 if (!get_capacity(disk) || !(state = check_partition(disk, bdev)))
489 return 0; 518 return 0;
490 if (IS_ERR(state)) /* I/O error reading the partition table */ 519 if (IS_ERR(state)) /* I/O error reading the partition table */
491 return -EIO; 520 return -EIO;
492 521
493 /* tell userspace that the media / partition table may have changed */ 522 /* tell userspace that the media / partition table may have changed */
494 kobject_uevent(&disk->dev.kobj, KOBJ_CHANGE); 523 kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE);
495 524
525 /* Detect the highest partition number and preallocate
526 * disk->part_tbl. This is an optimization and not strictly
527 * necessary.
528 */
529 for (p = 1, highest = 0; p < state->limit; p++)
530 if (state->parts[p].size)
531 highest = p;
532
533 disk_expand_part_tbl(disk, highest);
534
535 /* add partitions */
496 for (p = 1; p < state->limit; p++) { 536 for (p = 1; p < state->limit; p++) {
497 sector_t size = state->parts[p].size; 537 sector_t size = state->parts[p].size;
498 sector_t from = state->parts[p].from; 538 sector_t from = state->parts[p].from;
499 if (!size) 539 if (!size)
500 continue; 540 continue;
501 if (from + size > get_capacity(disk)) { 541 if (from + size > get_capacity(disk)) {
502 printk(KERN_ERR " %s: p%d exceeds device capacity\n", 542 printk(KERN_WARNING
543 "%s: p%d exceeds device capacity\n",
503 disk->disk_name, p); 544 disk->disk_name, p);
504 continue;
505 } 545 }
506 res = add_partition(disk, p, from, size, state->parts[p].flags); 546 res = add_partition(disk, p, from, size, state->parts[p].flags);
507 if (res) { 547 if (res) {
@@ -541,25 +581,31 @@ EXPORT_SYMBOL(read_dev_sector);
541 581
542void del_gendisk(struct gendisk *disk) 582void del_gendisk(struct gendisk *disk)
543{ 583{
544 int p; 584 struct disk_part_iter piter;
585 struct hd_struct *part;
545 586
546 /* invalidate stuff */ 587 /* invalidate stuff */
547 for (p = disk->minors - 1; p > 0; p--) { 588 disk_part_iter_init(&piter, disk,
548 invalidate_partition(disk, p); 589 DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE);
549 delete_partition(disk, p); 590 while ((part = disk_part_iter_next(&piter))) {
591 invalidate_partition(disk, part->partno);
592 delete_partition(disk, part->partno);
550 } 593 }
594 disk_part_iter_exit(&piter);
595
551 invalidate_partition(disk, 0); 596 invalidate_partition(disk, 0);
552 disk->capacity = 0; 597 blk_free_devt(disk_to_dev(disk)->devt);
598 set_capacity(disk, 0);
553 disk->flags &= ~GENHD_FL_UP; 599 disk->flags &= ~GENHD_FL_UP;
554 unlink_gendisk(disk); 600 unlink_gendisk(disk);
555 disk_stat_set_all(disk, 0); 601 part_stat_set_all(&disk->part0, 0);
556 disk->stamp = 0; 602 disk->part0.stamp = 0;
557 603
558 kobject_put(disk->holder_dir); 604 kobject_put(disk->part0.holder_dir);
559 kobject_put(disk->slave_dir); 605 kobject_put(disk->slave_dir);
560 disk->driverfs_dev = NULL; 606 disk->driverfs_dev = NULL;
561#ifndef CONFIG_SYSFS_DEPRECATED 607#ifndef CONFIG_SYSFS_DEPRECATED
562 sysfs_remove_link(block_depr, disk->dev.bus_id); 608 sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
563#endif 609#endif
564 device_del(&disk->dev); 610 device_del(disk_to_dev(disk));
565} 611}
diff --git a/fs/partitions/check.h b/fs/partitions/check.h
index 17ae8ecd9e8b..98dbe1a84528 100644
--- a/fs/partitions/check.h
+++ b/fs/partitions/check.h
@@ -5,15 +5,13 @@
5 * add_gd_partition adds a partitions details to the devices partition 5 * add_gd_partition adds a partitions details to the devices partition
6 * description. 6 * description.
7 */ 7 */
8enum { MAX_PART = 256 };
9
10struct parsed_partitions { 8struct parsed_partitions {
11 char name[BDEVNAME_SIZE]; 9 char name[BDEVNAME_SIZE];
12 struct { 10 struct {
13 sector_t from; 11 sector_t from;
14 sector_t size; 12 sector_t size;
15 int flags; 13 int flags;
16 } parts[MAX_PART]; 14 } parts[DISK_MAX_PARTS];
17 int next; 15 int next;
18 int limit; 16 int limit;
19}; 17};
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 0d6eb33597c6..71c9be59c9c2 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -337,65 +337,6 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
337 return 0; 337 return 0;
338} 338}
339 339
340/*
341 * Use precise platform statistics if available:
342 */
343#ifdef CONFIG_VIRT_CPU_ACCOUNTING
344static cputime_t task_utime(struct task_struct *p)
345{
346 return p->utime;
347}
348
349static cputime_t task_stime(struct task_struct *p)
350{
351 return p->stime;
352}
353#else
354static cputime_t task_utime(struct task_struct *p)
355{
356 clock_t utime = cputime_to_clock_t(p->utime),
357 total = utime + cputime_to_clock_t(p->stime);
358 u64 temp;
359
360 /*
361 * Use CFS's precise accounting:
362 */
363 temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime);
364
365 if (total) {
366 temp *= utime;
367 do_div(temp, total);
368 }
369 utime = (clock_t)temp;
370
371 p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime));
372 return p->prev_utime;
373}
374
375static cputime_t task_stime(struct task_struct *p)
376{
377 clock_t stime;
378
379 /*
380 * Use CFS's precise accounting. (we subtract utime from
381 * the total, to make sure the total observed by userspace
382 * grows monotonically - apps rely on that):
383 */
384 stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
385 cputime_to_clock_t(task_utime(p));
386
387 if (stime >= 0)
388 p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));
389
390 return p->prev_stime;
391}
392#endif
393
394static cputime_t task_gtime(struct task_struct *p)
395{
396 return p->gtime;
397}
398
399static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, 340static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
400 struct pid *pid, struct task_struct *task, int whole) 341 struct pid *pid, struct task_struct *task, int whole)
401{ 342{
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index bca0f81eb687..7821589a17d5 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -547,8 +547,8 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp
547 547
548 for (tmp = dir->subdir; tmp; tmp = tmp->next) 548 for (tmp = dir->subdir; tmp; tmp = tmp->next)
549 if (strcmp(tmp->name, dp->name) == 0) { 549 if (strcmp(tmp->name, dp->name) == 0) {
550 printk(KERN_WARNING "proc_dir_entry '%s' already " 550 printk(KERN_WARNING "proc_dir_entry '%s/%s' already registered\n",
551 "registered\n", dp->name); 551 dir->name, dp->name);
552 dump_stack(); 552 dump_stack();
553 break; 553 break;
554 } 554 }
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index ded969862960..29e20c6b1f7f 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -24,6 +24,7 @@
24#include <linux/tty.h> 24#include <linux/tty.h>
25#include <linux/string.h> 25#include <linux/string.h>
26#include <linux/mman.h> 26#include <linux/mman.h>
27#include <linux/quicklist.h>
27#include <linux/proc_fs.h> 28#include <linux/proc_fs.h>
28#include <linux/ioport.h> 29#include <linux/ioport.h>
29#include <linux/mm.h> 30#include <linux/mm.h>
@@ -182,6 +183,9 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
182 "SReclaimable: %8lu kB\n" 183 "SReclaimable: %8lu kB\n"
183 "SUnreclaim: %8lu kB\n" 184 "SUnreclaim: %8lu kB\n"
184 "PageTables: %8lu kB\n" 185 "PageTables: %8lu kB\n"
186#ifdef CONFIG_QUICKLIST
187 "Quicklists: %8lu kB\n"
188#endif
185 "NFS_Unstable: %8lu kB\n" 189 "NFS_Unstable: %8lu kB\n"
186 "Bounce: %8lu kB\n" 190 "Bounce: %8lu kB\n"
187 "WritebackTmp: %8lu kB\n" 191 "WritebackTmp: %8lu kB\n"
@@ -214,6 +218,9 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
214 K(global_page_state(NR_SLAB_RECLAIMABLE)), 218 K(global_page_state(NR_SLAB_RECLAIMABLE)),
215 K(global_page_state(NR_SLAB_UNRECLAIMABLE)), 219 K(global_page_state(NR_SLAB_UNRECLAIMABLE)),
216 K(global_page_state(NR_PAGETABLE)), 220 K(global_page_state(NR_PAGETABLE)),
221#ifdef CONFIG_QUICKLIST
222 K(quicklist_total_size()),
223#endif
217 K(global_page_state(NR_UNSTABLE_NFS)), 224 K(global_page_state(NR_UNSTABLE_NFS)),
218 K(global_page_state(NR_BOUNCE)), 225 K(global_page_state(NR_BOUNCE)),
219 K(global_page_state(NR_WRITEBACK_TEMP)), 226 K(global_page_state(NR_WRITEBACK_TEMP)),
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 52312ec93ff4..5145cb9125af 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -58,7 +58,7 @@ const struct inode_operations ramfs_file_inode_operations = {
58 * size 0 on the assumption that it's going to be used for an mmap of shared 58 * size 0 on the assumption that it's going to be used for an mmap of shared
59 * memory 59 * memory
60 */ 60 */
61static int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize) 61int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
62{ 62{
63 struct pagevec lru_pvec; 63 struct pagevec lru_pvec;
64 unsigned long npages, xpages, loop, limit; 64 unsigned long npages, xpages, loop, limit;
diff --git a/fs/splice.c b/fs/splice.c
index 1bbc6f4bb09c..a1e701c27156 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -898,6 +898,9 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
898 if (unlikely(!(out->f_mode & FMODE_WRITE))) 898 if (unlikely(!(out->f_mode & FMODE_WRITE)))
899 return -EBADF; 899 return -EBADF;
900 900
901 if (unlikely(out->f_flags & O_APPEND))
902 return -EINVAL;
903
901 ret = rw_verify_area(WRITE, out, ppos, len); 904 ret = rw_verify_area(WRITE, out, ppos, len);
902 if (unlikely(ret < 0)) 905 if (unlikely(ret < 0))
903 return ret; 906 return ret;
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index 154098157473..73db464cd08b 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -302,18 +302,6 @@ long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs)
302 int subtract_lebs; 302 int subtract_lebs;
303 long long available; 303 long long available;
304 304
305 /*
306 * Force the amount available to the total size reported if the used
307 * space is zero.
308 */
309 if (c->lst.total_used <= UBIFS_INO_NODE_SZ &&
310 c->budg_data_growth + c->budg_dd_growth == 0) {
311 /* Do the same calculation as for c->block_cnt */
312 available = c->main_lebs - 2;
313 available *= c->leb_size - c->dark_wm;
314 return available;
315 }
316
317 available = c->main_bytes - c->lst.total_used; 305 available = c->main_bytes - c->lst.total_used;
318 306
319 /* 307 /*
@@ -714,34 +702,106 @@ void ubifs_release_dirty_inode_budget(struct ubifs_info *c,
714} 702}
715 703
716/** 704/**
717 * ubifs_budg_get_free_space - return amount of free space. 705 * ubifs_reported_space - calculate reported free space.
706 * @c: the UBIFS file-system description object
707 * @free: amount of free space
708 *
709 * This function calculates amount of free space which will be reported to
710 * user-space. User-space application tend to expect that if the file-system
711 * (e.g., via the 'statfs()' call) reports that it has N bytes available, they
712 * are able to write a file of size N. UBIFS attaches node headers to each data
713 * node and it has to write indexind nodes as well. This introduces additional
714 * overhead, and UBIFS it has to report sligtly less free space to meet the
715 * above expectetion.
716 *
717 * This function assumes free space is made up of uncompressed data nodes and
718 * full index nodes (one per data node, tripled because we always allow enough
719 * space to write the index thrice).
720 *
721 * Note, the calculation is pessimistic, which means that most of the time
722 * UBIFS reports less space than it actually has.
723 */
724long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free)
725{
726 int divisor, factor, f;
727
728 /*
729 * Reported space size is @free * X, where X is UBIFS block size
730 * divided by UBIFS block size + all overhead one data block
731 * introduces. The overhead is the node header + indexing overhead.
732 *
733 * Indexing overhead calculations are based on the following formula:
734 * I = N/(f - 1) + 1, where I - number of indexing nodes, N - number
735 * of data nodes, f - fanout. Because effective UBIFS fanout is twice
736 * as less than maximum fanout, we assume that each data node
737 * introduces 3 * @c->max_idx_node_sz / (@c->fanout/2 - 1) bytes.
738 * Note, the multiplier 3 is because UBIFS reseves thrice as more space
739 * for the index.
740 */
741 f = c->fanout > 3 ? c->fanout >> 1 : 2;
742 factor = UBIFS_BLOCK_SIZE;
743 divisor = UBIFS_MAX_DATA_NODE_SZ;
744 divisor += (c->max_idx_node_sz * 3) / (f - 1);
745 free *= factor;
746 do_div(free, divisor);
747 return free;
748}
749
750/**
751 * ubifs_get_free_space - return amount of free space.
718 * @c: UBIFS file-system description object 752 * @c: UBIFS file-system description object
719 * 753 *
720 * This function returns amount of free space on the file-system. 754 * This function calculates amount of free space to report to user-space.
755 *
756 * Because UBIFS may introduce substantial overhead (the index, node headers,
757 * alighment, wastage at the end of eraseblocks, etc), it cannot report real
758 * amount of free flash space it has (well, because not all dirty space is
759 * reclamable, UBIFS does not actually know the real amount). If UBIFS did so,
760 * it would bread user expectetion about what free space is. Users seem to
761 * accustomed to assume that if the file-system reports N bytes of free space,
762 * they would be able to fit a file of N bytes to the FS. This almost works for
763 * traditional file-systems, because they have way less overhead than UBIFS.
764 * So, to keep users happy, UBIFS tries to take the overhead into account.
721 */ 765 */
722long long ubifs_budg_get_free_space(struct ubifs_info *c) 766long long ubifs_get_free_space(struct ubifs_info *c)
723{ 767{
724 int min_idx_lebs, rsvd_idx_lebs; 768 int min_idx_lebs, rsvd_idx_lebs, lebs;
725 long long available, outstanding, free; 769 long long available, outstanding, free;
726 770
727 /* Do exactly the same calculations as in 'do_budget_space()' */
728 spin_lock(&c->space_lock); 771 spin_lock(&c->space_lock);
729 min_idx_lebs = ubifs_calc_min_idx_lebs(c); 772 min_idx_lebs = ubifs_calc_min_idx_lebs(c);
773 outstanding = c->budg_data_growth + c->budg_dd_growth;
730 774
731 if (min_idx_lebs > c->lst.idx_lebs) 775 /*
732 rsvd_idx_lebs = min_idx_lebs - c->lst.idx_lebs; 776 * Force the amount available to the total size reported if the used
733 else 777 * space is zero.
734 rsvd_idx_lebs = 0; 778 */
735 779 if (c->lst.total_used <= UBIFS_INO_NODE_SZ && !outstanding) {
736 if (rsvd_idx_lebs > c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt
737 - c->lst.taken_empty_lebs) {
738 spin_unlock(&c->space_lock); 780 spin_unlock(&c->space_lock);
739 return 0; 781 return (long long)c->block_cnt << UBIFS_BLOCK_SHIFT;
740 } 782 }
741 783
742 available = ubifs_calc_available(c, min_idx_lebs); 784 available = ubifs_calc_available(c, min_idx_lebs);
743 outstanding = c->budg_data_growth + c->budg_dd_growth; 785
744 c->min_idx_lebs = min_idx_lebs; 786 /*
787 * When reporting free space to user-space, UBIFS guarantees that it is
788 * possible to write a file of free space size. This means that for
789 * empty LEBs we may use more precise calculations than
790 * 'ubifs_calc_available()' is using. Namely, we know that in empty
791 * LEBs we would waste only @c->leb_overhead bytes, not @c->dark_wm.
792 * Thus, amend the available space.
793 *
794 * Note, the calculations below are similar to what we have in
795 * 'do_budget_space()', so refer there for comments.
796 */
797 if (min_idx_lebs > c->lst.idx_lebs)
798 rsvd_idx_lebs = min_idx_lebs - c->lst.idx_lebs;
799 else
800 rsvd_idx_lebs = 0;
801 lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt -
802 c->lst.taken_empty_lebs;
803 lebs -= rsvd_idx_lebs;
804 available += lebs * (c->dark_wm - c->leb_overhead);
745 spin_unlock(&c->space_lock); 805 spin_unlock(&c->space_lock);
746 806
747 if (available > outstanding) 807 if (available > outstanding)
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index b9cb77473758..d7f7645779f2 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -538,7 +538,7 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
538 printk(KERN_DEBUG "\t%d orphan inode numbers:\n", n); 538 printk(KERN_DEBUG "\t%d orphan inode numbers:\n", n);
539 for (i = 0; i < n; i++) 539 for (i = 0; i < n; i++)
540 printk(KERN_DEBUG "\t ino %llu\n", 540 printk(KERN_DEBUG "\t ino %llu\n",
541 le64_to_cpu(orph->inos[i])); 541 (unsigned long long)le64_to_cpu(orph->inos[i]));
542 break; 542 break;
543 } 543 }
544 default: 544 default:
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 5c96f1fb7016..526c01ec8003 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -426,7 +426,7 @@ static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir)
426 426
427 while (1) { 427 while (1) {
428 dbg_gen("feed '%s', ino %llu, new f_pos %#x", 428 dbg_gen("feed '%s', ino %llu, new f_pos %#x",
429 dent->name, le64_to_cpu(dent->inum), 429 dent->name, (unsigned long long)le64_to_cpu(dent->inum),
430 key_hash_flash(c, &dent->key)); 430 key_hash_flash(c, &dent->key));
431 ubifs_assert(dent->ch.sqnum > ubifs_inode(dir)->creat_sqnum); 431 ubifs_assert(dent->ch.sqnum > ubifs_inode(dir)->creat_sqnum);
432 432
@@ -587,7 +587,6 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry)
587 if (err) { 587 if (err) {
588 if (err != -ENOSPC) 588 if (err != -ENOSPC)
589 return err; 589 return err;
590 err = 0;
591 budgeted = 0; 590 budgeted = 0;
592 } 591 }
593 592
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 4071d1cae29f..3d698e2022b1 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -793,7 +793,7 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,
793 int err; 793 int err;
794 struct ubifs_budget_req req; 794 struct ubifs_budget_req req;
795 loff_t old_size = inode->i_size, new_size = attr->ia_size; 795 loff_t old_size = inode->i_size, new_size = attr->ia_size;
796 int offset = new_size & (UBIFS_BLOCK_SIZE - 1); 796 int offset = new_size & (UBIFS_BLOCK_SIZE - 1), budgeted = 1;
797 struct ubifs_inode *ui = ubifs_inode(inode); 797 struct ubifs_inode *ui = ubifs_inode(inode);
798 798
799 dbg_gen("ino %lu, size %lld -> %lld", inode->i_ino, old_size, new_size); 799 dbg_gen("ino %lu, size %lld -> %lld", inode->i_ino, old_size, new_size);
@@ -811,8 +811,15 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,
811 /* A funny way to budget for truncation node */ 811 /* A funny way to budget for truncation node */
812 req.dirtied_ino_d = UBIFS_TRUN_NODE_SZ; 812 req.dirtied_ino_d = UBIFS_TRUN_NODE_SZ;
813 err = ubifs_budget_space(c, &req); 813 err = ubifs_budget_space(c, &req);
814 if (err) 814 if (err) {
815 return err; 815 /*
816 * Treat truncations to zero as deletion and always allow them,
817 * just like we do for '->unlink()'.
818 */
819 if (new_size || err != -ENOSPC)
820 return err;
821 budgeted = 0;
822 }
816 823
817 err = vmtruncate(inode, new_size); 824 err = vmtruncate(inode, new_size);
818 if (err) 825 if (err)
@@ -869,7 +876,12 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,
869 err = ubifs_jnl_truncate(c, inode, old_size, new_size); 876 err = ubifs_jnl_truncate(c, inode, old_size, new_size);
870 mutex_unlock(&ui->ui_mutex); 877 mutex_unlock(&ui->ui_mutex);
871out_budg: 878out_budg:
872 ubifs_release_budget(c, &req); 879 if (budgeted)
880 ubifs_release_budget(c, &req);
881 else {
882 c->nospace = c->nospace_rp = 0;
883 smp_wmb();
884 }
873 return err; 885 return err;
874} 886}
875 887
diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c
index adee7b5ddeab..47814cde2407 100644
--- a/fs/ubifs/find.c
+++ b/fs/ubifs/find.c
@@ -211,14 +211,8 @@ static const struct ubifs_lprops *scan_for_dirty(struct ubifs_info *c,
211 * dirty index heap, and it falls-back to LPT scanning if the heaps are empty 211 * dirty index heap, and it falls-back to LPT scanning if the heaps are empty
212 * or do not have an LEB which satisfies the @min_space criteria. 212 * or do not have an LEB which satisfies the @min_space criteria.
213 * 213 *
214 * Note: 214 * Note, LEBs which have less than dead watermark of free + dirty space are
215 * o LEBs which have less than dead watermark of dirty space are never picked 215 * never picked by this function.
216 * by this function;
217 *
218 * Returns zero and the LEB properties of
219 * found dirty LEB in case of success, %-ENOSPC if no dirty LEB was found and a
220 * negative error code in case of other failures. The returned LEB is marked as
221 * "taken".
222 * 216 *
223 * The additional @pick_free argument controls if this function has to return a 217 * The additional @pick_free argument controls if this function has to return a
224 * free or freeable LEB if one is present. For example, GC must to set it to %1, 218 * free or freeable LEB if one is present. For example, GC must to set it to %1,
@@ -231,6 +225,10 @@ static const struct ubifs_lprops *scan_for_dirty(struct ubifs_info *c,
231 * 225 *
232 * In addition @pick_free is set to %2 by the recovery process in order to 226 * In addition @pick_free is set to %2 by the recovery process in order to
233 * recover gc_lnum in which case an index LEB must not be returned. 227 * recover gc_lnum in which case an index LEB must not be returned.
228 *
229 * This function returns zero and the LEB properties of found dirty LEB in case
230 * of success, %-ENOSPC if no dirty LEB was found and a negative error code in
231 * case of other failures. The returned LEB is marked as "taken".
234 */ 232 */
235int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp, 233int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
236 int min_space, int pick_free) 234 int min_space, int pick_free)
@@ -245,7 +243,7 @@ int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
245 int lebs, rsvd_idx_lebs = 0; 243 int lebs, rsvd_idx_lebs = 0;
246 244
247 spin_lock(&c->space_lock); 245 spin_lock(&c->space_lock);
248 lebs = c->lst.empty_lebs; 246 lebs = c->lst.empty_lebs + c->idx_gc_cnt;
249 lebs += c->freeable_cnt - c->lst.taken_empty_lebs; 247 lebs += c->freeable_cnt - c->lst.taken_empty_lebs;
250 248
251 /* 249 /*
@@ -317,7 +315,7 @@ int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
317 lp = idx_lp; 315 lp = idx_lp;
318 316
319 if (lp) { 317 if (lp) {
320 ubifs_assert(lp->dirty >= c->dead_wm); 318 ubifs_assert(lp->free + lp->dirty >= c->dead_wm);
321 goto found; 319 goto found;
322 } 320 }
323 321
@@ -509,7 +507,6 @@ int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free,
509 rsvd_idx_lebs = 0; 507 rsvd_idx_lebs = 0;
510 lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt - 508 lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt -
511 c->lst.taken_empty_lebs; 509 c->lst.taken_empty_lebs;
512 ubifs_assert(lebs + c->lst.idx_lebs >= c->min_idx_lebs);
513 if (rsvd_idx_lebs < lebs) 510 if (rsvd_idx_lebs < lebs)
514 /* 511 /*
515 * OK to allocate an empty LEB, but we still don't want to go 512 * OK to allocate an empty LEB, but we still don't want to go
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index d0f3dac29081..02aba36fe3d4 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -334,15 +334,21 @@ int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp)
334 334
335 err = move_nodes(c, sleb); 335 err = move_nodes(c, sleb);
336 if (err) 336 if (err)
337 goto out; 337 goto out_inc_seq;
338 338
339 err = gc_sync_wbufs(c); 339 err = gc_sync_wbufs(c);
340 if (err) 340 if (err)
341 goto out; 341 goto out_inc_seq;
342 342
343 err = ubifs_change_one_lp(c, lnum, c->leb_size, 0, 0, 0, 0); 343 err = ubifs_change_one_lp(c, lnum, c->leb_size, 0, 0, 0, 0);
344 if (err) 344 if (err)
345 goto out; 345 goto out_inc_seq;
346
347 /* Allow for races with TNC */
348 c->gced_lnum = lnum;
349 smp_wmb();
350 c->gc_seq += 1;
351 smp_wmb();
346 352
347 if (c->gc_lnum == -1) { 353 if (c->gc_lnum == -1) {
348 c->gc_lnum = lnum; 354 c->gc_lnum = lnum;
@@ -363,6 +369,14 @@ int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp)
363out: 369out:
364 ubifs_scan_destroy(sleb); 370 ubifs_scan_destroy(sleb);
365 return err; 371 return err;
372
373out_inc_seq:
374 /* We may have moved at least some nodes so allow for races with TNC */
375 c->gced_lnum = lnum;
376 smp_wmb();
377 c->gc_seq += 1;
378 smp_wmb();
379 goto out;
366} 380}
367 381
368/** 382/**
diff --git a/fs/ubifs/misc.h b/fs/ubifs/misc.h
index 87dabf9fe742..4c12a9215d7f 100644
--- a/fs/ubifs/misc.h
+++ b/fs/ubifs/misc.h
@@ -284,38 +284,6 @@ static inline void *ubifs_idx_key(const struct ubifs_info *c,
284} 284}
285 285
286/** 286/**
287 * ubifs_reported_space - calculate reported free space.
288 * @c: the UBIFS file-system description object
289 * @free: amount of free space
290 *
291 * This function calculates amount of free space which will be reported to
292 * user-space. User-space application tend to expect that if the file-system
293 * (e.g., via the 'statfs()' call) reports that it has N bytes available, they
294 * are able to write a file of size N. UBIFS attaches node headers to each data
295 * node and it has to write indexind nodes as well. This introduces additional
296 * overhead, and UBIFS it has to report sligtly less free space to meet the
297 * above expectetion.
298 *
299 * This function assumes free space is made up of uncompressed data nodes and
300 * full index nodes (one per data node, doubled because we always allow enough
301 * space to write the index twice).
302 *
303 * Note, the calculation is pessimistic, which means that most of the time
304 * UBIFS reports less space than it actually has.
305 */
306static inline long long ubifs_reported_space(const struct ubifs_info *c,
307 uint64_t free)
308{
309 int divisor, factor;
310
311 divisor = UBIFS_MAX_DATA_NODE_SZ + (c->max_idx_node_sz * 3);
312 factor = UBIFS_MAX_DATA_NODE_SZ - UBIFS_DATA_NODE_SZ;
313 do_div(free, divisor);
314
315 return free * factor;
316}
317
318/**
319 * ubifs_current_time - round current time to time granularity. 287 * ubifs_current_time - round current time to time granularity.
320 * @inode: inode 288 * @inode: inode
321 */ 289 */
@@ -325,4 +293,21 @@ static inline struct timespec ubifs_current_time(struct inode *inode)
325 current_fs_time(inode->i_sb) : CURRENT_TIME_SEC; 293 current_fs_time(inode->i_sb) : CURRENT_TIME_SEC;
326} 294}
327 295
296/**
297 * ubifs_tnc_lookup - look up a file-system node.
298 * @c: UBIFS file-system description object
299 * @key: node key to lookup
300 * @node: the node is returned here
301 *
302 * This function look up and reads node with key @key. The caller has to make
303 * sure the @node buffer is large enough to fit the node. Returns zero in case
304 * of success, %-ENOENT if the node was not found, and a negative error code in
305 * case of failure.
306 */
307static inline int ubifs_tnc_lookup(struct ubifs_info *c,
308 const union ubifs_key *key, void *node)
309{
310 return ubifs_tnc_locate(c, key, node, NULL, NULL);
311}
312
328#endif /* __UBIFS_MISC_H__ */ 313#endif /* __UBIFS_MISC_H__ */
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index f71e6b8822c4..3f4902060c7a 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -370,8 +370,9 @@ static int ubifs_statfs(struct dentry *dentry, struct kstatfs *buf)
370{ 370{
371 struct ubifs_info *c = dentry->d_sb->s_fs_info; 371 struct ubifs_info *c = dentry->d_sb->s_fs_info;
372 unsigned long long free; 372 unsigned long long free;
373 __le32 *uuid = (__le32 *)c->uuid;
373 374
374 free = ubifs_budg_get_free_space(c); 375 free = ubifs_get_free_space(c);
375 dbg_gen("free space %lld bytes (%lld blocks)", 376 dbg_gen("free space %lld bytes (%lld blocks)",
376 free, free >> UBIFS_BLOCK_SHIFT); 377 free, free >> UBIFS_BLOCK_SHIFT);
377 378
@@ -386,7 +387,8 @@ static int ubifs_statfs(struct dentry *dentry, struct kstatfs *buf)
386 buf->f_files = 0; 387 buf->f_files = 0;
387 buf->f_ffree = 0; 388 buf->f_ffree = 0;
388 buf->f_namelen = UBIFS_MAX_NLEN; 389 buf->f_namelen = UBIFS_MAX_NLEN;
389 390 buf->f_fsid.val[0] = le32_to_cpu(uuid[0]) ^ le32_to_cpu(uuid[2]);
391 buf->f_fsid.val[1] = le32_to_cpu(uuid[1]) ^ le32_to_cpu(uuid[3]);
390 return 0; 392 return 0;
391} 393}
392 394
@@ -530,6 +532,12 @@ static int init_constants_early(struct ubifs_info *c)
530 c->dead_wm = ALIGN(MIN_WRITE_SZ, c->min_io_size); 532 c->dead_wm = ALIGN(MIN_WRITE_SZ, c->min_io_size);
531 c->dark_wm = ALIGN(UBIFS_MAX_NODE_SZ, c->min_io_size); 533 c->dark_wm = ALIGN(UBIFS_MAX_NODE_SZ, c->min_io_size);
532 534
535 /*
536 * Calculate how many bytes would be wasted at the end of LEB if it was
537 * fully filled with data nodes of maximum size. This is used in
538 * calculations when reporting free space.
539 */
540 c->leb_overhead = c->leb_size % UBIFS_MAX_DATA_NODE_SZ;
533 return 0; 541 return 0;
534} 542}
535 543
@@ -647,13 +655,11 @@ static int init_constants_late(struct ubifs_info *c)
647 * internally because it does not make much sense for UBIFS, but it is 655 * internally because it does not make much sense for UBIFS, but it is
648 * necessary to report something for the 'statfs()' call. 656 * necessary to report something for the 'statfs()' call.
649 * 657 *
650 * Subtract the LEB reserved for GC and the LEB which is reserved for 658 * Subtract the LEB reserved for GC, the LEB which is reserved for
651 * deletions. 659 * deletions, and assume only one journal head is available.
652 *
653 * Review 'ubifs_calc_available()' if changing this calculation.
654 */ 660 */
655 tmp64 = c->main_lebs - 2; 661 tmp64 = c->main_lebs - 2 - c->jhead_cnt + 1;
656 tmp64 *= (uint64_t)c->leb_size - c->dark_wm; 662 tmp64 *= (uint64_t)c->leb_size - c->leb_overhead;
657 tmp64 = ubifs_reported_space(c, tmp64); 663 tmp64 = ubifs_reported_space(c, tmp64);
658 c->block_cnt = tmp64 >> UBIFS_BLOCK_SHIFT; 664 c->block_cnt = tmp64 >> UBIFS_BLOCK_SHIFT;
659 665
@@ -1018,14 +1024,13 @@ static int mount_ubifs(struct ubifs_info *c)
1018 goto out_dereg; 1024 goto out_dereg;
1019 } 1025 }
1020 1026
1027 sprintf(c->bgt_name, BGT_NAME_PATTERN, c->vi.ubi_num, c->vi.vol_id);
1021 if (!mounted_read_only) { 1028 if (!mounted_read_only) {
1022 err = alloc_wbufs(c); 1029 err = alloc_wbufs(c);
1023 if (err) 1030 if (err)
1024 goto out_cbuf; 1031 goto out_cbuf;
1025 1032
1026 /* Create background thread */ 1033 /* Create background thread */
1027 sprintf(c->bgt_name, BGT_NAME_PATTERN, c->vi.ubi_num,
1028 c->vi.vol_id);
1029 c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name); 1034 c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name);
1030 if (!c->bgt) 1035 if (!c->bgt)
1031 c->bgt = ERR_PTR(-EINVAL); 1036 c->bgt = ERR_PTR(-EINVAL);
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index e909f4a96443..7634c5970887 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -506,7 +506,7 @@ static int fallible_read_node(struct ubifs_info *c, const union ubifs_key *key,
506 if (keys_cmp(c, key, &node_key) != 0) 506 if (keys_cmp(c, key, &node_key) != 0)
507 ret = 0; 507 ret = 0;
508 } 508 }
509 if (ret == 0) 509 if (ret == 0 && c->replaying)
510 dbg_mnt("dangling branch LEB %d:%d len %d, key %s", 510 dbg_mnt("dangling branch LEB %d:%d len %d, key %s",
511 zbr->lnum, zbr->offs, zbr->len, DBGKEY(key)); 511 zbr->lnum, zbr->offs, zbr->len, DBGKEY(key));
512 return ret; 512 return ret;
@@ -1382,50 +1382,39 @@ static int lookup_level0_dirty(struct ubifs_info *c, const union ubifs_key *key,
1382} 1382}
1383 1383
1384/** 1384/**
1385 * ubifs_tnc_lookup - look up a file-system node. 1385 * maybe_leb_gced - determine if a LEB may have been garbage collected.
1386 * @c: UBIFS file-system description object 1386 * @c: UBIFS file-system description object
1387 * @key: node key to lookup 1387 * @lnum: LEB number
1388 * @node: the node is returned here 1388 * @gc_seq1: garbage collection sequence number
1389 * 1389 *
1390 * This function look up and reads node with key @key. The caller has to make 1390 * This function determines if @lnum may have been garbage collected since
1391 * sure the @node buffer is large enough to fit the node. Returns zero in case 1391 * sequence number @gc_seq1. If it may have been then %1 is returned, otherwise
1392 * of success, %-ENOENT if the node was not found, and a negative error code in 1392 * %0 is returned.
1393 * case of failure.
1394 */ 1393 */
1395int ubifs_tnc_lookup(struct ubifs_info *c, const union ubifs_key *key, 1394static int maybe_leb_gced(struct ubifs_info *c, int lnum, int gc_seq1)
1396 void *node)
1397{ 1395{
1398 int found, n, err; 1396 int gc_seq2, gced_lnum;
1399 struct ubifs_znode *znode;
1400 struct ubifs_zbranch zbr, *zt;
1401 1397
1402 mutex_lock(&c->tnc_mutex); 1398 gced_lnum = c->gced_lnum;
1403 found = ubifs_lookup_level0(c, key, &znode, &n); 1399 smp_rmb();
1404 if (!found) { 1400 gc_seq2 = c->gc_seq;
1405 err = -ENOENT; 1401 /* Same seq means no GC */
1406 goto out; 1402 if (gc_seq1 == gc_seq2)
1407 } else if (found < 0) { 1403 return 0;
1408 err = found; 1404 /* Different by more than 1 means we don't know */
1409 goto out; 1405 if (gc_seq1 + 1 != gc_seq2)
1410 } 1406 return 1;
1411 zt = &znode->zbranch[n]; 1407 /*
1412 if (is_hash_key(c, key)) { 1408 * We have seen the sequence number has increased by 1. Now we need to
1413 /* 1409 * be sure we read the right LEB number, so read it again.
1414 * In this case the leaf node cache gets used, so we pass the 1410 */
1415 * address of the zbranch and keep the mutex locked 1411 smp_rmb();
1416 */ 1412 if (gced_lnum != c->gced_lnum)
1417 err = tnc_read_node_nm(c, zt, node); 1413 return 1;
1418 goto out; 1414 /* Finally we can check lnum */
1419 } 1415 if (gced_lnum == lnum)
1420 zbr = znode->zbranch[n]; 1416 return 1;
1421 mutex_unlock(&c->tnc_mutex); 1417 return 0;
1422
1423 err = ubifs_tnc_read_node(c, &zbr, node);
1424 return err;
1425
1426out:
1427 mutex_unlock(&c->tnc_mutex);
1428 return err;
1429} 1418}
1430 1419
1431/** 1420/**
@@ -1436,16 +1425,19 @@ out:
1436 * @lnum: LEB number is returned here 1425 * @lnum: LEB number is returned here
1437 * @offs: offset is returned here 1426 * @offs: offset is returned here
1438 * 1427 *
1439 * This function is the same as 'ubifs_tnc_lookup()' but it returns the node 1428 * This function look up and reads node with key @key. The caller has to make
1440 * location also. See 'ubifs_tnc_lookup()'. 1429 * sure the @node buffer is large enough to fit the node. Returns zero in case
1430 * of success, %-ENOENT if the node was not found, and a negative error code in
1431 * case of failure. The node location can be returned in @lnum and @offs.
1441 */ 1432 */
1442int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key, 1433int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key,
1443 void *node, int *lnum, int *offs) 1434 void *node, int *lnum, int *offs)
1444{ 1435{
1445 int found, n, err; 1436 int found, n, err, safely = 0, gc_seq1;
1446 struct ubifs_znode *znode; 1437 struct ubifs_znode *znode;
1447 struct ubifs_zbranch zbr, *zt; 1438 struct ubifs_zbranch zbr, *zt;
1448 1439
1440again:
1449 mutex_lock(&c->tnc_mutex); 1441 mutex_lock(&c->tnc_mutex);
1450 found = ubifs_lookup_level0(c, key, &znode, &n); 1442 found = ubifs_lookup_level0(c, key, &znode, &n);
1451 if (!found) { 1443 if (!found) {
@@ -1456,24 +1448,43 @@ int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key,
1456 goto out; 1448 goto out;
1457 } 1449 }
1458 zt = &znode->zbranch[n]; 1450 zt = &znode->zbranch[n];
1451 if (lnum) {
1452 *lnum = zt->lnum;
1453 *offs = zt->offs;
1454 }
1459 if (is_hash_key(c, key)) { 1455 if (is_hash_key(c, key)) {
1460 /* 1456 /*
1461 * In this case the leaf node cache gets used, so we pass the 1457 * In this case the leaf node cache gets used, so we pass the
1462 * address of the zbranch and keep the mutex locked 1458 * address of the zbranch and keep the mutex locked
1463 */ 1459 */
1464 *lnum = zt->lnum;
1465 *offs = zt->offs;
1466 err = tnc_read_node_nm(c, zt, node); 1460 err = tnc_read_node_nm(c, zt, node);
1467 goto out; 1461 goto out;
1468 } 1462 }
1463 if (safely) {
1464 err = ubifs_tnc_read_node(c, zt, node);
1465 goto out;
1466 }
1467 /* Drop the TNC mutex prematurely and race with garbage collection */
1469 zbr = znode->zbranch[n]; 1468 zbr = znode->zbranch[n];
1469 gc_seq1 = c->gc_seq;
1470 mutex_unlock(&c->tnc_mutex); 1470 mutex_unlock(&c->tnc_mutex);
1471 1471
1472 *lnum = zbr.lnum; 1472 if (ubifs_get_wbuf(c, zbr.lnum)) {
1473 *offs = zbr.offs; 1473 /* We do not GC journal heads */
1474 err = ubifs_tnc_read_node(c, &zbr, node);
1475 return err;
1476 }
1474 1477
1475 err = ubifs_tnc_read_node(c, &zbr, node); 1478 err = fallible_read_node(c, key, &zbr, node);
1476 return err; 1479 if (err <= 0 || maybe_leb_gced(c, zbr.lnum, gc_seq1)) {
1480 /*
1481 * The node may have been GC'ed out from under us so try again
1482 * while keeping the TNC mutex locked.
1483 */
1484 safely = 1;
1485 goto again;
1486 }
1487 return 0;
1477 1488
1478out: 1489out:
1479 mutex_unlock(&c->tnc_mutex); 1490 mutex_unlock(&c->tnc_mutex);
@@ -1498,7 +1509,6 @@ static int do_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,
1498{ 1509{
1499 int found, n, err; 1510 int found, n, err;
1500 struct ubifs_znode *znode; 1511 struct ubifs_znode *znode;
1501 struct ubifs_zbranch zbr;
1502 1512
1503 dbg_tnc("name '%.*s' key %s", nm->len, nm->name, DBGKEY(key)); 1513 dbg_tnc("name '%.*s' key %s", nm->len, nm->name, DBGKEY(key));
1504 mutex_lock(&c->tnc_mutex); 1514 mutex_lock(&c->tnc_mutex);
@@ -1522,11 +1532,7 @@ static int do_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,
1522 goto out_unlock; 1532 goto out_unlock;
1523 } 1533 }
1524 1534
1525 zbr = znode->zbranch[n]; 1535 err = tnc_read_node_nm(c, &znode->zbranch[n], node);
1526 mutex_unlock(&c->tnc_mutex);
1527
1528 err = tnc_read_node_nm(c, &zbr, node);
1529 return err;
1530 1536
1531out_unlock: 1537out_unlock:
1532 mutex_unlock(&c->tnc_mutex); 1538 mutex_unlock(&c->tnc_mutex);
diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h
index bd2121f3426e..a9ecbd9af20d 100644
--- a/fs/ubifs/ubifs-media.h
+++ b/fs/ubifs/ubifs-media.h
@@ -87,7 +87,7 @@
87#define UBIFS_SK_LEN 8 87#define UBIFS_SK_LEN 8
88 88
89/* Minimum index tree fanout */ 89/* Minimum index tree fanout */
90#define UBIFS_MIN_FANOUT 2 90#define UBIFS_MIN_FANOUT 3
91 91
92/* Maximum number of levels in UBIFS indexing B-tree */ 92/* Maximum number of levels in UBIFS indexing B-tree */
93#define UBIFS_MAX_LEVELS 512 93#define UBIFS_MAX_LEVELS 512
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index d7f706f7a302..17c620b93eec 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -995,6 +995,9 @@ struct ubifs_mount_opts {
995 * @max_idx_node_sz: maximum indexing node aligned on 8-bytes boundary 995 * @max_idx_node_sz: maximum indexing node aligned on 8-bytes boundary
996 * @max_inode_sz: maximum possible inode size in bytes 996 * @max_inode_sz: maximum possible inode size in bytes
997 * @max_znode_sz: size of znode in bytes 997 * @max_znode_sz: size of znode in bytes
998 *
999 * @leb_overhead: how many bytes are wasted in an LEB when it is filled with
1000 * data nodes of maximum size - used in free space reporting
998 * @dead_wm: LEB dead space watermark 1001 * @dead_wm: LEB dead space watermark
999 * @dark_wm: LEB dark space watermark 1002 * @dark_wm: LEB dark space watermark
1000 * @block_cnt: count of 4KiB blocks on the FS 1003 * @block_cnt: count of 4KiB blocks on the FS
@@ -1028,6 +1031,8 @@ struct ubifs_mount_opts {
1028 * @sbuf: a buffer of LEB size used by GC and replay for scanning 1031 * @sbuf: a buffer of LEB size used by GC and replay for scanning
1029 * @idx_gc: list of index LEBs that have been garbage collected 1032 * @idx_gc: list of index LEBs that have been garbage collected
1030 * @idx_gc_cnt: number of elements on the idx_gc list 1033 * @idx_gc_cnt: number of elements on the idx_gc list
1034 * @gc_seq: incremented for every non-index LEB garbage collected
1035 * @gced_lnum: last non-index LEB that was garbage collected
1031 * 1036 *
1032 * @infos_list: links all 'ubifs_info' objects 1037 * @infos_list: links all 'ubifs_info' objects
1033 * @umount_mutex: serializes shrinker and un-mount 1038 * @umount_mutex: serializes shrinker and un-mount
@@ -1224,6 +1229,8 @@ struct ubifs_info {
1224 int max_idx_node_sz; 1229 int max_idx_node_sz;
1225 long long max_inode_sz; 1230 long long max_inode_sz;
1226 int max_znode_sz; 1231 int max_znode_sz;
1232
1233 int leb_overhead;
1227 int dead_wm; 1234 int dead_wm;
1228 int dark_wm; 1235 int dark_wm;
1229 int block_cnt; 1236 int block_cnt;
@@ -1257,6 +1264,8 @@ struct ubifs_info {
1257 void *sbuf; 1264 void *sbuf;
1258 struct list_head idx_gc; 1265 struct list_head idx_gc;
1259 int idx_gc_cnt; 1266 int idx_gc_cnt;
1267 volatile int gc_seq;
1268 volatile int gced_lnum;
1260 1269
1261 struct list_head infos_list; 1270 struct list_head infos_list;
1262 struct mutex umount_mutex; 1271 struct mutex umount_mutex;
@@ -1434,9 +1443,10 @@ void ubifs_release_ino_dirty(struct ubifs_info *c, struct inode *inode,
1434 struct ubifs_budget_req *req); 1443 struct ubifs_budget_req *req);
1435void ubifs_cancel_ino_op(struct ubifs_info *c, struct inode *inode, 1444void ubifs_cancel_ino_op(struct ubifs_info *c, struct inode *inode,
1436 struct ubifs_budget_req *req); 1445 struct ubifs_budget_req *req);
1437long long ubifs_budg_get_free_space(struct ubifs_info *c); 1446long long ubifs_get_free_space(struct ubifs_info *c);
1438int ubifs_calc_min_idx_lebs(struct ubifs_info *c); 1447int ubifs_calc_min_idx_lebs(struct ubifs_info *c);
1439void ubifs_convert_page_budget(struct ubifs_info *c); 1448void ubifs_convert_page_budget(struct ubifs_info *c);
1449long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free);
1440long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs); 1450long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs);
1441 1451
1442/* find.c */ 1452/* find.c */
@@ -1451,8 +1461,6 @@ int ubifs_save_dirty_idx_lnums(struct ubifs_info *c);
1451/* tnc.c */ 1461/* tnc.c */
1452int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key, 1462int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key,
1453 struct ubifs_znode **zn, int *n); 1463 struct ubifs_znode **zn, int *n);
1454int ubifs_tnc_lookup(struct ubifs_info *c, const union ubifs_key *key,
1455 void *node);
1456int ubifs_tnc_lookup_nm(struct ubifs_info *c, const union ubifs_key *key, 1464int ubifs_tnc_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,
1457 void *node, const struct qstr *nm); 1465 void *node, const struct qstr *nm);
1458int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key, 1466int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key,
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 0ed6e146a0d9..eb91f3b70320 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -211,6 +211,7 @@ const struct file_operations udf_file_operations = {
211 .release = udf_release_file, 211 .release = udf_release_file,
212 .fsync = udf_fsync_file, 212 .fsync = udf_fsync_file,
213 .splice_read = generic_file_splice_read, 213 .splice_read = generic_file_splice_read,
214 .llseek = generic_file_llseek,
214}; 215};
215 216
216const struct inode_operations udf_file_inode_operations = { 217const struct inode_operations udf_file_inode_operations = {
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index eb9cfa23dc3d..a4f2b3ce45b0 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -76,11 +76,24 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
76 *err = -ENOSPC; 76 *err = -ENOSPC;
77 77
78 iinfo = UDF_I(inode); 78 iinfo = UDF_I(inode);
79 iinfo->i_unique = 0; 79 if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_EXTENDED_FE)) {
80 iinfo->i_lenExtents = 0; 80 iinfo->i_efe = 1;
81 iinfo->i_next_alloc_block = 0; 81 if (UDF_VERS_USE_EXTENDED_FE > sbi->s_udfrev)
82 iinfo->i_next_alloc_goal = 0; 82 sbi->s_udfrev = UDF_VERS_USE_EXTENDED_FE;
83 iinfo->i_strat4096 = 0; 83 iinfo->i_ext.i_data = kzalloc(inode->i_sb->s_blocksize -
84 sizeof(struct extendedFileEntry),
85 GFP_KERNEL);
86 } else {
87 iinfo->i_efe = 0;
88 iinfo->i_ext.i_data = kzalloc(inode->i_sb->s_blocksize -
89 sizeof(struct fileEntry),
90 GFP_KERNEL);
91 }
92 if (!iinfo->i_ext.i_data) {
93 iput(inode);
94 *err = -ENOMEM;
95 return NULL;
96 }
84 97
85 block = udf_new_block(dir->i_sb, NULL, 98 block = udf_new_block(dir->i_sb, NULL,
86 dinfo->i_location.partitionReferenceNum, 99 dinfo->i_location.partitionReferenceNum,
@@ -111,6 +124,7 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
111 lvhd->uniqueID = cpu_to_le64(uniqueID); 124 lvhd->uniqueID = cpu_to_le64(uniqueID);
112 mark_buffer_dirty(sbi->s_lvid_bh); 125 mark_buffer_dirty(sbi->s_lvid_bh);
113 } 126 }
127 mutex_unlock(&sbi->s_alloc_mutex);
114 inode->i_mode = mode; 128 inode->i_mode = mode;
115 inode->i_uid = current->fsuid; 129 inode->i_uid = current->fsuid;
116 if (dir->i_mode & S_ISGID) { 130 if (dir->i_mode & S_ISGID) {
@@ -129,25 +143,6 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
129 iinfo->i_lenEAttr = 0; 143 iinfo->i_lenEAttr = 0;
130 iinfo->i_lenAlloc = 0; 144 iinfo->i_lenAlloc = 0;
131 iinfo->i_use = 0; 145 iinfo->i_use = 0;
132 if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_EXTENDED_FE)) {
133 iinfo->i_efe = 1;
134 if (UDF_VERS_USE_EXTENDED_FE > sbi->s_udfrev)
135 sbi->s_udfrev = UDF_VERS_USE_EXTENDED_FE;
136 iinfo->i_ext.i_data = kzalloc(inode->i_sb->s_blocksize -
137 sizeof(struct extendedFileEntry),
138 GFP_KERNEL);
139 } else {
140 iinfo->i_efe = 0;
141 iinfo->i_ext.i_data = kzalloc(inode->i_sb->s_blocksize -
142 sizeof(struct fileEntry),
143 GFP_KERNEL);
144 }
145 if (!iinfo->i_ext.i_data) {
146 iput(inode);
147 *err = -ENOMEM;
148 mutex_unlock(&sbi->s_alloc_mutex);
149 return NULL;
150 }
151 if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_AD_IN_ICB)) 146 if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_AD_IN_ICB))
152 iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB; 147 iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB;
153 else if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD)) 148 else if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD))
@@ -158,7 +153,6 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
158 iinfo->i_crtime = current_fs_time(inode->i_sb); 153 iinfo->i_crtime = current_fs_time(inode->i_sb);
159 insert_inode_hash(inode); 154 insert_inode_hash(inode);
160 mark_inode_dirty(inode); 155 mark_inode_dirty(inode);
161 mutex_unlock(&sbi->s_alloc_mutex);
162 156
163 if (DQUOT_ALLOC_INODE(inode)) { 157 if (DQUOT_ALLOC_INODE(inode)) {
164 DQUOT_DROP(inode); 158 DQUOT_DROP(inode);
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index f42f80a3b1fa..a44d68eb50b5 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1338,6 +1338,10 @@ __xfs_get_blocks(
1338 offset = (xfs_off_t)iblock << inode->i_blkbits; 1338 offset = (xfs_off_t)iblock << inode->i_blkbits;
1339 ASSERT(bh_result->b_size >= (1 << inode->i_blkbits)); 1339 ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
1340 size = bh_result->b_size; 1340 size = bh_result->b_size;
1341
1342 if (!create && direct && offset >= i_size_read(inode))
1343 return 0;
1344
1341 error = xfs_iomap(XFS_I(inode), offset, size, 1345 error = xfs_iomap(XFS_I(inode), offset, size,
1342 create ? flags : BMAPI_READ, &iomap, &niomap); 1346 create ? flags : BMAPI_READ, &iomap, &niomap);
1343 if (error) 1347 if (error)
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 986061ae1b9b..36d5fcd3f593 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -1001,12 +1001,13 @@ xfs_buf_iodone_work(
1001 * We can get an EOPNOTSUPP to ordered writes. Here we clear the 1001 * We can get an EOPNOTSUPP to ordered writes. Here we clear the
1002 * ordered flag and reissue them. Because we can't tell the higher 1002 * ordered flag and reissue them. Because we can't tell the higher
1003 * layers directly that they should not issue ordered I/O anymore, they 1003 * layers directly that they should not issue ordered I/O anymore, they
1004 * need to check if the ordered flag was cleared during I/O completion. 1004 * need to check if the _XFS_BARRIER_FAILED flag was set during I/O completion.
1005 */ 1005 */
1006 if ((bp->b_error == EOPNOTSUPP) && 1006 if ((bp->b_error == EOPNOTSUPP) &&
1007 (bp->b_flags & (XBF_ORDERED|XBF_ASYNC)) == (XBF_ORDERED|XBF_ASYNC)) { 1007 (bp->b_flags & (XBF_ORDERED|XBF_ASYNC)) == (XBF_ORDERED|XBF_ASYNC)) {
1008 XB_TRACE(bp, "ordered_retry", bp->b_iodone); 1008 XB_TRACE(bp, "ordered_retry", bp->b_iodone);
1009 bp->b_flags &= ~XBF_ORDERED; 1009 bp->b_flags &= ~XBF_ORDERED;
1010 bp->b_flags |= _XFS_BARRIER_FAILED;
1010 xfs_buf_iorequest(bp); 1011 xfs_buf_iorequest(bp);
1011 } else if (bp->b_iodone) 1012 } else if (bp->b_iodone)
1012 (*(bp->b_iodone))(bp); 1013 (*(bp->b_iodone))(bp);
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index fe0109956656..456519a088c7 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -85,6 +85,14 @@ typedef enum {
85 * modifications being lost. 85 * modifications being lost.
86 */ 86 */
87 _XBF_PAGE_LOCKED = (1 << 22), 87 _XBF_PAGE_LOCKED = (1 << 22),
88
89 /*
90 * If we try a barrier write, but it fails we have to communicate
91 * this to the upper layers. Unfortunately b_error gets overwritten
92 * when the buffer is re-issued so we have to add another flag to
93 * keep this information.
94 */
95 _XFS_BARRIER_FAILED = (1 << 23),
88} xfs_buf_flags_t; 96} xfs_buf_flags_t;
89 97
90typedef enum { 98typedef enum {
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 73c65f19e549..18d3c8487835 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1302,9 +1302,29 @@ xfs_fs_remount(
1302 mp->m_flags &= ~XFS_MOUNT_BARRIER; 1302 mp->m_flags &= ~XFS_MOUNT_BARRIER;
1303 break; 1303 break;
1304 default: 1304 default:
1305 /*
1306 * Logically we would return an error here to prevent
1307 * users from believing they might have changed
1308 * mount options using remount which can't be changed.
1309 *
1310 * But unfortunately mount(8) adds all options from
1311 * mtab and fstab to the mount arguments in some cases
1312 * so we can't blindly reject options, but have to
1313 * check for each specified option if it actually
1314 * differs from the currently set option and only
1315 * reject it if that's the case.
1316 *
1317 * Until that is implemented we return success for
1318 * every remount request, and silently ignore all
1319 * options that we can't actually change.
1320 */
1321#if 0
1305 printk(KERN_INFO 1322 printk(KERN_INFO
1306 "XFS: mount option \"%s\" not supported for remount\n", p); 1323 "XFS: mount option \"%s\" not supported for remount\n", p);
1307 return -EINVAL; 1324 return -EINVAL;
1325#else
1326 return 0;
1327#endif
1308 } 1328 }
1309 } 1329 }
1310 1330
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 608c30c3f76b..002fc2617c8e 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -732,6 +732,7 @@ xfs_buf_item_init(
732 bip->bli_item.li_ops = &xfs_buf_item_ops; 732 bip->bli_item.li_ops = &xfs_buf_item_ops;
733 bip->bli_item.li_mountp = mp; 733 bip->bli_item.li_mountp = mp;
734 bip->bli_buf = bp; 734 bip->bli_buf = bp;
735 xfs_buf_hold(bp);
735 bip->bli_format.blf_type = XFS_LI_BUF; 736 bip->bli_format.blf_type = XFS_LI_BUF;
736 bip->bli_format.blf_blkno = (__int64_t)XFS_BUF_ADDR(bp); 737 bip->bli_format.blf_blkno = (__int64_t)XFS_BUF_ADDR(bp);
737 bip->bli_format.blf_len = (ushort)BTOBB(XFS_BUF_COUNT(bp)); 738 bip->bli_format.blf_len = (ushort)BTOBB(XFS_BUF_COUNT(bp));
@@ -867,6 +868,21 @@ xfs_buf_item_dirty(
867 return (bip->bli_flags & XFS_BLI_DIRTY); 868 return (bip->bli_flags & XFS_BLI_DIRTY);
868} 869}
869 870
871STATIC void
872xfs_buf_item_free(
873 xfs_buf_log_item_t *bip)
874{
875#ifdef XFS_TRANS_DEBUG
876 kmem_free(bip->bli_orig);
877 kmem_free(bip->bli_logged);
878#endif /* XFS_TRANS_DEBUG */
879
880#ifdef XFS_BLI_TRACE
881 ktrace_free(bip->bli_trace);
882#endif
883 kmem_zone_free(xfs_buf_item_zone, bip);
884}
885
870/* 886/*
871 * This is called when the buf log item is no longer needed. It should 887 * This is called when the buf log item is no longer needed. It should
872 * free the buf log item associated with the given buffer and clear 888 * free the buf log item associated with the given buffer and clear
@@ -887,18 +903,8 @@ xfs_buf_item_relse(
887 (XFS_BUF_IODONE_FUNC(bp) != NULL)) { 903 (XFS_BUF_IODONE_FUNC(bp) != NULL)) {
888 XFS_BUF_CLR_IODONE_FUNC(bp); 904 XFS_BUF_CLR_IODONE_FUNC(bp);
889 } 905 }
890 906 xfs_buf_rele(bp);
891#ifdef XFS_TRANS_DEBUG 907 xfs_buf_item_free(bip);
892 kmem_free(bip->bli_orig);
893 bip->bli_orig = NULL;
894 kmem_free(bip->bli_logged);
895 bip->bli_logged = NULL;
896#endif /* XFS_TRANS_DEBUG */
897
898#ifdef XFS_BLI_TRACE
899 ktrace_free(bip->bli_trace);
900#endif
901 kmem_zone_free(xfs_buf_item_zone, bip);
902} 908}
903 909
904 910
@@ -1120,6 +1126,7 @@ xfs_buf_iodone(
1120 1126
1121 ASSERT(bip->bli_buf == bp); 1127 ASSERT(bip->bli_buf == bp);
1122 1128
1129 xfs_buf_rele(bp);
1123 mp = bip->bli_item.li_mountp; 1130 mp = bip->bli_item.li_mountp;
1124 1131
1125 /* 1132 /*
@@ -1136,18 +1143,7 @@ xfs_buf_iodone(
1136 * xfs_trans_delete_ail() drops the AIL lock. 1143 * xfs_trans_delete_ail() drops the AIL lock.
1137 */ 1144 */
1138 xfs_trans_delete_ail(mp, (xfs_log_item_t *)bip); 1145 xfs_trans_delete_ail(mp, (xfs_log_item_t *)bip);
1139 1146 xfs_buf_item_free(bip);
1140#ifdef XFS_TRANS_DEBUG
1141 kmem_free(bip->bli_orig);
1142 bip->bli_orig = NULL;
1143 kmem_free(bip->bli_logged);
1144 bip->bli_logged = NULL;
1145#endif /* XFS_TRANS_DEBUG */
1146
1147#ifdef XFS_BLI_TRACE
1148 ktrace_free(bip->bli_trace);
1149#endif
1150 kmem_zone_free(xfs_buf_item_zone, bip);
1151} 1147}
1152 1148
1153#if defined(XFS_BLI_TRACE) 1149#if defined(XFS_BLI_TRACE)
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index 760f4c5b5160..75b0cd4da0ea 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -149,7 +149,14 @@ xfs_swap_extents(
149 149
150 sbp = &sxp->sx_stat; 150 sbp = &sxp->sx_stat;
151 151
152 xfs_lock_two_inodes(ip, tip, lock_flags); 152 /*
153 * we have to do two separate lock calls here to keep lockdep
154 * happy. If we try to get all the locks in one call, lock will
155 * report false positives when we drop the ILOCK and regain them
156 * below.
157 */
158 xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL);
159 xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
153 locked = 1; 160 locked = 1;
154 161
155 /* Verify that both files have the same format */ 162 /* Verify that both files have the same format */
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 00e80df9dd9d..dbd9cef852ec 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -4118,7 +4118,7 @@ xfs_iext_indirect_to_direct(
4118 ASSERT(nextents <= XFS_LINEAR_EXTS); 4118 ASSERT(nextents <= XFS_LINEAR_EXTS);
4119 size = nextents * sizeof(xfs_bmbt_rec_t); 4119 size = nextents * sizeof(xfs_bmbt_rec_t);
4120 4120
4121 xfs_iext_irec_compact_full(ifp); 4121 xfs_iext_irec_compact_pages(ifp);
4122 ASSERT(ifp->if_real_bytes == XFS_IEXT_BUFSZ); 4122 ASSERT(ifp->if_real_bytes == XFS_IEXT_BUFSZ);
4123 4123
4124 ep = ifp->if_u1.if_ext_irec->er_extbuf; 4124 ep = ifp->if_u1.if_ext_irec->er_extbuf;
@@ -4449,8 +4449,7 @@ xfs_iext_irec_remove(
4449 * compaction policy is as follows: 4449 * compaction policy is as follows:
4450 * 4450 *
4451 * Full Compaction: Extents fit into a single page (or inline buffer) 4451 * Full Compaction: Extents fit into a single page (or inline buffer)
4452 * Full Compaction: Extents occupy less than 10% of allocated space 4452 * Partial Compaction: Extents occupy less than 50% of allocated space
4453 * Partial Compaction: Extents occupy > 10% and < 50% of allocated space
4454 * No Compaction: Extents occupy at least 50% of allocated space 4453 * No Compaction: Extents occupy at least 50% of allocated space
4455 */ 4454 */
4456void 4455void
@@ -4471,8 +4470,6 @@ xfs_iext_irec_compact(
4471 xfs_iext_direct_to_inline(ifp, nextents); 4470 xfs_iext_direct_to_inline(ifp, nextents);
4472 } else if (nextents <= XFS_LINEAR_EXTS) { 4471 } else if (nextents <= XFS_LINEAR_EXTS) {
4473 xfs_iext_indirect_to_direct(ifp); 4472 xfs_iext_indirect_to_direct(ifp);
4474 } else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 3) {
4475 xfs_iext_irec_compact_full(ifp);
4476 } else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 1) { 4473 } else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 1) {
4477 xfs_iext_irec_compact_pages(ifp); 4474 xfs_iext_irec_compact_pages(ifp);
4478 } 4475 }
@@ -4496,7 +4493,7 @@ xfs_iext_irec_compact_pages(
4496 erp_next = erp + 1; 4493 erp_next = erp + 1;
4497 if (erp_next->er_extcount <= 4494 if (erp_next->er_extcount <=
4498 (XFS_LINEAR_EXTS - erp->er_extcount)) { 4495 (XFS_LINEAR_EXTS - erp->er_extcount)) {
4499 memmove(&erp->er_extbuf[erp->er_extcount], 4496 memcpy(&erp->er_extbuf[erp->er_extcount],
4500 erp_next->er_extbuf, erp_next->er_extcount * 4497 erp_next->er_extbuf, erp_next->er_extcount *
4501 sizeof(xfs_bmbt_rec_t)); 4498 sizeof(xfs_bmbt_rec_t));
4502 erp->er_extcount += erp_next->er_extcount; 4499 erp->er_extcount += erp_next->er_extcount;
@@ -4516,91 +4513,6 @@ xfs_iext_irec_compact_pages(
4516} 4513}
4517 4514
4518/* 4515/*
4519 * Fully compact the extent records managed by the indirection array.
4520 */
4521void
4522xfs_iext_irec_compact_full(
4523 xfs_ifork_t *ifp) /* inode fork pointer */
4524{
4525 xfs_bmbt_rec_host_t *ep, *ep_next; /* extent record pointers */
4526 xfs_ext_irec_t *erp, *erp_next; /* extent irec pointers */
4527 int erp_idx = 0; /* extent irec index */
4528 int ext_avail; /* empty entries in ex list */
4529 int ext_diff; /* number of exts to add */
4530 int nlists; /* number of irec's (ex lists) */
4531
4532 ASSERT(ifp->if_flags & XFS_IFEXTIREC);
4533
4534 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
4535 erp = ifp->if_u1.if_ext_irec;
4536 ep = &erp->er_extbuf[erp->er_extcount];
4537 erp_next = erp + 1;
4538 ep_next = erp_next->er_extbuf;
4539
4540 while (erp_idx < nlists - 1) {
4541 /*
4542 * Check how many extent records are available in this irec.
4543 * If there is none skip the whole exercise.
4544 */
4545 ext_avail = XFS_LINEAR_EXTS - erp->er_extcount;
4546 if (ext_avail) {
4547
4548 /*
4549 * Copy over as many as possible extent records into
4550 * the previous page.
4551 */
4552 ext_diff = MIN(ext_avail, erp_next->er_extcount);
4553 memcpy(ep, ep_next, ext_diff * sizeof(xfs_bmbt_rec_t));
4554 erp->er_extcount += ext_diff;
4555 erp_next->er_extcount -= ext_diff;
4556
4557 /*
4558 * If the next irec is empty now we can simply
4559 * remove it.
4560 */
4561 if (erp_next->er_extcount == 0) {
4562 /*
4563 * Free page before removing extent record
4564 * so er_extoffs don't get modified in
4565 * xfs_iext_irec_remove.
4566 */
4567 kmem_free(erp_next->er_extbuf);
4568 erp_next->er_extbuf = NULL;
4569 xfs_iext_irec_remove(ifp, erp_idx + 1);
4570 erp = &ifp->if_u1.if_ext_irec[erp_idx];
4571 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
4572
4573 /*
4574 * If the next irec is not empty move up the content
4575 * that has not been copied to the previous page to
4576 * the beggining of this one.
4577 */
4578 } else {
4579 memmove(erp_next->er_extbuf, &ep_next[ext_diff],
4580 erp_next->er_extcount *
4581 sizeof(xfs_bmbt_rec_t));
4582 ep_next = erp_next->er_extbuf;
4583 memset(&ep_next[erp_next->er_extcount], 0,
4584 (XFS_LINEAR_EXTS -
4585 erp_next->er_extcount) *
4586 sizeof(xfs_bmbt_rec_t));
4587 }
4588 }
4589
4590 if (erp->er_extcount == XFS_LINEAR_EXTS) {
4591 erp_idx++;
4592 if (erp_idx < nlists)
4593 erp = &ifp->if_u1.if_ext_irec[erp_idx];
4594 else
4595 break;
4596 }
4597 ep = &erp->er_extbuf[erp->er_extcount];
4598 erp_next = erp + 1;
4599 ep_next = erp_next->er_extbuf;
4600 }
4601}
4602
4603/*
4604 * This is called to update the er_extoff field in the indirection 4516 * This is called to update the er_extoff field in the indirection
4605 * array when extents have been added or removed from one of the 4517 * array when extents have been added or removed from one of the
4606 * extent lists. erp_idx contains the irec index to begin updating 4518 * extent lists. erp_idx contains the irec index to begin updating
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index ccba14eb9dbe..0b02c6443551 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -124,16 +124,27 @@ STATIC void xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog,
124STATIC int xlog_iclogs_empty(xlog_t *log); 124STATIC int xlog_iclogs_empty(xlog_t *log);
125 125
126#if defined(XFS_LOG_TRACE) 126#if defined(XFS_LOG_TRACE)
127
128#define XLOG_TRACE_LOGGRANT_SIZE 2048
129#define XLOG_TRACE_ICLOG_SIZE 256
130
131void
132xlog_trace_loggrant_alloc(xlog_t *log)
133{
134 log->l_grant_trace = ktrace_alloc(XLOG_TRACE_LOGGRANT_SIZE, KM_NOFS);
135}
136
137void
138xlog_trace_loggrant_dealloc(xlog_t *log)
139{
140 ktrace_free(log->l_grant_trace);
141}
142
127void 143void
128xlog_trace_loggrant(xlog_t *log, xlog_ticket_t *tic, xfs_caddr_t string) 144xlog_trace_loggrant(xlog_t *log, xlog_ticket_t *tic, xfs_caddr_t string)
129{ 145{
130 unsigned long cnts; 146 unsigned long cnts;
131 147
132 if (!log->l_grant_trace) {
133 log->l_grant_trace = ktrace_alloc(2048, KM_NOSLEEP);
134 if (!log->l_grant_trace)
135 return;
136 }
137 /* ticket counts are 1 byte each */ 148 /* ticket counts are 1 byte each */
138 cnts = ((unsigned long)tic->t_ocnt) | ((unsigned long)tic->t_cnt) << 8; 149 cnts = ((unsigned long)tic->t_ocnt) | ((unsigned long)tic->t_cnt) << 8;
139 150
@@ -157,10 +168,20 @@ xlog_trace_loggrant(xlog_t *log, xlog_ticket_t *tic, xfs_caddr_t string)
157} 168}
158 169
159void 170void
171xlog_trace_iclog_alloc(xlog_in_core_t *iclog)
172{
173 iclog->ic_trace = ktrace_alloc(XLOG_TRACE_ICLOG_SIZE, KM_NOFS);
174}
175
176void
177xlog_trace_iclog_dealloc(xlog_in_core_t *iclog)
178{
179 ktrace_free(iclog->ic_trace);
180}
181
182void
160xlog_trace_iclog(xlog_in_core_t *iclog, uint state) 183xlog_trace_iclog(xlog_in_core_t *iclog, uint state)
161{ 184{
162 if (!iclog->ic_trace)
163 iclog->ic_trace = ktrace_alloc(256, KM_NOFS);
164 ktrace_enter(iclog->ic_trace, 185 ktrace_enter(iclog->ic_trace,
165 (void *)((unsigned long)state), 186 (void *)((unsigned long)state),
166 (void *)((unsigned long)current_pid()), 187 (void *)((unsigned long)current_pid()),
@@ -170,8 +191,15 @@ xlog_trace_iclog(xlog_in_core_t *iclog, uint state)
170 (void *)NULL, (void *)NULL); 191 (void *)NULL, (void *)NULL);
171} 192}
172#else 193#else
194
195#define xlog_trace_loggrant_alloc(log)
196#define xlog_trace_loggrant_dealloc(log)
173#define xlog_trace_loggrant(log,tic,string) 197#define xlog_trace_loggrant(log,tic,string)
198
199#define xlog_trace_iclog_alloc(iclog)
200#define xlog_trace_iclog_dealloc(iclog)
174#define xlog_trace_iclog(iclog,state) 201#define xlog_trace_iclog(iclog,state)
202
175#endif /* XFS_LOG_TRACE */ 203#endif /* XFS_LOG_TRACE */
176 204
177 205
@@ -1005,11 +1033,12 @@ xlog_iodone(xfs_buf_t *bp)
1005 l = iclog->ic_log; 1033 l = iclog->ic_log;
1006 1034
1007 /* 1035 /*
1008 * If the ordered flag has been removed by a lower 1036 * If the _XFS_BARRIER_FAILED flag was set by a lower
1009 * layer, it means the underlyin device no longer supports 1037 * layer, it means the underlying device no longer supports
1010 * barrier I/O. Warn loudly and turn off barriers. 1038 * barrier I/O. Warn loudly and turn off barriers.
1011 */ 1039 */
1012 if ((l->l_mp->m_flags & XFS_MOUNT_BARRIER) && !XFS_BUF_ORDERED(bp)) { 1040 if (bp->b_flags & _XFS_BARRIER_FAILED) {
1041 bp->b_flags &= ~_XFS_BARRIER_FAILED;
1013 l->l_mp->m_flags &= ~XFS_MOUNT_BARRIER; 1042 l->l_mp->m_flags &= ~XFS_MOUNT_BARRIER;
1014 xfs_fs_cmn_err(CE_WARN, l->l_mp, 1043 xfs_fs_cmn_err(CE_WARN, l->l_mp,
1015 "xlog_iodone: Barriers are no longer supported" 1044 "xlog_iodone: Barriers are no longer supported"
@@ -1231,6 +1260,7 @@ xlog_alloc_log(xfs_mount_t *mp,
1231 spin_lock_init(&log->l_grant_lock); 1260 spin_lock_init(&log->l_grant_lock);
1232 sv_init(&log->l_flush_wait, 0, "flush_wait"); 1261 sv_init(&log->l_flush_wait, 0, "flush_wait");
1233 1262
1263 xlog_trace_loggrant_alloc(log);
1234 /* log record size must be multiple of BBSIZE; see xlog_rec_header_t */ 1264 /* log record size must be multiple of BBSIZE; see xlog_rec_header_t */
1235 ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0); 1265 ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0);
1236 1266
@@ -1285,6 +1315,8 @@ xlog_alloc_log(xfs_mount_t *mp,
1285 sv_init(&iclog->ic_force_wait, SV_DEFAULT, "iclog-force"); 1315 sv_init(&iclog->ic_force_wait, SV_DEFAULT, "iclog-force");
1286 sv_init(&iclog->ic_write_wait, SV_DEFAULT, "iclog-write"); 1316 sv_init(&iclog->ic_write_wait, SV_DEFAULT, "iclog-write");
1287 1317
1318 xlog_trace_iclog_alloc(iclog);
1319
1288 iclogp = &iclog->ic_next; 1320 iclogp = &iclog->ic_next;
1289 } 1321 }
1290 *iclogp = log->l_iclog; /* complete ring */ 1322 *iclogp = log->l_iclog; /* complete ring */
@@ -1565,11 +1597,7 @@ xlog_dealloc_log(xlog_t *log)
1565 sv_destroy(&iclog->ic_force_wait); 1597 sv_destroy(&iclog->ic_force_wait);
1566 sv_destroy(&iclog->ic_write_wait); 1598 sv_destroy(&iclog->ic_write_wait);
1567 xfs_buf_free(iclog->ic_bp); 1599 xfs_buf_free(iclog->ic_bp);
1568#ifdef XFS_LOG_TRACE 1600 xlog_trace_iclog_dealloc(iclog);
1569 if (iclog->ic_trace != NULL) {
1570 ktrace_free(iclog->ic_trace);
1571 }
1572#endif
1573 next_iclog = iclog->ic_next; 1601 next_iclog = iclog->ic_next;
1574 kmem_free(iclog); 1602 kmem_free(iclog);
1575 iclog = next_iclog; 1603 iclog = next_iclog;
@@ -1578,14 +1606,7 @@ xlog_dealloc_log(xlog_t *log)
1578 spinlock_destroy(&log->l_grant_lock); 1606 spinlock_destroy(&log->l_grant_lock);
1579 1607
1580 xfs_buf_free(log->l_xbuf); 1608 xfs_buf_free(log->l_xbuf);
1581#ifdef XFS_LOG_TRACE 1609 xlog_trace_loggrant_dealloc(log);
1582 if (log->l_trace != NULL) {
1583 ktrace_free(log->l_trace);
1584 }
1585 if (log->l_grant_trace != NULL) {
1586 ktrace_free(log->l_grant_trace);
1587 }
1588#endif
1589 log->l_mp->m_log = NULL; 1610 log->l_mp->m_log = NULL;
1590 kmem_free(log); 1611 kmem_free(log);
1591} /* xlog_dealloc_log */ 1612} /* xlog_dealloc_log */
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index c8a5b22ee3e3..e7d8f84443fa 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -448,7 +448,6 @@ typedef struct log {
448 int l_grant_write_bytes; 448 int l_grant_write_bytes;
449 449
450#ifdef XFS_LOG_TRACE 450#ifdef XFS_LOG_TRACE
451 struct ktrace *l_trace;
452 struct ktrace *l_grant_trace; 451 struct ktrace *l_grant_trace;
453#endif 452#endif
454 453
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index aa238c8fbd7a..8b6812f66a15 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -1838,6 +1838,12 @@ again:
1838#endif 1838#endif
1839} 1839}
1840 1840
1841/*
1842 * xfs_lock_two_inodes() can only be used to lock one type of lock
1843 * at a time - the iolock or the ilock, but not both at once. If
1844 * we lock both at once, lockdep will report false positives saying
1845 * we have violated locking orders.
1846 */
1841void 1847void
1842xfs_lock_two_inodes( 1848xfs_lock_two_inodes(
1843 xfs_inode_t *ip0, 1849 xfs_inode_t *ip0,
@@ -1848,6 +1854,8 @@ xfs_lock_two_inodes(
1848 int attempts = 0; 1854 int attempts = 0;
1849 xfs_log_item_t *lp; 1855 xfs_log_item_t *lp;
1850 1856
1857 if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
1858 ASSERT((lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) == 0);
1851 ASSERT(ip0->i_ino != ip1->i_ino); 1859 ASSERT(ip0->i_ino != ip1->i_ino);
1852 1860
1853 if (ip0->i_ino > ip1->i_ino) { 1861 if (ip0->i_ino > ip1->i_ino) {
@@ -3152,6 +3160,13 @@ error1: /* Just cancel transaction */
3152/* 3160/*
3153 * Zero file bytes between startoff and endoff inclusive. 3161 * Zero file bytes between startoff and endoff inclusive.
3154 * The iolock is held exclusive and no blocks are buffered. 3162 * The iolock is held exclusive and no blocks are buffered.
3163 *
3164 * This function is used by xfs_free_file_space() to zero
3165 * partial blocks when the range to free is not block aligned.
3166 * When unreserving space with boundaries that are not block
3167 * aligned we round up the start and round down the end
3168 * boundaries and then use this function to zero the parts of
3169 * the blocks that got dropped during the rounding.
3155 */ 3170 */
3156STATIC int 3171STATIC int
3157xfs_zero_remaining_bytes( 3172xfs_zero_remaining_bytes(
@@ -3168,6 +3183,17 @@ xfs_zero_remaining_bytes(
3168 int nimap; 3183 int nimap;
3169 int error = 0; 3184 int error = 0;
3170 3185
3186 /*
3187 * Avoid doing I/O beyond eof - it's not necessary
3188 * since nothing can read beyond eof. The space will
3189 * be zeroed when the file is extended anyway.
3190 */
3191 if (startoff >= ip->i_size)
3192 return 0;
3193
3194 if (endoff > ip->i_size)
3195 endoff = ip->i_size;
3196
3171 bp = xfs_buf_get_noaddr(mp->m_sb.sb_blocksize, 3197 bp = xfs_buf_get_noaddr(mp->m_sb.sb_blocksize,
3172 XFS_IS_REALTIME_INODE(ip) ? 3198 XFS_IS_REALTIME_INODE(ip) ?
3173 mp->m_rtdev_targp : mp->m_ddev_targp); 3199 mp->m_rtdev_targp : mp->m_ddev_targp);