aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2008-10-11 16:23:48 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2008-10-11 16:23:48 -0400
commitfd048088306656824958e7783ffcee27e241b361 (patch)
treebe11bebe3bbd2cac88ff27bd3c7450339d21bdc7
parent5c3c4d9b5810c9aabd8c05219c62ca088aa83eb0 (diff)
parent03010a3350301baac2154fa66de925ae2981b7e3 (diff)
Merge branch 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4
* 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (43 commits) ext4: Rename ext4dev to ext4 ext4: Avoid double dirtying of super block in ext4_put_super() Update ext4 MAINTAINERS file Hook ext4 to the vfs fiemap interface. generic block based fiemap implementation ocfs2: fiemap support vfs: vfs-level fiemap interface ext4: fix xattr deadlock jbd2: Fix buffer head leak when writing the commit block ext4: Add debugging markers that can be used by systemtap jbd2: abort instead of waiting for nonexistent transaction ext4: fix initialization of UNINIT bitmap blocks ext4: Remove old legacy block allocator ext4: Use readahead when reading an inode from the inode table ext4: Improve the documentation for ext4's /proc tunables ext4: Combine proc file handling into a single set of functions ext4: move /proc setup and teardown out of mballoc.c ext4: Don't use 'struct dentry' for internal lookups ext4/jbd2: Avoid WARN() messages when failing to write to the superblock ext4: use percpu data structures for lg_prealloc_list ...
-rw-r--r--Documentation/filesystems/ext4.txt14
-rw-r--r--Documentation/filesystems/fiemap.txt228
-rw-r--r--Documentation/filesystems/proc.txt73
-rw-r--r--MAINTAINERS5
-rw-r--r--fs/Kconfig88
-rw-r--r--fs/Makefile2
-rw-r--r--fs/ext2/ext2.h2
-rw-r--r--fs/ext2/file.c1
-rw-r--r--fs/ext2/inode.c8
-rw-r--r--fs/ext3/file.c1
-rw-r--r--fs/ext3/inode.c8
-rw-r--r--fs/ext4/Makefile10
-rw-r--r--fs/ext4/acl.h12
-rw-r--r--fs/ext4/balloc.c1457
-rw-r--r--fs/ext4/bitmap.c6
-rw-r--r--fs/ext4/dir.c64
-rw-r--r--fs/ext4/ext4.h131
-rw-r--r--fs/ext4/ext4_extents.h15
-rw-r--r--fs/ext4/ext4_i.h39
-rw-r--r--fs/ext4/ext4_sb.h25
-rw-r--r--fs/ext4/extents.c281
-rw-r--r--fs/ext4/file.c10
-rw-r--r--fs/ext4/fsync.c7
-rw-r--r--fs/ext4/hash.c8
-rw-r--r--fs/ext4/ialloc.c71
-rw-r--r--fs/ext4/inode.c620
-rw-r--r--fs/ext4/ioctl.c84
-rw-r--r--fs/ext4/mballoc.c220
-rw-r--r--fs/ext4/mballoc.h1
-rw-r--r--fs/ext4/migrate.c10
-rw-r--r--fs/ext4/namei.c402
-rw-r--r--fs/ext4/resize.c33
-rw-r--r--fs/ext4/super.c274
-rw-r--r--fs/ext4/symlink.c8
-rw-r--r--fs/ext4/xattr.c14
-rw-r--r--fs/ext4/xattr.h12
-rw-r--r--fs/ioctl.c273
-rw-r--r--fs/jbd2/checkpoint.c22
-rw-r--r--fs/jbd2/commit.c22
-rw-r--r--fs/jbd2/journal.c75
-rw-r--r--fs/ocfs2/alloc.c9
-rw-r--r--fs/ocfs2/alloc.h9
-rw-r--r--fs/ocfs2/extent_map.c346
-rw-r--r--fs/ocfs2/extent_map.h3
-rw-r--r--fs/ocfs2/file.c1
-rw-r--r--include/linux/ext3_fs.h2
-rw-r--r--include/linux/fiemap.h64
-rw-r--r--include/linux/fs.h21
-rw-r--r--include/linux/jbd2.h3
-rw-r--r--include/linux/percpu_counter.h12
-rw-r--r--lib/percpu_counter.c8
51 files changed, 2581 insertions, 2533 deletions
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index 0d5394920a3..74484e69640 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -32,9 +32,9 @@ Mailing list: linux-ext4@vger.kernel.org
32 you will need to merge your changes with the version from e2fsprogs 32 you will need to merge your changes with the version from e2fsprogs
33 1.41.x. 33 1.41.x.
34 34
35 - Create a new filesystem using the ext4dev filesystem type: 35 - Create a new filesystem using the ext4 filesystem type:
36 36
37 # mke2fs -t ext4dev /dev/hda1 37 # mke2fs -t ext4 /dev/hda1
38 38
39 Or configure an existing ext3 filesystem to support extents and set 39 Or configure an existing ext3 filesystem to support extents and set
40 the test_fs flag to indicate that it's ok for an in-development 40 the test_fs flag to indicate that it's ok for an in-development
@@ -47,13 +47,13 @@ Mailing list: linux-ext4@vger.kernel.org
47 47
48 # tune2fs -I 256 /dev/hda1 48 # tune2fs -I 256 /dev/hda1
49 49
50 (Note: we currently do not have tools to convert an ext4dev 50 (Note: we currently do not have tools to convert an ext4
51 filesystem back to ext3; so please do not do try this on production 51 filesystem back to ext3; so please do not do try this on production
52 filesystems.) 52 filesystems.)
53 53
54 - Mounting: 54 - Mounting:
55 55
56 # mount -t ext4dev /dev/hda1 /wherever 56 # mount -t ext4 /dev/hda1 /wherever
57 57
58 - When comparing performance with other filesystems, remember that 58 - When comparing performance with other filesystems, remember that
59 ext3/4 by default offers higher data integrity guarantees than most. 59 ext3/4 by default offers higher data integrity guarantees than most.
@@ -177,6 +177,11 @@ barrier=<0|1(*)> This enables/disables the use of write barriers in
177 your disks are battery-backed in one way or another, 177 your disks are battery-backed in one way or another,
178 disabling barriers may safely improve performance. 178 disabling barriers may safely improve performance.
179 179
180inode_readahead=n This tuning parameter controls the maximum
181 number of inode table blocks that ext4's inode
182 table readahead algorithm will pre-read into
183 the buffer cache. The default value is 32 blocks.
184
180orlov (*) This enables the new Orlov block allocator. It is 185orlov (*) This enables the new Orlov block allocator. It is
181 enabled by default. 186 enabled by default.
182 187
@@ -252,6 +257,7 @@ stripe=n Number of filesystem blocks that mballoc will try
252delalloc (*) Deferring block allocation until write-out time. 257delalloc (*) Deferring block allocation until write-out time.
253nodelalloc Disable delayed allocation. Blocks are allocation 258nodelalloc Disable delayed allocation. Blocks are allocation
254 when data is copied from user to page cache. 259 when data is copied from user to page cache.
260
255Data Mode 261Data Mode
256========= 262=========
257There are 3 different data modes: 263There are 3 different data modes:
diff --git a/Documentation/filesystems/fiemap.txt b/Documentation/filesystems/fiemap.txt
new file mode 100644
index 00000000000..1e3defcfe50
--- /dev/null
+++ b/Documentation/filesystems/fiemap.txt
@@ -0,0 +1,228 @@
1============
2Fiemap Ioctl
3============
4
5The fiemap ioctl is an efficient method for userspace to get file
6extent mappings. Instead of block-by-block mapping (such as bmap), fiemap
7returns a list of extents.
8
9
10Request Basics
11--------------
12
13A fiemap request is encoded within struct fiemap:
14
15struct fiemap {
16 __u64 fm_start; /* logical offset (inclusive) at
17 * which to start mapping (in) */
18 __u64 fm_length; /* logical length of mapping which
19 * userspace cares about (in) */
20 __u32 fm_flags; /* FIEMAP_FLAG_* flags for request (in/out) */
21 __u32 fm_mapped_extents; /* number of extents that were
22 * mapped (out) */
23 __u32 fm_extent_count; /* size of fm_extents array (in) */
24 __u32 fm_reserved;
25 struct fiemap_extent fm_extents[0]; /* array of mapped extents (out) */
26};
27
28
29fm_start, and fm_length specify the logical range within the file
30which the process would like mappings for. Extents returned mirror
31those on disk - that is, the logical offset of the 1st returned extent
32may start before fm_start, and the range covered by the last returned
33extent may end after fm_length. All offsets and lengths are in bytes.
34
35Certain flags to modify the way in which mappings are looked up can be
36set in fm_flags. If the kernel doesn't understand some particular
37flags, it will return EBADR and the contents of fm_flags will contain
38the set of flags which caused the error. If the kernel is compatible
39with all flags passed, the contents of fm_flags will be unmodified.
40It is up to userspace to determine whether rejection of a particular
41flag is fatal to it's operation. This scheme is intended to allow the
42fiemap interface to grow in the future but without losing
43compatibility with old software.
44
45fm_extent_count specifies the number of elements in the fm_extents[] array
46that can be used to return extents. If fm_extent_count is zero, then the
47fm_extents[] array is ignored (no extents will be returned), and the
48fm_mapped_extents count will hold the number of extents needed in
49fm_extents[] to hold the file's current mapping. Note that there is
50nothing to prevent the file from changing between calls to FIEMAP.
51
52The following flags can be set in fm_flags:
53
54* FIEMAP_FLAG_SYNC
55If this flag is set, the kernel will sync the file before mapping extents.
56
57* FIEMAP_FLAG_XATTR
58If this flag is set, the extents returned will describe the inodes
59extended attribute lookup tree, instead of it's data tree.
60
61
62Extent Mapping
63--------------
64
65Extent information is returned within the embedded fm_extents array
66which userspace must allocate along with the fiemap structure. The
67number of elements in the fiemap_extents[] array should be passed via
68fm_extent_count. The number of extents mapped by kernel will be
69returned via fm_mapped_extents. If the number of fiemap_extents
70allocated is less than would be required to map the requested range,
71the maximum number of extents that can be mapped in the fm_extent[]
72array will be returned and fm_mapped_extents will be equal to
73fm_extent_count. In that case, the last extent in the array will not
74complete the requested range and will not have the FIEMAP_EXTENT_LAST
75flag set (see the next section on extent flags).
76
77Each extent is described by a single fiemap_extent structure as
78returned in fm_extents.
79
80struct fiemap_extent {
81 __u64 fe_logical; /* logical offset in bytes for the start of
82 * the extent */
83 __u64 fe_physical; /* physical offset in bytes for the start
84 * of the extent */
85 __u64 fe_length; /* length in bytes for the extent */
86 __u64 fe_reserved64[2];
87 __u32 fe_flags; /* FIEMAP_EXTENT_* flags for this extent */
88 __u32 fe_reserved[3];
89};
90
91All offsets and lengths are in bytes and mirror those on disk. It is valid
92for an extents logical offset to start before the request or it's logical
93length to extend past the request. Unless FIEMAP_EXTENT_NOT_ALIGNED is
94returned, fe_logical, fe_physical, and fe_length will be aligned to the
95block size of the file system. With the exception of extents flagged as
96FIEMAP_EXTENT_MERGED, adjacent extents will not be merged.
97
98The fe_flags field contains flags which describe the extent returned.
99A special flag, FIEMAP_EXTENT_LAST is always set on the last extent in
100the file so that the process making fiemap calls can determine when no
101more extents are available, without having to call the ioctl again.
102
103Some flags are intentionally vague and will always be set in the
104presence of other more specific flags. This way a program looking for
105a general property does not have to know all existing and future flags
106which imply that property.
107
108For example, if FIEMAP_EXTENT_DATA_INLINE or FIEMAP_EXTENT_DATA_TAIL
109are set, FIEMAP_EXTENT_NOT_ALIGNED will also be set. A program looking
110for inline or tail-packed data can key on the specific flag. Software
111which simply cares not to try operating on non-aligned extents
112however, can just key on FIEMAP_EXTENT_NOT_ALIGNED, and not have to
113worry about all present and future flags which might imply unaligned
114data. Note that the opposite is not true - it would be valid for
115FIEMAP_EXTENT_NOT_ALIGNED to appear alone.
116
117* FIEMAP_EXTENT_LAST
118This is the last extent in the file. A mapping attempt past this
119extent will return nothing.
120
121* FIEMAP_EXTENT_UNKNOWN
122The location of this extent is currently unknown. This may indicate
123the data is stored on an inaccessible volume or that no storage has
124been allocated for the file yet.
125
126* FIEMAP_EXTENT_DELALLOC
127 - This will also set FIEMAP_EXTENT_UNKNOWN.
128Delayed allocation - while there is data for this extent, it's
129physical location has not been allocated yet.
130
131* FIEMAP_EXTENT_ENCODED
132This extent does not consist of plain filesystem blocks but is
133encoded (e.g. encrypted or compressed). Reading the data in this
134extent via I/O to the block device will have undefined results.
135
136Note that it is *always* undefined to try to update the data
137in-place by writing to the indicated location without the
138assistance of the filesystem, or to access the data using the
139information returned by the FIEMAP interface while the filesystem
140is mounted. In other words, user applications may only read the
141extent data via I/O to the block device while the filesystem is
142unmounted, and then only if the FIEMAP_EXTENT_ENCODED flag is
143clear; user applications must not try reading or writing to the
144filesystem via the block device under any other circumstances.
145
146* FIEMAP_EXTENT_DATA_ENCRYPTED
147 - This will also set FIEMAP_EXTENT_ENCODED
148The data in this extent has been encrypted by the file system.
149
150* FIEMAP_EXTENT_NOT_ALIGNED
151Extent offsets and length are not guaranteed to be block aligned.
152
153* FIEMAP_EXTENT_DATA_INLINE
154 This will also set FIEMAP_EXTENT_NOT_ALIGNED
155Data is located within a meta data block.
156
157* FIEMAP_EXTENT_DATA_TAIL
158 This will also set FIEMAP_EXTENT_NOT_ALIGNED
159Data is packed into a block with data from other files.
160
161* FIEMAP_EXTENT_UNWRITTEN
162Unwritten extent - the extent is allocated but it's data has not been
163initialized. This indicates the extent's data will be all zero if read
164through the filesystem but the contents are undefined if read directly from
165the device.
166
167* FIEMAP_EXTENT_MERGED
168This will be set when a file does not support extents, i.e., it uses a block
169based addressing scheme. Since returning an extent for each block back to
170userspace would be highly inefficient, the kernel will try to merge most
171adjacent blocks into 'extents'.
172
173
174VFS -> File System Implementation
175---------------------------------
176
177File systems wishing to support fiemap must implement a ->fiemap callback on
178their inode_operations structure. The fs ->fiemap call is responsible for
179defining it's set of supported fiemap flags, and calling a helper function on
180each discovered extent:
181
182struct inode_operations {
183 ...
184
185 int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start,
186 u64 len);
187
188->fiemap is passed struct fiemap_extent_info which describes the
189fiemap request:
190
191struct fiemap_extent_info {
192 unsigned int fi_flags; /* Flags as passed from user */
193 unsigned int fi_extents_mapped; /* Number of mapped extents */
194 unsigned int fi_extents_max; /* Size of fiemap_extent array */
195 struct fiemap_extent *fi_extents_start; /* Start of fiemap_extent array */
196};
197
198It is intended that the file system should not need to access any of this
199structure directly.
200
201
202Flag checking should be done at the beginning of the ->fiemap callback via the
203fiemap_check_flags() helper:
204
205int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags);
206
207The struct fieinfo should be passed in as recieved from ioctl_fiemap(). The
208set of fiemap flags which the fs understands should be passed via fs_flags. If
209fiemap_check_flags finds invalid user flags, it will place the bad values in
210fieinfo->fi_flags and return -EBADR. If the file system gets -EBADR, from
211fiemap_check_flags(), it should immediately exit, returning that error back to
212ioctl_fiemap().
213
214
215For each extent in the request range, the file system should call
216the helper function, fiemap_fill_next_extent():
217
218int fiemap_fill_next_extent(struct fiemap_extent_info *info, u64 logical,
219 u64 phys, u64 len, u32 flags, u32 dev);
220
221fiemap_fill_next_extent() will use the passed values to populate the
222next free extent in the fm_extents array. 'General' extent flags will
223automatically be set from specific flags on behalf of the calling file
224system so that the userspace API is not broken.
225
226fiemap_fill_next_extent() returns 0 on success, and 1 when the
227user-supplied fm_extents array is full. If an error is encountered
228while copying the extent to user memory, -EFAULT will be returned.
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index f566ad9bcb7..d831d24d2a6 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -923,45 +923,44 @@ CPUs.
923The "procs_blocked" line gives the number of processes currently blocked, 923The "procs_blocked" line gives the number of processes currently blocked,
924waiting for I/O to complete. 924waiting for I/O to complete.
925 925
926
9261.9 Ext4 file system parameters 9271.9 Ext4 file system parameters
927------------------------------ 928------------------------------
928Ext4 file system have one directory per partition under /proc/fs/ext4/ 929
929# ls /proc/fs/ext4/hdc/ 930Information about mounted ext4 file systems can be found in
930group_prealloc max_to_scan mb_groups mb_history min_to_scan order2_req 931/proc/fs/ext4. Each mounted filesystem will have a directory in
931stats stream_req 932/proc/fs/ext4 based on its device name (i.e., /proc/fs/ext4/hdc or
932 933/proc/fs/ext4/dm-0). The files in each per-device directory are shown
933mb_groups: 934in Table 1-10, below.
934This file gives the details of multiblock allocator buddy cache of free blocks 935
935 936Table 1-10: Files in /proc/fs/ext4/<devname>
936mb_history: 937..............................................................................
937Multiblock allocation history. 938 File Content
938 939 mb_groups details of multiblock allocator buddy cache of free blocks
939stats: 940 mb_history multiblock allocation history
940This file indicate whether the multiblock allocator should start collecting 941 stats controls whether the multiblock allocator should start
941statistics. The statistics are shown during unmount 942 collecting statistics, which are shown during the unmount
942 943 group_prealloc the multiblock allocator will round up allocation
943group_prealloc: 944 requests to a multiple of this tuning parameter if the
944The multiblock allocator normalize the block allocation request to 945 stripe size is not set in the ext4 superblock
945group_prealloc filesystem blocks if we don't have strip value set. 946 max_to_scan The maximum number of extents the multiblock allocator
946The stripe value can be specified at mount time or during mke2fs. 947 will search to find the best extent
947 948 min_to_scan The minimum number of extents the multiblock allocator
948max_to_scan: 949 will search to find the best extent
949How long multiblock allocator can look for a best extent (in found extents) 950 order2_req Tuning parameter which controls the minimum size for
950 951 requests (as a power of 2) where the buddy cache is
951min_to_scan: 952 used
952How long multiblock allocator must look for a best extent 953 stream_req Files which have fewer blocks than this tunable
953 954 parameter will have their blocks allocated out of a
954order2_req: 955 block group specific preallocation pool, so that small
955Multiblock allocator use 2^N search using buddies only for requests greater 956 files are packed closely together. Each large file
956than or equal to order2_req. The request size is specfied in file system 957 will have its blocks allocated out of its own unique
957blocks. A value of 2 indicate only if the requests are greater than or equal 958 preallocation pool.
958to 4 blocks. 959inode_readahead Tuning parameter which controls the maximum number of
959 960 inode table blocks that ext4's inode table readahead
960stream_req: 961 algorithm will pre-read into the buffer cache
961Files smaller than stream_req are served by the stream allocator, whose 962..............................................................................
962purpose is to pack requests as close each to other as possible to 963
963produce smooth I/O traffic. Avalue of 16 indicate that file smaller than 16
964filesystem block size will use group based preallocation.
965 964
966------------------------------------------------------------------------------ 965------------------------------------------------------------------------------
967Summary 966Summary
diff --git a/MAINTAINERS b/MAINTAINERS
index 68781ed2b73..587f418ed00 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1659,9 +1659,10 @@ L: linux-ext4@vger.kernel.org
1659S: Maintained 1659S: Maintained
1660 1660
1661EXT4 FILE SYSTEM 1661EXT4 FILE SYSTEM
1662P: Stephen Tweedie, Andrew Morton 1662P: Theodore Ts'o
1663M: sct@redhat.com, akpm@linux-foundation.org, adilger@sun.com 1663M: tytso@mit.edu, adilger@sun.com
1664L: linux-ext4@vger.kernel.org 1664L: linux-ext4@vger.kernel.org
1665W: http://ext4.wiki.kernel.org
1665S: Maintained 1666S: Maintained
1666 1667
1667F71805F HARDWARE MONITORING DRIVER 1668F71805F HARDWARE MONITORING DRIVER
diff --git a/fs/Kconfig b/fs/Kconfig
index abccb5dab9a..40183d94b68 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -136,37 +136,51 @@ config EXT3_FS_SECURITY
136 If you are not using a security module that requires using 136 If you are not using a security module that requires using
137 extended attributes for file security labels, say N. 137 extended attributes for file security labels, say N.
138 138
139config EXT4DEV_FS 139config EXT4_FS
140 tristate "Ext4dev/ext4 extended fs support development (EXPERIMENTAL)" 140 tristate "The Extended 4 (ext4) filesystem"
141 depends on EXPERIMENTAL
142 select JBD2 141 select JBD2
143 select CRC16 142 select CRC16
144 help 143 help
145 Ext4dev is a predecessor filesystem of the next generation 144 This is the next generation of the ext3 filesystem.
146 extended fs ext4, based on ext3 filesystem code. It will be
147 renamed ext4 fs later, once ext4dev is mature and stabilized.
148 145
149 Unlike the change from ext2 filesystem to ext3 filesystem, 146 Unlike the change from ext2 filesystem to ext3 filesystem,
150 the on-disk format of ext4dev is not the same as ext3 any more: 147 the on-disk format of ext4 is not forwards compatible with
151 it is based on extent maps and it supports 48-bit physical block 148 ext3; it is based on extent maps and it supports 48-bit
152 numbers. These combined on-disk format changes will allow 149 physical block numbers. The ext4 filesystem also supports delayed
153 ext4dev/ext4 to handle more than 16 TB filesystem volumes -- 150 allocation, persistent preallocation, high resolution time stamps,
154 a hard limit that ext3 cannot overcome without changing the 151 and a number of other features to improve performance and speed
155 on-disk format. 152 up fsck time. For more information, please see the web pages at
156 153 http://ext4.wiki.kernel.org.
157 Other than extent maps and 48-bit block numbers, ext4dev also is 154
158 likely to have other new features such as persistent preallocation, 155 The ext4 filesystem will support mounting an ext3
159 high resolution time stamps, and larger file support etc. These 156 filesystem; while there will be some performance gains from
160 features will be added to ext4dev gradually. 157 the delayed allocation and inode table readahead, the best
158 performance gains will require enabling ext4 features in the
159 filesystem, or formating a new filesystem as an ext4
160 filesystem initially.
161 161
162 To compile this file system support as a module, choose M here. The 162 To compile this file system support as a module, choose M here. The
163 module will be called ext4dev. 163 module will be called ext4dev.
164 164
165 If unsure, say N. 165 If unsure, say N.
166 166
167config EXT4DEV_FS_XATTR 167config EXT4DEV_COMPAT
168 bool "Ext4dev extended attributes" 168 bool "Enable ext4dev compatibility"
169 depends on EXT4DEV_FS 169 depends on EXT4_FS
170 help
171 Starting with 2.6.28, the name of the ext4 filesystem was
172 renamed from ext4dev to ext4. Unfortunately there are some
173 lagecy userspace programs (such as klibc's fstype) have
174 "ext4dev" hardcoded.
175
176 To enable backwards compatibility so that systems that are
177 still expecting to mount ext4 filesystems using ext4dev,
178 chose Y here. This feature will go away by 2.6.31, so
179 please arrange to get your userspace programs fixed!
180
181config EXT4_FS_XATTR
182 bool "Ext4 extended attributes"
183 depends on EXT4_FS
170 default y 184 default y
171 help 185 help
172 Extended attributes are name:value pairs associated with inodes by 186 Extended attributes are name:value pairs associated with inodes by
@@ -175,11 +189,11 @@ config EXT4DEV_FS_XATTR
175 189
176 If unsure, say N. 190 If unsure, say N.
177 191
178 You need this for POSIX ACL support on ext4dev/ext4. 192 You need this for POSIX ACL support on ext4.
179 193
180config EXT4DEV_FS_POSIX_ACL 194config EXT4_FS_POSIX_ACL
181 bool "Ext4dev POSIX Access Control Lists" 195 bool "Ext4 POSIX Access Control Lists"
182 depends on EXT4DEV_FS_XATTR 196 depends on EXT4_FS_XATTR
183 select FS_POSIX_ACL 197 select FS_POSIX_ACL
184 help 198 help
185 POSIX Access Control Lists (ACLs) support permissions for users and 199 POSIX Access Control Lists (ACLs) support permissions for users and
@@ -190,14 +204,14 @@ config EXT4DEV_FS_POSIX_ACL
190 204
191 If you don't know what Access Control Lists are, say N 205 If you don't know what Access Control Lists are, say N
192 206
193config EXT4DEV_FS_SECURITY 207config EXT4_FS_SECURITY
194 bool "Ext4dev Security Labels" 208 bool "Ext4 Security Labels"
195 depends on EXT4DEV_FS_XATTR 209 depends on EXT4_FS_XATTR
196 help 210 help
197 Security labels support alternative access control models 211 Security labels support alternative access control models
198 implemented by security modules like SELinux. This option 212 implemented by security modules like SELinux. This option
199 enables an extended attribute handler for file security 213 enables an extended attribute handler for file security
200 labels in the ext4dev/ext4 filesystem. 214 labels in the ext4 filesystem.
201 215
202 If you are not using a security module that requires using 216 If you are not using a security module that requires using
203 extended attributes for file security labels, say N. 217 extended attributes for file security labels, say N.
@@ -240,22 +254,22 @@ config JBD2
240 help 254 help
241 This is a generic journaling layer for block devices that support 255 This is a generic journaling layer for block devices that support
242 both 32-bit and 64-bit block numbers. It is currently used by 256 both 32-bit and 64-bit block numbers. It is currently used by
243 the ext4dev/ext4 filesystem, but it could also be used to add 257 the ext4 filesystem, but it could also be used to add
244 journal support to other file systems or block devices such 258 journal support to other file systems or block devices such
245 as RAID or LVM. 259 as RAID or LVM.
246 260
247 If you are using ext4dev/ext4, you need to say Y here. If you are not 261 If you are using ext4, you need to say Y here. If you are not
248 using ext4dev/ext4 then you will probably want to say N. 262 using ext4 then you will probably want to say N.
249 263
250 To compile this device as a module, choose M here. The module will be 264 To compile this device as a module, choose M here. The module will be
251 called jbd2. If you are compiling ext4dev/ext4 into the kernel, 265 called jbd2. If you are compiling ext4 into the kernel,
252 you cannot compile this code as a module. 266 you cannot compile this code as a module.
253 267
254config JBD2_DEBUG 268config JBD2_DEBUG
255 bool "JBD2 (ext4dev/ext4) debugging support" 269 bool "JBD2 (ext4) debugging support"
256 depends on JBD2 && DEBUG_FS 270 depends on JBD2 && DEBUG_FS
257 help 271 help
258 If you are using the ext4dev/ext4 journaled file system (or 272 If you are using the ext4 journaled file system (or
259 potentially any other filesystem/device using JBD2), this option 273 potentially any other filesystem/device using JBD2), this option
260 allows you to enable debugging output while the system is running, 274 allows you to enable debugging output while the system is running,
261 in order to help track down any problems you are having. 275 in order to help track down any problems you are having.
@@ -270,9 +284,9 @@ config JBD2_DEBUG
270config FS_MBCACHE 284config FS_MBCACHE
271# Meta block cache for Extended Attributes (ext2/ext3/ext4) 285# Meta block cache for Extended Attributes (ext2/ext3/ext4)
272 tristate 286 tristate
273 depends on EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4DEV_FS_XATTR 287 depends on EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4_FS_XATTR
274 default y if EXT2_FS=y || EXT3_FS=y || EXT4DEV_FS=y 288 default y if EXT2_FS=y || EXT3_FS=y || EXT4_FS=y
275 default m if EXT2_FS=m || EXT3_FS=m || EXT4DEV_FS=m 289 default m if EXT2_FS=m || EXT3_FS=m || EXT4_FS=m
276 290
277config REISERFS_FS 291config REISERFS_FS
278 tristate "Reiserfs support" 292 tristate "Reiserfs support"
diff --git a/fs/Makefile b/fs/Makefile
index a1482a5eff1..de404b00eb0 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -69,7 +69,7 @@ obj-$(CONFIG_DLM) += dlm/
69# Do not add any filesystems before this line 69# Do not add any filesystems before this line
70obj-$(CONFIG_REISERFS_FS) += reiserfs/ 70obj-$(CONFIG_REISERFS_FS) += reiserfs/
71obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3 71obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3
72obj-$(CONFIG_EXT4DEV_FS) += ext4/ # Before ext2 so root fs can be ext4dev 72obj-$(CONFIG_EXT4_FS) += ext4/ # Before ext2 so root fs can be ext4dev
73obj-$(CONFIG_JBD) += jbd/ 73obj-$(CONFIG_JBD) += jbd/
74obj-$(CONFIG_JBD2) += jbd2/ 74obj-$(CONFIG_JBD2) += jbd2/
75obj-$(CONFIG_EXT2_FS) += ext2/ 75obj-$(CONFIG_EXT2_FS) += ext2/
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 47d88da2d33..bae998c1e44 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -133,6 +133,8 @@ extern void ext2_truncate (struct inode *);
133extern int ext2_setattr (struct dentry *, struct iattr *); 133extern int ext2_setattr (struct dentry *, struct iattr *);
134extern void ext2_set_inode_flags(struct inode *inode); 134extern void ext2_set_inode_flags(struct inode *inode);
135extern void ext2_get_inode_flags(struct ext2_inode_info *); 135extern void ext2_get_inode_flags(struct ext2_inode_info *);
136extern int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
137 u64 start, u64 len);
136int __ext2_write_begin(struct file *file, struct address_space *mapping, 138int __ext2_write_begin(struct file *file, struct address_space *mapping,
137 loff_t pos, unsigned len, unsigned flags, 139 loff_t pos, unsigned len, unsigned flags,
138 struct page **pagep, void **fsdata); 140 struct page **pagep, void **fsdata);
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 5f2fa9c3629..45ed0712218 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -86,4 +86,5 @@ const struct inode_operations ext2_file_inode_operations = {
86#endif 86#endif
87 .setattr = ext2_setattr, 87 .setattr = ext2_setattr,
88 .permission = ext2_permission, 88 .permission = ext2_permission,
89 .fiemap = ext2_fiemap,
89}; 90};
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 991d6dfeb51..7658b33e265 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -31,6 +31,7 @@
31#include <linux/writeback.h> 31#include <linux/writeback.h>
32#include <linux/buffer_head.h> 32#include <linux/buffer_head.h>
33#include <linux/mpage.h> 33#include <linux/mpage.h>
34#include <linux/fiemap.h>
34#include "ext2.h" 35#include "ext2.h"
35#include "acl.h" 36#include "acl.h"
36#include "xip.h" 37#include "xip.h"
@@ -704,6 +705,13 @@ int ext2_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_
704 705
705} 706}
706 707
708int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
709 u64 start, u64 len)
710{
711 return generic_block_fiemap(inode, fieinfo, start, len,
712 ext2_get_block);
713}
714
707static int ext2_writepage(struct page *page, struct writeback_control *wbc) 715static int ext2_writepage(struct page *page, struct writeback_control *wbc)
708{ 716{
709 return block_write_full_page(page, ext2_get_block, wbc); 717 return block_write_full_page(page, ext2_get_block, wbc);
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index acc4913d301..3be1e0689c9 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -134,5 +134,6 @@ const struct inode_operations ext3_file_inode_operations = {
134 .removexattr = generic_removexattr, 134 .removexattr = generic_removexattr,
135#endif 135#endif
136 .permission = ext3_permission, 136 .permission = ext3_permission,
137 .fiemap = ext3_fiemap,
137}; 138};
138 139
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 507d8689b11..ebfec4d0148 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -36,6 +36,7 @@
36#include <linux/mpage.h> 36#include <linux/mpage.h>
37#include <linux/uio.h> 37#include <linux/uio.h>
38#include <linux/bio.h> 38#include <linux/bio.h>
39#include <linux/fiemap.h>
39#include "xattr.h" 40#include "xattr.h"
40#include "acl.h" 41#include "acl.h"
41 42
@@ -981,6 +982,13 @@ out:
981 return ret; 982 return ret;
982} 983}
983 984
985int ext3_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
986 u64 start, u64 len)
987{
988 return generic_block_fiemap(inode, fieinfo, start, len,
989 ext3_get_block);
990}
991
984/* 992/*
985 * `handle' can be NULL if create is zero 993 * `handle' can be NULL if create is zero
986 */ 994 */
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index ac6fa8ca0a2..a8ff003a00f 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -2,12 +2,12 @@
2# Makefile for the linux ext4-filesystem routines. 2# Makefile for the linux ext4-filesystem routines.
3# 3#
4 4
5obj-$(CONFIG_EXT4DEV_FS) += ext4dev.o 5obj-$(CONFIG_EXT4_FS) += ext4.o
6 6
7ext4dev-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ 7ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
8 ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ 8 ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
9 ext4_jbd2.o migrate.o mballoc.o 9 ext4_jbd2.o migrate.o mballoc.o
10 10
11ext4dev-$(CONFIG_EXT4DEV_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o 11ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
12ext4dev-$(CONFIG_EXT4DEV_FS_POSIX_ACL) += acl.o 12ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
13ext4dev-$(CONFIG_EXT4DEV_FS_SECURITY) += xattr_security.o 13ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o
diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h
index cd2b855a07d..cb45257a246 100644
--- a/fs/ext4/acl.h
+++ b/fs/ext4/acl.h
@@ -51,18 +51,18 @@ static inline int ext4_acl_count(size_t size)
51 } 51 }
52} 52}
53 53
54#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL 54#ifdef CONFIG_EXT4_FS_POSIX_ACL
55 55
56/* Value for inode->u.ext4_i.i_acl and inode->u.ext4_i.i_default_acl 56/* Value for inode->u.ext4_i.i_acl and inode->u.ext4_i.i_default_acl
57 if the ACL has not been cached */ 57 if the ACL has not been cached */
58#define EXT4_ACL_NOT_CACHED ((void *)-1) 58#define EXT4_ACL_NOT_CACHED ((void *)-1)
59 59
60/* acl.c */ 60/* acl.c */
61extern int ext4_permission (struct inode *, int); 61extern int ext4_permission(struct inode *, int);
62extern int ext4_acl_chmod (struct inode *); 62extern int ext4_acl_chmod(struct inode *);
63extern int ext4_init_acl (handle_t *, struct inode *, struct inode *); 63extern int ext4_init_acl(handle_t *, struct inode *, struct inode *);
64 64
65#else /* CONFIG_EXT4DEV_FS_POSIX_ACL */ 65#else /* CONFIG_EXT4_FS_POSIX_ACL */
66#include <linux/sched.h> 66#include <linux/sched.h>
67#define ext4_permission NULL 67#define ext4_permission NULL
68 68
@@ -77,5 +77,5 @@ ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
77{ 77{
78 return 0; 78 return 0;
79} 79}
80#endif /* CONFIG_EXT4DEV_FS_POSIX_ACL */ 80#endif /* CONFIG_EXT4_FS_POSIX_ACL */
81 81
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index e9fa960ba6d..bd2ece22882 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -83,6 +83,7 @@ static int ext4_group_used_meta_blocks(struct super_block *sb,
83 } 83 }
84 return used_blocks; 84 return used_blocks;
85} 85}
86
86/* Initializes an uninitialized block bitmap if given, and returns the 87/* Initializes an uninitialized block bitmap if given, and returns the
87 * number of blocks free in the group. */ 88 * number of blocks free in the group. */
88unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, 89unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
@@ -132,7 +133,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
132 */ 133 */
133 group_blocks = ext4_blocks_count(sbi->s_es) - 134 group_blocks = ext4_blocks_count(sbi->s_es) -
134 le32_to_cpu(sbi->s_es->s_first_data_block) - 135 le32_to_cpu(sbi->s_es->s_first_data_block) -
135 (EXT4_BLOCKS_PER_GROUP(sb) * (sbi->s_groups_count -1)); 136 (EXT4_BLOCKS_PER_GROUP(sb) * (sbi->s_groups_count - 1));
136 } else { 137 } else {
137 group_blocks = EXT4_BLOCKS_PER_GROUP(sb); 138 group_blocks = EXT4_BLOCKS_PER_GROUP(sb);
138 } 139 }
@@ -200,20 +201,20 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
200 * @bh: pointer to the buffer head to store the block 201 * @bh: pointer to the buffer head to store the block
201 * group descriptor 202 * group descriptor
202 */ 203 */
203struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, 204struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
204 ext4_group_t block_group, 205 ext4_group_t block_group,
205 struct buffer_head ** bh) 206 struct buffer_head **bh)
206{ 207{
207 unsigned long group_desc; 208 unsigned long group_desc;
208 unsigned long offset; 209 unsigned long offset;
209 struct ext4_group_desc * desc; 210 struct ext4_group_desc *desc;
210 struct ext4_sb_info *sbi = EXT4_SB(sb); 211 struct ext4_sb_info *sbi = EXT4_SB(sb);
211 212
212 if (block_group >= sbi->s_groups_count) { 213 if (block_group >= sbi->s_groups_count) {
213 ext4_error (sb, "ext4_get_group_desc", 214 ext4_error(sb, "ext4_get_group_desc",
214 "block_group >= groups_count - " 215 "block_group >= groups_count - "
215 "block_group = %lu, groups_count = %lu", 216 "block_group = %lu, groups_count = %lu",
216 block_group, sbi->s_groups_count); 217 block_group, sbi->s_groups_count);
217 218
218 return NULL; 219 return NULL;
219 } 220 }
@@ -222,10 +223,10 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
222 group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb); 223 group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb);
223 offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1); 224 offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1);
224 if (!sbi->s_group_desc[group_desc]) { 225 if (!sbi->s_group_desc[group_desc]) {
225 ext4_error (sb, "ext4_get_group_desc", 226 ext4_error(sb, "ext4_get_group_desc",
226 "Group descriptor not loaded - " 227 "Group descriptor not loaded - "
227 "block_group = %lu, group_desc = %lu, desc = %lu", 228 "block_group = %lu, group_desc = %lu, desc = %lu",
228 block_group, group_desc, offset); 229 block_group, group_desc, offset);
229 return NULL; 230 return NULL;
230 } 231 }
231 232
@@ -302,8 +303,8 @@ err_out:
302struct buffer_head * 303struct buffer_head *
303ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) 304ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
304{ 305{
305 struct ext4_group_desc * desc; 306 struct ext4_group_desc *desc;
306 struct buffer_head * bh = NULL; 307 struct buffer_head *bh = NULL;
307 ext4_fsblk_t bitmap_blk; 308 ext4_fsblk_t bitmap_blk;
308 309
309 desc = ext4_get_group_desc(sb, block_group, NULL); 310 desc = ext4_get_group_desc(sb, block_group, NULL);
@@ -318,9 +319,11 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
318 block_group, bitmap_blk); 319 block_group, bitmap_blk);
319 return NULL; 320 return NULL;
320 } 321 }
321 if (bh_uptodate_or_lock(bh)) 322 if (buffer_uptodate(bh) &&
323 !(desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))
322 return bh; 324 return bh;
323 325
326 lock_buffer(bh);
324 spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group)); 327 spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
325 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 328 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
326 ext4_init_block_bitmap(sb, bh, block_group, desc); 329 ext4_init_block_bitmap(sb, bh, block_group, desc);
@@ -345,301 +348,6 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
345 */ 348 */
346 return bh; 349 return bh;
347} 350}
348/*
349 * The reservation window structure operations
350 * --------------------------------------------
351 * Operations include:
352 * dump, find, add, remove, is_empty, find_next_reservable_window, etc.
353 *
354 * We use a red-black tree to represent per-filesystem reservation
355 * windows.
356 *
357 */
358
359/**
360 * __rsv_window_dump() -- Dump the filesystem block allocation reservation map
361 * @rb_root: root of per-filesystem reservation rb tree
362 * @verbose: verbose mode
363 * @fn: function which wishes to dump the reservation map
364 *
365 * If verbose is turned on, it will print the whole block reservation
366 * windows(start, end). Otherwise, it will only print out the "bad" windows,
367 * those windows that overlap with their immediate neighbors.
368 */
369#if 1
370static void __rsv_window_dump(struct rb_root *root, int verbose,
371 const char *fn)
372{
373 struct rb_node *n;
374 struct ext4_reserve_window_node *rsv, *prev;
375 int bad;
376
377restart:
378 n = rb_first(root);
379 bad = 0;
380 prev = NULL;
381
382 printk("Block Allocation Reservation Windows Map (%s):\n", fn);
383 while (n) {
384 rsv = rb_entry(n, struct ext4_reserve_window_node, rsv_node);
385 if (verbose)
386 printk("reservation window 0x%p "
387 "start: %llu, end: %llu\n",
388 rsv, rsv->rsv_start, rsv->rsv_end);
389 if (rsv->rsv_start && rsv->rsv_start >= rsv->rsv_end) {
390 printk("Bad reservation %p (start >= end)\n",
391 rsv);
392 bad = 1;
393 }
394 if (prev && prev->rsv_end >= rsv->rsv_start) {
395 printk("Bad reservation %p (prev->end >= start)\n",
396 rsv);
397 bad = 1;
398 }
399 if (bad) {
400 if (!verbose) {
401 printk("Restarting reservation walk in verbose mode\n");
402 verbose = 1;
403 goto restart;
404 }
405 }
406 n = rb_next(n);
407 prev = rsv;
408 }
409 printk("Window map complete.\n");
410 BUG_ON(bad);
411}
412#define rsv_window_dump(root, verbose) \
413 __rsv_window_dump((root), (verbose), __func__)
414#else
415#define rsv_window_dump(root, verbose) do {} while (0)
416#endif
417
418/**
419 * goal_in_my_reservation()
420 * @rsv: inode's reservation window
421 * @grp_goal: given goal block relative to the allocation block group
422 * @group: the current allocation block group
423 * @sb: filesystem super block
424 *
425 * Test if the given goal block (group relative) is within the file's
426 * own block reservation window range.
427 *
428 * If the reservation window is outside the goal allocation group, return 0;
429 * grp_goal (given goal block) could be -1, which means no specific
430 * goal block. In this case, always return 1.
431 * If the goal block is within the reservation window, return 1;
432 * otherwise, return 0;
433 */
434static int
435goal_in_my_reservation(struct ext4_reserve_window *rsv, ext4_grpblk_t grp_goal,
436 ext4_group_t group, struct super_block *sb)
437{
438 ext4_fsblk_t group_first_block, group_last_block;
439
440 group_first_block = ext4_group_first_block_no(sb, group);
441 group_last_block = group_first_block + (EXT4_BLOCKS_PER_GROUP(sb) - 1);
442
443 if ((rsv->_rsv_start > group_last_block) ||
444 (rsv->_rsv_end < group_first_block))
445 return 0;
446 if ((grp_goal >= 0) && ((grp_goal + group_first_block < rsv->_rsv_start)
447 || (grp_goal + group_first_block > rsv->_rsv_end)))
448 return 0;
449 return 1;
450}
451
452/**
453 * search_reserve_window()
454 * @rb_root: root of reservation tree
455 * @goal: target allocation block
456 *
457 * Find the reserved window which includes the goal, or the previous one
458 * if the goal is not in any window.
459 * Returns NULL if there are no windows or if all windows start after the goal.
460 */
461static struct ext4_reserve_window_node *
462search_reserve_window(struct rb_root *root, ext4_fsblk_t goal)
463{
464 struct rb_node *n = root->rb_node;
465 struct ext4_reserve_window_node *rsv;
466
467 if (!n)
468 return NULL;
469
470 do {
471 rsv = rb_entry(n, struct ext4_reserve_window_node, rsv_node);
472
473 if (goal < rsv->rsv_start)
474 n = n->rb_left;
475 else if (goal > rsv->rsv_end)
476 n = n->rb_right;
477 else
478 return rsv;
479 } while (n);
480 /*
481 * We've fallen off the end of the tree: the goal wasn't inside
482 * any particular node. OK, the previous node must be to one
483 * side of the interval containing the goal. If it's the RHS,
484 * we need to back up one.
485 */
486 if (rsv->rsv_start > goal) {
487 n = rb_prev(&rsv->rsv_node);
488 rsv = rb_entry(n, struct ext4_reserve_window_node, rsv_node);
489 }
490 return rsv;
491}
492
493/**
494 * ext4_rsv_window_add() -- Insert a window to the block reservation rb tree.
495 * @sb: super block
496 * @rsv: reservation window to add
497 *
498 * Must be called with rsv_lock hold.
499 */
500void ext4_rsv_window_add(struct super_block *sb,
501 struct ext4_reserve_window_node *rsv)
502{
503 struct rb_root *root = &EXT4_SB(sb)->s_rsv_window_root;
504 struct rb_node *node = &rsv->rsv_node;
505 ext4_fsblk_t start = rsv->rsv_start;
506
507 struct rb_node ** p = &root->rb_node;
508 struct rb_node * parent = NULL;
509 struct ext4_reserve_window_node *this;
510
511 while (*p)
512 {
513 parent = *p;
514 this = rb_entry(parent, struct ext4_reserve_window_node, rsv_node);
515
516 if (start < this->rsv_start)
517 p = &(*p)->rb_left;
518 else if (start > this->rsv_end)
519 p = &(*p)->rb_right;
520 else {
521 rsv_window_dump(root, 1);
522 BUG();
523 }
524 }
525
526 rb_link_node(node, parent, p);
527 rb_insert_color(node, root);
528}
529
530/**
531 * ext4_rsv_window_remove() -- unlink a window from the reservation rb tree
532 * @sb: super block
533 * @rsv: reservation window to remove
534 *
535 * Mark the block reservation window as not allocated, and unlink it
536 * from the filesystem reservation window rb tree. Must be called with
537 * rsv_lock hold.
538 */
539static void rsv_window_remove(struct super_block *sb,
540 struct ext4_reserve_window_node *rsv)
541{
542 rsv->rsv_start = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
543 rsv->rsv_end = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
544 rsv->rsv_alloc_hit = 0;
545 rb_erase(&rsv->rsv_node, &EXT4_SB(sb)->s_rsv_window_root);
546}
547
548/*
549 * rsv_is_empty() -- Check if the reservation window is allocated.
550 * @rsv: given reservation window to check
551 *
552 * returns 1 if the end block is EXT4_RESERVE_WINDOW_NOT_ALLOCATED.
553 */
554static inline int rsv_is_empty(struct ext4_reserve_window *rsv)
555{
556 /* a valid reservation end block could not be 0 */
557 return rsv->_rsv_end == EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
558}
559
560/**
561 * ext4_init_block_alloc_info()
562 * @inode: file inode structure
563 *
564 * Allocate and initialize the reservation window structure, and
565 * link the window to the ext4 inode structure at last
566 *
567 * The reservation window structure is only dynamically allocated
568 * and linked to ext4 inode the first time the open file
569 * needs a new block. So, before every ext4_new_block(s) call, for
570 * regular files, we should check whether the reservation window
571 * structure exists or not. In the latter case, this function is called.
572 * Fail to do so will result in block reservation being turned off for that
573 * open file.
574 *
575 * This function is called from ext4_get_blocks_handle(), also called
576 * when setting the reservation window size through ioctl before the file
577 * is open for write (needs block allocation).
578 *
579 * Needs down_write(i_data_sem) protection prior to call this function.
580 */
581void ext4_init_block_alloc_info(struct inode *inode)
582{
583 struct ext4_inode_info *ei = EXT4_I(inode);
584 struct ext4_block_alloc_info *block_i = ei->i_block_alloc_info;
585 struct super_block *sb = inode->i_sb;
586
587 block_i = kmalloc(sizeof(*block_i), GFP_NOFS);
588 if (block_i) {
589 struct ext4_reserve_window_node *rsv = &block_i->rsv_window_node;
590
591 rsv->rsv_start = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
592 rsv->rsv_end = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
593
594 /*
595 * if filesystem is mounted with NORESERVATION, the goal
596 * reservation window size is set to zero to indicate
597 * block reservation is off
598 */
599 if (!test_opt(sb, RESERVATION))
600 rsv->rsv_goal_size = 0;
601 else
602 rsv->rsv_goal_size = EXT4_DEFAULT_RESERVE_BLOCKS;
603 rsv->rsv_alloc_hit = 0;
604 block_i->last_alloc_logical_block = 0;
605 block_i->last_alloc_physical_block = 0;
606 }
607 ei->i_block_alloc_info = block_i;
608}
609
610/**
611 * ext4_discard_reservation()
612 * @inode: inode
613 *
614 * Discard(free) block reservation window on last file close, or truncate
615 * or at last iput().
616 *
617 * It is being called in three cases:
618 * ext4_release_file(): last writer close the file
619 * ext4_clear_inode(): last iput(), when nobody link to this file.
620 * ext4_truncate(): when the block indirect map is about to change.
621 *
622 */
623void ext4_discard_reservation(struct inode *inode)
624{
625 struct ext4_inode_info *ei = EXT4_I(inode);
626 struct ext4_block_alloc_info *block_i = ei->i_block_alloc_info;
627 struct ext4_reserve_window_node *rsv;
628 spinlock_t *rsv_lock = &EXT4_SB(inode->i_sb)->s_rsv_window_lock;
629
630 ext4_mb_discard_inode_preallocations(inode);
631
632 if (!block_i)
633 return;
634
635 rsv = &block_i->rsv_window_node;
636 if (!rsv_is_empty(&rsv->rsv_window)) {
637 spin_lock(rsv_lock);
638 if (!rsv_is_empty(&rsv->rsv_window))
639 rsv_window_remove(inode->i_sb, rsv);
640 spin_unlock(rsv_lock);
641 }
642}
643 351
644/** 352/**
645 * ext4_free_blocks_sb() -- Free given blocks and update quota 353 * ext4_free_blocks_sb() -- Free given blocks and update quota
@@ -648,6 +356,13 @@ void ext4_discard_reservation(struct inode *inode)
648 * @block: start physcial block to free 356 * @block: start physcial block to free
649 * @count: number of blocks to free 357 * @count: number of blocks to free
650 * @pdquot_freed_blocks: pointer to quota 358 * @pdquot_freed_blocks: pointer to quota
359 *
360 * XXX This function is only used by the on-line resizing code, which
361 * should probably be fixed up to call the mballoc variant. There
362 * this needs to be cleaned up later; in fact, I'm not convinced this
363 * is 100% correct in the face of the mballoc code. The online resizing
364 * code needs to be fixed up to more tightly (and correctly) interlock
365 * with the mballoc code.
651 */ 366 */
652void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb, 367void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
653 ext4_fsblk_t block, unsigned long count, 368 ext4_fsblk_t block, unsigned long count,
@@ -659,8 +374,8 @@ void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
659 ext4_grpblk_t bit; 374 ext4_grpblk_t bit;
660 unsigned long i; 375 unsigned long i;
661 unsigned long overflow; 376 unsigned long overflow;
662 struct ext4_group_desc * desc; 377 struct ext4_group_desc *desc;
663 struct ext4_super_block * es; 378 struct ext4_super_block *es;
664 struct ext4_sb_info *sbi; 379 struct ext4_sb_info *sbi;
665 int err = 0, ret; 380 int err = 0, ret;
666 ext4_grpblk_t group_freed; 381 ext4_grpblk_t group_freed;
@@ -671,13 +386,13 @@ void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
671 if (block < le32_to_cpu(es->s_first_data_block) || 386 if (block < le32_to_cpu(es->s_first_data_block) ||
672 block + count < block || 387 block + count < block ||
673 block + count > ext4_blocks_count(es)) { 388 block + count > ext4_blocks_count(es)) {
674 ext4_error (sb, "ext4_free_blocks", 389 ext4_error(sb, "ext4_free_blocks",
675 "Freeing blocks not in datazone - " 390 "Freeing blocks not in datazone - "
676 "block = %llu, count = %lu", block, count); 391 "block = %llu, count = %lu", block, count);
677 goto error_return; 392 goto error_return;
678 } 393 }
679 394
680 ext4_debug ("freeing block(s) %llu-%llu\n", block, block + count - 1); 395 ext4_debug("freeing block(s) %llu-%llu\n", block, block + count - 1);
681 396
682do_more: 397do_more:
683 overflow = 0; 398 overflow = 0;
@@ -694,7 +409,7 @@ do_more:
694 bitmap_bh = ext4_read_block_bitmap(sb, block_group); 409 bitmap_bh = ext4_read_block_bitmap(sb, block_group);
695 if (!bitmap_bh) 410 if (!bitmap_bh)
696 goto error_return; 411 goto error_return;
697 desc = ext4_get_group_desc (sb, block_group, &gd_bh); 412 desc = ext4_get_group_desc(sb, block_group, &gd_bh);
698 if (!desc) 413 if (!desc)
699 goto error_return; 414 goto error_return;
700 415
@@ -703,10 +418,10 @@ do_more:
703 in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) || 418 in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
704 in_range(block + count - 1, ext4_inode_table(sb, desc), 419 in_range(block + count - 1, ext4_inode_table(sb, desc),
705 sbi->s_itb_per_group)) { 420 sbi->s_itb_per_group)) {
706 ext4_error (sb, "ext4_free_blocks", 421 ext4_error(sb, "ext4_free_blocks",
707 "Freeing blocks in system zones - " 422 "Freeing blocks in system zones - "
708 "Block = %llu, count = %lu", 423 "Block = %llu, count = %lu",
709 block, count); 424 block, count);
710 goto error_return; 425 goto error_return;
711 } 426 }
712 427
@@ -848,7 +563,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
848 ext4_fsblk_t block, unsigned long count, 563 ext4_fsblk_t block, unsigned long count,
849 int metadata) 564 int metadata)
850{ 565{
851 struct super_block * sb; 566 struct super_block *sb;
852 unsigned long dquot_freed_blocks; 567 unsigned long dquot_freed_blocks;
853 568
854 /* this isn't the right place to decide whether block is metadata 569 /* this isn't the right place to decide whether block is metadata
@@ -859,748 +574,52 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
859 574
860 sb = inode->i_sb; 575 sb = inode->i_sb;
861 576
862 if (!test_opt(sb, MBALLOC) || !EXT4_SB(sb)->s_group_info) 577 ext4_mb_free_blocks(handle, inode, block, count,
863 ext4_free_blocks_sb(handle, sb, block, count, 578 metadata, &dquot_freed_blocks);
864 &dquot_freed_blocks);
865 else
866 ext4_mb_free_blocks(handle, inode, block, count,
867 metadata, &dquot_freed_blocks);
868 if (dquot_freed_blocks) 579 if (dquot_freed_blocks)
869 DQUOT_FREE_BLOCK(inode, dquot_freed_blocks); 580 DQUOT_FREE_BLOCK(inode, dquot_freed_blocks);
870 return; 581 return;
871} 582}
872 583
873/** 584int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
874 * ext4_test_allocatable() 585 s64 nblocks)
875 * @nr: given allocation block group
876 * @bh: bufferhead contains the bitmap of the given block group
877 *
878 * For ext4 allocations, we must not reuse any blocks which are
879 * allocated in the bitmap buffer's "last committed data" copy. This
880 * prevents deletes from freeing up the page for reuse until we have
881 * committed the delete transaction.
882 *
883 * If we didn't do this, then deleting something and reallocating it as
884 * data would allow the old block to be overwritten before the
885 * transaction committed (because we force data to disk before commit).
886 * This would lead to corruption if we crashed between overwriting the
887 * data and committing the delete.
888 *
889 * @@@ We may want to make this allocation behaviour conditional on
890 * data-writes at some point, and disable it for metadata allocations or
891 * sync-data inodes.
892 */
893static int ext4_test_allocatable(ext4_grpblk_t nr, struct buffer_head *bh)
894{
895 int ret;
896 struct journal_head *jh = bh2jh(bh);
897
898 if (ext4_test_bit(nr, bh->b_data))
899 return 0;
900
901 jbd_lock_bh_state(bh);
902 if (!jh->b_committed_data)
903 ret = 1;
904 else
905 ret = !ext4_test_bit(nr, jh->b_committed_data);
906 jbd_unlock_bh_state(bh);
907 return ret;
908}
909
910/**
911 * bitmap_search_next_usable_block()
912 * @start: the starting block (group relative) of the search
913 * @bh: bufferhead contains the block group bitmap
914 * @maxblocks: the ending block (group relative) of the reservation
915 *
916 * The bitmap search --- search forward alternately through the actual
917 * bitmap on disk and the last-committed copy in journal, until we find a
918 * bit free in both bitmaps.
919 */
920static ext4_grpblk_t
921bitmap_search_next_usable_block(ext4_grpblk_t start, struct buffer_head *bh,
922 ext4_grpblk_t maxblocks)
923{ 586{
924 ext4_grpblk_t next; 587 s64 free_blocks, dirty_blocks;
925 struct journal_head *jh = bh2jh(bh); 588 s64 root_blocks = 0;
926 589 struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
927 while (start < maxblocks) { 590 struct percpu_counter *dbc = &sbi->s_dirtyblocks_counter;
928 next = ext4_find_next_zero_bit(bh->b_data, maxblocks, start);
929 if (next >= maxblocks)
930 return -1;
931 if (ext4_test_allocatable(next, bh))
932 return next;
933 jbd_lock_bh_state(bh);
934 if (jh->b_committed_data)
935 start = ext4_find_next_zero_bit(jh->b_committed_data,
936 maxblocks, next);
937 jbd_unlock_bh_state(bh);
938 }
939 return -1;
940}
941 591
942/** 592 free_blocks = percpu_counter_read_positive(fbc);
943 * find_next_usable_block() 593 dirty_blocks = percpu_counter_read_positive(dbc);
944 * @start: the starting block (group relative) to find next
945 * allocatable block in bitmap.
946 * @bh: bufferhead contains the block group bitmap
947 * @maxblocks: the ending block (group relative) for the search
948 *
949 * Find an allocatable block in a bitmap. We honor both the bitmap and
950 * its last-committed copy (if that exists), and perform the "most
951 * appropriate allocation" algorithm of looking for a free block near
952 * the initial goal; then for a free byte somewhere in the bitmap; then
953 * for any free bit in the bitmap.
954 */
955static ext4_grpblk_t
956find_next_usable_block(ext4_grpblk_t start, struct buffer_head *bh,
957 ext4_grpblk_t maxblocks)
958{
959 ext4_grpblk_t here, next;
960 char *p, *r;
961
962 if (start > 0) {
963 /*
964 * The goal was occupied; search forward for a free
965 * block within the next XX blocks.
966 *
967 * end_goal is more or less random, but it has to be
968 * less than EXT4_BLOCKS_PER_GROUP. Aligning up to the
969 * next 64-bit boundary is simple..
970 */
971 ext4_grpblk_t end_goal = (start + 63) & ~63;
972 if (end_goal > maxblocks)
973 end_goal = maxblocks;
974 here = ext4_find_next_zero_bit(bh->b_data, end_goal, start);
975 if (here < end_goal && ext4_test_allocatable(here, bh))
976 return here;
977 ext4_debug("Bit not found near goal\n");
978 }
979
980 here = start;
981 if (here < 0)
982 here = 0;
983
984 p = ((char *)bh->b_data) + (here >> 3);
985 r = memscan(p, 0, ((maxblocks + 7) >> 3) - (here >> 3));
986 next = (r - ((char *)bh->b_data)) << 3;
987
988 if (next < maxblocks && next >= start && ext4_test_allocatable(next, bh))
989 return next;
990
991 /*
992 * The bitmap search --- search forward alternately through the actual
993 * bitmap and the last-committed copy until we find a bit free in
994 * both
995 */
996 here = bitmap_search_next_usable_block(here, bh, maxblocks);
997 return here;
998}
999
1000/**
1001 * claim_block()
1002 * @block: the free block (group relative) to allocate
1003 * @bh: the bufferhead containts the block group bitmap
1004 *
1005 * We think we can allocate this block in this bitmap. Try to set the bit.
1006 * If that succeeds then check that nobody has allocated and then freed the
1007 * block since we saw that is was not marked in b_committed_data. If it _was_
1008 * allocated and freed then clear the bit in the bitmap again and return
1009 * zero (failure).
1010 */
1011static inline int
1012claim_block(spinlock_t *lock, ext4_grpblk_t block, struct buffer_head *bh)
1013{
1014 struct journal_head *jh = bh2jh(bh);
1015 int ret;
1016
1017 if (ext4_set_bit_atomic(lock, block, bh->b_data))
1018 return 0;
1019 jbd_lock_bh_state(bh);
1020 if (jh->b_committed_data && ext4_test_bit(block,jh->b_committed_data)) {
1021 ext4_clear_bit_atomic(lock, block, bh->b_data);
1022 ret = 0;
1023 } else {
1024 ret = 1;
1025 }
1026 jbd_unlock_bh_state(bh);
1027 return ret;
1028}
1029 594
1030/** 595 if (!capable(CAP_SYS_RESOURCE) &&
1031 * ext4_try_to_allocate() 596 sbi->s_resuid != current->fsuid &&
1032 * @sb: superblock 597 (sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
1033 * @handle: handle to this transaction 598 root_blocks = ext4_r_blocks_count(sbi->s_es);
1034 * @group: given allocation block group
1035 * @bitmap_bh: bufferhead holds the block bitmap
1036 * @grp_goal: given target block within the group
1037 * @count: target number of blocks to allocate
1038 * @my_rsv: reservation window
1039 *
1040 * Attempt to allocate blocks within a give range. Set the range of allocation
1041 * first, then find the first free bit(s) from the bitmap (within the range),
1042 * and at last, allocate the blocks by claiming the found free bit as allocated.
1043 *
1044 * To set the range of this allocation:
1045 * if there is a reservation window, only try to allocate block(s) from the
1046 * file's own reservation window;
1047 * Otherwise, the allocation range starts from the give goal block, ends at
1048 * the block group's last block.
1049 *
1050 * If we failed to allocate the desired block then we may end up crossing to a
1051 * new bitmap. In that case we must release write access to the old one via
1052 * ext4_journal_release_buffer(), else we'll run out of credits.
1053 */
1054static ext4_grpblk_t
1055ext4_try_to_allocate(struct super_block *sb, handle_t *handle,
1056 ext4_group_t group, struct buffer_head *bitmap_bh,
1057 ext4_grpblk_t grp_goal, unsigned long *count,
1058 struct ext4_reserve_window *my_rsv)
1059{
1060 ext4_fsblk_t group_first_block;
1061 ext4_grpblk_t start, end;
1062 unsigned long num = 0;
1063
1064 /* we do allocation within the reservation window if we have a window */
1065 if (my_rsv) {
1066 group_first_block = ext4_group_first_block_no(sb, group);
1067 if (my_rsv->_rsv_start >= group_first_block)
1068 start = my_rsv->_rsv_start - group_first_block;
1069 else
1070 /* reservation window cross group boundary */
1071 start = 0;
1072 end = my_rsv->_rsv_end - group_first_block + 1;
1073 if (end > EXT4_BLOCKS_PER_GROUP(sb))
1074 /* reservation window crosses group boundary */
1075 end = EXT4_BLOCKS_PER_GROUP(sb);
1076 if ((start <= grp_goal) && (grp_goal < end))
1077 start = grp_goal;
1078 else
1079 grp_goal = -1;
1080 } else {
1081 if (grp_goal > 0)
1082 start = grp_goal;
1083 else
1084 start = 0;
1085 end = EXT4_BLOCKS_PER_GROUP(sb);
1086 }
1087
1088 BUG_ON(start > EXT4_BLOCKS_PER_GROUP(sb));
1089
1090repeat:
1091 if (grp_goal < 0 || !ext4_test_allocatable(grp_goal, bitmap_bh)) {
1092 grp_goal = find_next_usable_block(start, bitmap_bh, end);
1093 if (grp_goal < 0)
1094 goto fail_access;
1095 if (!my_rsv) {
1096 int i;
1097
1098 for (i = 0; i < 7 && grp_goal > start &&
1099 ext4_test_allocatable(grp_goal - 1,
1100 bitmap_bh);
1101 i++, grp_goal--)
1102 ;
1103 }
1104 }
1105 start = grp_goal;
1106
1107 if (!claim_block(sb_bgl_lock(EXT4_SB(sb), group),
1108 grp_goal, bitmap_bh)) {
1109 /*
1110 * The block was allocated by another thread, or it was
1111 * allocated and then freed by another thread
1112 */
1113 start++;
1114 grp_goal++;
1115 if (start >= end)
1116 goto fail_access;
1117 goto repeat;
1118 }
1119 num++;
1120 grp_goal++;
1121 while (num < *count && grp_goal < end
1122 && ext4_test_allocatable(grp_goal, bitmap_bh)
1123 && claim_block(sb_bgl_lock(EXT4_SB(sb), group),
1124 grp_goal, bitmap_bh)) {
1125 num++;
1126 grp_goal++;
1127 }
1128 *count = num;
1129 return grp_goal - num;
1130fail_access:
1131 *count = num;
1132 return -1;
1133}
1134
1135/**
1136 * find_next_reservable_window():
1137 * find a reservable space within the given range.
1138 * It does not allocate the reservation window for now:
1139 * alloc_new_reservation() will do the work later.
1140 *
1141 * @search_head: the head of the searching list;
1142 * This is not necessarily the list head of the whole filesystem
1143 *
1144 * We have both head and start_block to assist the search
1145 * for the reservable space. The list starts from head,
1146 * but we will shift to the place where start_block is,
1147 * then start from there, when looking for a reservable space.
1148 *
1149 * @size: the target new reservation window size
1150 *
1151 * @group_first_block: the first block we consider to start
1152 * the real search from
1153 *
1154 * @last_block:
1155 * the maximum block number that our goal reservable space
1156 * could start from. This is normally the last block in this
1157 * group. The search will end when we found the start of next
1158 * possible reservable space is out of this boundary.
1159 * This could handle the cross boundary reservation window
1160 * request.
1161 *
1162 * basically we search from the given range, rather than the whole
1163 * reservation double linked list, (start_block, last_block)
1164 * to find a free region that is of my size and has not
1165 * been reserved.
1166 *
1167 */
1168static int find_next_reservable_window(
1169 struct ext4_reserve_window_node *search_head,
1170 struct ext4_reserve_window_node *my_rsv,
1171 struct super_block * sb,
1172 ext4_fsblk_t start_block,
1173 ext4_fsblk_t last_block)
1174{
1175 struct rb_node *next;
1176 struct ext4_reserve_window_node *rsv, *prev;
1177 ext4_fsblk_t cur;
1178 int size = my_rsv->rsv_goal_size;
1179
1180 /* TODO: make the start of the reservation window byte-aligned */
1181 /* cur = *start_block & ~7;*/
1182 cur = start_block;
1183 rsv = search_head;
1184 if (!rsv)
1185 return -1;
1186
1187 while (1) {
1188 if (cur <= rsv->rsv_end)
1189 cur = rsv->rsv_end + 1;
1190
1191 /* TODO?
1192 * in the case we could not find a reservable space
1193 * that is what is expected, during the re-search, we could
1194 * remember what's the largest reservable space we could have
1195 * and return that one.
1196 *
1197 * For now it will fail if we could not find the reservable
1198 * space with expected-size (or more)...
1199 */
1200 if (cur > last_block)
1201 return -1; /* fail */
1202
1203 prev = rsv;
1204 next = rb_next(&rsv->rsv_node);
1205 rsv = rb_entry(next,struct ext4_reserve_window_node,rsv_node);
1206 599
1207 /* 600 if (free_blocks - (nblocks + root_blocks + dirty_blocks) <
1208 * Reached the last reservation, we can just append to the 601 EXT4_FREEBLOCKS_WATERMARK) {
1209 * previous one. 602 free_blocks = percpu_counter_sum(fbc);
1210 */ 603 dirty_blocks = percpu_counter_sum(dbc);
1211 if (!next) 604 if (dirty_blocks < 0) {
1212 break; 605 printk(KERN_CRIT "Dirty block accounting "
1213 606 "went wrong %lld\n",
1214 if (cur + size <= rsv->rsv_start) { 607 dirty_blocks);
1215 /*
1216 * Found a reserveable space big enough. We could
1217 * have a reservation across the group boundary here
1218 */
1219 break;
1220 } 608 }
1221 } 609 }
1222 /* 610 /* Check whether we have space after
1223 * we come here either : 611 * accounting for current dirty blocks
1224 * when we reach the end of the whole list,
1225 * and there is empty reservable space after last entry in the list.
1226 * append it to the end of the list.
1227 *
1228 * or we found one reservable space in the middle of the list,
1229 * return the reservation window that we could append to.
1230 * succeed.
1231 */ 612 */
613 if (free_blocks < ((root_blocks + nblocks) + dirty_blocks))
614 /* we don't have free space */
615 return -ENOSPC;
1232 616
1233 if ((prev != my_rsv) && (!rsv_is_empty(&my_rsv->rsv_window))) 617 /* Add the blocks to nblocks */
1234 rsv_window_remove(sb, my_rsv); 618 percpu_counter_add(dbc, nblocks);
1235
1236 /*
1237 * Let's book the whole avaliable window for now. We will check the
1238 * disk bitmap later and then, if there are free blocks then we adjust
1239 * the window size if it's larger than requested.
1240 * Otherwise, we will remove this node from the tree next time
1241 * call find_next_reservable_window.
1242 */
1243 my_rsv->rsv_start = cur;
1244 my_rsv->rsv_end = cur + size - 1;
1245 my_rsv->rsv_alloc_hit = 0;
1246
1247 if (prev != my_rsv)
1248 ext4_rsv_window_add(sb, my_rsv);
1249
1250 return 0; 619 return 0;
1251} 620}
1252 621
1253/** 622/**
1254 * alloc_new_reservation()--allocate a new reservation window
1255 *
1256 * To make a new reservation, we search part of the filesystem
1257 * reservation list (the list that inside the group). We try to
1258 * allocate a new reservation window near the allocation goal,
1259 * or the beginning of the group, if there is no goal.
1260 *
1261 * We first find a reservable space after the goal, then from
1262 * there, we check the bitmap for the first free block after
1263 * it. If there is no free block until the end of group, then the
1264 * whole group is full, we failed. Otherwise, check if the free
1265 * block is inside the expected reservable space, if so, we
1266 * succeed.
1267 * If the first free block is outside the reservable space, then
1268 * start from the first free block, we search for next available
1269 * space, and go on.
1270 *
1271 * on succeed, a new reservation will be found and inserted into the list
1272 * It contains at least one free block, and it does not overlap with other
1273 * reservation windows.
1274 *
1275 * failed: we failed to find a reservation window in this group
1276 *
1277 * @rsv: the reservation
1278 *
1279 * @grp_goal: The goal (group-relative). It is where the search for a
1280 * free reservable space should start from.
1281 * if we have a grp_goal(grp_goal >0 ), then start from there,
1282 * no grp_goal(grp_goal = -1), we start from the first block
1283 * of the group.
1284 *
1285 * @sb: the super block
1286 * @group: the group we are trying to allocate in
1287 * @bitmap_bh: the block group block bitmap
1288 *
1289 */
1290static int alloc_new_reservation(struct ext4_reserve_window_node *my_rsv,
1291 ext4_grpblk_t grp_goal, struct super_block *sb,
1292 ext4_group_t group, struct buffer_head *bitmap_bh)
1293{
1294 struct ext4_reserve_window_node *search_head;
1295 ext4_fsblk_t group_first_block, group_end_block, start_block;
1296 ext4_grpblk_t first_free_block;
1297 struct rb_root *fs_rsv_root = &EXT4_SB(sb)->s_rsv_window_root;
1298 unsigned long size;
1299 int ret;
1300 spinlock_t *rsv_lock = &EXT4_SB(sb)->s_rsv_window_lock;
1301
1302 group_first_block = ext4_group_first_block_no(sb, group);
1303 group_end_block = group_first_block + (EXT4_BLOCKS_PER_GROUP(sb) - 1);
1304
1305 if (grp_goal < 0)
1306 start_block = group_first_block;
1307 else
1308 start_block = grp_goal + group_first_block;
1309
1310 size = my_rsv->rsv_goal_size;
1311
1312 if (!rsv_is_empty(&my_rsv->rsv_window)) {
1313 /*
1314 * if the old reservation is cross group boundary
1315 * and if the goal is inside the old reservation window,
1316 * we will come here when we just failed to allocate from
1317 * the first part of the window. We still have another part
1318 * that belongs to the next group. In this case, there is no
1319 * point to discard our window and try to allocate a new one
1320 * in this group(which will fail). we should
1321 * keep the reservation window, just simply move on.
1322 *
1323 * Maybe we could shift the start block of the reservation
1324 * window to the first block of next group.
1325 */
1326
1327 if ((my_rsv->rsv_start <= group_end_block) &&
1328 (my_rsv->rsv_end > group_end_block) &&
1329 (start_block >= my_rsv->rsv_start))
1330 return -1;
1331
1332 if ((my_rsv->rsv_alloc_hit >
1333 (my_rsv->rsv_end - my_rsv->rsv_start + 1) / 2)) {
1334 /*
1335 * if the previously allocation hit ratio is
1336 * greater than 1/2, then we double the size of
1337 * the reservation window the next time,
1338 * otherwise we keep the same size window
1339 */
1340 size = size * 2;
1341 if (size > EXT4_MAX_RESERVE_BLOCKS)
1342 size = EXT4_MAX_RESERVE_BLOCKS;
1343 my_rsv->rsv_goal_size= size;
1344 }
1345 }
1346
1347 spin_lock(rsv_lock);
1348 /*
1349 * shift the search start to the window near the goal block
1350 */
1351 search_head = search_reserve_window(fs_rsv_root, start_block);
1352
1353 /*
1354 * find_next_reservable_window() simply finds a reservable window
1355 * inside the given range(start_block, group_end_block).
1356 *
1357 * To make sure the reservation window has a free bit inside it, we
1358 * need to check the bitmap after we found a reservable window.
1359 */
1360retry:
1361 ret = find_next_reservable_window(search_head, my_rsv, sb,
1362 start_block, group_end_block);
1363
1364 if (ret == -1) {
1365 if (!rsv_is_empty(&my_rsv->rsv_window))
1366 rsv_window_remove(sb, my_rsv);
1367 spin_unlock(rsv_lock);
1368 return -1;
1369 }
1370
1371 /*
1372 * On success, find_next_reservable_window() returns the
1373 * reservation window where there is a reservable space after it.
1374 * Before we reserve this reservable space, we need
1375 * to make sure there is at least a free block inside this region.
1376 *
1377 * searching the first free bit on the block bitmap and copy of
1378 * last committed bitmap alternatively, until we found a allocatable
1379 * block. Search start from the start block of the reservable space
1380 * we just found.
1381 */
1382 spin_unlock(rsv_lock);
1383 first_free_block = bitmap_search_next_usable_block(
1384 my_rsv->rsv_start - group_first_block,
1385 bitmap_bh, group_end_block - group_first_block + 1);
1386
1387 if (first_free_block < 0) {
1388 /*
1389 * no free block left on the bitmap, no point
1390 * to reserve the space. return failed.
1391 */
1392 spin_lock(rsv_lock);
1393 if (!rsv_is_empty(&my_rsv->rsv_window))
1394 rsv_window_remove(sb, my_rsv);
1395 spin_unlock(rsv_lock);
1396 return -1; /* failed */
1397 }
1398
1399 start_block = first_free_block + group_first_block;
1400 /*
1401 * check if the first free block is within the
1402 * free space we just reserved
1403 */
1404 if (start_block >= my_rsv->rsv_start && start_block <= my_rsv->rsv_end)
1405 return 0; /* success */
1406 /*
1407 * if the first free bit we found is out of the reservable space
1408 * continue search for next reservable space,
1409 * start from where the free block is,
1410 * we also shift the list head to where we stopped last time
1411 */
1412 search_head = my_rsv;
1413 spin_lock(rsv_lock);
1414 goto retry;
1415}
1416
1417/**
1418 * try_to_extend_reservation()
1419 * @my_rsv: given reservation window
1420 * @sb: super block
1421 * @size: the delta to extend
1422 *
1423 * Attempt to expand the reservation window large enough to have
1424 * required number of free blocks
1425 *
1426 * Since ext4_try_to_allocate() will always allocate blocks within
1427 * the reservation window range, if the window size is too small,
1428 * multiple blocks allocation has to stop at the end of the reservation
1429 * window. To make this more efficient, given the total number of
1430 * blocks needed and the current size of the window, we try to
1431 * expand the reservation window size if necessary on a best-effort
1432 * basis before ext4_new_blocks() tries to allocate blocks,
1433 */
1434static void try_to_extend_reservation(struct ext4_reserve_window_node *my_rsv,
1435 struct super_block *sb, int size)
1436{
1437 struct ext4_reserve_window_node *next_rsv;
1438 struct rb_node *next;
1439 spinlock_t *rsv_lock = &EXT4_SB(sb)->s_rsv_window_lock;
1440
1441 if (!spin_trylock(rsv_lock))
1442 return;
1443
1444 next = rb_next(&my_rsv->rsv_node);
1445
1446 if (!next)
1447 my_rsv->rsv_end += size;
1448 else {
1449 next_rsv = rb_entry(next, struct ext4_reserve_window_node, rsv_node);
1450
1451 if ((next_rsv->rsv_start - my_rsv->rsv_end - 1) >= size)
1452 my_rsv->rsv_end += size;
1453 else
1454 my_rsv->rsv_end = next_rsv->rsv_start - 1;
1455 }
1456 spin_unlock(rsv_lock);
1457}
1458
1459/**
1460 * ext4_try_to_allocate_with_rsv()
1461 * @sb: superblock
1462 * @handle: handle to this transaction
1463 * @group: given allocation block group
1464 * @bitmap_bh: bufferhead holds the block bitmap
1465 * @grp_goal: given target block within the group
1466 * @count: target number of blocks to allocate
1467 * @my_rsv: reservation window
1468 * @errp: pointer to store the error code
1469 *
1470 * This is the main function used to allocate a new block and its reservation
1471 * window.
1472 *
1473 * Each time when a new block allocation is need, first try to allocate from
1474 * its own reservation. If it does not have a reservation window, instead of
1475 * looking for a free bit on bitmap first, then look up the reservation list to
1476 * see if it is inside somebody else's reservation window, we try to allocate a
1477 * reservation window for it starting from the goal first. Then do the block
1478 * allocation within the reservation window.
1479 *
1480 * This will avoid keeping on searching the reservation list again and
1481 * again when somebody is looking for a free block (without
1482 * reservation), and there are lots of free blocks, but they are all
1483 * being reserved.
1484 *
1485 * We use a red-black tree for the per-filesystem reservation list.
1486 *
1487 */
1488static ext4_grpblk_t
1489ext4_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
1490 ext4_group_t group, struct buffer_head *bitmap_bh,
1491 ext4_grpblk_t grp_goal,
1492 struct ext4_reserve_window_node * my_rsv,
1493 unsigned long *count, int *errp)
1494{
1495 ext4_fsblk_t group_first_block, group_last_block;
1496 ext4_grpblk_t ret = 0;
1497 int fatal;
1498 unsigned long num = *count;
1499
1500 *errp = 0;
1501
1502 /*
1503 * Make sure we use undo access for the bitmap, because it is critical
1504 * that we do the frozen_data COW on bitmap buffers in all cases even
1505 * if the buffer is in BJ_Forget state in the committing transaction.
1506 */
1507 BUFFER_TRACE(bitmap_bh, "get undo access for new block");
1508 fatal = ext4_journal_get_undo_access(handle, bitmap_bh);
1509 if (fatal) {
1510 *errp = fatal;
1511 return -1;
1512 }
1513
1514 /*
1515 * we don't deal with reservation when
1516 * filesystem is mounted without reservation
1517 * or the file is not a regular file
1518 * or last attempt to allocate a block with reservation turned on failed
1519 */
1520 if (my_rsv == NULL ) {
1521 ret = ext4_try_to_allocate(sb, handle, group, bitmap_bh,
1522 grp_goal, count, NULL);
1523 goto out;
1524 }
1525 /*
1526 * grp_goal is a group relative block number (if there is a goal)
1527 * 0 <= grp_goal < EXT4_BLOCKS_PER_GROUP(sb)
1528 * first block is a filesystem wide block number
1529 * first block is the block number of the first block in this group
1530 */
1531 group_first_block = ext4_group_first_block_no(sb, group);
1532 group_last_block = group_first_block + (EXT4_BLOCKS_PER_GROUP(sb) - 1);
1533
1534 /*
1535 * Basically we will allocate a new block from inode's reservation
1536 * window.
1537 *
1538 * We need to allocate a new reservation window, if:
1539 * a) inode does not have a reservation window; or
1540 * b) last attempt to allocate a block from existing reservation
1541 * failed; or
1542 * c) we come here with a goal and with a reservation window
1543 *
1544 * We do not need to allocate a new reservation window if we come here
1545 * at the beginning with a goal and the goal is inside the window, or
1546 * we don't have a goal but already have a reservation window.
1547 * then we could go to allocate from the reservation window directly.
1548 */
1549 while (1) {
1550 if (rsv_is_empty(&my_rsv->rsv_window) || (ret < 0) ||
1551 !goal_in_my_reservation(&my_rsv->rsv_window,
1552 grp_goal, group, sb)) {
1553 if (my_rsv->rsv_goal_size < *count)
1554 my_rsv->rsv_goal_size = *count;
1555 ret = alloc_new_reservation(my_rsv, grp_goal, sb,
1556 group, bitmap_bh);
1557 if (ret < 0)
1558 break; /* failed */
1559
1560 if (!goal_in_my_reservation(&my_rsv->rsv_window,
1561 grp_goal, group, sb))
1562 grp_goal = -1;
1563 } else if (grp_goal >= 0) {
1564 int curr = my_rsv->rsv_end -
1565 (grp_goal + group_first_block) + 1;
1566
1567 if (curr < *count)
1568 try_to_extend_reservation(my_rsv, sb,
1569 *count - curr);
1570 }
1571
1572 if ((my_rsv->rsv_start > group_last_block) ||
1573 (my_rsv->rsv_end < group_first_block)) {
1574 rsv_window_dump(&EXT4_SB(sb)->s_rsv_window_root, 1);
1575 BUG();
1576 }
1577 ret = ext4_try_to_allocate(sb, handle, group, bitmap_bh,
1578 grp_goal, &num, &my_rsv->rsv_window);
1579 if (ret >= 0) {
1580 my_rsv->rsv_alloc_hit += num;
1581 *count = num;
1582 break; /* succeed */
1583 }
1584 num = *count;
1585 }
1586out:
1587 if (ret >= 0) {
1588 BUFFER_TRACE(bitmap_bh, "journal_dirty_metadata for "
1589 "bitmap block");
1590 fatal = ext4_journal_dirty_metadata(handle, bitmap_bh);
1591 if (fatal) {
1592 *errp = fatal;
1593 return -1;
1594 }
1595 return ret;
1596 }
1597
1598 BUFFER_TRACE(bitmap_bh, "journal_release_buffer");
1599 ext4_journal_release_buffer(handle, bitmap_bh);
1600 return ret;
1601}
1602
1603/**
1604 * ext4_has_free_blocks() 623 * ext4_has_free_blocks()
1605 * @sbi: in-core super block structure. 624 * @sbi: in-core super block structure.
1606 * @nblocks: number of neeed blocks 625 * @nblocks: number of neeed blocks
@@ -1610,29 +629,34 @@ out:
1610 * On success, return nblocks 629 * On success, return nblocks
1611 */ 630 */
1612ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi, 631ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
1613 ext4_fsblk_t nblocks) 632 s64 nblocks)
1614{ 633{
1615 ext4_fsblk_t free_blocks; 634 s64 free_blocks, dirty_blocks;
1616 ext4_fsblk_t root_blocks = 0; 635 s64 root_blocks = 0;
636 struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
637 struct percpu_counter *dbc = &sbi->s_dirtyblocks_counter;
1617 638
1618 free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter); 639 free_blocks = percpu_counter_read_positive(fbc);
640 dirty_blocks = percpu_counter_read_positive(dbc);
1619 641
1620 if (!capable(CAP_SYS_RESOURCE) && 642 if (!capable(CAP_SYS_RESOURCE) &&
1621 sbi->s_resuid != current->fsuid && 643 sbi->s_resuid != current->fsuid &&
1622 (sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid))) 644 (sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
1623 root_blocks = ext4_r_blocks_count(sbi->s_es); 645 root_blocks = ext4_r_blocks_count(sbi->s_es);
1624#ifdef CONFIG_SMP 646
1625 if (free_blocks - root_blocks < FBC_BATCH) 647 if (free_blocks - (nblocks + root_blocks + dirty_blocks) <
1626 free_blocks = 648 EXT4_FREEBLOCKS_WATERMARK) {
1627 percpu_counter_sum_and_set(&sbi->s_freeblocks_counter); 649 free_blocks = percpu_counter_sum(fbc);
1628#endif 650 dirty_blocks = percpu_counter_sum(dbc);
1629 if (free_blocks <= root_blocks) 651 }
652 if (free_blocks <= (root_blocks + dirty_blocks))
1630 /* we don't have free space */ 653 /* we don't have free space */
1631 return 0; 654 return 0;
1632 if (free_blocks - root_blocks < nblocks) 655
1633 return free_blocks - root_blocks; 656 if (free_blocks - (root_blocks + dirty_blocks) < nblocks)
657 return free_blocks - (root_blocks + dirty_blocks);
1634 return nblocks; 658 return nblocks;
1635 } 659}
1636 660
1637 661
1638/** 662/**
@@ -1657,303 +681,6 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
1657 return jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal); 681 return jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal);
1658} 682}
1659 683
1660/**
1661 * ext4_old_new_blocks() -- core block bitmap based block allocation function
1662 *
1663 * @handle: handle to this transaction
1664 * @inode: file inode
1665 * @goal: given target block(filesystem wide)
1666 * @count: target number of blocks to allocate
1667 * @errp: error code
1668 *
1669 * ext4_old_new_blocks uses a goal block to assist allocation and look up
1670 * the block bitmap directly to do block allocation. It tries to
1671 * allocate block(s) from the block group contains the goal block first. If
1672 * that fails, it will try to allocate block(s) from other block groups
1673 * without any specific goal block.
1674 *
1675 * This function is called when -o nomballoc mount option is enabled
1676 *
1677 */
1678ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
1679 ext4_fsblk_t goal, unsigned long *count, int *errp)
1680{
1681 struct buffer_head *bitmap_bh = NULL;
1682 struct buffer_head *gdp_bh;
1683 ext4_group_t group_no;
1684 ext4_group_t goal_group;
1685 ext4_grpblk_t grp_target_blk; /* blockgroup relative goal block */
1686 ext4_grpblk_t grp_alloc_blk; /* blockgroup-relative allocated block*/
1687 ext4_fsblk_t ret_block; /* filesyetem-wide allocated block */
1688 ext4_group_t bgi; /* blockgroup iteration index */
1689 int fatal = 0, err;
1690 int performed_allocation = 0;
1691 ext4_grpblk_t free_blocks; /* number of free blocks in a group */
1692 struct super_block *sb;
1693 struct ext4_group_desc *gdp;
1694 struct ext4_super_block *es;
1695 struct ext4_sb_info *sbi;
1696 struct ext4_reserve_window_node *my_rsv = NULL;
1697 struct ext4_block_alloc_info *block_i;
1698 unsigned short windowsz = 0;
1699 ext4_group_t ngroups;
1700 unsigned long num = *count;
1701
1702 sb = inode->i_sb;
1703 if (!sb) {
1704 *errp = -ENODEV;
1705 printk("ext4_new_block: nonexistent device");
1706 return 0;
1707 }
1708
1709 sbi = EXT4_SB(sb);
1710 if (!EXT4_I(inode)->i_delalloc_reserved_flag) {
1711 /*
1712 * With delalloc we already reserved the blocks
1713 */
1714 *count = ext4_has_free_blocks(sbi, *count);
1715 }
1716 if (*count == 0) {
1717 *errp = -ENOSPC;
1718 return 0; /*return with ENOSPC error */
1719 }
1720 num = *count;
1721
1722 /*
1723 * Check quota for allocation of this block.
1724 */
1725 if (DQUOT_ALLOC_BLOCK(inode, num)) {
1726 *errp = -EDQUOT;
1727 return 0;
1728 }
1729
1730 sbi = EXT4_SB(sb);
1731 es = EXT4_SB(sb)->s_es;
1732 ext4_debug("goal=%llu.\n", goal);
1733 /*
1734 * Allocate a block from reservation only when
1735 * filesystem is mounted with reservation(default,-o reservation), and
1736 * it's a regular file, and
1737 * the desired window size is greater than 0 (One could use ioctl
1738 * command EXT4_IOC_SETRSVSZ to set the window size to 0 to turn off
1739 * reservation on that particular file)
1740 */
1741 block_i = EXT4_I(inode)->i_block_alloc_info;
1742 if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0))
1743 my_rsv = &block_i->rsv_window_node;
1744
1745 /*
1746 * First, test whether the goal block is free.
1747 */
1748 if (goal < le32_to_cpu(es->s_first_data_block) ||
1749 goal >= ext4_blocks_count(es))
1750 goal = le32_to_cpu(es->s_first_data_block);
1751 ext4_get_group_no_and_offset(sb, goal, &group_no, &grp_target_blk);
1752 goal_group = group_no;
1753retry_alloc:
1754 gdp = ext4_get_group_desc(sb, group_no, &gdp_bh);
1755 if (!gdp)
1756 goto io_error;
1757
1758 free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
1759 /*
1760 * if there is not enough free blocks to make a new resevation
1761 * turn off reservation for this allocation
1762 */
1763 if (my_rsv && (free_blocks < windowsz)
1764 && (rsv_is_empty(&my_rsv->rsv_window)))
1765 my_rsv = NULL;
1766
1767 if (free_blocks > 0) {
1768 bitmap_bh = ext4_read_block_bitmap(sb, group_no);
1769 if (!bitmap_bh)
1770 goto io_error;
1771 grp_alloc_blk = ext4_try_to_allocate_with_rsv(sb, handle,
1772 group_no, bitmap_bh, grp_target_blk,
1773 my_rsv, &num, &fatal);
1774 if (fatal)
1775 goto out;
1776 if (grp_alloc_blk >= 0)
1777 goto allocated;
1778 }
1779
1780 ngroups = EXT4_SB(sb)->s_groups_count;
1781 smp_rmb();
1782
1783 /*
1784 * Now search the rest of the groups. We assume that
1785 * group_no and gdp correctly point to the last group visited.
1786 */
1787 for (bgi = 0; bgi < ngroups; bgi++) {
1788 group_no++;
1789 if (group_no >= ngroups)
1790 group_no = 0;
1791 gdp = ext4_get_group_desc(sb, group_no, &gdp_bh);
1792 if (!gdp)
1793 goto io_error;
1794 free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
1795 /*
1796 * skip this group if the number of
1797 * free blocks is less than half of the reservation
1798 * window size.
1799 */
1800 if (free_blocks <= (windowsz/2))
1801 continue;
1802
1803 brelse(bitmap_bh);
1804 bitmap_bh = ext4_read_block_bitmap(sb, group_no);
1805 if (!bitmap_bh)
1806 goto io_error;
1807 /*
1808 * try to allocate block(s) from this group, without a goal(-1).
1809 */
1810 grp_alloc_blk = ext4_try_to_allocate_with_rsv(sb, handle,
1811 group_no, bitmap_bh, -1, my_rsv,
1812 &num, &fatal);
1813 if (fatal)
1814 goto out;
1815 if (grp_alloc_blk >= 0)
1816 goto allocated;
1817 }
1818 /*
1819 * We may end up a bogus ealier ENOSPC error due to
1820 * filesystem is "full" of reservations, but
1821 * there maybe indeed free blocks avaliable on disk
1822 * In this case, we just forget about the reservations
1823 * just do block allocation as without reservations.
1824 */
1825 if (my_rsv) {
1826 my_rsv = NULL;
1827 windowsz = 0;
1828 group_no = goal_group;
1829 goto retry_alloc;
1830 }
1831 /* No space left on the device */
1832 *errp = -ENOSPC;
1833 goto out;
1834
1835allocated:
1836
1837 ext4_debug("using block group %lu(%d)\n",
1838 group_no, gdp->bg_free_blocks_count);
1839
1840 BUFFER_TRACE(gdp_bh, "get_write_access");
1841 fatal = ext4_journal_get_write_access(handle, gdp_bh);
1842 if (fatal)
1843 goto out;
1844
1845 ret_block = grp_alloc_blk + ext4_group_first_block_no(sb, group_no);
1846
1847 if (in_range(ext4_block_bitmap(sb, gdp), ret_block, num) ||
1848 in_range(ext4_inode_bitmap(sb, gdp), ret_block, num) ||
1849 in_range(ret_block, ext4_inode_table(sb, gdp),
1850 EXT4_SB(sb)->s_itb_per_group) ||
1851 in_range(ret_block + num - 1, ext4_inode_table(sb, gdp),
1852 EXT4_SB(sb)->s_itb_per_group)) {
1853 ext4_error(sb, "ext4_new_block",
1854 "Allocating block in system zone - "
1855 "blocks from %llu, length %lu",
1856 ret_block, num);
1857 /*
1858 * claim_block marked the blocks we allocated
1859 * as in use. So we may want to selectively
1860 * mark some of the blocks as free
1861 */
1862 goto retry_alloc;
1863 }
1864
1865 performed_allocation = 1;
1866
1867#ifdef CONFIG_JBD2_DEBUG
1868 {
1869 struct buffer_head *debug_bh;
1870
1871 /* Record bitmap buffer state in the newly allocated block */
1872 debug_bh = sb_find_get_block(sb, ret_block);
1873 if (debug_bh) {
1874 BUFFER_TRACE(debug_bh, "state when allocated");
1875 BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap state");
1876 brelse(debug_bh);
1877 }
1878 }
1879 jbd_lock_bh_state(bitmap_bh);
1880 spin_lock(sb_bgl_lock(sbi, group_no));
1881 if (buffer_jbd(bitmap_bh) && bh2jh(bitmap_bh)->b_committed_data) {
1882 int i;
1883
1884 for (i = 0; i < num; i++) {
1885 if (ext4_test_bit(grp_alloc_blk+i,
1886 bh2jh(bitmap_bh)->b_committed_data)) {
1887 printk("%s: block was unexpectedly set in "
1888 "b_committed_data\n", __func__);
1889 }
1890 }
1891 }
1892 ext4_debug("found bit %d\n", grp_alloc_blk);
1893 spin_unlock(sb_bgl_lock(sbi, group_no));
1894 jbd_unlock_bh_state(bitmap_bh);
1895#endif
1896
1897 if (ret_block + num - 1 >= ext4_blocks_count(es)) {
1898 ext4_error(sb, "ext4_new_block",
1899 "block(%llu) >= blocks count(%llu) - "
1900 "block_group = %lu, es == %p ", ret_block,
1901 ext4_blocks_count(es), group_no, es);
1902 goto out;
1903 }
1904
1905 /*
1906 * It is up to the caller to add the new buffer to a journal
1907 * list of some description. We don't know in advance whether
1908 * the caller wants to use it as metadata or data.
1909 */
1910 spin_lock(sb_bgl_lock(sbi, group_no));
1911 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))
1912 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
1913 le16_add_cpu(&gdp->bg_free_blocks_count, -num);
1914 gdp->bg_checksum = ext4_group_desc_csum(sbi, group_no, gdp);
1915 spin_unlock(sb_bgl_lock(sbi, group_no));
1916 if (!EXT4_I(inode)->i_delalloc_reserved_flag)
1917 percpu_counter_sub(&sbi->s_freeblocks_counter, num);
1918
1919 if (sbi->s_log_groups_per_flex) {
1920 ext4_group_t flex_group = ext4_flex_group(sbi, group_no);
1921 spin_lock(sb_bgl_lock(sbi, flex_group));
1922 sbi->s_flex_groups[flex_group].free_blocks -= num;
1923 spin_unlock(sb_bgl_lock(sbi, flex_group));
1924 }
1925
1926 BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
1927 err = ext4_journal_dirty_metadata(handle, gdp_bh);
1928 if (!fatal)
1929 fatal = err;
1930
1931 sb->s_dirt = 1;
1932 if (fatal)
1933 goto out;
1934
1935 *errp = 0;
1936 brelse(bitmap_bh);
1937 DQUOT_FREE_BLOCK(inode, *count-num);
1938 *count = num;
1939 return ret_block;
1940
1941io_error:
1942 *errp = -EIO;
1943out:
1944 if (fatal) {
1945 *errp = fatal;
1946 ext4_std_error(sb, fatal);
1947 }
1948 /*
1949 * Undo the block allocation
1950 */
1951 if (!performed_allocation)
1952 DQUOT_FREE_BLOCK(inode, *count);
1953 brelse(bitmap_bh);
1954 return 0;
1955}
1956
1957#define EXT4_META_BLOCK 0x1 684#define EXT4_META_BLOCK 0x1
1958 685
1959static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode, 686static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode,
@@ -1963,10 +690,6 @@ static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode,
1963 struct ext4_allocation_request ar; 690 struct ext4_allocation_request ar;
1964 ext4_fsblk_t ret; 691 ext4_fsblk_t ret;
1965 692
1966 if (!test_opt(inode->i_sb, MBALLOC)) {
1967 return ext4_old_new_blocks(handle, inode, goal, count, errp);
1968 }
1969
1970 memset(&ar, 0, sizeof(ar)); 693 memset(&ar, 0, sizeof(ar));
1971 /* Fill with neighbour allocated blocks */ 694 /* Fill with neighbour allocated blocks */
1972 695
@@ -2008,7 +731,7 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
2008 /* 731 /*
2009 * Account for the allocated meta blocks 732 * Account for the allocated meta blocks
2010 */ 733 */
2011 if (!(*errp)) { 734 if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) {
2012 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 735 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
2013 EXT4_I(inode)->i_allocated_meta_blocks += *count; 736 EXT4_I(inode)->i_allocated_meta_blocks += *count;
2014 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 737 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
@@ -2093,10 +816,9 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
2093 bitmap_count += x; 816 bitmap_count += x;
2094 } 817 }
2095 brelse(bitmap_bh); 818 brelse(bitmap_bh);
2096 printk("ext4_count_free_blocks: stored = %llu" 819 printk(KERN_DEBUG "ext4_count_free_blocks: stored = %llu"
2097 ", computed = %llu, %llu\n", 820 ", computed = %llu, %llu\n", ext4_free_blocks_count(es),
2098 ext4_free_blocks_count(es), 821 desc_count, bitmap_count);
2099 desc_count, bitmap_count);
2100 return bitmap_count; 822 return bitmap_count;
2101#else 823#else
2102 desc_count = 0; 824 desc_count = 0;
@@ -2183,8 +905,9 @@ unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group)
2183 905
2184 if (!EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG) || 906 if (!EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG) ||
2185 metagroup < first_meta_bg) 907 metagroup < first_meta_bg)
2186 return ext4_bg_num_gdb_nometa(sb,group); 908 return ext4_bg_num_gdb_nometa(sb, group);
2187 909
2188 return ext4_bg_num_gdb_meta(sb,group); 910 return ext4_bg_num_gdb_meta(sb,group);
2189 911
2190} 912}
913
diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c
index d37ea675045..0a7a6663c19 100644
--- a/fs/ext4/bitmap.c
+++ b/fs/ext4/bitmap.c
@@ -15,17 +15,17 @@
15 15
16static const int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0}; 16static const int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0};
17 17
18unsigned long ext4_count_free (struct buffer_head * map, unsigned int numchars) 18unsigned long ext4_count_free(struct buffer_head *map, unsigned int numchars)
19{ 19{
20 unsigned int i; 20 unsigned int i;
21 unsigned long sum = 0; 21 unsigned long sum = 0;
22 22
23 if (!map) 23 if (!map)
24 return (0); 24 return 0;
25 for (i = 0; i < numchars; i++) 25 for (i = 0; i < numchars; i++)
26 sum += nibblemap[map->b_data[i] & 0xf] + 26 sum += nibblemap[map->b_data[i] & 0xf] +
27 nibblemap[(map->b_data[i] >> 4) & 0xf]; 27 nibblemap[(map->b_data[i] >> 4) & 0xf];
28 return (sum); 28 return sum;
29} 29}
30 30
31#endif /* EXT4FS_DEBUG */ 31#endif /* EXT4FS_DEBUG */
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index ec8e33b4521..3ca6a2b7632 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -33,10 +33,10 @@ static unsigned char ext4_filetype_table[] = {
33}; 33};
34 34
35static int ext4_readdir(struct file *, void *, filldir_t); 35static int ext4_readdir(struct file *, void *, filldir_t);
36static int ext4_dx_readdir(struct file * filp, 36static int ext4_dx_readdir(struct file *filp,
37 void * dirent, filldir_t filldir); 37 void *dirent, filldir_t filldir);
38static int ext4_release_dir (struct inode * inode, 38static int ext4_release_dir(struct inode *inode,
39 struct file * filp); 39 struct file *filp);
40 40
41const struct file_operations ext4_dir_operations = { 41const struct file_operations ext4_dir_operations = {
42 .llseek = generic_file_llseek, 42 .llseek = generic_file_llseek,
@@ -61,12 +61,12 @@ static unsigned char get_dtype(struct super_block *sb, int filetype)
61} 61}
62 62
63 63
64int ext4_check_dir_entry (const char * function, struct inode * dir, 64int ext4_check_dir_entry(const char *function, struct inode *dir,
65 struct ext4_dir_entry_2 * de, 65 struct ext4_dir_entry_2 *de,
66 struct buffer_head * bh, 66 struct buffer_head *bh,
67 unsigned long offset) 67 unsigned long offset)
68{ 68{
69 const char * error_msg = NULL; 69 const char *error_msg = NULL;
70 const int rlen = ext4_rec_len_from_disk(de->rec_len); 70 const int rlen = ext4_rec_len_from_disk(de->rec_len);
71 71
72 if (rlen < EXT4_DIR_REC_LEN(1)) 72 if (rlen < EXT4_DIR_REC_LEN(1))
@@ -82,7 +82,7 @@ int ext4_check_dir_entry (const char * function, struct inode * dir,
82 error_msg = "inode out of bounds"; 82 error_msg = "inode out of bounds";
83 83
84 if (error_msg != NULL) 84 if (error_msg != NULL)
85 ext4_error (dir->i_sb, function, 85 ext4_error(dir->i_sb, function,
86 "bad entry in directory #%lu: %s - " 86 "bad entry in directory #%lu: %s - "
87 "offset=%lu, inode=%lu, rec_len=%d, name_len=%d", 87 "offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
88 dir->i_ino, error_msg, offset, 88 dir->i_ino, error_msg, offset,
@@ -91,8 +91,8 @@ int ext4_check_dir_entry (const char * function, struct inode * dir,
91 return error_msg == NULL ? 1 : 0; 91 return error_msg == NULL ? 1 : 0;
92} 92}
93 93
94static int ext4_readdir(struct file * filp, 94static int ext4_readdir(struct file *filp,
95 void * dirent, filldir_t filldir) 95 void *dirent, filldir_t filldir)
96{ 96{
97 int error = 0; 97 int error = 0;
98 unsigned long offset; 98 unsigned long offset;
@@ -102,6 +102,7 @@ static int ext4_readdir(struct file * filp,
102 int err; 102 int err;
103 struct inode *inode = filp->f_path.dentry->d_inode; 103 struct inode *inode = filp->f_path.dentry->d_inode;
104 int ret = 0; 104 int ret = 0;
105 int dir_has_error = 0;
105 106
106 sb = inode->i_sb; 107 sb = inode->i_sb;
107 108
@@ -148,9 +149,13 @@ static int ext4_readdir(struct file * filp,
148 * of recovering data when there's a bad sector 149 * of recovering data when there's a bad sector
149 */ 150 */
150 if (!bh) { 151 if (!bh) {
151 ext4_error (sb, "ext4_readdir", 152 if (!dir_has_error) {
152 "directory #%lu contains a hole at offset %lu", 153 ext4_error(sb, __func__, "directory #%lu "
153 inode->i_ino, (unsigned long)filp->f_pos); 154 "contains a hole at offset %Lu",
155 inode->i_ino,
156 (unsigned long long) filp->f_pos);
157 dir_has_error = 1;
158 }
154 /* corrupt size? Maybe no more blocks to read */ 159 /* corrupt size? Maybe no more blocks to read */
155 if (filp->f_pos > inode->i_blocks << 9) 160 if (filp->f_pos > inode->i_blocks << 9)
156 break; 161 break;
@@ -187,14 +192,14 @@ revalidate:
187 while (!error && filp->f_pos < inode->i_size 192 while (!error && filp->f_pos < inode->i_size
188 && offset < sb->s_blocksize) { 193 && offset < sb->s_blocksize) {
189 de = (struct ext4_dir_entry_2 *) (bh->b_data + offset); 194 de = (struct ext4_dir_entry_2 *) (bh->b_data + offset);
190 if (!ext4_check_dir_entry ("ext4_readdir", inode, de, 195 if (!ext4_check_dir_entry("ext4_readdir", inode, de,
191 bh, offset)) { 196 bh, offset)) {
192 /* 197 /*
193 * On error, skip the f_pos to the next block 198 * On error, skip the f_pos to the next block
194 */ 199 */
195 filp->f_pos = (filp->f_pos | 200 filp->f_pos = (filp->f_pos |
196 (sb->s_blocksize - 1)) + 1; 201 (sb->s_blocksize - 1)) + 1;
197 brelse (bh); 202 brelse(bh);
198 ret = stored; 203 ret = stored;
199 goto out; 204 goto out;
200 } 205 }
@@ -218,12 +223,12 @@ revalidate:
218 break; 223 break;
219 if (version != filp->f_version) 224 if (version != filp->f_version)
220 goto revalidate; 225 goto revalidate;
221 stored ++; 226 stored++;
222 } 227 }
223 filp->f_pos += ext4_rec_len_from_disk(de->rec_len); 228 filp->f_pos += ext4_rec_len_from_disk(de->rec_len);
224 } 229 }
225 offset = 0; 230 offset = 0;
226 brelse (bh); 231 brelse(bh);
227 } 232 }
228out: 233out:
229 return ret; 234 return ret;
@@ -290,9 +295,9 @@ static void free_rb_tree_fname(struct rb_root *root)
290 parent = rb_parent(n); 295 parent = rb_parent(n);
291 fname = rb_entry(n, struct fname, rb_hash); 296 fname = rb_entry(n, struct fname, rb_hash);
292 while (fname) { 297 while (fname) {
293 struct fname * old = fname; 298 struct fname *old = fname;
294 fname = fname->next; 299 fname = fname->next;
295 kfree (old); 300 kfree(old);
296 } 301 }
297 if (!parent) 302 if (!parent)
298 root->rb_node = NULL; 303 root->rb_node = NULL;
@@ -331,7 +336,7 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
331 struct ext4_dir_entry_2 *dirent) 336 struct ext4_dir_entry_2 *dirent)
332{ 337{
333 struct rb_node **p, *parent = NULL; 338 struct rb_node **p, *parent = NULL;
334 struct fname * fname, *new_fn; 339 struct fname *fname, *new_fn;
335 struct dir_private_info *info; 340 struct dir_private_info *info;
336 int len; 341 int len;
337 342
@@ -388,19 +393,20 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
388 * for all entres on the fname linked list. (Normally there is only 393 * for all entres on the fname linked list. (Normally there is only
389 * one entry on the linked list, unless there are 62 bit hash collisions.) 394 * one entry on the linked list, unless there are 62 bit hash collisions.)
390 */ 395 */
391static int call_filldir(struct file * filp, void * dirent, 396static int call_filldir(struct file *filp, void *dirent,
392 filldir_t filldir, struct fname *fname) 397 filldir_t filldir, struct fname *fname)
393{ 398{
394 struct dir_private_info *info = filp->private_data; 399 struct dir_private_info *info = filp->private_data;
395 loff_t curr_pos; 400 loff_t curr_pos;
396 struct inode *inode = filp->f_path.dentry->d_inode; 401 struct inode *inode = filp->f_path.dentry->d_inode;
397 struct super_block * sb; 402 struct super_block *sb;
398 int error; 403 int error;
399 404
400 sb = inode->i_sb; 405 sb = inode->i_sb;
401 406
402 if (!fname) { 407 if (!fname) {
403 printk("call_filldir: called with null fname?!?\n"); 408 printk(KERN_ERR "ext4: call_filldir: called with "
409 "null fname?!?\n");
404 return 0; 410 return 0;
405 } 411 }
406 curr_pos = hash2pos(fname->hash, fname->minor_hash); 412 curr_pos = hash2pos(fname->hash, fname->minor_hash);
@@ -419,8 +425,8 @@ static int call_filldir(struct file * filp, void * dirent,
419 return 0; 425 return 0;
420} 426}
421 427
422static int ext4_dx_readdir(struct file * filp, 428static int ext4_dx_readdir(struct file *filp,
423 void * dirent, filldir_t filldir) 429 void *dirent, filldir_t filldir)
424{ 430{
425 struct dir_private_info *info = filp->private_data; 431 struct dir_private_info *info = filp->private_data;
426 struct inode *inode = filp->f_path.dentry->d_inode; 432 struct inode *inode = filp->f_path.dentry->d_inode;
@@ -511,7 +517,7 @@ finished:
511 return 0; 517 return 0;
512} 518}
513 519
514static int ext4_release_dir (struct inode * inode, struct file * filp) 520static int ext4_release_dir(struct inode *inode, struct file *filp)
515{ 521{
516 if (filp->private_data) 522 if (filp->private_data)
517 ext4_htree_free_dir_info(filp->private_data); 523 ext4_htree_free_dir_info(filp->private_data);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 295003241d3..f46a513a515 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -44,9 +44,9 @@
44#ifdef EXT4FS_DEBUG 44#ifdef EXT4FS_DEBUG
45#define ext4_debug(f, a...) \ 45#define ext4_debug(f, a...) \
46 do { \ 46 do { \
47 printk (KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:", \ 47 printk(KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:", \
48 __FILE__, __LINE__, __func__); \ 48 __FILE__, __LINE__, __func__); \
49 printk (KERN_DEBUG f, ## a); \ 49 printk(KERN_DEBUG f, ## a); \
50 } while (0) 50 } while (0)
51#else 51#else
52#define ext4_debug(f, a...) do {} while (0) 52#define ext4_debug(f, a...) do {} while (0)
@@ -128,7 +128,7 @@ struct ext4_allocation_request {
128#else 128#else
129# define EXT4_BLOCK_SIZE(s) (EXT4_MIN_BLOCK_SIZE << (s)->s_log_block_size) 129# define EXT4_BLOCK_SIZE(s) (EXT4_MIN_BLOCK_SIZE << (s)->s_log_block_size)
130#endif 130#endif
131#define EXT4_ADDR_PER_BLOCK(s) (EXT4_BLOCK_SIZE(s) / sizeof (__u32)) 131#define EXT4_ADDR_PER_BLOCK(s) (EXT4_BLOCK_SIZE(s) / sizeof(__u32))
132#ifdef __KERNEL__ 132#ifdef __KERNEL__
133# define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits) 133# define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits)
134#else 134#else
@@ -245,7 +245,7 @@ struct flex_groups {
245#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ 245#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */
246 246
247#define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */ 247#define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */
248#define EXT4_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */ 248#define EXT4_FL_USER_MODIFIABLE 0x000B80FF /* User modifiable flags */
249 249
250/* 250/*
251 * Inode dynamic state flags 251 * Inode dynamic state flags
@@ -291,8 +291,6 @@ struct ext4_new_group_data {
291#define EXT4_IOC_SETFLAGS FS_IOC_SETFLAGS 291#define EXT4_IOC_SETFLAGS FS_IOC_SETFLAGS
292#define EXT4_IOC_GETVERSION _IOR('f', 3, long) 292#define EXT4_IOC_GETVERSION _IOR('f', 3, long)
293#define EXT4_IOC_SETVERSION _IOW('f', 4, long) 293#define EXT4_IOC_SETVERSION _IOW('f', 4, long)
294#define EXT4_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long)
295#define EXT4_IOC_GROUP_ADD _IOW('f', 8,struct ext4_new_group_input)
296#define EXT4_IOC_GETVERSION_OLD FS_IOC_GETVERSION 294#define EXT4_IOC_GETVERSION_OLD FS_IOC_GETVERSION
297#define EXT4_IOC_SETVERSION_OLD FS_IOC_SETVERSION 295#define EXT4_IOC_SETVERSION_OLD FS_IOC_SETVERSION
298#ifdef CONFIG_JBD2_DEBUG 296#ifdef CONFIG_JBD2_DEBUG
@@ -300,7 +298,10 @@ struct ext4_new_group_data {
300#endif 298#endif
301#define EXT4_IOC_GETRSVSZ _IOR('f', 5, long) 299#define EXT4_IOC_GETRSVSZ _IOR('f', 5, long)
302#define EXT4_IOC_SETRSVSZ _IOW('f', 6, long) 300#define EXT4_IOC_SETRSVSZ _IOW('f', 6, long)
303#define EXT4_IOC_MIGRATE _IO('f', 7) 301#define EXT4_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long)
302#define EXT4_IOC_GROUP_ADD _IOW('f', 8, struct ext4_new_group_input)
303#define EXT4_IOC_MIGRATE _IO('f', 9)
304 /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */
304 305
305/* 306/*
306 * ioctl commands in 32 bit emulation 307 * ioctl commands in 32 bit emulation
@@ -538,7 +539,6 @@ do { \
538#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ 539#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */
539#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ 540#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
540#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ 541#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */
541#define EXT4_MOUNT_MBALLOC 0x4000000 /* Buddy allocation support */
542#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ 542#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
543/* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */ 543/* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
544#ifndef _LINUX_EXT2_FS_H 544#ifndef _LINUX_EXT2_FS_H
@@ -667,7 +667,7 @@ struct ext4_super_block {
667}; 667};
668 668
669#ifdef __KERNEL__ 669#ifdef __KERNEL__
670static inline struct ext4_sb_info * EXT4_SB(struct super_block *sb) 670static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
671{ 671{
672 return sb->s_fs_info; 672 return sb->s_fs_info;
673} 673}
@@ -725,11 +725,11 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
725 */ 725 */
726 726
727#define EXT4_HAS_COMPAT_FEATURE(sb,mask) \ 727#define EXT4_HAS_COMPAT_FEATURE(sb,mask) \
728 ( EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask) ) 728 (EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask))
729#define EXT4_HAS_RO_COMPAT_FEATURE(sb,mask) \ 729#define EXT4_HAS_RO_COMPAT_FEATURE(sb,mask) \
730 ( EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask) ) 730 (EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask))
731#define EXT4_HAS_INCOMPAT_FEATURE(sb,mask) \ 731#define EXT4_HAS_INCOMPAT_FEATURE(sb,mask) \
732 ( EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask) ) 732 (EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask))
733#define EXT4_SET_COMPAT_FEATURE(sb,mask) \ 733#define EXT4_SET_COMPAT_FEATURE(sb,mask) \
734 EXT4_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask) 734 EXT4_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask)
735#define EXT4_SET_RO_COMPAT_FEATURE(sb,mask) \ 735#define EXT4_SET_RO_COMPAT_FEATURE(sb,mask) \
@@ -789,6 +789,8 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
789#define EXT4_DEF_RESUID 0 789#define EXT4_DEF_RESUID 0
790#define EXT4_DEF_RESGID 0 790#define EXT4_DEF_RESGID 0
791 791
792#define EXT4_DEF_INODE_READAHEAD_BLKS 32
793
792/* 794/*
793 * Default mount options 795 * Default mount options
794 */ 796 */
@@ -954,6 +956,24 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no)
954void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, 956void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
955 unsigned long *blockgrpp, ext4_grpblk_t *offsetp); 957 unsigned long *blockgrpp, ext4_grpblk_t *offsetp);
956 958
959extern struct proc_dir_entry *ext4_proc_root;
960
961#ifdef CONFIG_PROC_FS
962extern const struct file_operations ext4_ui_proc_fops;
963
964#define EXT4_PROC_HANDLER(name, var) \
965do { \
966 proc = proc_create_data(name, mode, sbi->s_proc, \
967 &ext4_ui_proc_fops, &sbi->s_##var); \
968 if (proc == NULL) { \
969 printk(KERN_ERR "EXT4-fs: can't create %s\n", name); \
970 goto err_out; \
971 } \
972} while (0)
973#else
974#define EXT4_PROC_HANDLER(name, var)
975#endif
976
957/* 977/*
958 * Function prototypes 978 * Function prototypes
959 */ 979 */
@@ -981,23 +1001,20 @@ extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
981extern ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode, 1001extern ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
982 ext4_lblk_t iblock, ext4_fsblk_t goal, 1002 ext4_lblk_t iblock, ext4_fsblk_t goal,
983 unsigned long *count, int *errp); 1003 unsigned long *count, int *errp);
984extern ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode, 1004extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
985 ext4_fsblk_t goal, unsigned long *count, int *errp);
986extern ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi, 1005extern ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
987 ext4_fsblk_t nblocks); 1006 s64 nblocks);
988extern void ext4_free_blocks (handle_t *handle, struct inode *inode, 1007extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
989 ext4_fsblk_t block, unsigned long count, int metadata); 1008 ext4_fsblk_t block, unsigned long count, int metadata);
990extern void ext4_free_blocks_sb (handle_t *handle, struct super_block *sb, 1009extern void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
991 ext4_fsblk_t block, unsigned long count, 1010 ext4_fsblk_t block, unsigned long count,
992 unsigned long *pdquot_freed_blocks); 1011 unsigned long *pdquot_freed_blocks);
993extern ext4_fsblk_t ext4_count_free_blocks (struct super_block *); 1012extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *);
994extern void ext4_check_blocks_bitmap (struct super_block *); 1013extern void ext4_check_blocks_bitmap(struct super_block *);
995extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, 1014extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
996 ext4_group_t block_group, 1015 ext4_group_t block_group,
997 struct buffer_head ** bh); 1016 struct buffer_head ** bh);
998extern int ext4_should_retry_alloc(struct super_block *sb, int *retries); 1017extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
999extern void ext4_init_block_alloc_info(struct inode *);
1000extern void ext4_rsv_window_add(struct super_block *sb, struct ext4_reserve_window_node *rsv);
1001 1018
1002/* dir.c */ 1019/* dir.c */
1003extern int ext4_check_dir_entry(const char *, struct inode *, 1020extern int ext4_check_dir_entry(const char *, struct inode *,
@@ -1009,20 +1026,20 @@ extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
1009extern void ext4_htree_free_dir_info(struct dir_private_info *p); 1026extern void ext4_htree_free_dir_info(struct dir_private_info *p);
1010 1027
1011/* fsync.c */ 1028/* fsync.c */
1012extern int ext4_sync_file (struct file *, struct dentry *, int); 1029extern int ext4_sync_file(struct file *, struct dentry *, int);
1013 1030
1014/* hash.c */ 1031/* hash.c */
1015extern int ext4fs_dirhash(const char *name, int len, struct 1032extern int ext4fs_dirhash(const char *name, int len, struct
1016 dx_hash_info *hinfo); 1033 dx_hash_info *hinfo);
1017 1034
1018/* ialloc.c */ 1035/* ialloc.c */
1019extern struct inode * ext4_new_inode (handle_t *, struct inode *, int); 1036extern struct inode * ext4_new_inode(handle_t *, struct inode *, int);
1020extern void ext4_free_inode (handle_t *, struct inode *); 1037extern void ext4_free_inode(handle_t *, struct inode *);
1021extern struct inode * ext4_orphan_get (struct super_block *, unsigned long); 1038extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
1022extern unsigned long ext4_count_free_inodes (struct super_block *); 1039extern unsigned long ext4_count_free_inodes(struct super_block *);
1023extern unsigned long ext4_count_dirs (struct super_block *); 1040extern unsigned long ext4_count_dirs(struct super_block *);
1024extern void ext4_check_inodes_bitmap (struct super_block *); 1041extern void ext4_check_inodes_bitmap(struct super_block *);
1025extern unsigned long ext4_count_free (struct buffer_head *, unsigned); 1042extern unsigned long ext4_count_free(struct buffer_head *, unsigned);
1026 1043
1027/* mballoc.c */ 1044/* mballoc.c */
1028extern long ext4_mb_stats; 1045extern long ext4_mb_stats;
@@ -1032,7 +1049,7 @@ extern int ext4_mb_release(struct super_block *);
1032extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *, 1049extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *,
1033 struct ext4_allocation_request *, int *); 1050 struct ext4_allocation_request *, int *);
1034extern int ext4_mb_reserve_blocks(struct super_block *, int); 1051extern int ext4_mb_reserve_blocks(struct super_block *, int);
1035extern void ext4_mb_discard_inode_preallocations(struct inode *); 1052extern void ext4_discard_preallocations(struct inode *);
1036extern int __init init_ext4_mballoc(void); 1053extern int __init init_ext4_mballoc(void);
1037extern void exit_ext4_mballoc(void); 1054extern void exit_ext4_mballoc(void);
1038extern void ext4_mb_free_blocks(handle_t *, struct inode *, 1055extern void ext4_mb_free_blocks(handle_t *, struct inode *,
@@ -1050,24 +1067,25 @@ struct buffer_head *ext4_getblk(handle_t *, struct inode *,
1050 ext4_lblk_t, int, int *); 1067 ext4_lblk_t, int, int *);
1051struct buffer_head *ext4_bread(handle_t *, struct inode *, 1068struct buffer_head *ext4_bread(handle_t *, struct inode *,
1052 ext4_lblk_t, int, int *); 1069 ext4_lblk_t, int, int *);
1070int ext4_get_block(struct inode *inode, sector_t iblock,
1071 struct buffer_head *bh_result, int create);
1053int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, 1072int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
1054 ext4_lblk_t iblock, unsigned long maxblocks, 1073 ext4_lblk_t iblock, unsigned long maxblocks,
1055 struct buffer_head *bh_result, 1074 struct buffer_head *bh_result,
1056 int create, int extend_disksize); 1075 int create, int extend_disksize);
1057 1076
1058extern struct inode *ext4_iget(struct super_block *, unsigned long); 1077extern struct inode *ext4_iget(struct super_block *, unsigned long);
1059extern int ext4_write_inode (struct inode *, int); 1078extern int ext4_write_inode(struct inode *, int);
1060extern int ext4_setattr (struct dentry *, struct iattr *); 1079extern int ext4_setattr(struct dentry *, struct iattr *);
1061extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, 1080extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
1062 struct kstat *stat); 1081 struct kstat *stat);
1063extern void ext4_delete_inode (struct inode *); 1082extern void ext4_delete_inode(struct inode *);
1064extern int ext4_sync_inode (handle_t *, struct inode *); 1083extern int ext4_sync_inode(handle_t *, struct inode *);
1065extern void ext4_discard_reservation (struct inode *);
1066extern void ext4_dirty_inode(struct inode *); 1084extern void ext4_dirty_inode(struct inode *);
1067extern int ext4_change_inode_journal_flag(struct inode *, int); 1085extern int ext4_change_inode_journal_flag(struct inode *, int);
1068extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); 1086extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
1069extern int ext4_can_truncate(struct inode *inode); 1087extern int ext4_can_truncate(struct inode *inode);
1070extern void ext4_truncate (struct inode *); 1088extern void ext4_truncate(struct inode *);
1071extern void ext4_set_inode_flags(struct inode *); 1089extern void ext4_set_inode_flags(struct inode *);
1072extern void ext4_get_inode_flags(struct ext4_inode_info *); 1090extern void ext4_get_inode_flags(struct ext4_inode_info *);
1073extern void ext4_set_aops(struct inode *inode); 1091extern void ext4_set_aops(struct inode *inode);
@@ -1080,11 +1098,10 @@ extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page);
1080 1098
1081/* ioctl.c */ 1099/* ioctl.c */
1082extern long ext4_ioctl(struct file *, unsigned int, unsigned long); 1100extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
1083extern long ext4_compat_ioctl (struct file *, unsigned int, unsigned long); 1101extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
1084 1102
1085/* migrate.c */ 1103/* migrate.c */
1086extern int ext4_ext_migrate(struct inode *, struct file *, unsigned int, 1104extern int ext4_ext_migrate(struct inode *);
1087 unsigned long);
1088/* namei.c */ 1105/* namei.c */
1089extern int ext4_orphan_add(handle_t *, struct inode *); 1106extern int ext4_orphan_add(handle_t *, struct inode *);
1090extern int ext4_orphan_del(handle_t *, struct inode *); 1107extern int ext4_orphan_del(handle_t *, struct inode *);
@@ -1099,14 +1116,14 @@ extern int ext4_group_extend(struct super_block *sb,
1099 ext4_fsblk_t n_blocks_count); 1116 ext4_fsblk_t n_blocks_count);
1100 1117
1101/* super.c */ 1118/* super.c */
1102extern void ext4_error (struct super_block *, const char *, const char *, ...) 1119extern void ext4_error(struct super_block *, const char *, const char *, ...)
1103 __attribute__ ((format (printf, 3, 4))); 1120 __attribute__ ((format (printf, 3, 4)));
1104extern void __ext4_std_error (struct super_block *, const char *, int); 1121extern void __ext4_std_error(struct super_block *, const char *, int);
1105extern void ext4_abort (struct super_block *, const char *, const char *, ...) 1122extern void ext4_abort(struct super_block *, const char *, const char *, ...)
1106 __attribute__ ((format (printf, 3, 4))); 1123 __attribute__ ((format (printf, 3, 4)));
1107extern void ext4_warning (struct super_block *, const char *, const char *, ...) 1124extern void ext4_warning(struct super_block *, const char *, const char *, ...)
1108 __attribute__ ((format (printf, 3, 4))); 1125 __attribute__ ((format (printf, 3, 4)));
1109extern void ext4_update_dynamic_rev (struct super_block *sb); 1126extern void ext4_update_dynamic_rev(struct super_block *sb);
1110extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb, 1127extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb,
1111 __u32 compat); 1128 __u32 compat);
1112extern int ext4_update_rocompat_feature(handle_t *handle, 1129extern int ext4_update_rocompat_feature(handle_t *handle,
@@ -1179,7 +1196,7 @@ static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
1179 1196
1180static inline 1197static inline
1181struct ext4_group_info *ext4_get_group_info(struct super_block *sb, 1198struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
1182 ext4_group_t group) 1199 ext4_group_t group)
1183{ 1200{
1184 struct ext4_group_info ***grp_info; 1201 struct ext4_group_info ***grp_info;
1185 long indexv, indexh; 1202 long indexv, indexh;
@@ -1207,6 +1224,28 @@ do { \
1207 __ext4_std_error((sb), __func__, (errno)); \ 1224 __ext4_std_error((sb), __func__, (errno)); \
1208} while (0) 1225} while (0)
1209 1226
1227#ifdef CONFIG_SMP
1228/* Each CPU can accumulate FBC_BATCH blocks in their local
1229 * counters. So we need to make sure we have free blocks more
1230 * than FBC_BATCH * nr_cpu_ids. Also add a window of 4 times.
1231 */
1232#define EXT4_FREEBLOCKS_WATERMARK (4 * (FBC_BATCH * nr_cpu_ids))
1233#else
1234#define EXT4_FREEBLOCKS_WATERMARK 0
1235#endif
1236
1237static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
1238{
1239 /*
1240 * XXX: replace with spinlock if seen contended -bzzz
1241 */
1242 down_write(&EXT4_I(inode)->i_data_sem);
1243 if (newsize > EXT4_I(inode)->i_disksize)
1244 EXT4_I(inode)->i_disksize = newsize;
1245 up_write(&EXT4_I(inode)->i_data_sem);
1246 return ;
1247}
1248
1210/* 1249/*
1211 * Inodes and files operations 1250 * Inodes and files operations
1212 */ 1251 */
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index d33dc56d698..bec7ce59fc0 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -124,6 +124,19 @@ struct ext4_ext_path {
124#define EXT4_EXT_CACHE_GAP 1 124#define EXT4_EXT_CACHE_GAP 1
125#define EXT4_EXT_CACHE_EXTENT 2 125#define EXT4_EXT_CACHE_EXTENT 2
126 126
127/*
128 * to be called by ext4_ext_walk_space()
129 * negative retcode - error
130 * positive retcode - signal for ext4_ext_walk_space(), see below
131 * callback must return valid extent (passed or newly created)
132 */
133typedef int (*ext_prepare_callback)(struct inode *, struct ext4_ext_path *,
134 struct ext4_ext_cache *,
135 struct ext4_extent *, void *);
136
137#define EXT_CONTINUE 0
138#define EXT_BREAK 1
139#define EXT_REPEAT 2
127 140
128#define EXT_MAX_BLOCK 0xffffffff 141#define EXT_MAX_BLOCK 0xffffffff
129 142
@@ -224,6 +237,8 @@ extern int ext4_ext_try_to_merge(struct inode *inode,
224 struct ext4_extent *); 237 struct ext4_extent *);
225extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *); 238extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *);
226extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *); 239extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *);
240extern int ext4_ext_walk_space(struct inode *, ext4_lblk_t, ext4_lblk_t,
241 ext_prepare_callback, void *);
227extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t, 242extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
228 struct ext4_ext_path *); 243 struct ext4_ext_path *);
229extern int ext4_ext_search_left(struct inode *, struct ext4_ext_path *, 244extern int ext4_ext_search_left(struct inode *, struct ext4_ext_path *,
diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h
index ef7409f0e7e..5c124c0ac6d 100644
--- a/fs/ext4/ext4_i.h
+++ b/fs/ext4/ext4_i.h
@@ -33,38 +33,6 @@ typedef __u32 ext4_lblk_t;
33/* data type for block group number */ 33/* data type for block group number */
34typedef unsigned long ext4_group_t; 34typedef unsigned long ext4_group_t;
35 35
36struct ext4_reserve_window {
37 ext4_fsblk_t _rsv_start; /* First byte reserved */
38 ext4_fsblk_t _rsv_end; /* Last byte reserved or 0 */
39};
40
41struct ext4_reserve_window_node {
42 struct rb_node rsv_node;
43 __u32 rsv_goal_size;
44 __u32 rsv_alloc_hit;
45 struct ext4_reserve_window rsv_window;
46};
47
48struct ext4_block_alloc_info {
49 /* information about reservation window */
50 struct ext4_reserve_window_node rsv_window_node;
51 /*
52 * was i_next_alloc_block in ext4_inode_info
53 * is the logical (file-relative) number of the
54 * most-recently-allocated block in this file.
55 * We use this for detecting linearly ascending allocation requests.
56 */
57 ext4_lblk_t last_alloc_logical_block;
58 /*
59 * Was i_next_alloc_goal in ext4_inode_info
60 * is the *physical* companion to i_next_alloc_block.
61 * it the physical block number of the block which was most-recentl
62 * allocated to this file. This give us the goal (target) for the next
63 * allocation when we detect linearly ascending requests.
64 */
65 ext4_fsblk_t last_alloc_physical_block;
66};
67
68#define rsv_start rsv_window._rsv_start 36#define rsv_start rsv_window._rsv_start
69#define rsv_end rsv_window._rsv_end 37#define rsv_end rsv_window._rsv_end
70 38
@@ -97,11 +65,8 @@ struct ext4_inode_info {
97 ext4_group_t i_block_group; 65 ext4_group_t i_block_group;
98 __u32 i_state; /* Dynamic state flags for ext4 */ 66 __u32 i_state; /* Dynamic state flags for ext4 */
99 67
100 /* block reservation info */
101 struct ext4_block_alloc_info *i_block_alloc_info;
102
103 ext4_lblk_t i_dir_start_lookup; 68 ext4_lblk_t i_dir_start_lookup;
104#ifdef CONFIG_EXT4DEV_FS_XATTR 69#ifdef CONFIG_EXT4_FS_XATTR
105 /* 70 /*
106 * Extended attributes can be read independently of the main file 71 * Extended attributes can be read independently of the main file
107 * data. Taking i_mutex even when reading would cause contention 72 * data. Taking i_mutex even when reading would cause contention
@@ -111,7 +76,7 @@ struct ext4_inode_info {
111 */ 76 */
112 struct rw_semaphore xattr_sem; 77 struct rw_semaphore xattr_sem;
113#endif 78#endif
114#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL 79#ifdef CONFIG_EXT4_FS_POSIX_ACL
115 struct posix_acl *i_acl; 80 struct posix_acl *i_acl;
116 struct posix_acl *i_default_acl; 81 struct posix_acl *i_default_acl;
117#endif 82#endif
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index 6300226d553..6a0b40d4326 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -40,8 +40,8 @@ struct ext4_sb_info {
40 unsigned long s_blocks_last; /* Last seen block count */ 40 unsigned long s_blocks_last; /* Last seen block count */
41 loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */ 41 loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */
42 struct buffer_head * s_sbh; /* Buffer containing the super block */ 42 struct buffer_head * s_sbh; /* Buffer containing the super block */
43 struct ext4_super_block * s_es; /* Pointer to the super block in the buffer */ 43 struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */
44 struct buffer_head ** s_group_desc; 44 struct buffer_head **s_group_desc;
45 unsigned long s_mount_opt; 45 unsigned long s_mount_opt;
46 ext4_fsblk_t s_sb_block; 46 ext4_fsblk_t s_sb_block;
47 uid_t s_resuid; 47 uid_t s_resuid;
@@ -52,6 +52,7 @@ struct ext4_sb_info {
52 int s_desc_per_block_bits; 52 int s_desc_per_block_bits;
53 int s_inode_size; 53 int s_inode_size;
54 int s_first_ino; 54 int s_first_ino;
55 unsigned int s_inode_readahead_blks;
55 spinlock_t s_next_gen_lock; 56 spinlock_t s_next_gen_lock;
56 u32 s_next_generation; 57 u32 s_next_generation;
57 u32 s_hash_seed[4]; 58 u32 s_hash_seed[4];
@@ -59,16 +60,17 @@ struct ext4_sb_info {
59 struct percpu_counter s_freeblocks_counter; 60 struct percpu_counter s_freeblocks_counter;
60 struct percpu_counter s_freeinodes_counter; 61 struct percpu_counter s_freeinodes_counter;
61 struct percpu_counter s_dirs_counter; 62 struct percpu_counter s_dirs_counter;
63 struct percpu_counter s_dirtyblocks_counter;
62 struct blockgroup_lock s_blockgroup_lock; 64 struct blockgroup_lock s_blockgroup_lock;
65 struct proc_dir_entry *s_proc;
63 66
64 /* root of the per fs reservation window tree */ 67 /* root of the per fs reservation window tree */
65 spinlock_t s_rsv_window_lock; 68 spinlock_t s_rsv_window_lock;
66 struct rb_root s_rsv_window_root; 69 struct rb_root s_rsv_window_root;
67 struct ext4_reserve_window_node s_rsv_window_head;
68 70
69 /* Journaling */ 71 /* Journaling */
70 struct inode * s_journal_inode; 72 struct inode *s_journal_inode;
71 struct journal_s * s_journal; 73 struct journal_s *s_journal;
72 struct list_head s_orphan; 74 struct list_head s_orphan;
73 unsigned long s_commit_interval; 75 unsigned long s_commit_interval;
74 struct block_device *journal_bdev; 76 struct block_device *journal_bdev;
@@ -106,12 +108,12 @@ struct ext4_sb_info {
106 108
107 /* tunables */ 109 /* tunables */
108 unsigned long s_stripe; 110 unsigned long s_stripe;
109 unsigned long s_mb_stream_request; 111 unsigned int s_mb_stream_request;
110 unsigned long s_mb_max_to_scan; 112 unsigned int s_mb_max_to_scan;
111 unsigned long s_mb_min_to_scan; 113 unsigned int s_mb_min_to_scan;
112 unsigned long s_mb_stats; 114 unsigned int s_mb_stats;
113 unsigned long s_mb_order2_reqs; 115 unsigned int s_mb_order2_reqs;
114 unsigned long s_mb_group_prealloc; 116 unsigned int s_mb_group_prealloc;
115 /* where last allocation was done - for stream allocation */ 117 /* where last allocation was done - for stream allocation */
116 unsigned long s_mb_last_group; 118 unsigned long s_mb_last_group;
117 unsigned long s_mb_last_start; 119 unsigned long s_mb_last_start;
@@ -121,7 +123,6 @@ struct ext4_sb_info {
121 int s_mb_history_cur; 123 int s_mb_history_cur;
122 int s_mb_history_max; 124 int s_mb_history_max;
123 int s_mb_history_num; 125 int s_mb_history_num;
124 struct proc_dir_entry *s_mb_proc;
125 spinlock_t s_mb_history_lock; 126 spinlock_t s_mb_history_lock;
126 int s_mb_history_filter; 127 int s_mb_history_filter;
127 128
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index b24d3c53f20..ea2ce3c0ae6 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -40,6 +40,7 @@
40#include <linux/slab.h> 40#include <linux/slab.h>
41#include <linux/falloc.h> 41#include <linux/falloc.h>
42#include <asm/uaccess.h> 42#include <asm/uaccess.h>
43#include <linux/fiemap.h>
43#include "ext4_jbd2.h" 44#include "ext4_jbd2.h"
44#include "ext4_extents.h" 45#include "ext4_extents.h"
45 46
@@ -383,8 +384,8 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
383 ext_debug("\n"); 384 ext_debug("\n");
384} 385}
385#else 386#else
386#define ext4_ext_show_path(inode,path) 387#define ext4_ext_show_path(inode, path)
387#define ext4_ext_show_leaf(inode,path) 388#define ext4_ext_show_leaf(inode, path)
388#endif 389#endif
389 390
390void ext4_ext_drop_refs(struct ext4_ext_path *path) 391void ext4_ext_drop_refs(struct ext4_ext_path *path)
@@ -440,9 +441,10 @@ ext4_ext_binsearch_idx(struct inode *inode,
440 for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) { 441 for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) {
441 if (k != 0 && 442 if (k != 0 &&
442 le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) { 443 le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) {
443 printk("k=%d, ix=0x%p, first=0x%p\n", k, 444 printk(KERN_DEBUG "k=%d, ix=0x%p, "
444 ix, EXT_FIRST_INDEX(eh)); 445 "first=0x%p\n", k,
445 printk("%u <= %u\n", 446 ix, EXT_FIRST_INDEX(eh));
447 printk(KERN_DEBUG "%u <= %u\n",
446 le32_to_cpu(ix->ei_block), 448 le32_to_cpu(ix->ei_block),
447 le32_to_cpu(ix[-1].ei_block)); 449 le32_to_cpu(ix[-1].ei_block));
448 } 450 }
@@ -1475,7 +1477,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1475 struct ext4_ext_path *path, 1477 struct ext4_ext_path *path,
1476 struct ext4_extent *newext) 1478 struct ext4_extent *newext)
1477{ 1479{
1478 struct ext4_extent_header * eh; 1480 struct ext4_extent_header *eh;
1479 struct ext4_extent *ex, *fex; 1481 struct ext4_extent *ex, *fex;
1480 struct ext4_extent *nearex; /* nearest extent */ 1482 struct ext4_extent *nearex; /* nearest extent */
1481 struct ext4_ext_path *npath = NULL; 1483 struct ext4_ext_path *npath = NULL;
@@ -1625,6 +1627,113 @@ cleanup:
1625 return err; 1627 return err;
1626} 1628}
1627 1629
1630int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
1631 ext4_lblk_t num, ext_prepare_callback func,
1632 void *cbdata)
1633{
1634 struct ext4_ext_path *path = NULL;
1635 struct ext4_ext_cache cbex;
1636 struct ext4_extent *ex;
1637 ext4_lblk_t next, start = 0, end = 0;
1638 ext4_lblk_t last = block + num;
1639 int depth, exists, err = 0;
1640
1641 BUG_ON(func == NULL);
1642 BUG_ON(inode == NULL);
1643
1644 while (block < last && block != EXT_MAX_BLOCK) {
1645 num = last - block;
1646 /* find extent for this block */
1647 path = ext4_ext_find_extent(inode, block, path);
1648 if (IS_ERR(path)) {
1649 err = PTR_ERR(path);
1650 path = NULL;
1651 break;
1652 }
1653
1654 depth = ext_depth(inode);
1655 BUG_ON(path[depth].p_hdr == NULL);
1656 ex = path[depth].p_ext;
1657 next = ext4_ext_next_allocated_block(path);
1658
1659 exists = 0;
1660 if (!ex) {
1661 /* there is no extent yet, so try to allocate
1662 * all requested space */
1663 start = block;
1664 end = block + num;
1665 } else if (le32_to_cpu(ex->ee_block) > block) {
1666 /* need to allocate space before found extent */
1667 start = block;
1668 end = le32_to_cpu(ex->ee_block);
1669 if (block + num < end)
1670 end = block + num;
1671 } else if (block >= le32_to_cpu(ex->ee_block)
1672 + ext4_ext_get_actual_len(ex)) {
1673 /* need to allocate space after found extent */
1674 start = block;
1675 end = block + num;
1676 if (end >= next)
1677 end = next;
1678 } else if (block >= le32_to_cpu(ex->ee_block)) {
1679 /*
1680 * some part of requested space is covered
1681 * by found extent
1682 */
1683 start = block;
1684 end = le32_to_cpu(ex->ee_block)
1685 + ext4_ext_get_actual_len(ex);
1686 if (block + num < end)
1687 end = block + num;
1688 exists = 1;
1689 } else {
1690 BUG();
1691 }
1692 BUG_ON(end <= start);
1693
1694 if (!exists) {
1695 cbex.ec_block = start;
1696 cbex.ec_len = end - start;
1697 cbex.ec_start = 0;
1698 cbex.ec_type = EXT4_EXT_CACHE_GAP;
1699 } else {
1700 cbex.ec_block = le32_to_cpu(ex->ee_block);
1701 cbex.ec_len = ext4_ext_get_actual_len(ex);
1702 cbex.ec_start = ext_pblock(ex);
1703 cbex.ec_type = EXT4_EXT_CACHE_EXTENT;
1704 }
1705
1706 BUG_ON(cbex.ec_len == 0);
1707 err = func(inode, path, &cbex, ex, cbdata);
1708 ext4_ext_drop_refs(path);
1709
1710 if (err < 0)
1711 break;
1712
1713 if (err == EXT_REPEAT)
1714 continue;
1715 else if (err == EXT_BREAK) {
1716 err = 0;
1717 break;
1718 }
1719
1720 if (ext_depth(inode) != depth) {
1721 /* depth was changed. we have to realloc path */
1722 kfree(path);
1723 path = NULL;
1724 }
1725
1726 block = cbex.ec_block + cbex.ec_len;
1727 }
1728
1729 if (path) {
1730 ext4_ext_drop_refs(path);
1731 kfree(path);
1732 }
1733
1734 return err;
1735}
1736
1628static void 1737static void
1629ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block, 1738ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block,
1630 __u32 len, ext4_fsblk_t start, int type) 1739 __u32 len, ext4_fsblk_t start, int type)
@@ -2142,7 +2251,7 @@ void ext4_ext_init(struct super_block *sb)
2142 */ 2251 */
2143 2252
2144 if (test_opt(sb, EXTENTS)) { 2253 if (test_opt(sb, EXTENTS)) {
2145 printk("EXT4-fs: file extents enabled"); 2254 printk(KERN_INFO "EXT4-fs: file extents enabled");
2146#ifdef AGGRESSIVE_TEST 2255#ifdef AGGRESSIVE_TEST
2147 printk(", aggressive tests"); 2256 printk(", aggressive tests");
2148#endif 2257#endif
@@ -2696,11 +2805,8 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2696 goto out2; 2805 goto out2;
2697 } 2806 }
2698 /* 2807 /*
2699 * Okay, we need to do block allocation. Lazily initialize the block 2808 * Okay, we need to do block allocation.
2700 * allocation info here if necessary.
2701 */ 2809 */
2702 if (S_ISREG(inode->i_mode) && (!EXT4_I(inode)->i_block_alloc_info))
2703 ext4_init_block_alloc_info(inode);
2704 2810
2705 /* find neighbour allocated blocks */ 2811 /* find neighbour allocated blocks */
2706 ar.lleft = iblock; 2812 ar.lleft = iblock;
@@ -2760,7 +2866,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2760 /* free data blocks we just allocated */ 2866 /* free data blocks we just allocated */
2761 /* not a good idea to call discard here directly, 2867 /* not a good idea to call discard here directly,
2762 * but otherwise we'd need to call it every free() */ 2868 * but otherwise we'd need to call it every free() */
2763 ext4_mb_discard_inode_preallocations(inode); 2869 ext4_discard_preallocations(inode);
2764 ext4_free_blocks(handle, inode, ext_pblock(&newex), 2870 ext4_free_blocks(handle, inode, ext_pblock(&newex),
2765 ext4_ext_get_actual_len(&newex), 0); 2871 ext4_ext_get_actual_len(&newex), 0);
2766 goto out2; 2872 goto out2;
@@ -2824,7 +2930,7 @@ void ext4_ext_truncate(struct inode *inode)
2824 down_write(&EXT4_I(inode)->i_data_sem); 2930 down_write(&EXT4_I(inode)->i_data_sem);
2825 ext4_ext_invalidate_cache(inode); 2931 ext4_ext_invalidate_cache(inode);
2826 2932
2827 ext4_discard_reservation(inode); 2933 ext4_discard_preallocations(inode);
2828 2934
2829 /* 2935 /*
2830 * TODO: optimization is possible here. 2936 * TODO: optimization is possible here.
@@ -2877,10 +2983,11 @@ static void ext4_falloc_update_inode(struct inode *inode,
2877 * Update only when preallocation was requested beyond 2983 * Update only when preallocation was requested beyond
2878 * the file size. 2984 * the file size.
2879 */ 2985 */
2880 if (!(mode & FALLOC_FL_KEEP_SIZE) && 2986 if (!(mode & FALLOC_FL_KEEP_SIZE)) {
2881 new_size > i_size_read(inode)) { 2987 if (new_size > i_size_read(inode))
2882 i_size_write(inode, new_size); 2988 i_size_write(inode, new_size);
2883 EXT4_I(inode)->i_disksize = new_size; 2989 if (new_size > EXT4_I(inode)->i_disksize)
2990 ext4_update_i_disksize(inode, new_size);
2884 } 2991 }
2885 2992
2886} 2993}
@@ -2972,3 +3079,143 @@ retry:
2972 mutex_unlock(&inode->i_mutex); 3079 mutex_unlock(&inode->i_mutex);
2973 return ret > 0 ? ret2 : ret; 3080 return ret > 0 ? ret2 : ret;
2974} 3081}
3082
3083/*
3084 * Callback function called for each extent to gather FIEMAP information.
3085 */
3086int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
3087 struct ext4_ext_cache *newex, struct ext4_extent *ex,
3088 void *data)
3089{
3090 struct fiemap_extent_info *fieinfo = data;
3091 unsigned long blksize_bits = inode->i_sb->s_blocksize_bits;
3092 __u64 logical;
3093 __u64 physical;
3094 __u64 length;
3095 __u32 flags = 0;
3096 int error;
3097
3098 logical = (__u64)newex->ec_block << blksize_bits;
3099
3100 if (newex->ec_type == EXT4_EXT_CACHE_GAP) {
3101 pgoff_t offset;
3102 struct page *page;
3103 struct buffer_head *bh = NULL;
3104
3105 offset = logical >> PAGE_SHIFT;
3106 page = find_get_page(inode->i_mapping, offset);
3107 if (!page || !page_has_buffers(page))
3108 return EXT_CONTINUE;
3109
3110 bh = page_buffers(page);
3111
3112 if (!bh)
3113 return EXT_CONTINUE;
3114
3115 if (buffer_delay(bh)) {
3116 flags |= FIEMAP_EXTENT_DELALLOC;
3117 page_cache_release(page);
3118 } else {
3119 page_cache_release(page);
3120 return EXT_CONTINUE;
3121 }
3122 }
3123
3124 physical = (__u64)newex->ec_start << blksize_bits;
3125 length = (__u64)newex->ec_len << blksize_bits;
3126
3127 if (ex && ext4_ext_is_uninitialized(ex))
3128 flags |= FIEMAP_EXTENT_UNWRITTEN;
3129
3130 /*
3131 * If this extent reaches EXT_MAX_BLOCK, it must be last.
3132 *
3133 * Or if ext4_ext_next_allocated_block is EXT_MAX_BLOCK,
3134 * this also indicates no more allocated blocks.
3135 *
3136 * XXX this might miss a single-block extent at EXT_MAX_BLOCK
3137 */
3138 if (logical + length - 1 == EXT_MAX_BLOCK ||
3139 ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK)
3140 flags |= FIEMAP_EXTENT_LAST;
3141
3142 error = fiemap_fill_next_extent(fieinfo, logical, physical,
3143 length, flags);
3144 if (error < 0)
3145 return error;
3146 if (error == 1)
3147 return EXT_BREAK;
3148
3149 return EXT_CONTINUE;
3150}
3151
3152/* fiemap flags we can handle specified here */
3153#define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
3154
3155int ext4_xattr_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo)
3156{
3157 __u64 physical = 0;
3158 __u64 length;
3159 __u32 flags = FIEMAP_EXTENT_LAST;
3160 int blockbits = inode->i_sb->s_blocksize_bits;
3161 int error = 0;
3162
3163 /* in-inode? */
3164 if (EXT4_I(inode)->i_state & EXT4_STATE_XATTR) {
3165 struct ext4_iloc iloc;
3166 int offset; /* offset of xattr in inode */
3167
3168 error = ext4_get_inode_loc(inode, &iloc);
3169 if (error)
3170 return error;
3171 physical = iloc.bh->b_blocknr << blockbits;
3172 offset = EXT4_GOOD_OLD_INODE_SIZE +
3173 EXT4_I(inode)->i_extra_isize;
3174 physical += offset;
3175 length = EXT4_SB(inode->i_sb)->s_inode_size - offset;
3176 flags |= FIEMAP_EXTENT_DATA_INLINE;
3177 } else { /* external block */
3178 physical = EXT4_I(inode)->i_file_acl << blockbits;
3179 length = inode->i_sb->s_blocksize;
3180 }
3181
3182 if (physical)
3183 error = fiemap_fill_next_extent(fieinfo, 0, physical,
3184 length, flags);
3185 return (error < 0 ? error : 0);
3186}
3187
3188int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3189 __u64 start, __u64 len)
3190{
3191 ext4_lblk_t start_blk;
3192 ext4_lblk_t len_blks;
3193 int error = 0;
3194
3195 /* fallback to generic here if not in extents fmt */
3196 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
3197 return generic_block_fiemap(inode, fieinfo, start, len,
3198 ext4_get_block);
3199
3200 if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS))
3201 return -EBADR;
3202
3203 if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
3204 error = ext4_xattr_fiemap(inode, fieinfo);
3205 } else {
3206 start_blk = start >> inode->i_sb->s_blocksize_bits;
3207 len_blks = len >> inode->i_sb->s_blocksize_bits;
3208
3209 /*
3210 * Walk the extent tree gathering extent information.
3211 * ext4_ext_fiemap_cb will push extents back to user.
3212 */
3213 down_write(&EXT4_I(inode)->i_data_sem);
3214 error = ext4_ext_walk_space(inode, start_blk, len_blks,
3215 ext4_ext_fiemap_cb, fieinfo);
3216 up_write(&EXT4_I(inode)->i_data_sem);
3217 }
3218
3219 return error;
3220}
3221
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 430eb7978db..6bd11fba71f 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -31,14 +31,14 @@
31 * from ext4_file_open: open gets called at every open, but release 31 * from ext4_file_open: open gets called at every open, but release
32 * gets called only when /all/ the files are closed. 32 * gets called only when /all/ the files are closed.
33 */ 33 */
34static int ext4_release_file (struct inode * inode, struct file * filp) 34static int ext4_release_file(struct inode *inode, struct file *filp)
35{ 35{
36 /* if we are the last writer on the inode, drop the block reservation */ 36 /* if we are the last writer on the inode, drop the block reservation */
37 if ((filp->f_mode & FMODE_WRITE) && 37 if ((filp->f_mode & FMODE_WRITE) &&
38 (atomic_read(&inode->i_writecount) == 1)) 38 (atomic_read(&inode->i_writecount) == 1))
39 { 39 {
40 down_write(&EXT4_I(inode)->i_data_sem); 40 down_write(&EXT4_I(inode)->i_data_sem);
41 ext4_discard_reservation(inode); 41 ext4_discard_preallocations(inode);
42 up_write(&EXT4_I(inode)->i_data_sem); 42 up_write(&EXT4_I(inode)->i_data_sem);
43 } 43 }
44 if (is_dx(inode) && filp->private_data) 44 if (is_dx(inode) && filp->private_data)
@@ -140,6 +140,9 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
140 return 0; 140 return 0;
141} 141}
142 142
143extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
144 __u64 start, __u64 len);
145
143const struct file_operations ext4_file_operations = { 146const struct file_operations ext4_file_operations = {
144 .llseek = generic_file_llseek, 147 .llseek = generic_file_llseek,
145 .read = do_sync_read, 148 .read = do_sync_read,
@@ -162,7 +165,7 @@ const struct inode_operations ext4_file_inode_operations = {
162 .truncate = ext4_truncate, 165 .truncate = ext4_truncate,
163 .setattr = ext4_setattr, 166 .setattr = ext4_setattr,
164 .getattr = ext4_getattr, 167 .getattr = ext4_getattr,
165#ifdef CONFIG_EXT4DEV_FS_XATTR 168#ifdef CONFIG_EXT4_FS_XATTR
166 .setxattr = generic_setxattr, 169 .setxattr = generic_setxattr,
167 .getxattr = generic_getxattr, 170 .getxattr = generic_getxattr,
168 .listxattr = ext4_listxattr, 171 .listxattr = ext4_listxattr,
@@ -170,5 +173,6 @@ const struct inode_operations ext4_file_inode_operations = {
170#endif 173#endif
171 .permission = ext4_permission, 174 .permission = ext4_permission,
172 .fallocate = ext4_fallocate, 175 .fallocate = ext4_fallocate,
176 .fiemap = ext4_fiemap,
173}; 177};
174 178
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index a45c3737ad3..5afe4370840 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -28,6 +28,7 @@
28#include <linux/writeback.h> 28#include <linux/writeback.h>
29#include <linux/jbd2.h> 29#include <linux/jbd2.h>
30#include <linux/blkdev.h> 30#include <linux/blkdev.h>
31#include <linux/marker.h>
31#include "ext4.h" 32#include "ext4.h"
32#include "ext4_jbd2.h" 33#include "ext4_jbd2.h"
33 34
@@ -43,7 +44,7 @@
43 * inode to disk. 44 * inode to disk.
44 */ 45 */
45 46
46int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync) 47int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
47{ 48{
48 struct inode *inode = dentry->d_inode; 49 struct inode *inode = dentry->d_inode;
49 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; 50 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
@@ -51,6 +52,10 @@ int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync)
51 52
52 J_ASSERT(ext4_journal_current_handle() == NULL); 53 J_ASSERT(ext4_journal_current_handle() == NULL);
53 54
55 trace_mark(ext4_sync_file, "dev %s datasync %d ino %ld parent %ld",
56 inode->i_sb->s_id, datasync, inode->i_ino,
57 dentry->d_parent->d_inode->i_ino);
58
54 /* 59 /*
55 * data=writeback: 60 * data=writeback:
56 * The caller's filemap_fdatawrite()/wait will sync the data. 61 * The caller's filemap_fdatawrite()/wait will sync the data.
diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
index 1d6329dbe39..556ca8eba3d 100644
--- a/fs/ext4/hash.c
+++ b/fs/ext4/hash.c
@@ -27,7 +27,7 @@ static void TEA_transform(__u32 buf[4], __u32 const in[])
27 sum += DELTA; 27 sum += DELTA;
28 b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); 28 b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b);
29 b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); 29 b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d);
30 } while(--n); 30 } while (--n);
31 31
32 buf[0] += b0; 32 buf[0] += b0;
33 buf[1] += b1; 33 buf[1] += b1;
@@ -35,7 +35,7 @@ static void TEA_transform(__u32 buf[4], __u32 const in[])
35 35
36 36
37/* The old legacy hash */ 37/* The old legacy hash */
38static __u32 dx_hack_hash (const char *name, int len) 38static __u32 dx_hack_hash(const char *name, int len)
39{ 39{
40 __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; 40 __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
41 while (len--) { 41 while (len--) {
@@ -59,7 +59,7 @@ static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
59 val = pad; 59 val = pad;
60 if (len > num*4) 60 if (len > num*4)
61 len = num * 4; 61 len = num * 4;
62 for (i=0; i < len; i++) { 62 for (i = 0; i < len; i++) {
63 if ((i % 4) == 0) 63 if ((i % 4) == 0)
64 val = pad; 64 val = pad;
65 val = msg[i] + (val << 8); 65 val = msg[i] + (val << 8);
@@ -104,7 +104,7 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
104 104
105 /* Check to see if the seed is all zero's */ 105 /* Check to see if the seed is all zero's */
106 if (hinfo->seed) { 106 if (hinfo->seed) {
107 for (i=0; i < 4; i++) { 107 for (i = 0; i < 4; i++) {
108 if (hinfo->seed[i]) 108 if (hinfo->seed[i])
109 break; 109 break;
110 } 110 }
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index f344834bbf5..fe34d74cfb1 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -115,9 +115,11 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
115 block_group, bitmap_blk); 115 block_group, bitmap_blk);
116 return NULL; 116 return NULL;
117 } 117 }
118 if (bh_uptodate_or_lock(bh)) 118 if (buffer_uptodate(bh) &&
119 !(desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)))
119 return bh; 120 return bh;
120 121
122 lock_buffer(bh);
121 spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group)); 123 spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
122 if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { 124 if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
123 ext4_init_inode_bitmap(sb, bh, block_group, desc); 125 ext4_init_inode_bitmap(sb, bh, block_group, desc);
@@ -154,39 +156,40 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
154 * though), and then we'd have two inodes sharing the 156 * though), and then we'd have two inodes sharing the
155 * same inode number and space on the harddisk. 157 * same inode number and space on the harddisk.
156 */ 158 */
157void ext4_free_inode (handle_t *handle, struct inode * inode) 159void ext4_free_inode(handle_t *handle, struct inode *inode)
158{ 160{
159 struct super_block * sb = inode->i_sb; 161 struct super_block *sb = inode->i_sb;
160 int is_directory; 162 int is_directory;
161 unsigned long ino; 163 unsigned long ino;
162 struct buffer_head *bitmap_bh = NULL; 164 struct buffer_head *bitmap_bh = NULL;
163 struct buffer_head *bh2; 165 struct buffer_head *bh2;
164 ext4_group_t block_group; 166 ext4_group_t block_group;
165 unsigned long bit; 167 unsigned long bit;
166 struct ext4_group_desc * gdp; 168 struct ext4_group_desc *gdp;
167 struct ext4_super_block * es; 169 struct ext4_super_block *es;
168 struct ext4_sb_info *sbi; 170 struct ext4_sb_info *sbi;
169 int fatal = 0, err; 171 int fatal = 0, err;
170 ext4_group_t flex_group; 172 ext4_group_t flex_group;
171 173
172 if (atomic_read(&inode->i_count) > 1) { 174 if (atomic_read(&inode->i_count) > 1) {
173 printk ("ext4_free_inode: inode has count=%d\n", 175 printk(KERN_ERR "ext4_free_inode: inode has count=%d\n",
174 atomic_read(&inode->i_count)); 176 atomic_read(&inode->i_count));
175 return; 177 return;
176 } 178 }
177 if (inode->i_nlink) { 179 if (inode->i_nlink) {
178 printk ("ext4_free_inode: inode has nlink=%d\n", 180 printk(KERN_ERR "ext4_free_inode: inode has nlink=%d\n",
179 inode->i_nlink); 181 inode->i_nlink);
180 return; 182 return;
181 } 183 }
182 if (!sb) { 184 if (!sb) {
183 printk("ext4_free_inode: inode on nonexistent device\n"); 185 printk(KERN_ERR "ext4_free_inode: inode on "
186 "nonexistent device\n");
184 return; 187 return;
185 } 188 }
186 sbi = EXT4_SB(sb); 189 sbi = EXT4_SB(sb);
187 190
188 ino = inode->i_ino; 191 ino = inode->i_ino;
189 ext4_debug ("freeing inode %lu\n", ino); 192 ext4_debug("freeing inode %lu\n", ino);
190 193
191 /* 194 /*
192 * Note: we must free any quota before locking the superblock, 195 * Note: we must free any quota before locking the superblock,
@@ -200,12 +203,12 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
200 is_directory = S_ISDIR(inode->i_mode); 203 is_directory = S_ISDIR(inode->i_mode);
201 204
202 /* Do this BEFORE marking the inode not in use or returning an error */ 205 /* Do this BEFORE marking the inode not in use or returning an error */
203 clear_inode (inode); 206 clear_inode(inode);
204 207
205 es = EXT4_SB(sb)->s_es; 208 es = EXT4_SB(sb)->s_es;
206 if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) { 209 if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
207 ext4_error (sb, "ext4_free_inode", 210 ext4_error(sb, "ext4_free_inode",
208 "reserved or nonexistent inode %lu", ino); 211 "reserved or nonexistent inode %lu", ino);
209 goto error_return; 212 goto error_return;
210 } 213 }
211 block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); 214 block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
@@ -222,10 +225,10 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
222 /* Ok, now we can actually update the inode bitmaps.. */ 225 /* Ok, now we can actually update the inode bitmaps.. */
223 if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group), 226 if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
224 bit, bitmap_bh->b_data)) 227 bit, bitmap_bh->b_data))
225 ext4_error (sb, "ext4_free_inode", 228 ext4_error(sb, "ext4_free_inode",
226 "bit already cleared for inode %lu", ino); 229 "bit already cleared for inode %lu", ino);
227 else { 230 else {
228 gdp = ext4_get_group_desc (sb, block_group, &bh2); 231 gdp = ext4_get_group_desc(sb, block_group, &bh2);
229 232
230 BUFFER_TRACE(bh2, "get_write_access"); 233 BUFFER_TRACE(bh2, "get_write_access");
231 fatal = ext4_journal_get_write_access(handle, bh2); 234 fatal = ext4_journal_get_write_access(handle, bh2);
@@ -287,7 +290,7 @@ static int find_group_dir(struct super_block *sb, struct inode *parent,
287 avefreei = freei / ngroups; 290 avefreei = freei / ngroups;
288 291
289 for (group = 0; group < ngroups; group++) { 292 for (group = 0; group < ngroups; group++) {
290 desc = ext4_get_group_desc (sb, group, NULL); 293 desc = ext4_get_group_desc(sb, group, NULL);
291 if (!desc || !desc->bg_free_inodes_count) 294 if (!desc || !desc->bg_free_inodes_count)
292 continue; 295 continue;
293 if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei) 296 if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei)
@@ -576,16 +579,16 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
576 * For other inodes, search forward from the parent directory's block 579 * For other inodes, search forward from the parent directory's block
577 * group to find a free inode. 580 * group to find a free inode.
578 */ 581 */
579struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode) 582struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
580{ 583{
581 struct super_block *sb; 584 struct super_block *sb;
582 struct buffer_head *bitmap_bh = NULL; 585 struct buffer_head *bitmap_bh = NULL;
583 struct buffer_head *bh2; 586 struct buffer_head *bh2;
584 ext4_group_t group = 0; 587 ext4_group_t group = 0;
585 unsigned long ino = 0; 588 unsigned long ino = 0;
586 struct inode * inode; 589 struct inode *inode;
587 struct ext4_group_desc * gdp = NULL; 590 struct ext4_group_desc *gdp = NULL;
588 struct ext4_super_block * es; 591 struct ext4_super_block *es;
589 struct ext4_inode_info *ei; 592 struct ext4_inode_info *ei;
590 struct ext4_sb_info *sbi; 593 struct ext4_sb_info *sbi;
591 int ret2, err = 0; 594 int ret2, err = 0;
@@ -613,7 +616,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
613 } 616 }
614 617
615 if (S_ISDIR(mode)) { 618 if (S_ISDIR(mode)) {
616 if (test_opt (sb, OLDALLOC)) 619 if (test_opt(sb, OLDALLOC))
617 ret2 = find_group_dir(sb, dir, &group); 620 ret2 = find_group_dir(sb, dir, &group);
618 else 621 else
619 ret2 = find_group_orlov(sb, dir, &group); 622 ret2 = find_group_orlov(sb, dir, &group);
@@ -783,7 +786,7 @@ got:
783 } 786 }
784 787
785 inode->i_uid = current->fsuid; 788 inode->i_uid = current->fsuid;
786 if (test_opt (sb, GRPID)) 789 if (test_opt(sb, GRPID))
787 inode->i_gid = dir->i_gid; 790 inode->i_gid = dir->i_gid;
788 else if (dir->i_mode & S_ISGID) { 791 else if (dir->i_mode & S_ISGID) {
789 inode->i_gid = dir->i_gid; 792 inode->i_gid = dir->i_gid;
@@ -816,7 +819,6 @@ got:
816 ei->i_flags &= ~EXT4_DIRSYNC_FL; 819 ei->i_flags &= ~EXT4_DIRSYNC_FL;
817 ei->i_file_acl = 0; 820 ei->i_file_acl = 0;
818 ei->i_dtime = 0; 821 ei->i_dtime = 0;
819 ei->i_block_alloc_info = NULL;
820 ei->i_block_group = group; 822 ei->i_block_group = group;
821 823
822 ext4_set_inode_flags(inode); 824 ext4_set_inode_flags(inode);
@@ -832,7 +834,7 @@ got:
832 ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize; 834 ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize;
833 835
834 ret = inode; 836 ret = inode;
835 if(DQUOT_ALLOC_INODE(inode)) { 837 if (DQUOT_ALLOC_INODE(inode)) {
836 err = -EDQUOT; 838 err = -EDQUOT;
837 goto fail_drop; 839 goto fail_drop;
838 } 840 }
@@ -841,7 +843,7 @@ got:
841 if (err) 843 if (err)
842 goto fail_free_drop; 844 goto fail_free_drop;
843 845
844 err = ext4_init_security(handle,inode, dir); 846 err = ext4_init_security(handle, inode, dir);
845 if (err) 847 if (err)
846 goto fail_free_drop; 848 goto fail_free_drop;
847 849
@@ -959,7 +961,7 @@ error:
959 return ERR_PTR(err); 961 return ERR_PTR(err);
960} 962}
961 963
962unsigned long ext4_count_free_inodes (struct super_block * sb) 964unsigned long ext4_count_free_inodes(struct super_block *sb)
963{ 965{
964 unsigned long desc_count; 966 unsigned long desc_count;
965 struct ext4_group_desc *gdp; 967 struct ext4_group_desc *gdp;
@@ -974,7 +976,7 @@ unsigned long ext4_count_free_inodes (struct super_block * sb)
974 bitmap_count = 0; 976 bitmap_count = 0;
975 gdp = NULL; 977 gdp = NULL;
976 for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) { 978 for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
977 gdp = ext4_get_group_desc (sb, i, NULL); 979 gdp = ext4_get_group_desc(sb, i, NULL);
978 if (!gdp) 980 if (!gdp)
979 continue; 981 continue;
980 desc_count += le16_to_cpu(gdp->bg_free_inodes_count); 982 desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
@@ -989,13 +991,14 @@ unsigned long ext4_count_free_inodes (struct super_block * sb)
989 bitmap_count += x; 991 bitmap_count += x;
990 } 992 }
991 brelse(bitmap_bh); 993 brelse(bitmap_bh);
992 printk("ext4_count_free_inodes: stored = %u, computed = %lu, %lu\n", 994 printk(KERN_DEBUG "ext4_count_free_inodes: "
993 le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count); 995 "stored = %u, computed = %lu, %lu\n",
996 le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count);
994 return desc_count; 997 return desc_count;
995#else 998#else
996 desc_count = 0; 999 desc_count = 0;
997 for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) { 1000 for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
998 gdp = ext4_get_group_desc (sb, i, NULL); 1001 gdp = ext4_get_group_desc(sb, i, NULL);
999 if (!gdp) 1002 if (!gdp)
1000 continue; 1003 continue;
1001 desc_count += le16_to_cpu(gdp->bg_free_inodes_count); 1004 desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
@@ -1006,13 +1009,13 @@ unsigned long ext4_count_free_inodes (struct super_block * sb)
1006} 1009}
1007 1010
1008/* Called at mount-time, super-block is locked */ 1011/* Called at mount-time, super-block is locked */
1009unsigned long ext4_count_dirs (struct super_block * sb) 1012unsigned long ext4_count_dirs(struct super_block * sb)
1010{ 1013{
1011 unsigned long count = 0; 1014 unsigned long count = 0;
1012 ext4_group_t i; 1015 ext4_group_t i;
1013 1016
1014 for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) { 1017 for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
1015 struct ext4_group_desc *gdp = ext4_get_group_desc (sb, i, NULL); 1018 struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
1016 if (!gdp) 1019 if (!gdp)
1017 continue; 1020 continue;
1018 count += le16_to_cpu(gdp->bg_used_dirs_count); 1021 count += le16_to_cpu(gdp->bg_used_dirs_count);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 7e91913e325..9b4ec9decfd 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -190,7 +190,7 @@ static int ext4_journal_test_restart(handle_t *handle, struct inode *inode)
190/* 190/*
191 * Called at the last iput() if i_nlink is zero. 191 * Called at the last iput() if i_nlink is zero.
192 */ 192 */
193void ext4_delete_inode (struct inode * inode) 193void ext4_delete_inode(struct inode *inode)
194{ 194{
195 handle_t *handle; 195 handle_t *handle;
196 int err; 196 int err;
@@ -330,11 +330,11 @@ static int ext4_block_to_path(struct inode *inode,
330 int final = 0; 330 int final = 0;
331 331
332 if (i_block < 0) { 332 if (i_block < 0) {
333 ext4_warning (inode->i_sb, "ext4_block_to_path", "block < 0"); 333 ext4_warning(inode->i_sb, "ext4_block_to_path", "block < 0");
334 } else if (i_block < direct_blocks) { 334 } else if (i_block < direct_blocks) {
335 offsets[n++] = i_block; 335 offsets[n++] = i_block;
336 final = direct_blocks; 336 final = direct_blocks;
337 } else if ( (i_block -= direct_blocks) < indirect_blocks) { 337 } else if ((i_block -= direct_blocks) < indirect_blocks) {
338 offsets[n++] = EXT4_IND_BLOCK; 338 offsets[n++] = EXT4_IND_BLOCK;
339 offsets[n++] = i_block; 339 offsets[n++] = i_block;
340 final = ptrs; 340 final = ptrs;
@@ -400,14 +400,14 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
400 400
401 *err = 0; 401 *err = 0;
402 /* i_data is not going away, no lock needed */ 402 /* i_data is not going away, no lock needed */
403 add_chain (chain, NULL, EXT4_I(inode)->i_data + *offsets); 403 add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets);
404 if (!p->key) 404 if (!p->key)
405 goto no_block; 405 goto no_block;
406 while (--depth) { 406 while (--depth) {
407 bh = sb_bread(sb, le32_to_cpu(p->key)); 407 bh = sb_bread(sb, le32_to_cpu(p->key));
408 if (!bh) 408 if (!bh)
409 goto failure; 409 goto failure;
410 add_chain(++p, bh, (__le32*)bh->b_data + *++offsets); 410 add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets);
411 /* Reader: end */ 411 /* Reader: end */
412 if (!p->key) 412 if (!p->key)
413 goto no_block; 413 goto no_block;
@@ -443,7 +443,7 @@ no_block:
443static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) 443static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
444{ 444{
445 struct ext4_inode_info *ei = EXT4_I(inode); 445 struct ext4_inode_info *ei = EXT4_I(inode);
446 __le32 *start = ind->bh ? (__le32*) ind->bh->b_data : ei->i_data; 446 __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data;
447 __le32 *p; 447 __le32 *p;
448 ext4_fsblk_t bg_start; 448 ext4_fsblk_t bg_start;
449 ext4_fsblk_t last_block; 449 ext4_fsblk_t last_block;
@@ -486,18 +486,9 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
486static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, 486static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
487 Indirect *partial) 487 Indirect *partial)
488{ 488{
489 struct ext4_block_alloc_info *block_i;
490
491 block_i = EXT4_I(inode)->i_block_alloc_info;
492
493 /* 489 /*
494 * try the heuristic for sequential allocation, 490 * XXX need to get goal block from mballoc's data structures
495 * failing that at least try to get decent locality.
496 */ 491 */
497 if (block_i && (block == block_i->last_alloc_logical_block + 1)
498 && (block_i->last_alloc_physical_block != 0)) {
499 return block_i->last_alloc_physical_block + 1;
500 }
501 492
502 return ext4_find_near(inode, partial); 493 return ext4_find_near(inode, partial);
503} 494}
@@ -630,7 +621,7 @@ allocated:
630 *err = 0; 621 *err = 0;
631 return ret; 622 return ret;
632failed_out: 623failed_out:
633 for (i = 0; i <index; i++) 624 for (i = 0; i < index; i++)
634 ext4_free_blocks(handle, inode, new_blocks[i], 1, 0); 625 ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
635 return ret; 626 return ret;
636} 627}
@@ -703,7 +694,7 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
703 branch[n].p = (__le32 *) bh->b_data + offsets[n]; 694 branch[n].p = (__le32 *) bh->b_data + offsets[n];
704 branch[n].key = cpu_to_le32(new_blocks[n]); 695 branch[n].key = cpu_to_le32(new_blocks[n]);
705 *branch[n].p = branch[n].key; 696 *branch[n].p = branch[n].key;
706 if ( n == indirect_blks) { 697 if (n == indirect_blks) {
707 current_block = new_blocks[n]; 698 current_block = new_blocks[n];
708 /* 699 /*
709 * End of chain, update the last new metablock of 700 * End of chain, update the last new metablock of
@@ -730,7 +721,7 @@ failed:
730 BUFFER_TRACE(branch[i].bh, "call jbd2_journal_forget"); 721 BUFFER_TRACE(branch[i].bh, "call jbd2_journal_forget");
731 ext4_journal_forget(handle, branch[i].bh); 722 ext4_journal_forget(handle, branch[i].bh);
732 } 723 }
733 for (i = 0; i <indirect_blks; i++) 724 for (i = 0; i < indirect_blks; i++)
734 ext4_free_blocks(handle, inode, new_blocks[i], 1, 0); 725 ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
735 726
736 ext4_free_blocks(handle, inode, new_blocks[i], num, 0); 727 ext4_free_blocks(handle, inode, new_blocks[i], num, 0);
@@ -757,10 +748,8 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
757{ 748{
758 int i; 749 int i;
759 int err = 0; 750 int err = 0;
760 struct ext4_block_alloc_info *block_i;
761 ext4_fsblk_t current_block; 751 ext4_fsblk_t current_block;
762 752
763 block_i = EXT4_I(inode)->i_block_alloc_info;
764 /* 753 /*
765 * If we're splicing into a [td]indirect block (as opposed to the 754 * If we're splicing into a [td]indirect block (as opposed to the
766 * inode) then we need to get write access to the [td]indirect block 755 * inode) then we need to get write access to the [td]indirect block
@@ -783,18 +772,7 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
783 if (num == 0 && blks > 1) { 772 if (num == 0 && blks > 1) {
784 current_block = le32_to_cpu(where->key) + 1; 773 current_block = le32_to_cpu(where->key) + 1;
785 for (i = 1; i < blks; i++) 774 for (i = 1; i < blks; i++)
786 *(where->p + i ) = cpu_to_le32(current_block++); 775 *(where->p + i) = cpu_to_le32(current_block++);
787 }
788
789 /*
790 * update the most recently allocated logical & physical block
791 * in i_block_alloc_info, to assist find the proper goal block for next
792 * allocation
793 */
794 if (block_i) {
795 block_i->last_alloc_logical_block = block + blks - 1;
796 block_i->last_alloc_physical_block =
797 le32_to_cpu(where[num].key) + blks - 1;
798 } 776 }
799 777
800 /* We are done with atomic stuff, now do the rest of housekeeping */ 778 /* We are done with atomic stuff, now do the rest of housekeeping */
@@ -914,12 +892,8 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
914 goto cleanup; 892 goto cleanup;
915 893
916 /* 894 /*
917 * Okay, we need to do block allocation. Lazily initialize the block 895 * Okay, we need to do block allocation.
918 * allocation info here if necessary
919 */ 896 */
920 if (S_ISREG(inode->i_mode) && (!ei->i_block_alloc_info))
921 ext4_init_block_alloc_info(inode);
922
923 goal = ext4_find_goal(inode, iblock, partial); 897 goal = ext4_find_goal(inode, iblock, partial);
924 898
925 /* the number of blocks need to allocate for [d,t]indirect blocks */ 899 /* the number of blocks need to allocate for [d,t]indirect blocks */
@@ -1030,19 +1004,20 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
1030 BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); 1004 BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
1031 mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb; 1005 mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
1032 1006
1033 /* Account for allocated meta_blocks */ 1007 if (mdb_free) {
1034 mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks; 1008 /* Account for allocated meta_blocks */
1009 mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks;
1035 1010
1036 /* update fs free blocks counter for truncate case */ 1011 /* update fs dirty blocks counter */
1037 percpu_counter_add(&sbi->s_freeblocks_counter, mdb_free); 1012 percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free);
1013 EXT4_I(inode)->i_allocated_meta_blocks = 0;
1014 EXT4_I(inode)->i_reserved_meta_blocks = mdb;
1015 }
1038 1016
1039 /* update per-inode reservations */ 1017 /* update per-inode reservations */
1040 BUG_ON(used > EXT4_I(inode)->i_reserved_data_blocks); 1018 BUG_ON(used > EXT4_I(inode)->i_reserved_data_blocks);
1041 EXT4_I(inode)->i_reserved_data_blocks -= used; 1019 EXT4_I(inode)->i_reserved_data_blocks -= used;
1042 1020
1043 BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
1044 EXT4_I(inode)->i_reserved_meta_blocks = mdb;
1045 EXT4_I(inode)->i_allocated_meta_blocks = 0;
1046 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1021 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1047} 1022}
1048 1023
@@ -1160,8 +1135,8 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
1160/* Maximum number of blocks we map for direct IO at once. */ 1135/* Maximum number of blocks we map for direct IO at once. */
1161#define DIO_MAX_BLOCKS 4096 1136#define DIO_MAX_BLOCKS 4096
1162 1137
1163static int ext4_get_block(struct inode *inode, sector_t iblock, 1138int ext4_get_block(struct inode *inode, sector_t iblock,
1164 struct buffer_head *bh_result, int create) 1139 struct buffer_head *bh_result, int create)
1165{ 1140{
1166 handle_t *handle = ext4_journal_current_handle(); 1141 handle_t *handle = ext4_journal_current_handle();
1167 int ret = 0, started = 0; 1142 int ret = 0, started = 0;
@@ -1241,7 +1216,7 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
1241 BUFFER_TRACE(bh, "call get_create_access"); 1216 BUFFER_TRACE(bh, "call get_create_access");
1242 fatal = ext4_journal_get_create_access(handle, bh); 1217 fatal = ext4_journal_get_create_access(handle, bh);
1243 if (!fatal && !buffer_uptodate(bh)) { 1218 if (!fatal && !buffer_uptodate(bh)) {
1244 memset(bh->b_data,0,inode->i_sb->s_blocksize); 1219 memset(bh->b_data, 0, inode->i_sb->s_blocksize);
1245 set_buffer_uptodate(bh); 1220 set_buffer_uptodate(bh);
1246 } 1221 }
1247 unlock_buffer(bh); 1222 unlock_buffer(bh);
@@ -1266,7 +1241,7 @@ err:
1266struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, 1241struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
1267 ext4_lblk_t block, int create, int *err) 1242 ext4_lblk_t block, int create, int *err)
1268{ 1243{
1269 struct buffer_head * bh; 1244 struct buffer_head *bh;
1270 1245
1271 bh = ext4_getblk(handle, inode, block, create, err); 1246 bh = ext4_getblk(handle, inode, block, create, err);
1272 if (!bh) 1247 if (!bh)
@@ -1282,13 +1257,13 @@ struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
1282 return NULL; 1257 return NULL;
1283} 1258}
1284 1259
1285static int walk_page_buffers( handle_t *handle, 1260static int walk_page_buffers(handle_t *handle,
1286 struct buffer_head *head, 1261 struct buffer_head *head,
1287 unsigned from, 1262 unsigned from,
1288 unsigned to, 1263 unsigned to,
1289 int *partial, 1264 int *partial,
1290 int (*fn)( handle_t *handle, 1265 int (*fn)(handle_t *handle,
1291 struct buffer_head *bh)) 1266 struct buffer_head *bh))
1292{ 1267{
1293 struct buffer_head *bh; 1268 struct buffer_head *bh;
1294 unsigned block_start, block_end; 1269 unsigned block_start, block_end;
@@ -1296,9 +1271,9 @@ static int walk_page_buffers( handle_t *handle,
1296 int err, ret = 0; 1271 int err, ret = 0;
1297 struct buffer_head *next; 1272 struct buffer_head *next;
1298 1273
1299 for ( bh = head, block_start = 0; 1274 for (bh = head, block_start = 0;
1300 ret == 0 && (bh != head || !block_start); 1275 ret == 0 && (bh != head || !block_start);
1301 block_start = block_end, bh = next) 1276 block_start = block_end, bh = next)
1302 { 1277 {
1303 next = bh->b_this_page; 1278 next = bh->b_this_page;
1304 block_end = block_start + blocksize; 1279 block_end = block_start + blocksize;
@@ -1351,23 +1326,23 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
1351 loff_t pos, unsigned len, unsigned flags, 1326 loff_t pos, unsigned len, unsigned flags,
1352 struct page **pagep, void **fsdata) 1327 struct page **pagep, void **fsdata)
1353{ 1328{
1354 struct inode *inode = mapping->host; 1329 struct inode *inode = mapping->host;
1355 int ret, needed_blocks = ext4_writepage_trans_blocks(inode); 1330 int ret, needed_blocks = ext4_writepage_trans_blocks(inode);
1356 handle_t *handle; 1331 handle_t *handle;
1357 int retries = 0; 1332 int retries = 0;
1358 struct page *page; 1333 struct page *page;
1359 pgoff_t index; 1334 pgoff_t index;
1360 unsigned from, to; 1335 unsigned from, to;
1361 1336
1362 index = pos >> PAGE_CACHE_SHIFT; 1337 index = pos >> PAGE_CACHE_SHIFT;
1363 from = pos & (PAGE_CACHE_SIZE - 1); 1338 from = pos & (PAGE_CACHE_SIZE - 1);
1364 to = from + len; 1339 to = from + len;
1365 1340
1366retry: 1341retry:
1367 handle = ext4_journal_start(inode, needed_blocks); 1342 handle = ext4_journal_start(inode, needed_blocks);
1368 if (IS_ERR(handle)) { 1343 if (IS_ERR(handle)) {
1369 ret = PTR_ERR(handle); 1344 ret = PTR_ERR(handle);
1370 goto out; 1345 goto out;
1371 } 1346 }
1372 1347
1373 page = __grab_cache_page(mapping, index); 1348 page = __grab_cache_page(mapping, index);
@@ -1387,9 +1362,16 @@ retry:
1387 } 1362 }
1388 1363
1389 if (ret) { 1364 if (ret) {
1390 unlock_page(page); 1365 unlock_page(page);
1391 ext4_journal_stop(handle); 1366 ext4_journal_stop(handle);
1392 page_cache_release(page); 1367 page_cache_release(page);
1368 /*
1369 * block_write_begin may have instantiated a few blocks
1370 * outside i_size. Trim these off again. Don't need
1371 * i_size_read because we hold i_mutex.
1372 */
1373 if (pos + len > inode->i_size)
1374 vmtruncate(inode, inode->i_size);
1393 } 1375 }
1394 1376
1395 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 1377 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -1426,16 +1408,18 @@ static int ext4_ordered_write_end(struct file *file,
1426 ret = ext4_jbd2_file_inode(handle, inode); 1408 ret = ext4_jbd2_file_inode(handle, inode);
1427 1409
1428 if (ret == 0) { 1410 if (ret == 0) {
1429 /*
1430 * generic_write_end() will run mark_inode_dirty() if i_size
1431 * changes. So let's piggyback the i_disksize mark_inode_dirty
1432 * into that.
1433 */
1434 loff_t new_i_size; 1411 loff_t new_i_size;
1435 1412
1436 new_i_size = pos + copied; 1413 new_i_size = pos + copied;
1437 if (new_i_size > EXT4_I(inode)->i_disksize) 1414 if (new_i_size > EXT4_I(inode)->i_disksize) {
1438 EXT4_I(inode)->i_disksize = new_i_size; 1415 ext4_update_i_disksize(inode, new_i_size);
1416 /* We need to mark inode dirty even if
1417 * new_i_size is less that inode->i_size
1418 * bu greater than i_disksize.(hint delalloc)
1419 */
1420 ext4_mark_inode_dirty(handle, inode);
1421 }
1422
1439 ret2 = generic_write_end(file, mapping, pos, len, copied, 1423 ret2 = generic_write_end(file, mapping, pos, len, copied,
1440 page, fsdata); 1424 page, fsdata);
1441 copied = ret2; 1425 copied = ret2;
@@ -1460,8 +1444,14 @@ static int ext4_writeback_write_end(struct file *file,
1460 loff_t new_i_size; 1444 loff_t new_i_size;
1461 1445
1462 new_i_size = pos + copied; 1446 new_i_size = pos + copied;
1463 if (new_i_size > EXT4_I(inode)->i_disksize) 1447 if (new_i_size > EXT4_I(inode)->i_disksize) {
1464 EXT4_I(inode)->i_disksize = new_i_size; 1448 ext4_update_i_disksize(inode, new_i_size);
1449 /* We need to mark inode dirty even if
1450 * new_i_size is less that inode->i_size
1451 * bu greater than i_disksize.(hint delalloc)
1452 */
1453 ext4_mark_inode_dirty(handle, inode);
1454 }
1465 1455
1466 ret2 = generic_write_end(file, mapping, pos, len, copied, 1456 ret2 = generic_write_end(file, mapping, pos, len, copied,
1467 page, fsdata); 1457 page, fsdata);
@@ -1486,6 +1476,7 @@ static int ext4_journalled_write_end(struct file *file,
1486 int ret = 0, ret2; 1476 int ret = 0, ret2;
1487 int partial = 0; 1477 int partial = 0;
1488 unsigned from, to; 1478 unsigned from, to;
1479 loff_t new_i_size;
1489 1480
1490 from = pos & (PAGE_CACHE_SIZE - 1); 1481 from = pos & (PAGE_CACHE_SIZE - 1);
1491 to = from + len; 1482 to = from + len;
@@ -1500,11 +1491,12 @@ static int ext4_journalled_write_end(struct file *file,
1500 to, &partial, write_end_fn); 1491 to, &partial, write_end_fn);
1501 if (!partial) 1492 if (!partial)
1502 SetPageUptodate(page); 1493 SetPageUptodate(page);
1503 if (pos+copied > inode->i_size) 1494 new_i_size = pos + copied;
1495 if (new_i_size > inode->i_size)
1504 i_size_write(inode, pos+copied); 1496 i_size_write(inode, pos+copied);
1505 EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; 1497 EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
1506 if (inode->i_size > EXT4_I(inode)->i_disksize) { 1498 if (new_i_size > EXT4_I(inode)->i_disksize) {
1507 EXT4_I(inode)->i_disksize = inode->i_size; 1499 ext4_update_i_disksize(inode, new_i_size);
1508 ret2 = ext4_mark_inode_dirty(handle, inode); 1500 ret2 = ext4_mark_inode_dirty(handle, inode);
1509 if (!ret) 1501 if (!ret)
1510 ret = ret2; 1502 ret = ret2;
@@ -1521,6 +1513,7 @@ static int ext4_journalled_write_end(struct file *file,
1521 1513
1522static int ext4_da_reserve_space(struct inode *inode, int nrblocks) 1514static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
1523{ 1515{
1516 int retries = 0;
1524 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1517 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1525 unsigned long md_needed, mdblocks, total = 0; 1518 unsigned long md_needed, mdblocks, total = 0;
1526 1519
@@ -1529,6 +1522,7 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
1529 * in order to allocate nrblocks 1522 * in order to allocate nrblocks
1530 * worse case is one extent per block 1523 * worse case is one extent per block
1531 */ 1524 */
1525repeat:
1532 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 1526 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
1533 total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks; 1527 total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks;
1534 mdblocks = ext4_calc_metadata_amount(inode, total); 1528 mdblocks = ext4_calc_metadata_amount(inode, total);
@@ -1537,13 +1531,14 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
1537 md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks; 1531 md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks;
1538 total = md_needed + nrblocks; 1532 total = md_needed + nrblocks;
1539 1533
1540 if (ext4_has_free_blocks(sbi, total) < total) { 1534 if (ext4_claim_free_blocks(sbi, total)) {
1541 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1535 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1536 if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
1537 yield();
1538 goto repeat;
1539 }
1542 return -ENOSPC; 1540 return -ENOSPC;
1543 } 1541 }
1544 /* reduce fs free blocks counter */
1545 percpu_counter_sub(&sbi->s_freeblocks_counter, total);
1546
1547 EXT4_I(inode)->i_reserved_data_blocks += nrblocks; 1542 EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
1548 EXT4_I(inode)->i_reserved_meta_blocks = mdblocks; 1543 EXT4_I(inode)->i_reserved_meta_blocks = mdblocks;
1549 1544
@@ -1585,8 +1580,8 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
1585 1580
1586 release = to_free + mdb_free; 1581 release = to_free + mdb_free;
1587 1582
1588 /* update fs free blocks counter for truncate case */ 1583 /* update fs dirty blocks counter for truncate case */
1589 percpu_counter_add(&sbi->s_freeblocks_counter, release); 1584 percpu_counter_sub(&sbi->s_dirtyblocks_counter, release);
1590 1585
1591 /* update per-inode reservations */ 1586 /* update per-inode reservations */
1592 BUG_ON(to_free > EXT4_I(inode)->i_reserved_data_blocks); 1587 BUG_ON(to_free > EXT4_I(inode)->i_reserved_data_blocks);
@@ -1630,6 +1625,7 @@ struct mpage_da_data {
1630 struct writeback_control *wbc; 1625 struct writeback_control *wbc;
1631 int io_done; 1626 int io_done;
1632 long pages_written; 1627 long pages_written;
1628 int retval;
1633}; 1629};
1634 1630
1635/* 1631/*
@@ -1783,6 +1779,57 @@ static inline void __unmap_underlying_blocks(struct inode *inode,
1783 unmap_underlying_metadata(bdev, bh->b_blocknr + i); 1779 unmap_underlying_metadata(bdev, bh->b_blocknr + i);
1784} 1780}
1785 1781
1782static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
1783 sector_t logical, long blk_cnt)
1784{
1785 int nr_pages, i;
1786 pgoff_t index, end;
1787 struct pagevec pvec;
1788 struct inode *inode = mpd->inode;
1789 struct address_space *mapping = inode->i_mapping;
1790
1791 index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
1792 end = (logical + blk_cnt - 1) >>
1793 (PAGE_CACHE_SHIFT - inode->i_blkbits);
1794 while (index <= end) {
1795 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
1796 if (nr_pages == 0)
1797 break;
1798 for (i = 0; i < nr_pages; i++) {
1799 struct page *page = pvec.pages[i];
1800 index = page->index;
1801 if (index > end)
1802 break;
1803 index++;
1804
1805 BUG_ON(!PageLocked(page));
1806 BUG_ON(PageWriteback(page));
1807 block_invalidatepage(page, 0);
1808 ClearPageUptodate(page);
1809 unlock_page(page);
1810 }
1811 }
1812 return;
1813}
1814
1815static void ext4_print_free_blocks(struct inode *inode)
1816{
1817 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1818 printk(KERN_EMERG "Total free blocks count %lld\n",
1819 ext4_count_free_blocks(inode->i_sb));
1820 printk(KERN_EMERG "Free/Dirty block details\n");
1821 printk(KERN_EMERG "free_blocks=%lld\n",
1822 percpu_counter_sum(&sbi->s_freeblocks_counter));
1823 printk(KERN_EMERG "dirty_blocks=%lld\n",
1824 percpu_counter_sum(&sbi->s_dirtyblocks_counter));
1825 printk(KERN_EMERG "Block reservation details\n");
1826 printk(KERN_EMERG "i_reserved_data_blocks=%lu\n",
1827 EXT4_I(inode)->i_reserved_data_blocks);
1828 printk(KERN_EMERG "i_reserved_meta_blocks=%lu\n",
1829 EXT4_I(inode)->i_reserved_meta_blocks);
1830 return;
1831}
1832
1786/* 1833/*
1787 * mpage_da_map_blocks - go through given space 1834 * mpage_da_map_blocks - go through given space
1788 * 1835 *
@@ -1792,32 +1839,69 @@ static inline void __unmap_underlying_blocks(struct inode *inode,
1792 * The function skips space we know is already mapped to disk blocks. 1839 * The function skips space we know is already mapped to disk blocks.
1793 * 1840 *
1794 */ 1841 */
1795static void mpage_da_map_blocks(struct mpage_da_data *mpd) 1842static int mpage_da_map_blocks(struct mpage_da_data *mpd)
1796{ 1843{
1797 int err = 0; 1844 int err = 0;
1798 struct buffer_head *lbh = &mpd->lbh;
1799 sector_t next = lbh->b_blocknr;
1800 struct buffer_head new; 1845 struct buffer_head new;
1846 struct buffer_head *lbh = &mpd->lbh;
1847 sector_t next;
1801 1848
1802 /* 1849 /*
1803 * We consider only non-mapped and non-allocated blocks 1850 * We consider only non-mapped and non-allocated blocks
1804 */ 1851 */
1805 if (buffer_mapped(lbh) && !buffer_delay(lbh)) 1852 if (buffer_mapped(lbh) && !buffer_delay(lbh))
1806 return; 1853 return 0;
1807
1808 new.b_state = lbh->b_state; 1854 new.b_state = lbh->b_state;
1809 new.b_blocknr = 0; 1855 new.b_blocknr = 0;
1810 new.b_size = lbh->b_size; 1856 new.b_size = lbh->b_size;
1811 1857 next = lbh->b_blocknr;
1812 /* 1858 /*
1813 * If we didn't accumulate anything 1859 * If we didn't accumulate anything
1814 * to write simply return 1860 * to write simply return
1815 */ 1861 */
1816 if (!new.b_size) 1862 if (!new.b_size)
1817 return; 1863 return 0;
1818 err = mpd->get_block(mpd->inode, next, &new, 1); 1864 err = mpd->get_block(mpd->inode, next, &new, 1);
1819 if (err) 1865 if (err) {
1820 return; 1866
1867 /* If get block returns with error
1868 * we simply return. Later writepage
1869 * will redirty the page and writepages
1870 * will find the dirty page again
1871 */
1872 if (err == -EAGAIN)
1873 return 0;
1874
1875 if (err == -ENOSPC &&
1876 ext4_count_free_blocks(mpd->inode->i_sb)) {
1877 mpd->retval = err;
1878 return 0;
1879 }
1880
1881 /*
1882 * get block failure will cause us
1883 * to loop in writepages. Because
1884 * a_ops->writepage won't be able to
1885 * make progress. The page will be redirtied
1886 * by writepage and writepages will again
1887 * try to write the same.
1888 */
1889 printk(KERN_EMERG "%s block allocation failed for inode %lu "
1890 "at logical offset %llu with max blocks "
1891 "%zd with error %d\n",
1892 __func__, mpd->inode->i_ino,
1893 (unsigned long long)next,
1894 lbh->b_size >> mpd->inode->i_blkbits, err);
1895 printk(KERN_EMERG "This should not happen.!! "
1896 "Data will be lost\n");
1897 if (err == -ENOSPC) {
1898 ext4_print_free_blocks(mpd->inode);
1899 }
1900 /* invlaidate all the pages */
1901 ext4_da_block_invalidatepages(mpd, next,
1902 lbh->b_size >> mpd->inode->i_blkbits);
1903 return err;
1904 }
1821 BUG_ON(new.b_size == 0); 1905 BUG_ON(new.b_size == 0);
1822 1906
1823 if (buffer_new(&new)) 1907 if (buffer_new(&new))
@@ -1830,7 +1914,7 @@ static void mpage_da_map_blocks(struct mpage_da_data *mpd)
1830 if (buffer_delay(lbh) || buffer_unwritten(lbh)) 1914 if (buffer_delay(lbh) || buffer_unwritten(lbh))
1831 mpage_put_bnr_to_bhs(mpd, next, &new); 1915 mpage_put_bnr_to_bhs(mpd, next, &new);
1832 1916
1833 return; 1917 return 0;
1834} 1918}
1835 1919
1836#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \ 1920#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
@@ -1899,8 +1983,8 @@ flush_it:
1899 * We couldn't merge the block to our extent, so we 1983 * We couldn't merge the block to our extent, so we
1900 * need to flush current extent and start new one 1984 * need to flush current extent and start new one
1901 */ 1985 */
1902 mpage_da_map_blocks(mpd); 1986 if (mpage_da_map_blocks(mpd) == 0)
1903 mpage_da_submit_io(mpd); 1987 mpage_da_submit_io(mpd);
1904 mpd->io_done = 1; 1988 mpd->io_done = 1;
1905 return; 1989 return;
1906} 1990}
@@ -1942,8 +2026,8 @@ static int __mpage_da_writepage(struct page *page,
1942 * and start IO on them using writepage() 2026 * and start IO on them using writepage()
1943 */ 2027 */
1944 if (mpd->next_page != mpd->first_page) { 2028 if (mpd->next_page != mpd->first_page) {
1945 mpage_da_map_blocks(mpd); 2029 if (mpage_da_map_blocks(mpd) == 0)
1946 mpage_da_submit_io(mpd); 2030 mpage_da_submit_io(mpd);
1947 /* 2031 /*
1948 * skip rest of the page in the page_vec 2032 * skip rest of the page in the page_vec
1949 */ 2033 */
@@ -2018,39 +2102,36 @@ static int __mpage_da_writepage(struct page *page,
2018 */ 2102 */
2019static int mpage_da_writepages(struct address_space *mapping, 2103static int mpage_da_writepages(struct address_space *mapping,
2020 struct writeback_control *wbc, 2104 struct writeback_control *wbc,
2021 get_block_t get_block) 2105 struct mpage_da_data *mpd)
2022{ 2106{
2023 struct mpage_da_data mpd;
2024 long to_write; 2107 long to_write;
2025 int ret; 2108 int ret;
2026 2109
2027 if (!get_block) 2110 if (!mpd->get_block)
2028 return generic_writepages(mapping, wbc); 2111 return generic_writepages(mapping, wbc);
2029 2112
2030 mpd.wbc = wbc; 2113 mpd->lbh.b_size = 0;
2031 mpd.inode = mapping->host; 2114 mpd->lbh.b_state = 0;
2032 mpd.lbh.b_size = 0; 2115 mpd->lbh.b_blocknr = 0;
2033 mpd.lbh.b_state = 0; 2116 mpd->first_page = 0;
2034 mpd.lbh.b_blocknr = 0; 2117 mpd->next_page = 0;
2035 mpd.first_page = 0; 2118 mpd->io_done = 0;
2036 mpd.next_page = 0; 2119 mpd->pages_written = 0;
2037 mpd.get_block = get_block; 2120 mpd->retval = 0;
2038 mpd.io_done = 0;
2039 mpd.pages_written = 0;
2040 2121
2041 to_write = wbc->nr_to_write; 2122 to_write = wbc->nr_to_write;
2042 2123
2043 ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, &mpd); 2124 ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd);
2044 2125
2045 /* 2126 /*
2046 * Handle last extent of pages 2127 * Handle last extent of pages
2047 */ 2128 */
2048 if (!mpd.io_done && mpd.next_page != mpd.first_page) { 2129 if (!mpd->io_done && mpd->next_page != mpd->first_page) {
2049 mpage_da_map_blocks(&mpd); 2130 if (mpage_da_map_blocks(mpd) == 0)
2050 mpage_da_submit_io(&mpd); 2131 mpage_da_submit_io(mpd);
2051 } 2132 }
2052 2133
2053 wbc->nr_to_write = to_write - mpd.pages_written; 2134 wbc->nr_to_write = to_write - mpd->pages_written;
2054 return ret; 2135 return ret;
2055} 2136}
2056 2137
@@ -2103,18 +2184,24 @@ static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
2103 handle_t *handle = NULL; 2184 handle_t *handle = NULL;
2104 2185
2105 handle = ext4_journal_current_handle(); 2186 handle = ext4_journal_current_handle();
2106 if (!handle) { 2187 BUG_ON(!handle);
2107 ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks, 2188 ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
2108 bh_result, 0, 0, 0); 2189 bh_result, create, 0, EXT4_DELALLOC_RSVED);
2109 BUG_ON(!ret);
2110 } else {
2111 ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
2112 bh_result, create, 0, EXT4_DELALLOC_RSVED);
2113 }
2114
2115 if (ret > 0) { 2190 if (ret > 0) {
2191
2116 bh_result->b_size = (ret << inode->i_blkbits); 2192 bh_result->b_size = (ret << inode->i_blkbits);
2117 2193
2194 if (ext4_should_order_data(inode)) {
2195 int retval;
2196 retval = ext4_jbd2_file_inode(handle, inode);
2197 if (retval)
2198 /*
2199 * Failed to add inode for ordered
2200 * mode. Don't update file size
2201 */
2202 return retval;
2203 }
2204
2118 /* 2205 /*
2119 * Update on-disk size along with block allocation 2206 * Update on-disk size along with block allocation
2120 * we don't use 'extend_disksize' as size may change 2207 * we don't use 'extend_disksize' as size may change
@@ -2124,18 +2211,9 @@ static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
2124 if (disksize > i_size_read(inode)) 2211 if (disksize > i_size_read(inode))
2125 disksize = i_size_read(inode); 2212 disksize = i_size_read(inode);
2126 if (disksize > EXT4_I(inode)->i_disksize) { 2213 if (disksize > EXT4_I(inode)->i_disksize) {
2127 /* 2214 ext4_update_i_disksize(inode, disksize);
2128 * XXX: replace with spinlock if seen contended -bzzz 2215 ret = ext4_mark_inode_dirty(handle, inode);
2129 */ 2216 return ret;
2130 down_write(&EXT4_I(inode)->i_data_sem);
2131 if (disksize > EXT4_I(inode)->i_disksize)
2132 EXT4_I(inode)->i_disksize = disksize;
2133 up_write(&EXT4_I(inode)->i_data_sem);
2134
2135 if (EXT4_I(inode)->i_disksize == disksize) {
2136 ret = ext4_mark_inode_dirty(handle, inode);
2137 return ret;
2138 }
2139 } 2217 }
2140 ret = 0; 2218 ret = 0;
2141 } 2219 }
@@ -2284,6 +2362,7 @@ static int ext4_da_writepages(struct address_space *mapping,
2284{ 2362{
2285 handle_t *handle = NULL; 2363 handle_t *handle = NULL;
2286 loff_t range_start = 0; 2364 loff_t range_start = 0;
2365 struct mpage_da_data mpd;
2287 struct inode *inode = mapping->host; 2366 struct inode *inode = mapping->host;
2288 int needed_blocks, ret = 0, nr_to_writebump = 0; 2367 int needed_blocks, ret = 0, nr_to_writebump = 0;
2289 long to_write, pages_skipped = 0; 2368 long to_write, pages_skipped = 0;
@@ -2317,6 +2396,9 @@ static int ext4_da_writepages(struct address_space *mapping,
2317 range_start = wbc->range_start; 2396 range_start = wbc->range_start;
2318 pages_skipped = wbc->pages_skipped; 2397 pages_skipped = wbc->pages_skipped;
2319 2398
2399 mpd.wbc = wbc;
2400 mpd.inode = mapping->host;
2401
2320restart_loop: 2402restart_loop:
2321 to_write = wbc->nr_to_write; 2403 to_write = wbc->nr_to_write;
2322 while (!ret && to_write > 0) { 2404 while (!ret && to_write > 0) {
@@ -2340,23 +2422,17 @@ restart_loop:
2340 dump_stack(); 2422 dump_stack();
2341 goto out_writepages; 2423 goto out_writepages;
2342 } 2424 }
2343 if (ext4_should_order_data(inode)) {
2344 /*
2345 * With ordered mode we need to add
2346 * the inode to the journal handl
2347 * when we do block allocation.
2348 */
2349 ret = ext4_jbd2_file_inode(handle, inode);
2350 if (ret) {
2351 ext4_journal_stop(handle);
2352 goto out_writepages;
2353 }
2354 }
2355
2356 to_write -= wbc->nr_to_write; 2425 to_write -= wbc->nr_to_write;
2357 ret = mpage_da_writepages(mapping, wbc, 2426
2358 ext4_da_get_block_write); 2427 mpd.get_block = ext4_da_get_block_write;
2428 ret = mpage_da_writepages(mapping, wbc, &mpd);
2429
2359 ext4_journal_stop(handle); 2430 ext4_journal_stop(handle);
2431
2432 if (mpd.retval == -ENOSPC)
2433 jbd2_journal_force_commit_nested(sbi->s_journal);
2434
2435 /* reset the retry count */
2360 if (ret == MPAGE_DA_EXTENT_TAIL) { 2436 if (ret == MPAGE_DA_EXTENT_TAIL) {
2361 /* 2437 /*
2362 * got one extent now try with 2438 * got one extent now try with
@@ -2391,6 +2467,33 @@ out_writepages:
2391 return ret; 2467 return ret;
2392} 2468}
2393 2469
2470#define FALL_BACK_TO_NONDELALLOC 1
2471static int ext4_nonda_switch(struct super_block *sb)
2472{
2473 s64 free_blocks, dirty_blocks;
2474 struct ext4_sb_info *sbi = EXT4_SB(sb);
2475
2476 /*
2477 * switch to non delalloc mode if we are running low
2478 * on free block. The free block accounting via percpu
2479 * counters can get slightly wrong with FBC_BATCH getting
2480 * accumulated on each CPU without updating global counters
2481 * Delalloc need an accurate free block accounting. So switch
2482 * to non delalloc when we are near to error range.
2483 */
2484 free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
2485 dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyblocks_counter);
2486 if (2 * free_blocks < 3 * dirty_blocks ||
2487 free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) {
2488 /*
2489 * free block count is less that 150% of dirty blocks
2490 * or free blocks is less that watermark
2491 */
2492 return 1;
2493 }
2494 return 0;
2495}
2496
2394static int ext4_da_write_begin(struct file *file, struct address_space *mapping, 2497static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
2395 loff_t pos, unsigned len, unsigned flags, 2498 loff_t pos, unsigned len, unsigned flags,
2396 struct page **pagep, void **fsdata) 2499 struct page **pagep, void **fsdata)
@@ -2406,6 +2509,12 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
2406 from = pos & (PAGE_CACHE_SIZE - 1); 2509 from = pos & (PAGE_CACHE_SIZE - 1);
2407 to = from + len; 2510 to = from + len;
2408 2511
2512 if (ext4_nonda_switch(inode->i_sb)) {
2513 *fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
2514 return ext4_write_begin(file, mapping, pos,
2515 len, flags, pagep, fsdata);
2516 }
2517 *fsdata = (void *)0;
2409retry: 2518retry:
2410 /* 2519 /*
2411 * With delayed allocation, we don't log the i_disksize update 2520 * With delayed allocation, we don't log the i_disksize update
@@ -2433,6 +2542,13 @@ retry:
2433 unlock_page(page); 2542 unlock_page(page);
2434 ext4_journal_stop(handle); 2543 ext4_journal_stop(handle);
2435 page_cache_release(page); 2544 page_cache_release(page);
2545 /*
2546 * block_write_begin may have instantiated a few blocks
2547 * outside i_size. Trim these off again. Don't need
2548 * i_size_read because we hold i_mutex.
2549 */
2550 if (pos + len > inode->i_size)
2551 vmtruncate(inode, inode->i_size);
2436 } 2552 }
2437 2553
2438 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 2554 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -2456,7 +2572,7 @@ static int ext4_da_should_update_i_disksize(struct page *page,
2456 bh = page_buffers(page); 2572 bh = page_buffers(page);
2457 idx = offset >> inode->i_blkbits; 2573 idx = offset >> inode->i_blkbits;
2458 2574
2459 for (i=0; i < idx; i++) 2575 for (i = 0; i < idx; i++)
2460 bh = bh->b_this_page; 2576 bh = bh->b_this_page;
2461 2577
2462 if (!buffer_mapped(bh) || (buffer_delay(bh))) 2578 if (!buffer_mapped(bh) || (buffer_delay(bh)))
@@ -2474,9 +2590,22 @@ static int ext4_da_write_end(struct file *file,
2474 handle_t *handle = ext4_journal_current_handle(); 2590 handle_t *handle = ext4_journal_current_handle();
2475 loff_t new_i_size; 2591 loff_t new_i_size;
2476 unsigned long start, end; 2592 unsigned long start, end;
2593 int write_mode = (int)(unsigned long)fsdata;
2594
2595 if (write_mode == FALL_BACK_TO_NONDELALLOC) {
2596 if (ext4_should_order_data(inode)) {
2597 return ext4_ordered_write_end(file, mapping, pos,
2598 len, copied, page, fsdata);
2599 } else if (ext4_should_writeback_data(inode)) {
2600 return ext4_writeback_write_end(file, mapping, pos,
2601 len, copied, page, fsdata);
2602 } else {
2603 BUG();
2604 }
2605 }
2477 2606
2478 start = pos & (PAGE_CACHE_SIZE - 1); 2607 start = pos & (PAGE_CACHE_SIZE - 1);
2479 end = start + copied -1; 2608 end = start + copied - 1;
2480 2609
2481 /* 2610 /*
2482 * generic_write_end() will run mark_inode_dirty() if i_size 2611 * generic_write_end() will run mark_inode_dirty() if i_size
@@ -2500,6 +2629,11 @@ static int ext4_da_write_end(struct file *file,
2500 EXT4_I(inode)->i_disksize = new_i_size; 2629 EXT4_I(inode)->i_disksize = new_i_size;
2501 } 2630 }
2502 up_write(&EXT4_I(inode)->i_data_sem); 2631 up_write(&EXT4_I(inode)->i_data_sem);
2632 /* We need to mark inode dirty even if
2633 * new_i_size is less that inode->i_size
2634 * bu greater than i_disksize.(hint delalloc)
2635 */
2636 ext4_mark_inode_dirty(handle, inode);
2503 } 2637 }
2504 } 2638 }
2505 ret2 = generic_write_end(file, mapping, pos, len, copied, 2639 ret2 = generic_write_end(file, mapping, pos, len, copied,
@@ -2591,7 +2725,7 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
2591 return 0; 2725 return 0;
2592 } 2726 }
2593 2727
2594 return generic_block_bmap(mapping,block,ext4_get_block); 2728 return generic_block_bmap(mapping, block, ext4_get_block);
2595} 2729}
2596 2730
2597static int bget_one(handle_t *handle, struct buffer_head *bh) 2731static int bget_one(handle_t *handle, struct buffer_head *bh)
@@ -3197,7 +3331,7 @@ static Indirect *ext4_find_shared(struct inode *inode, int depth,
3197 if (!partial->key && *partial->p) 3331 if (!partial->key && *partial->p)
3198 /* Writer: end */ 3332 /* Writer: end */
3199 goto no_top; 3333 goto no_top;
3200 for (p=partial; p>chain && all_zeroes((__le32*)p->bh->b_data,p->p); p--) 3334 for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--)
3201 ; 3335 ;
3202 /* 3336 /*
3203 * OK, we've found the last block that must survive. The rest of our 3337 * OK, we've found the last block that must survive. The rest of our
@@ -3216,7 +3350,7 @@ static Indirect *ext4_find_shared(struct inode *inode, int depth,
3216 } 3350 }
3217 /* Writer: end */ 3351 /* Writer: end */
3218 3352
3219 while(partial > p) { 3353 while (partial > p) {
3220 brelse(partial->bh); 3354 brelse(partial->bh);
3221 partial--; 3355 partial--;
3222 } 3356 }
@@ -3408,9 +3542,9 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
3408 /* This zaps the entire block. Bottom up. */ 3542 /* This zaps the entire block. Bottom up. */
3409 BUFFER_TRACE(bh, "free child branches"); 3543 BUFFER_TRACE(bh, "free child branches");
3410 ext4_free_branches(handle, inode, bh, 3544 ext4_free_branches(handle, inode, bh,
3411 (__le32*)bh->b_data, 3545 (__le32 *) bh->b_data,
3412 (__le32*)bh->b_data + addr_per_block, 3546 (__le32 *) bh->b_data + addr_per_block,
3413 depth); 3547 depth);
3414 3548
3415 /* 3549 /*
3416 * We've probably journalled the indirect block several 3550 * We've probably journalled the indirect block several
@@ -3578,7 +3712,7 @@ void ext4_truncate(struct inode *inode)
3578 */ 3712 */
3579 down_write(&ei->i_data_sem); 3713 down_write(&ei->i_data_sem);
3580 3714
3581 ext4_discard_reservation(inode); 3715 ext4_discard_preallocations(inode);
3582 3716
3583 /* 3717 /*
3584 * The orphan list entry will now protect us from any crash which 3718 * The orphan list entry will now protect us from any crash which
@@ -3673,41 +3807,6 @@ out_stop:
3673 ext4_journal_stop(handle); 3807 ext4_journal_stop(handle);
3674} 3808}
3675 3809
3676static ext4_fsblk_t ext4_get_inode_block(struct super_block *sb,
3677 unsigned long ino, struct ext4_iloc *iloc)
3678{
3679 ext4_group_t block_group;
3680 unsigned long offset;
3681 ext4_fsblk_t block;
3682 struct ext4_group_desc *gdp;
3683
3684 if (!ext4_valid_inum(sb, ino)) {
3685 /*
3686 * This error is already checked for in namei.c unless we are
3687 * looking at an NFS filehandle, in which case no error
3688 * report is needed
3689 */
3690 return 0;
3691 }
3692
3693 block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
3694 gdp = ext4_get_group_desc(sb, block_group, NULL);
3695 if (!gdp)
3696 return 0;
3697
3698 /*
3699 * Figure out the offset within the block group inode table
3700 */
3701 offset = ((ino - 1) % EXT4_INODES_PER_GROUP(sb)) *
3702 EXT4_INODE_SIZE(sb);
3703 block = ext4_inode_table(sb, gdp) +
3704 (offset >> EXT4_BLOCK_SIZE_BITS(sb));
3705
3706 iloc->block_group = block_group;
3707 iloc->offset = offset & (EXT4_BLOCK_SIZE(sb) - 1);
3708 return block;
3709}
3710
3711/* 3810/*
3712 * ext4_get_inode_loc returns with an extra refcount against the inode's 3811 * ext4_get_inode_loc returns with an extra refcount against the inode's
3713 * underlying buffer_head on success. If 'in_mem' is true, we have all 3812 * underlying buffer_head on success. If 'in_mem' is true, we have all
@@ -3717,19 +3816,35 @@ static ext4_fsblk_t ext4_get_inode_block(struct super_block *sb,
3717static int __ext4_get_inode_loc(struct inode *inode, 3816static int __ext4_get_inode_loc(struct inode *inode,
3718 struct ext4_iloc *iloc, int in_mem) 3817 struct ext4_iloc *iloc, int in_mem)
3719{ 3818{
3720 ext4_fsblk_t block; 3819 struct ext4_group_desc *gdp;
3721 struct buffer_head *bh; 3820 struct buffer_head *bh;
3821 struct super_block *sb = inode->i_sb;
3822 ext4_fsblk_t block;
3823 int inodes_per_block, inode_offset;
3824
3825 iloc->bh = 0;
3826 if (!ext4_valid_inum(sb, inode->i_ino))
3827 return -EIO;
3722 3828
3723 block = ext4_get_inode_block(inode->i_sb, inode->i_ino, iloc); 3829 iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb);
3724 if (!block) 3830 gdp = ext4_get_group_desc(sb, iloc->block_group, NULL);
3831 if (!gdp)
3725 return -EIO; 3832 return -EIO;
3726 3833
3727 bh = sb_getblk(inode->i_sb, block); 3834 /*
3835 * Figure out the offset within the block group inode table
3836 */
3837 inodes_per_block = (EXT4_BLOCK_SIZE(sb) / EXT4_INODE_SIZE(sb));
3838 inode_offset = ((inode->i_ino - 1) %
3839 EXT4_INODES_PER_GROUP(sb));
3840 block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
3841 iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);
3842
3843 bh = sb_getblk(sb, block);
3728 if (!bh) { 3844 if (!bh) {
3729 ext4_error (inode->i_sb, "ext4_get_inode_loc", 3845 ext4_error(sb, "ext4_get_inode_loc", "unable to read "
3730 "unable to read inode block - " 3846 "inode block - inode=%lu, block=%llu",
3731 "inode=%lu, block=%llu", 3847 inode->i_ino, block);
3732 inode->i_ino, block);
3733 return -EIO; 3848 return -EIO;
3734 } 3849 }
3735 if (!buffer_uptodate(bh)) { 3850 if (!buffer_uptodate(bh)) {
@@ -3757,28 +3872,12 @@ static int __ext4_get_inode_loc(struct inode *inode,
3757 */ 3872 */
3758 if (in_mem) { 3873 if (in_mem) {
3759 struct buffer_head *bitmap_bh; 3874 struct buffer_head *bitmap_bh;
3760 struct ext4_group_desc *desc; 3875 int i, start;
3761 int inodes_per_buffer;
3762 int inode_offset, i;
3763 ext4_group_t block_group;
3764 int start;
3765
3766 block_group = (inode->i_ino - 1) /
3767 EXT4_INODES_PER_GROUP(inode->i_sb);
3768 inodes_per_buffer = bh->b_size /
3769 EXT4_INODE_SIZE(inode->i_sb);
3770 inode_offset = ((inode->i_ino - 1) %
3771 EXT4_INODES_PER_GROUP(inode->i_sb));
3772 start = inode_offset & ~(inodes_per_buffer - 1);
3773 3876
3774 /* Is the inode bitmap in cache? */ 3877 start = inode_offset & ~(inodes_per_block - 1);
3775 desc = ext4_get_group_desc(inode->i_sb,
3776 block_group, NULL);
3777 if (!desc)
3778 goto make_io;
3779 3878
3780 bitmap_bh = sb_getblk(inode->i_sb, 3879 /* Is the inode bitmap in cache? */
3781 ext4_inode_bitmap(inode->i_sb, desc)); 3880 bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp));
3782 if (!bitmap_bh) 3881 if (!bitmap_bh)
3783 goto make_io; 3882 goto make_io;
3784 3883
@@ -3791,14 +3890,14 @@ static int __ext4_get_inode_loc(struct inode *inode,
3791 brelse(bitmap_bh); 3890 brelse(bitmap_bh);
3792 goto make_io; 3891 goto make_io;
3793 } 3892 }
3794 for (i = start; i < start + inodes_per_buffer; i++) { 3893 for (i = start; i < start + inodes_per_block; i++) {
3795 if (i == inode_offset) 3894 if (i == inode_offset)
3796 continue; 3895 continue;
3797 if (ext4_test_bit(i, bitmap_bh->b_data)) 3896 if (ext4_test_bit(i, bitmap_bh->b_data))
3798 break; 3897 break;
3799 } 3898 }
3800 brelse(bitmap_bh); 3899 brelse(bitmap_bh);
3801 if (i == start + inodes_per_buffer) { 3900 if (i == start + inodes_per_block) {
3802 /* all other inodes are free, so skip I/O */ 3901 /* all other inodes are free, so skip I/O */
3803 memset(bh->b_data, 0, bh->b_size); 3902 memset(bh->b_data, 0, bh->b_size);
3804 set_buffer_uptodate(bh); 3903 set_buffer_uptodate(bh);
@@ -3809,6 +3908,36 @@ static int __ext4_get_inode_loc(struct inode *inode,
3809 3908
3810make_io: 3909make_io:
3811 /* 3910 /*
3911 * If we need to do any I/O, try to pre-readahead extra
3912 * blocks from the inode table.
3913 */
3914 if (EXT4_SB(sb)->s_inode_readahead_blks) {
3915 ext4_fsblk_t b, end, table;
3916 unsigned num;
3917
3918 table = ext4_inode_table(sb, gdp);
3919 /* Make sure s_inode_readahead_blks is a power of 2 */
3920 while (EXT4_SB(sb)->s_inode_readahead_blks &
3921 (EXT4_SB(sb)->s_inode_readahead_blks-1))
3922 EXT4_SB(sb)->s_inode_readahead_blks =
3923 (EXT4_SB(sb)->s_inode_readahead_blks &
3924 (EXT4_SB(sb)->s_inode_readahead_blks-1));
3925 b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1);
3926 if (table > b)
3927 b = table;
3928 end = b + EXT4_SB(sb)->s_inode_readahead_blks;
3929 num = EXT4_INODES_PER_GROUP(sb);
3930 if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
3931 EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
3932 num -= le16_to_cpu(gdp->bg_itable_unused);
3933 table += num / inodes_per_block;
3934 if (end > table)
3935 end = table;
3936 while (b <= end)
3937 sb_breadahead(sb, b++);
3938 }
3939
3940 /*
3812 * There are other valid inodes in the buffer, this inode 3941 * There are other valid inodes in the buffer, this inode
3813 * has in-inode xattrs, or we don't have this inode in memory. 3942 * has in-inode xattrs, or we don't have this inode in memory.
3814 * Read the block from disk. 3943 * Read the block from disk.
@@ -3818,10 +3947,9 @@ make_io:
3818 submit_bh(READ_META, bh); 3947 submit_bh(READ_META, bh);
3819 wait_on_buffer(bh); 3948 wait_on_buffer(bh);
3820 if (!buffer_uptodate(bh)) { 3949 if (!buffer_uptodate(bh)) {
3821 ext4_error(inode->i_sb, "ext4_get_inode_loc", 3950 ext4_error(sb, __func__,
3822 "unable to read inode block - " 3951 "unable to read inode block - inode=%lu, "
3823 "inode=%lu, block=%llu", 3952 "block=%llu", inode->i_ino, block);
3824 inode->i_ino, block);
3825 brelse(bh); 3953 brelse(bh);
3826 return -EIO; 3954 return -EIO;
3827 } 3955 }
@@ -3913,11 +4041,10 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
3913 return inode; 4041 return inode;
3914 4042
3915 ei = EXT4_I(inode); 4043 ei = EXT4_I(inode);
3916#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL 4044#ifdef CONFIG_EXT4_FS_POSIX_ACL
3917 ei->i_acl = EXT4_ACL_NOT_CACHED; 4045 ei->i_acl = EXT4_ACL_NOT_CACHED;
3918 ei->i_default_acl = EXT4_ACL_NOT_CACHED; 4046 ei->i_default_acl = EXT4_ACL_NOT_CACHED;
3919#endif 4047#endif
3920 ei->i_block_alloc_info = NULL;
3921 4048
3922 ret = __ext4_get_inode_loc(inode, &iloc, 0); 4049 ret = __ext4_get_inode_loc(inode, &iloc, 0);
3923 if (ret < 0) 4050 if (ret < 0)
@@ -3927,7 +4054,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
3927 inode->i_mode = le16_to_cpu(raw_inode->i_mode); 4054 inode->i_mode = le16_to_cpu(raw_inode->i_mode);
3928 inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); 4055 inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
3929 inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); 4056 inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
3930 if(!(test_opt (inode->i_sb, NO_UID32))) { 4057 if (!(test_opt(inode->i_sb, NO_UID32))) {
3931 inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; 4058 inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
3932 inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; 4059 inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
3933 } 4060 }
@@ -3945,7 +4072,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
3945 if (inode->i_mode == 0 || 4072 if (inode->i_mode == 0 ||
3946 !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) { 4073 !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) {
3947 /* this inode is deleted */ 4074 /* this inode is deleted */
3948 brelse (bh); 4075 brelse(bh);
3949 ret = -ESTALE; 4076 ret = -ESTALE;
3950 goto bad_inode; 4077 goto bad_inode;
3951 } 4078 }
@@ -3978,7 +4105,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
3978 ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); 4105 ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
3979 if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > 4106 if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
3980 EXT4_INODE_SIZE(inode->i_sb)) { 4107 EXT4_INODE_SIZE(inode->i_sb)) {
3981 brelse (bh); 4108 brelse(bh);
3982 ret = -EIO; 4109 ret = -EIO;
3983 goto bad_inode; 4110 goto bad_inode;
3984 } 4111 }
@@ -4031,7 +4158,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4031 init_special_inode(inode, inode->i_mode, 4158 init_special_inode(inode, inode->i_mode,
4032 new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); 4159 new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
4033 } 4160 }
4034 brelse (iloc.bh); 4161 brelse(iloc.bh);
4035 ext4_set_inode_flags(inode); 4162 ext4_set_inode_flags(inode);
4036 unlock_new_inode(inode); 4163 unlock_new_inode(inode);
4037 return inode; 4164 return inode;
@@ -4113,14 +4240,14 @@ static int ext4_do_update_inode(handle_t *handle,
4113 4240
4114 ext4_get_inode_flags(ei); 4241 ext4_get_inode_flags(ei);
4115 raw_inode->i_mode = cpu_to_le16(inode->i_mode); 4242 raw_inode->i_mode = cpu_to_le16(inode->i_mode);
4116 if(!(test_opt(inode->i_sb, NO_UID32))) { 4243 if (!(test_opt(inode->i_sb, NO_UID32))) {
4117 raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid)); 4244 raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
4118 raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid)); 4245 raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));
4119/* 4246/*
4120 * Fix up interoperability with old kernels. Otherwise, old inodes get 4247 * Fix up interoperability with old kernels. Otherwise, old inodes get
4121 * re-used with the upper 16 bits of the uid/gid intact 4248 * re-used with the upper 16 bits of the uid/gid intact
4122 */ 4249 */
4123 if(!ei->i_dtime) { 4250 if (!ei->i_dtime) {
4124 raw_inode->i_uid_high = 4251 raw_inode->i_uid_high =
4125 cpu_to_le16(high_16_bits(inode->i_uid)); 4252 cpu_to_le16(high_16_bits(inode->i_uid));
4126 raw_inode->i_gid_high = 4253 raw_inode->i_gid_high =
@@ -4208,7 +4335,7 @@ static int ext4_do_update_inode(handle_t *handle,
4208 ei->i_state &= ~EXT4_STATE_NEW; 4335 ei->i_state &= ~EXT4_STATE_NEW;
4209 4336
4210out_brelse: 4337out_brelse:
4211 brelse (bh); 4338 brelse(bh);
4212 ext4_std_error(inode->i_sb, err); 4339 ext4_std_error(inode->i_sb, err);
4213 return err; 4340 return err;
4214} 4341}
@@ -4811,6 +4938,7 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
4811 loff_t size; 4938 loff_t size;
4812 unsigned long len; 4939 unsigned long len;
4813 int ret = -EINVAL; 4940 int ret = -EINVAL;
4941 void *fsdata;
4814 struct file *file = vma->vm_file; 4942 struct file *file = vma->vm_file;
4815 struct inode *inode = file->f_path.dentry->d_inode; 4943 struct inode *inode = file->f_path.dentry->d_inode;
4816 struct address_space *mapping = inode->i_mapping; 4944 struct address_space *mapping = inode->i_mapping;
@@ -4849,11 +4977,11 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
4849 * on the same page though 4977 * on the same page though
4850 */ 4978 */
4851 ret = mapping->a_ops->write_begin(file, mapping, page_offset(page), 4979 ret = mapping->a_ops->write_begin(file, mapping, page_offset(page),
4852 len, AOP_FLAG_UNINTERRUPTIBLE, &page, NULL); 4980 len, AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
4853 if (ret < 0) 4981 if (ret < 0)
4854 goto out_unlock; 4982 goto out_unlock;
4855 ret = mapping->a_ops->write_end(file, mapping, page_offset(page), 4983 ret = mapping->a_ops->write_end(file, mapping, page_offset(page),
4856 len, len, page, NULL); 4984 len, len, page, fsdata);
4857 if (ret < 0) 4985 if (ret < 0)
4858 goto out_unlock; 4986 goto out_unlock;
4859 ret = 0; 4987 ret = 0;
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 7a6c2f1faba..ea27eaa0cfe 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -23,9 +23,8 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
23 struct inode *inode = filp->f_dentry->d_inode; 23 struct inode *inode = filp->f_dentry->d_inode;
24 struct ext4_inode_info *ei = EXT4_I(inode); 24 struct ext4_inode_info *ei = EXT4_I(inode);
25 unsigned int flags; 25 unsigned int flags;
26 unsigned short rsv_window_size;
27 26
28 ext4_debug ("cmd = %u, arg = %lu\n", cmd, arg); 27 ext4_debug("cmd = %u, arg = %lu\n", cmd, arg);
29 28
30 switch (cmd) { 29 switch (cmd) {
31 case EXT4_IOC_GETFLAGS: 30 case EXT4_IOC_GETFLAGS:
@@ -34,7 +33,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
34 return put_user(flags, (int __user *) arg); 33 return put_user(flags, (int __user *) arg);
35 case EXT4_IOC_SETFLAGS: { 34 case EXT4_IOC_SETFLAGS: {
36 handle_t *handle = NULL; 35 handle_t *handle = NULL;
37 int err; 36 int err, migrate = 0;
38 struct ext4_iloc iloc; 37 struct ext4_iloc iloc;
39 unsigned int oldflags; 38 unsigned int oldflags;
40 unsigned int jflag; 39 unsigned int jflag;
@@ -82,6 +81,17 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
82 if (!capable(CAP_SYS_RESOURCE)) 81 if (!capable(CAP_SYS_RESOURCE))
83 goto flags_out; 82 goto flags_out;
84 } 83 }
84 if (oldflags & EXT4_EXTENTS_FL) {
85 /* We don't support clearning extent flags */
86 if (!(flags & EXT4_EXTENTS_FL)) {
87 err = -EOPNOTSUPP;
88 goto flags_out;
89 }
90 } else if (flags & EXT4_EXTENTS_FL) {
91 /* migrate the file */
92 migrate = 1;
93 flags &= ~EXT4_EXTENTS_FL;
94 }
85 95
86 handle = ext4_journal_start(inode, 1); 96 handle = ext4_journal_start(inode, 1);
87 if (IS_ERR(handle)) { 97 if (IS_ERR(handle)) {
@@ -109,6 +119,10 @@ flags_err:
109 119
110 if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) 120 if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL))
111 err = ext4_change_inode_journal_flag(inode, jflag); 121 err = ext4_change_inode_journal_flag(inode, jflag);
122 if (err)
123 goto flags_out;
124 if (migrate)
125 err = ext4_ext_migrate(inode);
112flags_out: 126flags_out:
113 mutex_unlock(&inode->i_mutex); 127 mutex_unlock(&inode->i_mutex);
114 mnt_drop_write(filp->f_path.mnt); 128 mnt_drop_write(filp->f_path.mnt);
@@ -175,49 +189,6 @@ setversion_out:
175 return ret; 189 return ret;
176 } 190 }
177#endif 191#endif
178 case EXT4_IOC_GETRSVSZ:
179 if (test_opt(inode->i_sb, RESERVATION)
180 && S_ISREG(inode->i_mode)
181 && ei->i_block_alloc_info) {
182 rsv_window_size = ei->i_block_alloc_info->rsv_window_node.rsv_goal_size;
183 return put_user(rsv_window_size, (int __user *)arg);
184 }
185 return -ENOTTY;
186 case EXT4_IOC_SETRSVSZ: {
187 int err;
188
189 if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode))
190 return -ENOTTY;
191
192 if (!is_owner_or_cap(inode))
193 return -EACCES;
194
195 if (get_user(rsv_window_size, (int __user *)arg))
196 return -EFAULT;
197
198 err = mnt_want_write(filp->f_path.mnt);
199 if (err)
200 return err;
201
202 if (rsv_window_size > EXT4_MAX_RESERVE_BLOCKS)
203 rsv_window_size = EXT4_MAX_RESERVE_BLOCKS;
204
205 /*
206 * need to allocate reservation structure for this inode
207 * before set the window size
208 */
209 down_write(&ei->i_data_sem);
210 if (!ei->i_block_alloc_info)
211 ext4_init_block_alloc_info(inode);
212
213 if (ei->i_block_alloc_info){
214 struct ext4_reserve_window_node *rsv = &ei->i_block_alloc_info->rsv_window_node;
215 rsv->rsv_goal_size = rsv_window_size;
216 }
217 up_write(&ei->i_data_sem);
218 mnt_drop_write(filp->f_path.mnt);
219 return 0;
220 }
221 case EXT4_IOC_GROUP_EXTEND: { 192 case EXT4_IOC_GROUP_EXTEND: {
222 ext4_fsblk_t n_blocks_count; 193 ext4_fsblk_t n_blocks_count;
223 struct super_block *sb = inode->i_sb; 194 struct super_block *sb = inode->i_sb;
@@ -267,7 +238,26 @@ setversion_out:
267 } 238 }
268 239
269 case EXT4_IOC_MIGRATE: 240 case EXT4_IOC_MIGRATE:
270 return ext4_ext_migrate(inode, filp, cmd, arg); 241 {
242 int err;
243 if (!is_owner_or_cap(inode))
244 return -EACCES;
245
246 err = mnt_want_write(filp->f_path.mnt);
247 if (err)
248 return err;
249 /*
250 * inode_mutex prevent write and truncate on the file.
251 * Read still goes through. We take i_data_sem in
252 * ext4_ext_swap_inode_data before we switch the
253 * inode format to prevent read.
254 */
255 mutex_lock(&(inode->i_mutex));
256 err = ext4_ext_migrate(inode);
257 mutex_unlock(&(inode->i_mutex));
258 mnt_drop_write(filp->f_path.mnt);
259 return err;
260 }
271 261
272 default: 262 default:
273 return -ENOTTY; 263 return -ENOTTY;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index e0e3a5eb1dd..b580714f0d8 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -477,9 +477,10 @@ static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
477 b2 = (unsigned char *) bitmap; 477 b2 = (unsigned char *) bitmap;
478 for (i = 0; i < e4b->bd_sb->s_blocksize; i++) { 478 for (i = 0; i < e4b->bd_sb->s_blocksize; i++) {
479 if (b1[i] != b2[i]) { 479 if (b1[i] != b2[i]) {
480 printk("corruption in group %lu at byte %u(%u):" 480 printk(KERN_ERR "corruption in group %lu "
481 " %x in copy != %x on disk/prealloc\n", 481 "at byte %u(%u): %x in copy != %x "
482 e4b->bd_group, i, i * 8, b1[i], b2[i]); 482 "on disk/prealloc\n",
483 e4b->bd_group, i, i * 8, b1[i], b2[i]);
483 BUG(); 484 BUG();
484 } 485 }
485 } 486 }
@@ -533,9 +534,6 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
533 void *buddy; 534 void *buddy;
534 void *buddy2; 535 void *buddy2;
535 536
536 if (!test_opt(sb, MBALLOC))
537 return 0;
538
539 { 537 {
540 static int mb_check_counter; 538 static int mb_check_counter;
541 if (mb_check_counter++ % 100 != 0) 539 if (mb_check_counter++ % 100 != 0)
@@ -784,9 +782,11 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
784 if (bh[i] == NULL) 782 if (bh[i] == NULL)
785 goto out; 783 goto out;
786 784
787 if (bh_uptodate_or_lock(bh[i])) 785 if (buffer_uptodate(bh[i]) &&
786 !(desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))
788 continue; 787 continue;
789 788
789 lock_buffer(bh[i]);
790 spin_lock(sb_bgl_lock(EXT4_SB(sb), first_group + i)); 790 spin_lock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
791 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 791 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
792 ext4_init_block_bitmap(sb, bh[i], 792 ext4_init_block_bitmap(sb, bh[i],
@@ -2169,9 +2169,10 @@ static void ext4_mb_history_release(struct super_block *sb)
2169{ 2169{
2170 struct ext4_sb_info *sbi = EXT4_SB(sb); 2170 struct ext4_sb_info *sbi = EXT4_SB(sb);
2171 2171
2172 remove_proc_entry("mb_groups", sbi->s_mb_proc); 2172 if (sbi->s_proc != NULL) {
2173 remove_proc_entry("mb_history", sbi->s_mb_proc); 2173 remove_proc_entry("mb_groups", sbi->s_proc);
2174 2174 remove_proc_entry("mb_history", sbi->s_proc);
2175 }
2175 kfree(sbi->s_mb_history); 2176 kfree(sbi->s_mb_history);
2176} 2177}
2177 2178
@@ -2180,10 +2181,10 @@ static void ext4_mb_history_init(struct super_block *sb)
2180 struct ext4_sb_info *sbi = EXT4_SB(sb); 2181 struct ext4_sb_info *sbi = EXT4_SB(sb);
2181 int i; 2182 int i;
2182 2183
2183 if (sbi->s_mb_proc != NULL) { 2184 if (sbi->s_proc != NULL) {
2184 proc_create_data("mb_history", S_IRUGO, sbi->s_mb_proc, 2185 proc_create_data("mb_history", S_IRUGO, sbi->s_proc,
2185 &ext4_mb_seq_history_fops, sb); 2186 &ext4_mb_seq_history_fops, sb);
2186 proc_create_data("mb_groups", S_IRUGO, sbi->s_mb_proc, 2187 proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
2187 &ext4_mb_seq_groups_fops, sb); 2188 &ext4_mb_seq_groups_fops, sb);
2188 } 2189 }
2189 2190
@@ -2485,19 +2486,14 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2485 unsigned max; 2486 unsigned max;
2486 int ret; 2487 int ret;
2487 2488
2488 if (!test_opt(sb, MBALLOC))
2489 return 0;
2490
2491 i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short); 2489 i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short);
2492 2490
2493 sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); 2491 sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
2494 if (sbi->s_mb_offsets == NULL) { 2492 if (sbi->s_mb_offsets == NULL) {
2495 clear_opt(sbi->s_mount_opt, MBALLOC);
2496 return -ENOMEM; 2493 return -ENOMEM;
2497 } 2494 }
2498 sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); 2495 sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
2499 if (sbi->s_mb_maxs == NULL) { 2496 if (sbi->s_mb_maxs == NULL) {
2500 clear_opt(sbi->s_mount_opt, MBALLOC);
2501 kfree(sbi->s_mb_maxs); 2497 kfree(sbi->s_mb_maxs);
2502 return -ENOMEM; 2498 return -ENOMEM;
2503 } 2499 }
@@ -2520,7 +2516,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2520 /* init file for buddy data */ 2516 /* init file for buddy data */
2521 ret = ext4_mb_init_backend(sb); 2517 ret = ext4_mb_init_backend(sb);
2522 if (ret != 0) { 2518 if (ret != 0) {
2523 clear_opt(sbi->s_mount_opt, MBALLOC);
2524 kfree(sbi->s_mb_offsets); 2519 kfree(sbi->s_mb_offsets);
2525 kfree(sbi->s_mb_maxs); 2520 kfree(sbi->s_mb_maxs);
2526 return ret; 2521 return ret;
@@ -2540,17 +2535,15 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2540 sbi->s_mb_history_filter = EXT4_MB_HISTORY_DEFAULT; 2535 sbi->s_mb_history_filter = EXT4_MB_HISTORY_DEFAULT;
2541 sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC; 2536 sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC;
2542 2537
2543 i = sizeof(struct ext4_locality_group) * nr_cpu_ids; 2538 sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
2544 sbi->s_locality_groups = kmalloc(i, GFP_KERNEL);
2545 if (sbi->s_locality_groups == NULL) { 2539 if (sbi->s_locality_groups == NULL) {
2546 clear_opt(sbi->s_mount_opt, MBALLOC);
2547 kfree(sbi->s_mb_offsets); 2540 kfree(sbi->s_mb_offsets);
2548 kfree(sbi->s_mb_maxs); 2541 kfree(sbi->s_mb_maxs);
2549 return -ENOMEM; 2542 return -ENOMEM;
2550 } 2543 }
2551 for (i = 0; i < nr_cpu_ids; i++) { 2544 for_each_possible_cpu(i) {
2552 struct ext4_locality_group *lg; 2545 struct ext4_locality_group *lg;
2553 lg = &sbi->s_locality_groups[i]; 2546 lg = per_cpu_ptr(sbi->s_locality_groups, i);
2554 mutex_init(&lg->lg_mutex); 2547 mutex_init(&lg->lg_mutex);
2555 for (j = 0; j < PREALLOC_TB_SIZE; j++) 2548 for (j = 0; j < PREALLOC_TB_SIZE; j++)
2556 INIT_LIST_HEAD(&lg->lg_prealloc_list[j]); 2549 INIT_LIST_HEAD(&lg->lg_prealloc_list[j]);
@@ -2560,7 +2553,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2560 ext4_mb_init_per_dev_proc(sb); 2553 ext4_mb_init_per_dev_proc(sb);
2561 ext4_mb_history_init(sb); 2554 ext4_mb_history_init(sb);
2562 2555
2563 printk("EXT4-fs: mballoc enabled\n"); 2556 printk(KERN_INFO "EXT4-fs: mballoc enabled\n");
2564 return 0; 2557 return 0;
2565} 2558}
2566 2559
@@ -2589,9 +2582,6 @@ int ext4_mb_release(struct super_block *sb)
2589 struct ext4_group_info *grinfo; 2582 struct ext4_group_info *grinfo;
2590 struct ext4_sb_info *sbi = EXT4_SB(sb); 2583 struct ext4_sb_info *sbi = EXT4_SB(sb);
2591 2584
2592 if (!test_opt(sb, MBALLOC))
2593 return 0;
2594
2595 /* release freed, non-committed blocks */ 2585 /* release freed, non-committed blocks */
2596 spin_lock(&sbi->s_md_lock); 2586 spin_lock(&sbi->s_md_lock);
2597 list_splice_init(&sbi->s_closed_transaction, 2587 list_splice_init(&sbi->s_closed_transaction,
@@ -2647,8 +2637,7 @@ int ext4_mb_release(struct super_block *sb)
2647 atomic_read(&sbi->s_mb_discarded)); 2637 atomic_read(&sbi->s_mb_discarded));
2648 } 2638 }
2649 2639
2650 kfree(sbi->s_locality_groups); 2640 free_percpu(sbi->s_locality_groups);
2651
2652 ext4_mb_history_release(sb); 2641 ext4_mb_history_release(sb);
2653 ext4_mb_destroy_per_dev_proc(sb); 2642 ext4_mb_destroy_per_dev_proc(sb);
2654 2643
@@ -2721,118 +2710,46 @@ ext4_mb_free_committed_blocks(struct super_block *sb)
2721#define EXT4_MB_STREAM_REQ "stream_req" 2710#define EXT4_MB_STREAM_REQ "stream_req"
2722#define EXT4_MB_GROUP_PREALLOC "group_prealloc" 2711#define EXT4_MB_GROUP_PREALLOC "group_prealloc"
2723 2712
2724
2725
2726#define MB_PROC_FOPS(name) \
2727static int ext4_mb_##name##_proc_show(struct seq_file *m, void *v) \
2728{ \
2729 struct ext4_sb_info *sbi = m->private; \
2730 \
2731 seq_printf(m, "%ld\n", sbi->s_mb_##name); \
2732 return 0; \
2733} \
2734 \
2735static int ext4_mb_##name##_proc_open(struct inode *inode, struct file *file)\
2736{ \
2737 return single_open(file, ext4_mb_##name##_proc_show, PDE(inode)->data);\
2738} \
2739 \
2740static ssize_t ext4_mb_##name##_proc_write(struct file *file, \
2741 const char __user *buf, size_t cnt, loff_t *ppos) \
2742{ \
2743 struct ext4_sb_info *sbi = PDE(file->f_path.dentry->d_inode)->data;\
2744 char str[32]; \
2745 long value; \
2746 if (cnt >= sizeof(str)) \
2747 return -EINVAL; \
2748 if (copy_from_user(str, buf, cnt)) \
2749 return -EFAULT; \
2750 value = simple_strtol(str, NULL, 0); \
2751 if (value <= 0) \
2752 return -ERANGE; \
2753 sbi->s_mb_##name = value; \
2754 return cnt; \
2755} \
2756 \
2757static const struct file_operations ext4_mb_##name##_proc_fops = { \
2758 .owner = THIS_MODULE, \
2759 .open = ext4_mb_##name##_proc_open, \
2760 .read = seq_read, \
2761 .llseek = seq_lseek, \
2762 .release = single_release, \
2763 .write = ext4_mb_##name##_proc_write, \
2764};
2765
2766MB_PROC_FOPS(stats);
2767MB_PROC_FOPS(max_to_scan);
2768MB_PROC_FOPS(min_to_scan);
2769MB_PROC_FOPS(order2_reqs);
2770MB_PROC_FOPS(stream_request);
2771MB_PROC_FOPS(group_prealloc);
2772
2773#define MB_PROC_HANDLER(name, var) \
2774do { \
2775 proc = proc_create_data(name, mode, sbi->s_mb_proc, \
2776 &ext4_mb_##var##_proc_fops, sbi); \
2777 if (proc == NULL) { \
2778 printk(KERN_ERR "EXT4-fs: can't to create %s\n", name); \
2779 goto err_out; \
2780 } \
2781} while (0)
2782
2783static int ext4_mb_init_per_dev_proc(struct super_block *sb) 2713static int ext4_mb_init_per_dev_proc(struct super_block *sb)
2784{ 2714{
2785 mode_t mode = S_IFREG | S_IRUGO | S_IWUSR; 2715 mode_t mode = S_IFREG | S_IRUGO | S_IWUSR;
2786 struct ext4_sb_info *sbi = EXT4_SB(sb); 2716 struct ext4_sb_info *sbi = EXT4_SB(sb);
2787 struct proc_dir_entry *proc; 2717 struct proc_dir_entry *proc;
2788 char devname[64];
2789 2718
2790 if (proc_root_ext4 == NULL) { 2719 if (sbi->s_proc == NULL)
2791 sbi->s_mb_proc = NULL;
2792 return -EINVAL; 2720 return -EINVAL;
2793 }
2794 bdevname(sb->s_bdev, devname);
2795 sbi->s_mb_proc = proc_mkdir(devname, proc_root_ext4);
2796
2797 MB_PROC_HANDLER(EXT4_MB_STATS_NAME, stats);
2798 MB_PROC_HANDLER(EXT4_MB_MAX_TO_SCAN_NAME, max_to_scan);
2799 MB_PROC_HANDLER(EXT4_MB_MIN_TO_SCAN_NAME, min_to_scan);
2800 MB_PROC_HANDLER(EXT4_MB_ORDER2_REQ, order2_reqs);
2801 MB_PROC_HANDLER(EXT4_MB_STREAM_REQ, stream_request);
2802 MB_PROC_HANDLER(EXT4_MB_GROUP_PREALLOC, group_prealloc);
2803 2721
2722 EXT4_PROC_HANDLER(EXT4_MB_STATS_NAME, mb_stats);
2723 EXT4_PROC_HANDLER(EXT4_MB_MAX_TO_SCAN_NAME, mb_max_to_scan);
2724 EXT4_PROC_HANDLER(EXT4_MB_MIN_TO_SCAN_NAME, mb_min_to_scan);
2725 EXT4_PROC_HANDLER(EXT4_MB_ORDER2_REQ, mb_order2_reqs);
2726 EXT4_PROC_HANDLER(EXT4_MB_STREAM_REQ, mb_stream_request);
2727 EXT4_PROC_HANDLER(EXT4_MB_GROUP_PREALLOC, mb_group_prealloc);
2804 return 0; 2728 return 0;
2805 2729
2806err_out: 2730err_out:
2807 printk(KERN_ERR "EXT4-fs: Unable to create %s\n", devname); 2731 remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_proc);
2808 remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_mb_proc); 2732 remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_proc);
2809 remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_mb_proc); 2733 remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_proc);
2810 remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_mb_proc); 2734 remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc);
2811 remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_mb_proc); 2735 remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc);
2812 remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_mb_proc); 2736 remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc);
2813 remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_mb_proc);
2814 remove_proc_entry(devname, proc_root_ext4);
2815 sbi->s_mb_proc = NULL;
2816
2817 return -ENOMEM; 2737 return -ENOMEM;
2818} 2738}
2819 2739
2820static int ext4_mb_destroy_per_dev_proc(struct super_block *sb) 2740static int ext4_mb_destroy_per_dev_proc(struct super_block *sb)
2821{ 2741{
2822 struct ext4_sb_info *sbi = EXT4_SB(sb); 2742 struct ext4_sb_info *sbi = EXT4_SB(sb);
2823 char devname[64];
2824 2743
2825 if (sbi->s_mb_proc == NULL) 2744 if (sbi->s_proc == NULL)
2826 return -EINVAL; 2745 return -EINVAL;
2827 2746
2828 bdevname(sb->s_bdev, devname); 2747 remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_proc);
2829 remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_mb_proc); 2748 remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_proc);
2830 remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_mb_proc); 2749 remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_proc);
2831 remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_mb_proc); 2750 remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc);
2832 remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_mb_proc); 2751 remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc);
2833 remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_mb_proc); 2752 remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc);
2834 remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_mb_proc);
2835 remove_proc_entry(devname, proc_root_ext4);
2836 2753
2837 return 0; 2754 return 0;
2838} 2755}
@@ -2854,11 +2771,6 @@ int __init init_ext4_mballoc(void)
2854 kmem_cache_destroy(ext4_pspace_cachep); 2771 kmem_cache_destroy(ext4_pspace_cachep);
2855 return -ENOMEM; 2772 return -ENOMEM;
2856 } 2773 }
2857#ifdef CONFIG_PROC_FS
2858 proc_root_ext4 = proc_mkdir("fs/ext4", NULL);
2859 if (proc_root_ext4 == NULL)
2860 printk(KERN_ERR "EXT4-fs: Unable to create fs/ext4\n");
2861#endif
2862 return 0; 2774 return 0;
2863} 2775}
2864 2776
@@ -2867,9 +2779,6 @@ void exit_ext4_mballoc(void)
2867 /* XXX: synchronize_rcu(); */ 2779 /* XXX: synchronize_rcu(); */
2868 kmem_cache_destroy(ext4_pspace_cachep); 2780 kmem_cache_destroy(ext4_pspace_cachep);
2869 kmem_cache_destroy(ext4_ac_cachep); 2781 kmem_cache_destroy(ext4_ac_cachep);
2870#ifdef CONFIG_PROC_FS
2871 remove_proc_entry("fs/ext4", NULL);
2872#endif
2873} 2782}
2874 2783
2875 2784
@@ -2879,7 +2788,7 @@ void exit_ext4_mballoc(void)
2879 */ 2788 */
2880static noinline_for_stack int 2789static noinline_for_stack int
2881ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, 2790ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2882 handle_t *handle) 2791 handle_t *handle, unsigned long reserv_blks)
2883{ 2792{
2884 struct buffer_head *bitmap_bh = NULL; 2793 struct buffer_head *bitmap_bh = NULL;
2885 struct ext4_super_block *es; 2794 struct ext4_super_block *es;
@@ -2968,15 +2877,16 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2968 le16_add_cpu(&gdp->bg_free_blocks_count, -ac->ac_b_ex.fe_len); 2877 le16_add_cpu(&gdp->bg_free_blocks_count, -ac->ac_b_ex.fe_len);
2969 gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp); 2878 gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
2970 spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); 2879 spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
2971 2880 percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
2972 /* 2881 /*
2973 * free blocks account has already be reduced/reserved 2882 * Now reduce the dirty block count also. Should not go negative
2974 * at write_begin() time for delayed allocation
2975 * do not double accounting
2976 */ 2883 */
2977 if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED)) 2884 if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
2978 percpu_counter_sub(&sbi->s_freeblocks_counter, 2885 /* release all the reserved blocks if non delalloc */
2979 ac->ac_b_ex.fe_len); 2886 percpu_counter_sub(&sbi->s_dirtyblocks_counter, reserv_blks);
2887 else
2888 percpu_counter_sub(&sbi->s_dirtyblocks_counter,
2889 ac->ac_b_ex.fe_len);
2980 2890
2981 if (sbi->s_log_groups_per_flex) { 2891 if (sbi->s_log_groups_per_flex) {
2982 ext4_group_t flex_group = ext4_flex_group(sbi, 2892 ext4_group_t flex_group = ext4_flex_group(sbi,
@@ -3884,7 +3794,7 @@ out:
3884 * 3794 *
3885 * FIXME!! Make sure it is valid at all the call sites 3795 * FIXME!! Make sure it is valid at all the call sites
3886 */ 3796 */
3887void ext4_mb_discard_inode_preallocations(struct inode *inode) 3797void ext4_discard_preallocations(struct inode *inode)
3888{ 3798{
3889 struct ext4_inode_info *ei = EXT4_I(inode); 3799 struct ext4_inode_info *ei = EXT4_I(inode);
3890 struct super_block *sb = inode->i_sb; 3800 struct super_block *sb = inode->i_sb;
@@ -3896,7 +3806,7 @@ void ext4_mb_discard_inode_preallocations(struct inode *inode)
3896 struct ext4_buddy e4b; 3806 struct ext4_buddy e4b;
3897 int err; 3807 int err;
3898 3808
3899 if (!test_opt(sb, MBALLOC) || !S_ISREG(inode->i_mode)) { 3809 if (!S_ISREG(inode->i_mode)) {
3900 /*BUG_ON(!list_empty(&ei->i_prealloc_list));*/ 3810 /*BUG_ON(!list_empty(&ei->i_prealloc_list));*/
3901 return; 3811 return;
3902 } 3812 }
@@ -4094,8 +4004,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
4094 * per cpu locality group is to reduce the contention between block 4004 * per cpu locality group is to reduce the contention between block
4095 * request from multiple CPUs. 4005 * request from multiple CPUs.
4096 */ 4006 */
4097 ac->ac_lg = &sbi->s_locality_groups[get_cpu()]; 4007 ac->ac_lg = per_cpu_ptr(sbi->s_locality_groups, raw_smp_processor_id());
4098 put_cpu();
4099 4008
4100 /* we're going to use group allocation */ 4009 /* we're going to use group allocation */
4101 ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC; 4010 ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC;
@@ -4369,33 +4278,32 @@ static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
4369ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, 4278ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4370 struct ext4_allocation_request *ar, int *errp) 4279 struct ext4_allocation_request *ar, int *errp)
4371{ 4280{
4281 int freed;
4372 struct ext4_allocation_context *ac = NULL; 4282 struct ext4_allocation_context *ac = NULL;
4373 struct ext4_sb_info *sbi; 4283 struct ext4_sb_info *sbi;
4374 struct super_block *sb; 4284 struct super_block *sb;
4375 ext4_fsblk_t block = 0; 4285 ext4_fsblk_t block = 0;
4376 int freed; 4286 unsigned long inquota;
4377 int inquota; 4287 unsigned long reserv_blks = 0;
4378 4288
4379 sb = ar->inode->i_sb; 4289 sb = ar->inode->i_sb;
4380 sbi = EXT4_SB(sb); 4290 sbi = EXT4_SB(sb);
4381 4291
4382 if (!test_opt(sb, MBALLOC)) {
4383 block = ext4_old_new_blocks(handle, ar->inode, ar->goal,
4384 &(ar->len), errp);
4385 return block;
4386 }
4387 if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) { 4292 if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) {
4388 /* 4293 /*
4389 * With delalloc we already reserved the blocks 4294 * With delalloc we already reserved the blocks
4390 */ 4295 */
4391 ar->len = ext4_has_free_blocks(sbi, ar->len); 4296 while (ar->len && ext4_claim_free_blocks(sbi, ar->len)) {
4392 } 4297 /* let others to free the space */
4393 4298 yield();
4394 if (ar->len == 0) { 4299 ar->len = ar->len >> 1;
4395 *errp = -ENOSPC; 4300 }
4396 return 0; 4301 if (!ar->len) {
4302 *errp = -ENOSPC;
4303 return 0;
4304 }
4305 reserv_blks = ar->len;
4397 } 4306 }
4398
4399 while (ar->len && DQUOT_ALLOC_BLOCK(ar->inode, ar->len)) { 4307 while (ar->len && DQUOT_ALLOC_BLOCK(ar->inode, ar->len)) {
4400 ar->flags |= EXT4_MB_HINT_NOPREALLOC; 4308 ar->flags |= EXT4_MB_HINT_NOPREALLOC;
4401 ar->len--; 4309 ar->len--;
@@ -4441,7 +4349,7 @@ repeat:
4441 } 4349 }
4442 4350
4443 if (likely(ac->ac_status == AC_STATUS_FOUND)) { 4351 if (likely(ac->ac_status == AC_STATUS_FOUND)) {
4444 *errp = ext4_mb_mark_diskspace_used(ac, handle); 4352 *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_blks);
4445 if (*errp == -EAGAIN) { 4353 if (*errp == -EAGAIN) {
4446 ac->ac_b_ex.fe_group = 0; 4354 ac->ac_b_ex.fe_group = 0;
4447 ac->ac_b_ex.fe_start = 0; 4355 ac->ac_b_ex.fe_start = 0;
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index c7c9906c2a7..b3b4828f8b8 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -257,7 +257,6 @@ static void ext4_mb_store_history(struct ext4_allocation_context *ac);
257 257
258#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) 258#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
259 259
260static struct proc_dir_entry *proc_root_ext4;
261struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t); 260struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t);
262 261
263static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, 262static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 46fc0b5b12b..f2a9cf498ec 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -447,8 +447,7 @@ static int free_ext_block(handle_t *handle, struct inode *inode)
447 447
448} 448}
449 449
450int ext4_ext_migrate(struct inode *inode, struct file *filp, 450int ext4_ext_migrate(struct inode *inode)
451 unsigned int cmd, unsigned long arg)
452{ 451{
453 handle_t *handle; 452 handle_t *handle;
454 int retval = 0, i; 453 int retval = 0, i;
@@ -516,12 +515,6 @@ int ext4_ext_migrate(struct inode *inode, struct file *filp,
516 * when we add extents we extent the journal 515 * when we add extents we extent the journal
517 */ 516 */
518 /* 517 /*
519 * inode_mutex prevent write and truncate on the file. Read still goes
520 * through. We take i_data_sem in ext4_ext_swap_inode_data before we
521 * switch the inode format to prevent read.
522 */
523 mutex_lock(&(inode->i_mutex));
524 /*
525 * Even though we take i_mutex we can still cause block allocation 518 * Even though we take i_mutex we can still cause block allocation
526 * via mmap write to holes. If we have allocated new blocks we fail 519 * via mmap write to holes. If we have allocated new blocks we fail
527 * migrate. New block allocation will clear EXT4_EXT_MIGRATE flag. 520 * migrate. New block allocation will clear EXT4_EXT_MIGRATE flag.
@@ -623,7 +616,6 @@ err_out:
623 tmp_inode->i_nlink = 0; 616 tmp_inode->i_nlink = 0;
624 617
625 ext4_journal_stop(handle); 618 ext4_journal_stop(handle);
626 mutex_unlock(&(inode->i_mutex));
627 619
628 if (tmp_inode) 620 if (tmp_inode)
629 iput(tmp_inode); 621 iput(tmp_inode);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 387ad98350c..92db9e94514 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -151,34 +151,36 @@ struct dx_map_entry
151 151
152static inline ext4_lblk_t dx_get_block(struct dx_entry *entry); 152static inline ext4_lblk_t dx_get_block(struct dx_entry *entry);
153static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value); 153static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value);
154static inline unsigned dx_get_hash (struct dx_entry *entry); 154static inline unsigned dx_get_hash(struct dx_entry *entry);
155static void dx_set_hash (struct dx_entry *entry, unsigned value); 155static void dx_set_hash(struct dx_entry *entry, unsigned value);
156static unsigned dx_get_count (struct dx_entry *entries); 156static unsigned dx_get_count(struct dx_entry *entries);
157static unsigned dx_get_limit (struct dx_entry *entries); 157static unsigned dx_get_limit(struct dx_entry *entries);
158static void dx_set_count (struct dx_entry *entries, unsigned value); 158static void dx_set_count(struct dx_entry *entries, unsigned value);
159static void dx_set_limit (struct dx_entry *entries, unsigned value); 159static void dx_set_limit(struct dx_entry *entries, unsigned value);
160static unsigned dx_root_limit (struct inode *dir, unsigned infosize); 160static unsigned dx_root_limit(struct inode *dir, unsigned infosize);
161static unsigned dx_node_limit (struct inode *dir); 161static unsigned dx_node_limit(struct inode *dir);
162static struct dx_frame *dx_probe(struct dentry *dentry, 162static struct dx_frame *dx_probe(const struct qstr *d_name,
163 struct inode *dir, 163 struct inode *dir,
164 struct dx_hash_info *hinfo, 164 struct dx_hash_info *hinfo,
165 struct dx_frame *frame, 165 struct dx_frame *frame,
166 int *err); 166 int *err);
167static void dx_release (struct dx_frame *frames); 167static void dx_release(struct dx_frame *frames);
168static int dx_make_map (struct ext4_dir_entry_2 *de, int size, 168static int dx_make_map(struct ext4_dir_entry_2 *de, int size,
169 struct dx_hash_info *hinfo, struct dx_map_entry map[]); 169 struct dx_hash_info *hinfo, struct dx_map_entry map[]);
170static void dx_sort_map(struct dx_map_entry *map, unsigned count); 170static void dx_sort_map(struct dx_map_entry *map, unsigned count);
171static struct ext4_dir_entry_2 *dx_move_dirents (char *from, char *to, 171static struct ext4_dir_entry_2 *dx_move_dirents(char *from, char *to,
172 struct dx_map_entry *offsets, int count); 172 struct dx_map_entry *offsets, int count);
173static struct ext4_dir_entry_2* dx_pack_dirents (char *base, int size); 173static struct ext4_dir_entry_2* dx_pack_dirents(char *base, int size);
174static void dx_insert_block(struct dx_frame *frame, 174static void dx_insert_block(struct dx_frame *frame,
175 u32 hash, ext4_lblk_t block); 175 u32 hash, ext4_lblk_t block);
176static int ext4_htree_next_block(struct inode *dir, __u32 hash, 176static int ext4_htree_next_block(struct inode *dir, __u32 hash,
177 struct dx_frame *frame, 177 struct dx_frame *frame,
178 struct dx_frame *frames, 178 struct dx_frame *frames,
179 __u32 *start_hash); 179 __u32 *start_hash);
180static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry, 180static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
181 struct ext4_dir_entry_2 **res_dir, int *err); 181 const struct qstr *d_name,
182 struct ext4_dir_entry_2 **res_dir,
183 int *err);
182static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, 184static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
183 struct inode *inode); 185 struct inode *inode);
184 186
@@ -207,44 +209,44 @@ static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value)
207 entry->block = cpu_to_le32(value); 209 entry->block = cpu_to_le32(value);
208} 210}
209 211
210static inline unsigned dx_get_hash (struct dx_entry *entry) 212static inline unsigned dx_get_hash(struct dx_entry *entry)
211{ 213{
212 return le32_to_cpu(entry->hash); 214 return le32_to_cpu(entry->hash);
213} 215}
214 216
215static inline void dx_set_hash (struct dx_entry *entry, unsigned value) 217static inline void dx_set_hash(struct dx_entry *entry, unsigned value)
216{ 218{
217 entry->hash = cpu_to_le32(value); 219 entry->hash = cpu_to_le32(value);
218} 220}
219 221
220static inline unsigned dx_get_count (struct dx_entry *entries) 222static inline unsigned dx_get_count(struct dx_entry *entries)
221{ 223{
222 return le16_to_cpu(((struct dx_countlimit *) entries)->count); 224 return le16_to_cpu(((struct dx_countlimit *) entries)->count);
223} 225}
224 226
225static inline unsigned dx_get_limit (struct dx_entry *entries) 227static inline unsigned dx_get_limit(struct dx_entry *entries)
226{ 228{
227 return le16_to_cpu(((struct dx_countlimit *) entries)->limit); 229 return le16_to_cpu(((struct dx_countlimit *) entries)->limit);
228} 230}
229 231
230static inline void dx_set_count (struct dx_entry *entries, unsigned value) 232static inline void dx_set_count(struct dx_entry *entries, unsigned value)
231{ 233{
232 ((struct dx_countlimit *) entries)->count = cpu_to_le16(value); 234 ((struct dx_countlimit *) entries)->count = cpu_to_le16(value);
233} 235}
234 236
235static inline void dx_set_limit (struct dx_entry *entries, unsigned value) 237static inline void dx_set_limit(struct dx_entry *entries, unsigned value)
236{ 238{
237 ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value); 239 ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value);
238} 240}
239 241
240static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize) 242static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize)
241{ 243{
242 unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) - 244 unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) -
243 EXT4_DIR_REC_LEN(2) - infosize; 245 EXT4_DIR_REC_LEN(2) - infosize;
244 return entry_space / sizeof(struct dx_entry); 246 return entry_space / sizeof(struct dx_entry);
245} 247}
246 248
247static inline unsigned dx_node_limit (struct inode *dir) 249static inline unsigned dx_node_limit(struct inode *dir)
248{ 250{
249 unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0); 251 unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0);
250 return entry_space / sizeof(struct dx_entry); 252 return entry_space / sizeof(struct dx_entry);
@@ -254,12 +256,12 @@ static inline unsigned dx_node_limit (struct inode *dir)
254 * Debug 256 * Debug
255 */ 257 */
256#ifdef DX_DEBUG 258#ifdef DX_DEBUG
257static void dx_show_index (char * label, struct dx_entry *entries) 259static void dx_show_index(char * label, struct dx_entry *entries)
258{ 260{
259 int i, n = dx_get_count (entries); 261 int i, n = dx_get_count (entries);
260 printk("%s index ", label); 262 printk(KERN_DEBUG "%s index ", label);
261 for (i = 0; i < n; i++) { 263 for (i = 0; i < n; i++) {
262 printk("%x->%lu ", i? dx_get_hash(entries + i) : 264 printk("%x->%lu ", i ? dx_get_hash(entries + i) :
263 0, (unsigned long)dx_get_block(entries + i)); 265 0, (unsigned long)dx_get_block(entries + i));
264 } 266 }
265 printk("\n"); 267 printk("\n");
@@ -306,7 +308,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
306 struct dx_entry *entries, int levels) 308 struct dx_entry *entries, int levels)
307{ 309{
308 unsigned blocksize = dir->i_sb->s_blocksize; 310 unsigned blocksize = dir->i_sb->s_blocksize;
309 unsigned count = dx_get_count (entries), names = 0, space = 0, i; 311 unsigned count = dx_get_count(entries), names = 0, space = 0, i;
310 unsigned bcount = 0; 312 unsigned bcount = 0;
311 struct buffer_head *bh; 313 struct buffer_head *bh;
312 int err; 314 int err;
@@ -325,11 +327,12 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
325 names += stats.names; 327 names += stats.names;
326 space += stats.space; 328 space += stats.space;
327 bcount += stats.bcount; 329 bcount += stats.bcount;
328 brelse (bh); 330 brelse(bh);
329 } 331 }
330 if (bcount) 332 if (bcount)
331 printk("%snames %u, fullness %u (%u%%)\n", levels?"":" ", 333 printk(KERN_DEBUG "%snames %u, fullness %u (%u%%)\n",
332 names, space/bcount,(space/bcount)*100/blocksize); 334 levels ? "" : " ", names, space/bcount,
335 (space/bcount)*100/blocksize);
333 return (struct stats) { names, space, bcount}; 336 return (struct stats) { names, space, bcount};
334} 337}
335#endif /* DX_DEBUG */ 338#endif /* DX_DEBUG */
@@ -344,7 +347,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
344 * back to userspace. 347 * back to userspace.
345 */ 348 */
346static struct dx_frame * 349static struct dx_frame *
347dx_probe(struct dentry *dentry, struct inode *dir, 350dx_probe(const struct qstr *d_name, struct inode *dir,
348 struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err) 351 struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err)
349{ 352{
350 unsigned count, indirect; 353 unsigned count, indirect;
@@ -355,8 +358,6 @@ dx_probe(struct dentry *dentry, struct inode *dir,
355 u32 hash; 358 u32 hash;
356 359
357 frame->bh = NULL; 360 frame->bh = NULL;
358 if (dentry)
359 dir = dentry->d_parent->d_inode;
360 if (!(bh = ext4_bread (NULL,dir, 0, 0, err))) 361 if (!(bh = ext4_bread (NULL,dir, 0, 0, err)))
361 goto fail; 362 goto fail;
362 root = (struct dx_root *) bh->b_data; 363 root = (struct dx_root *) bh->b_data;
@@ -372,8 +373,8 @@ dx_probe(struct dentry *dentry, struct inode *dir,
372 } 373 }
373 hinfo->hash_version = root->info.hash_version; 374 hinfo->hash_version = root->info.hash_version;
374 hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed; 375 hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed;
375 if (dentry) 376 if (d_name)
376 ext4fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo); 377 ext4fs_dirhash(d_name->name, d_name->len, hinfo);
377 hash = hinfo->hash; 378 hash = hinfo->hash;
378 379
379 if (root->info.unused_flags & 1) { 380 if (root->info.unused_flags & 1) {
@@ -406,7 +407,7 @@ dx_probe(struct dentry *dentry, struct inode *dir,
406 goto fail; 407 goto fail;
407 } 408 }
408 409
409 dxtrace (printk("Look up %x", hash)); 410 dxtrace(printk("Look up %x", hash));
410 while (1) 411 while (1)
411 { 412 {
412 count = dx_get_count(entries); 413 count = dx_get_count(entries);
@@ -555,7 +556,7 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
555 0, &err))) 556 0, &err)))
556 return err; /* Failure */ 557 return err; /* Failure */
557 p++; 558 p++;
558 brelse (p->bh); 559 brelse(p->bh);
559 p->bh = bh; 560 p->bh = bh;
560 p->at = p->entries = ((struct dx_node *) bh->b_data)->entries; 561 p->at = p->entries = ((struct dx_node *) bh->b_data)->entries;
561 } 562 }
@@ -593,7 +594,7 @@ static int htree_dirblock_to_tree(struct file *dir_file,
593 /* On error, skip the f_pos to the next block. */ 594 /* On error, skip the f_pos to the next block. */
594 dir_file->f_pos = (dir_file->f_pos | 595 dir_file->f_pos = (dir_file->f_pos |
595 (dir->i_sb->s_blocksize - 1)) + 1; 596 (dir->i_sb->s_blocksize - 1)) + 1;
596 brelse (bh); 597 brelse(bh);
597 return count; 598 return count;
598 } 599 }
599 ext4fs_dirhash(de->name, de->name_len, hinfo); 600 ext4fs_dirhash(de->name, de->name_len, hinfo);
@@ -635,8 +636,8 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
635 int ret, err; 636 int ret, err;
636 __u32 hashval; 637 __u32 hashval;
637 638
638 dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash, 639 dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n",
639 start_minor_hash)); 640 start_hash, start_minor_hash));
640 dir = dir_file->f_path.dentry->d_inode; 641 dir = dir_file->f_path.dentry->d_inode;
641 if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) { 642 if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) {
642 hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; 643 hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
@@ -648,7 +649,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
648 } 649 }
649 hinfo.hash = start_hash; 650 hinfo.hash = start_hash;
650 hinfo.minor_hash = 0; 651 hinfo.minor_hash = 0;
651 frame = dx_probe(NULL, dir_file->f_path.dentry->d_inode, &hinfo, frames, &err); 652 frame = dx_probe(NULL, dir, &hinfo, frames, &err);
652 if (!frame) 653 if (!frame)
653 return err; 654 return err;
654 655
@@ -694,8 +695,8 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
694 break; 695 break;
695 } 696 }
696 dx_release(frames); 697 dx_release(frames);
697 dxtrace(printk("Fill tree: returned %d entries, next hash: %x\n", 698 dxtrace(printk(KERN_DEBUG "Fill tree: returned %d entries, "
698 count, *next_hash)); 699 "next hash: %x\n", count, *next_hash));
699 return count; 700 return count;
700errout: 701errout:
701 dx_release(frames); 702 dx_release(frames);
@@ -802,17 +803,17 @@ static inline int ext4_match (int len, const char * const name,
802/* 803/*
803 * Returns 0 if not found, -1 on failure, and 1 on success 804 * Returns 0 if not found, -1 on failure, and 1 on success
804 */ 805 */
805static inline int search_dirblock(struct buffer_head * bh, 806static inline int search_dirblock(struct buffer_head *bh,
806 struct inode *dir, 807 struct inode *dir,
807 struct dentry *dentry, 808 const struct qstr *d_name,
808 unsigned long offset, 809 unsigned long offset,
809 struct ext4_dir_entry_2 ** res_dir) 810 struct ext4_dir_entry_2 ** res_dir)
810{ 811{
811 struct ext4_dir_entry_2 * de; 812 struct ext4_dir_entry_2 * de;
812 char * dlimit; 813 char * dlimit;
813 int de_len; 814 int de_len;
814 const char *name = dentry->d_name.name; 815 const char *name = d_name->name;
815 int namelen = dentry->d_name.len; 816 int namelen = d_name->len;
816 817
817 de = (struct ext4_dir_entry_2 *) bh->b_data; 818 de = (struct ext4_dir_entry_2 *) bh->b_data;
818 dlimit = bh->b_data + dir->i_sb->s_blocksize; 819 dlimit = bh->b_data + dir->i_sb->s_blocksize;
@@ -851,12 +852,13 @@ static inline int search_dirblock(struct buffer_head * bh,
851 * The returned buffer_head has ->b_count elevated. The caller is expected 852 * The returned buffer_head has ->b_count elevated. The caller is expected
852 * to brelse() it when appropriate. 853 * to brelse() it when appropriate.
853 */ 854 */
854static struct buffer_head * ext4_find_entry (struct dentry *dentry, 855static struct buffer_head * ext4_find_entry (struct inode *dir,
856 const struct qstr *d_name,
855 struct ext4_dir_entry_2 ** res_dir) 857 struct ext4_dir_entry_2 ** res_dir)
856{ 858{
857 struct super_block * sb; 859 struct super_block *sb;
858 struct buffer_head * bh_use[NAMEI_RA_SIZE]; 860 struct buffer_head *bh_use[NAMEI_RA_SIZE];
859 struct buffer_head * bh, *ret = NULL; 861 struct buffer_head *bh, *ret = NULL;
860 ext4_lblk_t start, block, b; 862 ext4_lblk_t start, block, b;
861 int ra_max = 0; /* Number of bh's in the readahead 863 int ra_max = 0; /* Number of bh's in the readahead
862 buffer, bh_use[] */ 864 buffer, bh_use[] */
@@ -865,16 +867,15 @@ static struct buffer_head * ext4_find_entry (struct dentry *dentry,
865 int num = 0; 867 int num = 0;
866 ext4_lblk_t nblocks; 868 ext4_lblk_t nblocks;
867 int i, err; 869 int i, err;
868 struct inode *dir = dentry->d_parent->d_inode;
869 int namelen; 870 int namelen;
870 871
871 *res_dir = NULL; 872 *res_dir = NULL;
872 sb = dir->i_sb; 873 sb = dir->i_sb;
873 namelen = dentry->d_name.len; 874 namelen = d_name->len;
874 if (namelen > EXT4_NAME_LEN) 875 if (namelen > EXT4_NAME_LEN)
875 return NULL; 876 return NULL;
876 if (is_dx(dir)) { 877 if (is_dx(dir)) {
877 bh = ext4_dx_find_entry(dentry, res_dir, &err); 878 bh = ext4_dx_find_entry(dir, d_name, res_dir, &err);
878 /* 879 /*
879 * On success, or if the error was file not found, 880 * On success, or if the error was file not found,
880 * return. Otherwise, fall back to doing a search the 881 * return. Otherwise, fall back to doing a search the
@@ -882,7 +883,8 @@ static struct buffer_head * ext4_find_entry (struct dentry *dentry,
882 */ 883 */
883 if (bh || (err != ERR_BAD_DX_DIR)) 884 if (bh || (err != ERR_BAD_DX_DIR))
884 return bh; 885 return bh;
885 dxtrace(printk("ext4_find_entry: dx failed, falling back\n")); 886 dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, "
887 "falling back\n"));
886 } 888 }
887 nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb); 889 nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb);
888 start = EXT4_I(dir)->i_dir_start_lookup; 890 start = EXT4_I(dir)->i_dir_start_lookup;
@@ -926,7 +928,7 @@ restart:
926 brelse(bh); 928 brelse(bh);
927 goto next; 929 goto next;
928 } 930 }
929 i = search_dirblock(bh, dir, dentry, 931 i = search_dirblock(bh, dir, d_name,
930 block << EXT4_BLOCK_SIZE_BITS(sb), res_dir); 932 block << EXT4_BLOCK_SIZE_BITS(sb), res_dir);
931 if (i == 1) { 933 if (i == 1) {
932 EXT4_I(dir)->i_dir_start_lookup = block; 934 EXT4_I(dir)->i_dir_start_lookup = block;
@@ -956,11 +958,11 @@ restart:
956cleanup_and_exit: 958cleanup_and_exit:
957 /* Clean up the read-ahead blocks */ 959 /* Clean up the read-ahead blocks */
958 for (; ra_ptr < ra_max; ra_ptr++) 960 for (; ra_ptr < ra_max; ra_ptr++)
959 brelse (bh_use[ra_ptr]); 961 brelse(bh_use[ra_ptr]);
960 return ret; 962 return ret;
961} 963}
962 964
963static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry, 965static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name,
964 struct ext4_dir_entry_2 **res_dir, int *err) 966 struct ext4_dir_entry_2 **res_dir, int *err)
965{ 967{
966 struct super_block * sb; 968 struct super_block * sb;
@@ -971,14 +973,13 @@ static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
971 struct buffer_head *bh; 973 struct buffer_head *bh;
972 ext4_lblk_t block; 974 ext4_lblk_t block;
973 int retval; 975 int retval;
974 int namelen = dentry->d_name.len; 976 int namelen = d_name->len;
975 const u8 *name = dentry->d_name.name; 977 const u8 *name = d_name->name;
976 struct inode *dir = dentry->d_parent->d_inode;
977 978
978 sb = dir->i_sb; 979 sb = dir->i_sb;
979 /* NFS may look up ".." - look at dx_root directory block */ 980 /* NFS may look up ".." - look at dx_root directory block */
980 if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){ 981 if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){
981 if (!(frame = dx_probe(dentry, NULL, &hinfo, frames, err))) 982 if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err)))
982 return NULL; 983 return NULL;
983 } else { 984 } else {
984 frame = frames; 985 frame = frames;
@@ -1010,7 +1011,7 @@ static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
1010 return bh; 1011 return bh;
1011 } 1012 }
1012 } 1013 }
1013 brelse (bh); 1014 brelse(bh);
1014 /* Check to see if we should continue to search */ 1015 /* Check to see if we should continue to search */
1015 retval = ext4_htree_next_block(dir, hash, frame, 1016 retval = ext4_htree_next_block(dir, hash, frame,
1016 frames, NULL); 1017 frames, NULL);
@@ -1025,25 +1026,25 @@ static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
1025 1026
1026 *err = -ENOENT; 1027 *err = -ENOENT;
1027errout: 1028errout:
1028 dxtrace(printk("%s not found\n", name)); 1029 dxtrace(printk(KERN_DEBUG "%s not found\n", name));
1029 dx_release (frames); 1030 dx_release (frames);
1030 return NULL; 1031 return NULL;
1031} 1032}
1032 1033
1033static struct dentry *ext4_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd) 1034static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
1034{ 1035{
1035 struct inode * inode; 1036 struct inode *inode;
1036 struct ext4_dir_entry_2 * de; 1037 struct ext4_dir_entry_2 *de;
1037 struct buffer_head * bh; 1038 struct buffer_head *bh;
1038 1039
1039 if (dentry->d_name.len > EXT4_NAME_LEN) 1040 if (dentry->d_name.len > EXT4_NAME_LEN)
1040 return ERR_PTR(-ENAMETOOLONG); 1041 return ERR_PTR(-ENAMETOOLONG);
1041 1042
1042 bh = ext4_find_entry(dentry, &de); 1043 bh = ext4_find_entry(dir, &dentry->d_name, &de);
1043 inode = NULL; 1044 inode = NULL;
1044 if (bh) { 1045 if (bh) {
1045 unsigned long ino = le32_to_cpu(de->inode); 1046 unsigned long ino = le32_to_cpu(de->inode);
1046 brelse (bh); 1047 brelse(bh);
1047 if (!ext4_valid_inum(dir->i_sb, ino)) { 1048 if (!ext4_valid_inum(dir->i_sb, ino)) {
1048 ext4_error(dir->i_sb, "ext4_lookup", 1049 ext4_error(dir->i_sb, "ext4_lookup",
1049 "bad inode number: %lu", ino); 1050 "bad inode number: %lu", ino);
@@ -1062,15 +1063,14 @@ struct dentry *ext4_get_parent(struct dentry *child)
1062 unsigned long ino; 1063 unsigned long ino;
1063 struct dentry *parent; 1064 struct dentry *parent;
1064 struct inode *inode; 1065 struct inode *inode;
1065 struct dentry dotdot; 1066 static const struct qstr dotdot = {
1067 .name = "..",
1068 .len = 2,
1069 };
1066 struct ext4_dir_entry_2 * de; 1070 struct ext4_dir_entry_2 * de;
1067 struct buffer_head *bh; 1071 struct buffer_head *bh;
1068 1072
1069 dotdot.d_name.name = ".."; 1073 bh = ext4_find_entry(child->d_inode, &dotdot, &de);
1070 dotdot.d_name.len = 2;
1071 dotdot.d_parent = child; /* confusing, isn't it! */
1072
1073 bh = ext4_find_entry(&dotdot, &de);
1074 inode = NULL; 1074 inode = NULL;
1075 if (!bh) 1075 if (!bh)
1076 return ERR_PTR(-ENOENT); 1076 return ERR_PTR(-ENOENT);
@@ -1201,10 +1201,10 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
1201 1201
1202 /* create map in the end of data2 block */ 1202 /* create map in the end of data2 block */
1203 map = (struct dx_map_entry *) (data2 + blocksize); 1203 map = (struct dx_map_entry *) (data2 + blocksize);
1204 count = dx_make_map ((struct ext4_dir_entry_2 *) data1, 1204 count = dx_make_map((struct ext4_dir_entry_2 *) data1,
1205 blocksize, hinfo, map); 1205 blocksize, hinfo, map);
1206 map -= count; 1206 map -= count;
1207 dx_sort_map (map, count); 1207 dx_sort_map(map, count);
1208 /* Split the existing block in the middle, size-wise */ 1208 /* Split the existing block in the middle, size-wise */
1209 size = 0; 1209 size = 0;
1210 move = 0; 1210 move = 0;
@@ -1225,7 +1225,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
1225 1225
1226 /* Fancy dance to stay within two buffers */ 1226 /* Fancy dance to stay within two buffers */
1227 de2 = dx_move_dirents(data1, data2, map + split, count - split); 1227 de2 = dx_move_dirents(data1, data2, map + split, count - split);
1228 de = dx_pack_dirents(data1,blocksize); 1228 de = dx_pack_dirents(data1, blocksize);
1229 de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de); 1229 de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de);
1230 de2->rec_len = ext4_rec_len_to_disk(data2 + blocksize - (char *) de2); 1230 de2->rec_len = ext4_rec_len_to_disk(data2 + blocksize - (char *) de2);
1231 dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1)); 1231 dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1));
@@ -1237,15 +1237,15 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
1237 swap(*bh, bh2); 1237 swap(*bh, bh2);
1238 de = de2; 1238 de = de2;
1239 } 1239 }
1240 dx_insert_block (frame, hash2 + continued, newblock); 1240 dx_insert_block(frame, hash2 + continued, newblock);
1241 err = ext4_journal_dirty_metadata (handle, bh2); 1241 err = ext4_journal_dirty_metadata(handle, bh2);
1242 if (err) 1242 if (err)
1243 goto journal_error; 1243 goto journal_error;
1244 err = ext4_journal_dirty_metadata (handle, frame->bh); 1244 err = ext4_journal_dirty_metadata(handle, frame->bh);
1245 if (err) 1245 if (err)
1246 goto journal_error; 1246 goto journal_error;
1247 brelse (bh2); 1247 brelse(bh2);
1248 dxtrace(dx_show_index ("frame", frame->entries)); 1248 dxtrace(dx_show_index("frame", frame->entries));
1249 return de; 1249 return de;
1250 1250
1251journal_error: 1251journal_error:
@@ -1271,7 +1271,7 @@ errout:
1271 */ 1271 */
1272static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, 1272static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1273 struct inode *inode, struct ext4_dir_entry_2 *de, 1273 struct inode *inode, struct ext4_dir_entry_2 *de,
1274 struct buffer_head * bh) 1274 struct buffer_head *bh)
1275{ 1275{
1276 struct inode *dir = dentry->d_parent->d_inode; 1276 struct inode *dir = dentry->d_parent->d_inode;
1277 const char *name = dentry->d_name.name; 1277 const char *name = dentry->d_name.name;
@@ -1288,11 +1288,11 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1288 while ((char *) de <= top) { 1288 while ((char *) de <= top) {
1289 if (!ext4_check_dir_entry("ext4_add_entry", dir, de, 1289 if (!ext4_check_dir_entry("ext4_add_entry", dir, de,
1290 bh, offset)) { 1290 bh, offset)) {
1291 brelse (bh); 1291 brelse(bh);
1292 return -EIO; 1292 return -EIO;
1293 } 1293 }
1294 if (ext4_match (namelen, name, de)) { 1294 if (ext4_match(namelen, name, de)) {
1295 brelse (bh); 1295 brelse(bh);
1296 return -EEXIST; 1296 return -EEXIST;
1297 } 1297 }
1298 nlen = EXT4_DIR_REC_LEN(de->name_len); 1298 nlen = EXT4_DIR_REC_LEN(de->name_len);
@@ -1329,7 +1329,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1329 } else 1329 } else
1330 de->inode = 0; 1330 de->inode = 0;
1331 de->name_len = namelen; 1331 de->name_len = namelen;
1332 memcpy (de->name, name, namelen); 1332 memcpy(de->name, name, namelen);
1333 /* 1333 /*
1334 * XXX shouldn't update any times until successful 1334 * XXX shouldn't update any times until successful
1335 * completion of syscall, but too many callers depend 1335 * completion of syscall, but too many callers depend
@@ -1377,7 +1377,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1377 struct fake_dirent *fde; 1377 struct fake_dirent *fde;
1378 1378
1379 blocksize = dir->i_sb->s_blocksize; 1379 blocksize = dir->i_sb->s_blocksize;
1380 dxtrace(printk("Creating index\n")); 1380 dxtrace(printk(KERN_DEBUG "Creating index\n"));
1381 retval = ext4_journal_get_write_access(handle, bh); 1381 retval = ext4_journal_get_write_access(handle, bh);
1382 if (retval) { 1382 if (retval) {
1383 ext4_std_error(dir->i_sb, retval); 1383 ext4_std_error(dir->i_sb, retval);
@@ -1386,7 +1386,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1386 } 1386 }
1387 root = (struct dx_root *) bh->b_data; 1387 root = (struct dx_root *) bh->b_data;
1388 1388
1389 bh2 = ext4_append (handle, dir, &block, &retval); 1389 bh2 = ext4_append(handle, dir, &block, &retval);
1390 if (!(bh2)) { 1390 if (!(bh2)) {
1391 brelse(bh); 1391 brelse(bh);
1392 return retval; 1392 return retval;
@@ -1412,9 +1412,9 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1412 root->info.info_length = sizeof(root->info); 1412 root->info.info_length = sizeof(root->info);
1413 root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; 1413 root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
1414 entries = root->entries; 1414 entries = root->entries;
1415 dx_set_block (entries, 1); 1415 dx_set_block(entries, 1);
1416 dx_set_count (entries, 1); 1416 dx_set_count(entries, 1);
1417 dx_set_limit (entries, dx_root_limit(dir, sizeof(root->info))); 1417 dx_set_limit(entries, dx_root_limit(dir, sizeof(root->info)));
1418 1418
1419 /* Initialize as for dx_probe */ 1419 /* Initialize as for dx_probe */
1420 hinfo.hash_version = root->info.hash_version; 1420 hinfo.hash_version = root->info.hash_version;
@@ -1443,14 +1443,14 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1443 * may not sleep between calling this and putting something into 1443 * may not sleep between calling this and putting something into
1444 * the entry, as someone else might have used it while you slept. 1444 * the entry, as someone else might have used it while you slept.
1445 */ 1445 */
1446static int ext4_add_entry (handle_t *handle, struct dentry *dentry, 1446static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
1447 struct inode *inode) 1447 struct inode *inode)
1448{ 1448{
1449 struct inode *dir = dentry->d_parent->d_inode; 1449 struct inode *dir = dentry->d_parent->d_inode;
1450 unsigned long offset; 1450 unsigned long offset;
1451 struct buffer_head * bh; 1451 struct buffer_head *bh;
1452 struct ext4_dir_entry_2 *de; 1452 struct ext4_dir_entry_2 *de;
1453 struct super_block * sb; 1453 struct super_block *sb;
1454 int retval; 1454 int retval;
1455 int dx_fallback=0; 1455 int dx_fallback=0;
1456 unsigned blocksize; 1456 unsigned blocksize;
@@ -1500,13 +1500,13 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1500 struct dx_frame frames[2], *frame; 1500 struct dx_frame frames[2], *frame;
1501 struct dx_entry *entries, *at; 1501 struct dx_entry *entries, *at;
1502 struct dx_hash_info hinfo; 1502 struct dx_hash_info hinfo;
1503 struct buffer_head * bh; 1503 struct buffer_head *bh;
1504 struct inode *dir = dentry->d_parent->d_inode; 1504 struct inode *dir = dentry->d_parent->d_inode;
1505 struct super_block * sb = dir->i_sb; 1505 struct super_block *sb = dir->i_sb;
1506 struct ext4_dir_entry_2 *de; 1506 struct ext4_dir_entry_2 *de;
1507 int err; 1507 int err;
1508 1508
1509 frame = dx_probe(dentry, NULL, &hinfo, frames, &err); 1509 frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err);
1510 if (!frame) 1510 if (!frame)
1511 return err; 1511 return err;
1512 entries = frame->entries; 1512 entries = frame->entries;
@@ -1527,7 +1527,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1527 } 1527 }
1528 1528
1529 /* Block full, should compress but for now just split */ 1529 /* Block full, should compress but for now just split */
1530 dxtrace(printk("using %u of %u node entries\n", 1530 dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
1531 dx_get_count(entries), dx_get_limit(entries))); 1531 dx_get_count(entries), dx_get_limit(entries)));
1532 /* Need to split index? */ 1532 /* Need to split index? */
1533 if (dx_get_count(entries) == dx_get_limit(entries)) { 1533 if (dx_get_count(entries) == dx_get_limit(entries)) {
@@ -1559,7 +1559,8 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1559 if (levels) { 1559 if (levels) {
1560 unsigned icount1 = icount/2, icount2 = icount - icount1; 1560 unsigned icount1 = icount/2, icount2 = icount - icount1;
1561 unsigned hash2 = dx_get_hash(entries + icount1); 1561 unsigned hash2 = dx_get_hash(entries + icount1);
1562 dxtrace(printk("Split index %i/%i\n", icount1, icount2)); 1562 dxtrace(printk(KERN_DEBUG "Split index %i/%i\n",
1563 icount1, icount2));
1563 1564
1564 BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */ 1565 BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
1565 err = ext4_journal_get_write_access(handle, 1566 err = ext4_journal_get_write_access(handle,
@@ -1567,11 +1568,11 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1567 if (err) 1568 if (err)
1568 goto journal_error; 1569 goto journal_error;
1569 1570
1570 memcpy ((char *) entries2, (char *) (entries + icount1), 1571 memcpy((char *) entries2, (char *) (entries + icount1),
1571 icount2 * sizeof(struct dx_entry)); 1572 icount2 * sizeof(struct dx_entry));
1572 dx_set_count (entries, icount1); 1573 dx_set_count(entries, icount1);
1573 dx_set_count (entries2, icount2); 1574 dx_set_count(entries2, icount2);
1574 dx_set_limit (entries2, dx_node_limit(dir)); 1575 dx_set_limit(entries2, dx_node_limit(dir));
1575 1576
1576 /* Which index block gets the new entry? */ 1577 /* Which index block gets the new entry? */
1577 if (at - entries >= icount1) { 1578 if (at - entries >= icount1) {
@@ -1579,16 +1580,17 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1579 frame->entries = entries = entries2; 1580 frame->entries = entries = entries2;
1580 swap(frame->bh, bh2); 1581 swap(frame->bh, bh2);
1581 } 1582 }
1582 dx_insert_block (frames + 0, hash2, newblock); 1583 dx_insert_block(frames + 0, hash2, newblock);
1583 dxtrace(dx_show_index ("node", frames[1].entries)); 1584 dxtrace(dx_show_index("node", frames[1].entries));
1584 dxtrace(dx_show_index ("node", 1585 dxtrace(dx_show_index("node",
1585 ((struct dx_node *) bh2->b_data)->entries)); 1586 ((struct dx_node *) bh2->b_data)->entries));
1586 err = ext4_journal_dirty_metadata(handle, bh2); 1587 err = ext4_journal_dirty_metadata(handle, bh2);
1587 if (err) 1588 if (err)
1588 goto journal_error; 1589 goto journal_error;
1589 brelse (bh2); 1590 brelse (bh2);
1590 } else { 1591 } else {
1591 dxtrace(printk("Creating second level index...\n")); 1592 dxtrace(printk(KERN_DEBUG
1593 "Creating second level index...\n"));
1592 memcpy((char *) entries2, (char *) entries, 1594 memcpy((char *) entries2, (char *) entries,
1593 icount * sizeof(struct dx_entry)); 1595 icount * sizeof(struct dx_entry));
1594 dx_set_limit(entries2, dx_node_limit(dir)); 1596 dx_set_limit(entries2, dx_node_limit(dir));
@@ -1630,12 +1632,12 @@ cleanup:
1630 * ext4_delete_entry deletes a directory entry by merging it with the 1632 * ext4_delete_entry deletes a directory entry by merging it with the
1631 * previous entry 1633 * previous entry
1632 */ 1634 */
1633static int ext4_delete_entry (handle_t *handle, 1635static int ext4_delete_entry(handle_t *handle,
1634 struct inode * dir, 1636 struct inode *dir,
1635 struct ext4_dir_entry_2 * de_del, 1637 struct ext4_dir_entry_2 *de_del,
1636 struct buffer_head * bh) 1638 struct buffer_head *bh)
1637{ 1639{
1638 struct ext4_dir_entry_2 * de, * pde; 1640 struct ext4_dir_entry_2 *de, *pde;
1639 int i; 1641 int i;
1640 1642
1641 i = 0; 1643 i = 0;
@@ -1716,11 +1718,11 @@ static int ext4_add_nondir(handle_t *handle,
1716 * If the create succeeds, we fill in the inode information 1718 * If the create succeeds, we fill in the inode information
1717 * with d_instantiate(). 1719 * with d_instantiate().
1718 */ 1720 */
1719static int ext4_create (struct inode * dir, struct dentry * dentry, int mode, 1721static int ext4_create(struct inode *dir, struct dentry *dentry, int mode,
1720 struct nameidata *nd) 1722 struct nameidata *nd)
1721{ 1723{
1722 handle_t *handle; 1724 handle_t *handle;
1723 struct inode * inode; 1725 struct inode *inode;
1724 int err, retries = 0; 1726 int err, retries = 0;
1725 1727
1726retry: 1728retry:
@@ -1747,8 +1749,8 @@ retry:
1747 return err; 1749 return err;
1748} 1750}
1749 1751
1750static int ext4_mknod (struct inode * dir, struct dentry *dentry, 1752static int ext4_mknod(struct inode *dir, struct dentry *dentry,
1751 int mode, dev_t rdev) 1753 int mode, dev_t rdev)
1752{ 1754{
1753 handle_t *handle; 1755 handle_t *handle;
1754 struct inode *inode; 1756 struct inode *inode;
@@ -1767,11 +1769,11 @@ retry:
1767 if (IS_DIRSYNC(dir)) 1769 if (IS_DIRSYNC(dir))
1768 handle->h_sync = 1; 1770 handle->h_sync = 1;
1769 1771
1770 inode = ext4_new_inode (handle, dir, mode); 1772 inode = ext4_new_inode(handle, dir, mode);
1771 err = PTR_ERR(inode); 1773 err = PTR_ERR(inode);
1772 if (!IS_ERR(inode)) { 1774 if (!IS_ERR(inode)) {
1773 init_special_inode(inode, inode->i_mode, rdev); 1775 init_special_inode(inode, inode->i_mode, rdev);
1774#ifdef CONFIG_EXT4DEV_FS_XATTR 1776#ifdef CONFIG_EXT4_FS_XATTR
1775 inode->i_op = &ext4_special_inode_operations; 1777 inode->i_op = &ext4_special_inode_operations;
1776#endif 1778#endif
1777 err = ext4_add_nondir(handle, dentry, inode); 1779 err = ext4_add_nondir(handle, dentry, inode);
@@ -1782,12 +1784,12 @@ retry:
1782 return err; 1784 return err;
1783} 1785}
1784 1786
1785static int ext4_mkdir(struct inode * dir, struct dentry * dentry, int mode) 1787static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode)
1786{ 1788{
1787 handle_t *handle; 1789 handle_t *handle;
1788 struct inode * inode; 1790 struct inode *inode;
1789 struct buffer_head * dir_block; 1791 struct buffer_head *dir_block;
1790 struct ext4_dir_entry_2 * de; 1792 struct ext4_dir_entry_2 *de;
1791 int err, retries = 0; 1793 int err, retries = 0;
1792 1794
1793 if (EXT4_DIR_LINK_MAX(dir)) 1795 if (EXT4_DIR_LINK_MAX(dir))
@@ -1803,7 +1805,7 @@ retry:
1803 if (IS_DIRSYNC(dir)) 1805 if (IS_DIRSYNC(dir))
1804 handle->h_sync = 1; 1806 handle->h_sync = 1;
1805 1807
1806 inode = ext4_new_inode (handle, dir, S_IFDIR | mode); 1808 inode = ext4_new_inode(handle, dir, S_IFDIR | mode);
1807 err = PTR_ERR(inode); 1809 err = PTR_ERR(inode);
1808 if (IS_ERR(inode)) 1810 if (IS_ERR(inode))
1809 goto out_stop; 1811 goto out_stop;
@@ -1811,7 +1813,7 @@ retry:
1811 inode->i_op = &ext4_dir_inode_operations; 1813 inode->i_op = &ext4_dir_inode_operations;
1812 inode->i_fop = &ext4_dir_operations; 1814 inode->i_fop = &ext4_dir_operations;
1813 inode->i_size = EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize; 1815 inode->i_size = EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize;
1814 dir_block = ext4_bread (handle, inode, 0, 1, &err); 1816 dir_block = ext4_bread(handle, inode, 0, 1, &err);
1815 if (!dir_block) 1817 if (!dir_block)
1816 goto out_clear_inode; 1818 goto out_clear_inode;
1817 BUFFER_TRACE(dir_block, "get_write_access"); 1819 BUFFER_TRACE(dir_block, "get_write_access");
@@ -1820,26 +1822,26 @@ retry:
1820 de->inode = cpu_to_le32(inode->i_ino); 1822 de->inode = cpu_to_le32(inode->i_ino);
1821 de->name_len = 1; 1823 de->name_len = 1;
1822 de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len)); 1824 de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len));
1823 strcpy (de->name, "."); 1825 strcpy(de->name, ".");
1824 ext4_set_de_type(dir->i_sb, de, S_IFDIR); 1826 ext4_set_de_type(dir->i_sb, de, S_IFDIR);
1825 de = ext4_next_entry(de); 1827 de = ext4_next_entry(de);
1826 de->inode = cpu_to_le32(dir->i_ino); 1828 de->inode = cpu_to_le32(dir->i_ino);
1827 de->rec_len = ext4_rec_len_to_disk(inode->i_sb->s_blocksize - 1829 de->rec_len = ext4_rec_len_to_disk(inode->i_sb->s_blocksize -
1828 EXT4_DIR_REC_LEN(1)); 1830 EXT4_DIR_REC_LEN(1));
1829 de->name_len = 2; 1831 de->name_len = 2;
1830 strcpy (de->name, ".."); 1832 strcpy(de->name, "..");
1831 ext4_set_de_type(dir->i_sb, de, S_IFDIR); 1833 ext4_set_de_type(dir->i_sb, de, S_IFDIR);
1832 inode->i_nlink = 2; 1834 inode->i_nlink = 2;
1833 BUFFER_TRACE(dir_block, "call ext4_journal_dirty_metadata"); 1835 BUFFER_TRACE(dir_block, "call ext4_journal_dirty_metadata");
1834 ext4_journal_dirty_metadata(handle, dir_block); 1836 ext4_journal_dirty_metadata(handle, dir_block);
1835 brelse (dir_block); 1837 brelse(dir_block);
1836 ext4_mark_inode_dirty(handle, inode); 1838 ext4_mark_inode_dirty(handle, inode);
1837 err = ext4_add_entry (handle, dentry, inode); 1839 err = ext4_add_entry(handle, dentry, inode);
1838 if (err) { 1840 if (err) {
1839out_clear_inode: 1841out_clear_inode:
1840 clear_nlink(inode); 1842 clear_nlink(inode);
1841 ext4_mark_inode_dirty(handle, inode); 1843 ext4_mark_inode_dirty(handle, inode);
1842 iput (inode); 1844 iput(inode);
1843 goto out_stop; 1845 goto out_stop;
1844 } 1846 }
1845 ext4_inc_count(handle, dir); 1847 ext4_inc_count(handle, dir);
@@ -1856,17 +1858,17 @@ out_stop:
1856/* 1858/*
1857 * routine to check that the specified directory is empty (for rmdir) 1859 * routine to check that the specified directory is empty (for rmdir)
1858 */ 1860 */
1859static int empty_dir (struct inode * inode) 1861static int empty_dir(struct inode *inode)
1860{ 1862{
1861 unsigned long offset; 1863 unsigned long offset;
1862 struct buffer_head * bh; 1864 struct buffer_head *bh;
1863 struct ext4_dir_entry_2 * de, * de1; 1865 struct ext4_dir_entry_2 *de, *de1;
1864 struct super_block * sb; 1866 struct super_block *sb;
1865 int err = 0; 1867 int err = 0;
1866 1868
1867 sb = inode->i_sb; 1869 sb = inode->i_sb;
1868 if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) || 1870 if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) ||
1869 !(bh = ext4_bread (NULL, inode, 0, 0, &err))) { 1871 !(bh = ext4_bread(NULL, inode, 0, 0, &err))) {
1870 if (err) 1872 if (err)
1871 ext4_error(inode->i_sb, __func__, 1873 ext4_error(inode->i_sb, __func__,
1872 "error %d reading directory #%lu offset 0", 1874 "error %d reading directory #%lu offset 0",
@@ -1881,23 +1883,23 @@ static int empty_dir (struct inode * inode)
1881 de1 = ext4_next_entry(de); 1883 de1 = ext4_next_entry(de);
1882 if (le32_to_cpu(de->inode) != inode->i_ino || 1884 if (le32_to_cpu(de->inode) != inode->i_ino ||
1883 !le32_to_cpu(de1->inode) || 1885 !le32_to_cpu(de1->inode) ||
1884 strcmp (".", de->name) || 1886 strcmp(".", de->name) ||
1885 strcmp ("..", de1->name)) { 1887 strcmp("..", de1->name)) {
1886 ext4_warning (inode->i_sb, "empty_dir", 1888 ext4_warning(inode->i_sb, "empty_dir",
1887 "bad directory (dir #%lu) - no `.' or `..'", 1889 "bad directory (dir #%lu) - no `.' or `..'",
1888 inode->i_ino); 1890 inode->i_ino);
1889 brelse (bh); 1891 brelse(bh);
1890 return 1; 1892 return 1;
1891 } 1893 }
1892 offset = ext4_rec_len_from_disk(de->rec_len) + 1894 offset = ext4_rec_len_from_disk(de->rec_len) +
1893 ext4_rec_len_from_disk(de1->rec_len); 1895 ext4_rec_len_from_disk(de1->rec_len);
1894 de = ext4_next_entry(de1); 1896 de = ext4_next_entry(de1);
1895 while (offset < inode->i_size ) { 1897 while (offset < inode->i_size) {
1896 if (!bh || 1898 if (!bh ||
1897 (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) { 1899 (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
1898 err = 0; 1900 err = 0;
1899 brelse (bh); 1901 brelse(bh);
1900 bh = ext4_bread (NULL, inode, 1902 bh = ext4_bread(NULL, inode,
1901 offset >> EXT4_BLOCK_SIZE_BITS(sb), 0, &err); 1903 offset >> EXT4_BLOCK_SIZE_BITS(sb), 0, &err);
1902 if (!bh) { 1904 if (!bh) {
1903 if (err) 1905 if (err)
@@ -1917,13 +1919,13 @@ static int empty_dir (struct inode * inode)
1917 continue; 1919 continue;
1918 } 1920 }
1919 if (le32_to_cpu(de->inode)) { 1921 if (le32_to_cpu(de->inode)) {
1920 brelse (bh); 1922 brelse(bh);
1921 return 0; 1923 return 0;
1922 } 1924 }
1923 offset += ext4_rec_len_from_disk(de->rec_len); 1925 offset += ext4_rec_len_from_disk(de->rec_len);
1924 de = ext4_next_entry(de); 1926 de = ext4_next_entry(de);
1925 } 1927 }
1926 brelse (bh); 1928 brelse(bh);
1927 return 1; 1929 return 1;
1928} 1930}
1929 1931
@@ -1954,8 +1956,8 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
1954 * ->i_nlink. For, say it, character device. Not a regular file, 1956 * ->i_nlink. For, say it, character device. Not a regular file,
1955 * not a directory, not a symlink and ->i_nlink > 0. 1957 * not a directory, not a symlink and ->i_nlink > 0.
1956 */ 1958 */
1957 J_ASSERT ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || 1959 J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
1958 S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); 1960 S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
1959 1961
1960 BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access"); 1962 BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access");
1961 err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh); 1963 err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
@@ -2069,12 +2071,12 @@ out_brelse:
2069 goto out_err; 2071 goto out_err;
2070} 2072}
2071 2073
2072static int ext4_rmdir (struct inode * dir, struct dentry *dentry) 2074static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
2073{ 2075{
2074 int retval; 2076 int retval;
2075 struct inode * inode; 2077 struct inode *inode;
2076 struct buffer_head * bh; 2078 struct buffer_head *bh;
2077 struct ext4_dir_entry_2 * de; 2079 struct ext4_dir_entry_2 *de;
2078 handle_t *handle; 2080 handle_t *handle;
2079 2081
2080 /* Initialize quotas before so that eventual writes go in 2082 /* Initialize quotas before so that eventual writes go in
@@ -2085,7 +2087,7 @@ static int ext4_rmdir (struct inode * dir, struct dentry *dentry)
2085 return PTR_ERR(handle); 2087 return PTR_ERR(handle);
2086 2088
2087 retval = -ENOENT; 2089 retval = -ENOENT;
2088 bh = ext4_find_entry (dentry, &de); 2090 bh = ext4_find_entry(dir, &dentry->d_name, &de);
2089 if (!bh) 2091 if (!bh)
2090 goto end_rmdir; 2092 goto end_rmdir;
2091 2093
@@ -2099,16 +2101,16 @@ static int ext4_rmdir (struct inode * dir, struct dentry *dentry)
2099 goto end_rmdir; 2101 goto end_rmdir;
2100 2102
2101 retval = -ENOTEMPTY; 2103 retval = -ENOTEMPTY;
2102 if (!empty_dir (inode)) 2104 if (!empty_dir(inode))
2103 goto end_rmdir; 2105 goto end_rmdir;
2104 2106
2105 retval = ext4_delete_entry(handle, dir, de, bh); 2107 retval = ext4_delete_entry(handle, dir, de, bh);
2106 if (retval) 2108 if (retval)
2107 goto end_rmdir; 2109 goto end_rmdir;
2108 if (!EXT4_DIR_LINK_EMPTY(inode)) 2110 if (!EXT4_DIR_LINK_EMPTY(inode))
2109 ext4_warning (inode->i_sb, "ext4_rmdir", 2111 ext4_warning(inode->i_sb, "ext4_rmdir",
2110 "empty directory has too many links (%d)", 2112 "empty directory has too many links (%d)",
2111 inode->i_nlink); 2113 inode->i_nlink);
2112 inode->i_version++; 2114 inode->i_version++;
2113 clear_nlink(inode); 2115 clear_nlink(inode);
2114 /* There's no need to set i_disksize: the fact that i_nlink is 2116 /* There's no need to set i_disksize: the fact that i_nlink is
@@ -2124,16 +2126,16 @@ static int ext4_rmdir (struct inode * dir, struct dentry *dentry)
2124 2126
2125end_rmdir: 2127end_rmdir:
2126 ext4_journal_stop(handle); 2128 ext4_journal_stop(handle);
2127 brelse (bh); 2129 brelse(bh);
2128 return retval; 2130 return retval;
2129} 2131}
2130 2132
2131static int ext4_unlink(struct inode * dir, struct dentry *dentry) 2133static int ext4_unlink(struct inode *dir, struct dentry *dentry)
2132{ 2134{
2133 int retval; 2135 int retval;
2134 struct inode * inode; 2136 struct inode *inode;
2135 struct buffer_head * bh; 2137 struct buffer_head *bh;
2136 struct ext4_dir_entry_2 * de; 2138 struct ext4_dir_entry_2 *de;
2137 handle_t *handle; 2139 handle_t *handle;
2138 2140
2139 /* Initialize quotas before so that eventual writes go 2141 /* Initialize quotas before so that eventual writes go
@@ -2147,7 +2149,7 @@ static int ext4_unlink(struct inode * dir, struct dentry *dentry)
2147 handle->h_sync = 1; 2149 handle->h_sync = 1;
2148 2150
2149 retval = -ENOENT; 2151 retval = -ENOENT;
2150 bh = ext4_find_entry (dentry, &de); 2152 bh = ext4_find_entry(dir, &dentry->d_name, &de);
2151 if (!bh) 2153 if (!bh)
2152 goto end_unlink; 2154 goto end_unlink;
2153 2155
@@ -2158,9 +2160,9 @@ static int ext4_unlink(struct inode * dir, struct dentry *dentry)
2158 goto end_unlink; 2160 goto end_unlink;
2159 2161
2160 if (!inode->i_nlink) { 2162 if (!inode->i_nlink) {
2161 ext4_warning (inode->i_sb, "ext4_unlink", 2163 ext4_warning(inode->i_sb, "ext4_unlink",
2162 "Deleting nonexistent file (%lu), %d", 2164 "Deleting nonexistent file (%lu), %d",
2163 inode->i_ino, inode->i_nlink); 2165 inode->i_ino, inode->i_nlink);
2164 inode->i_nlink = 1; 2166 inode->i_nlink = 1;
2165 } 2167 }
2166 retval = ext4_delete_entry(handle, dir, de, bh); 2168 retval = ext4_delete_entry(handle, dir, de, bh);
@@ -2178,15 +2180,15 @@ static int ext4_unlink(struct inode * dir, struct dentry *dentry)
2178 2180
2179end_unlink: 2181end_unlink:
2180 ext4_journal_stop(handle); 2182 ext4_journal_stop(handle);
2181 brelse (bh); 2183 brelse(bh);
2182 return retval; 2184 return retval;
2183} 2185}
2184 2186
2185static int ext4_symlink (struct inode * dir, 2187static int ext4_symlink(struct inode *dir,
2186 struct dentry *dentry, const char * symname) 2188 struct dentry *dentry, const char *symname)
2187{ 2189{
2188 handle_t *handle; 2190 handle_t *handle;
2189 struct inode * inode; 2191 struct inode *inode;
2190 int l, err, retries = 0; 2192 int l, err, retries = 0;
2191 2193
2192 l = strlen(symname)+1; 2194 l = strlen(symname)+1;
@@ -2203,12 +2205,12 @@ retry:
2203 if (IS_DIRSYNC(dir)) 2205 if (IS_DIRSYNC(dir))
2204 handle->h_sync = 1; 2206 handle->h_sync = 1;
2205 2207
2206 inode = ext4_new_inode (handle, dir, S_IFLNK|S_IRWXUGO); 2208 inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO);
2207 err = PTR_ERR(inode); 2209 err = PTR_ERR(inode);
2208 if (IS_ERR(inode)) 2210 if (IS_ERR(inode))
2209 goto out_stop; 2211 goto out_stop;
2210 2212
2211 if (l > sizeof (EXT4_I(inode)->i_data)) { 2213 if (l > sizeof(EXT4_I(inode)->i_data)) {
2212 inode->i_op = &ext4_symlink_inode_operations; 2214 inode->i_op = &ext4_symlink_inode_operations;
2213 ext4_set_aops(inode); 2215 ext4_set_aops(inode);
2214 /* 2216 /*
@@ -2221,14 +2223,14 @@ retry:
2221 if (err) { 2223 if (err) {
2222 clear_nlink(inode); 2224 clear_nlink(inode);
2223 ext4_mark_inode_dirty(handle, inode); 2225 ext4_mark_inode_dirty(handle, inode);
2224 iput (inode); 2226 iput(inode);
2225 goto out_stop; 2227 goto out_stop;
2226 } 2228 }
2227 } else { 2229 } else {
2228 /* clear the extent format for fast symlink */ 2230 /* clear the extent format for fast symlink */
2229 EXT4_I(inode)->i_flags &= ~EXT4_EXTENTS_FL; 2231 EXT4_I(inode)->i_flags &= ~EXT4_EXTENTS_FL;
2230 inode->i_op = &ext4_fast_symlink_inode_operations; 2232 inode->i_op = &ext4_fast_symlink_inode_operations;
2231 memcpy((char*)&EXT4_I(inode)->i_data,symname,l); 2233 memcpy((char *)&EXT4_I(inode)->i_data, symname, l);
2232 inode->i_size = l-1; 2234 inode->i_size = l-1;
2233 } 2235 }
2234 EXT4_I(inode)->i_disksize = inode->i_size; 2236 EXT4_I(inode)->i_disksize = inode->i_size;
@@ -2240,8 +2242,8 @@ out_stop:
2240 return err; 2242 return err;
2241} 2243}
2242 2244
2243static int ext4_link (struct dentry * old_dentry, 2245static int ext4_link(struct dentry *old_dentry,
2244 struct inode * dir, struct dentry *dentry) 2246 struct inode *dir, struct dentry *dentry)
2245{ 2247{
2246 handle_t *handle; 2248 handle_t *handle;
2247 struct inode *inode = old_dentry->d_inode; 2249 struct inode *inode = old_dentry->d_inode;
@@ -2284,13 +2286,13 @@ retry:
2284 * Anybody can rename anything with this: the permission checks are left to the 2286 * Anybody can rename anything with this: the permission checks are left to the
2285 * higher-level routines. 2287 * higher-level routines.
2286 */ 2288 */
2287static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry, 2289static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2288 struct inode * new_dir,struct dentry *new_dentry) 2290 struct inode *new_dir, struct dentry *new_dentry)
2289{ 2291{
2290 handle_t *handle; 2292 handle_t *handle;
2291 struct inode * old_inode, * new_inode; 2293 struct inode *old_inode, *new_inode;
2292 struct buffer_head * old_bh, * new_bh, * dir_bh; 2294 struct buffer_head *old_bh, *new_bh, *dir_bh;
2293 struct ext4_dir_entry_2 * old_de, * new_de; 2295 struct ext4_dir_entry_2 *old_de, *new_de;
2294 int retval; 2296 int retval;
2295 2297
2296 old_bh = new_bh = dir_bh = NULL; 2298 old_bh = new_bh = dir_bh = NULL;
@@ -2308,7 +2310,7 @@ static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry,
2308 if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) 2310 if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
2309 handle->h_sync = 1; 2311 handle->h_sync = 1;
2310 2312
2311 old_bh = ext4_find_entry (old_dentry, &old_de); 2313 old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de);
2312 /* 2314 /*
2313 * Check for inode number is _not_ due to possible IO errors. 2315 * Check for inode number is _not_ due to possible IO errors.
2314 * We might rmdir the source, keep it as pwd of some process 2316 * We might rmdir the source, keep it as pwd of some process
@@ -2321,32 +2323,32 @@ static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry,
2321 goto end_rename; 2323 goto end_rename;
2322 2324
2323 new_inode = new_dentry->d_inode; 2325 new_inode = new_dentry->d_inode;
2324 new_bh = ext4_find_entry (new_dentry, &new_de); 2326 new_bh = ext4_find_entry(new_dir, &new_dentry->d_name, &new_de);
2325 if (new_bh) { 2327 if (new_bh) {
2326 if (!new_inode) { 2328 if (!new_inode) {
2327 brelse (new_bh); 2329 brelse(new_bh);
2328 new_bh = NULL; 2330 new_bh = NULL;
2329 } 2331 }
2330 } 2332 }
2331 if (S_ISDIR(old_inode->i_mode)) { 2333 if (S_ISDIR(old_inode->i_mode)) {
2332 if (new_inode) { 2334 if (new_inode) {
2333 retval = -ENOTEMPTY; 2335 retval = -ENOTEMPTY;
2334 if (!empty_dir (new_inode)) 2336 if (!empty_dir(new_inode))
2335 goto end_rename; 2337 goto end_rename;
2336 } 2338 }
2337 retval = -EIO; 2339 retval = -EIO;
2338 dir_bh = ext4_bread (handle, old_inode, 0, 0, &retval); 2340 dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval);
2339 if (!dir_bh) 2341 if (!dir_bh)
2340 goto end_rename; 2342 goto end_rename;
2341 if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino) 2343 if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino)
2342 goto end_rename; 2344 goto end_rename;
2343 retval = -EMLINK; 2345 retval = -EMLINK;
2344 if (!new_inode && new_dir!=old_dir && 2346 if (!new_inode && new_dir != old_dir &&
2345 new_dir->i_nlink >= EXT4_LINK_MAX) 2347 new_dir->i_nlink >= EXT4_LINK_MAX)
2346 goto end_rename; 2348 goto end_rename;
2347 } 2349 }
2348 if (!new_bh) { 2350 if (!new_bh) {
2349 retval = ext4_add_entry (handle, new_dentry, old_inode); 2351 retval = ext4_add_entry(handle, new_dentry, old_inode);
2350 if (retval) 2352 if (retval)
2351 goto end_rename; 2353 goto end_rename;
2352 } else { 2354 } else {
@@ -2388,7 +2390,7 @@ static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry,
2388 struct buffer_head *old_bh2; 2390 struct buffer_head *old_bh2;
2389 struct ext4_dir_entry_2 *old_de2; 2391 struct ext4_dir_entry_2 *old_de2;
2390 2392
2391 old_bh2 = ext4_find_entry(old_dentry, &old_de2); 2393 old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de2);
2392 if (old_bh2) { 2394 if (old_bh2) {
2393 retval = ext4_delete_entry(handle, old_dir, 2395 retval = ext4_delete_entry(handle, old_dir,
2394 old_de2, old_bh2); 2396 old_de2, old_bh2);
@@ -2433,9 +2435,9 @@ static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry,
2433 retval = 0; 2435 retval = 0;
2434 2436
2435end_rename: 2437end_rename:
2436 brelse (dir_bh); 2438 brelse(dir_bh);
2437 brelse (old_bh); 2439 brelse(old_bh);
2438 brelse (new_bh); 2440 brelse(new_bh);
2439 ext4_journal_stop(handle); 2441 ext4_journal_stop(handle);
2440 return retval; 2442 return retval;
2441} 2443}
@@ -2454,7 +2456,7 @@ const struct inode_operations ext4_dir_inode_operations = {
2454 .mknod = ext4_mknod, 2456 .mknod = ext4_mknod,
2455 .rename = ext4_rename, 2457 .rename = ext4_rename,
2456 .setattr = ext4_setattr, 2458 .setattr = ext4_setattr,
2457#ifdef CONFIG_EXT4DEV_FS_XATTR 2459#ifdef CONFIG_EXT4_FS_XATTR
2458 .setxattr = generic_setxattr, 2460 .setxattr = generic_setxattr,
2459 .getxattr = generic_getxattr, 2461 .getxattr = generic_getxattr,
2460 .listxattr = ext4_listxattr, 2462 .listxattr = ext4_listxattr,
@@ -2465,7 +2467,7 @@ const struct inode_operations ext4_dir_inode_operations = {
2465 2467
2466const struct inode_operations ext4_special_inode_operations = { 2468const struct inode_operations ext4_special_inode_operations = {
2467 .setattr = ext4_setattr, 2469 .setattr = ext4_setattr,
2468#ifdef CONFIG_EXT4DEV_FS_XATTR 2470#ifdef CONFIG_EXT4_FS_XATTR
2469 .setxattr = generic_setxattr, 2471 .setxattr = generic_setxattr,
2470 .getxattr = generic_getxattr, 2472 .getxattr = generic_getxattr,
2471 .listxattr = ext4_listxattr, 2473 .listxattr = ext4_listxattr,
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index b3d35604ea1..b6ec1843a01 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -416,8 +416,8 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
416 "EXT4-fs: ext4_add_new_gdb: adding group block %lu\n", 416 "EXT4-fs: ext4_add_new_gdb: adding group block %lu\n",
417 gdb_num); 417 gdb_num);
418 418
419 /* 419 /*
420 * If we are not using the primary superblock/GDT copy don't resize, 420 * If we are not using the primary superblock/GDT copy don't resize,
421 * because the user tools have no way of handling this. Probably a 421 * because the user tools have no way of handling this. Probably a
422 * bad time to do it anyways. 422 * bad time to do it anyways.
423 */ 423 */
@@ -870,11 +870,10 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
870 * We can allocate memory for mb_alloc based on the new group 870 * We can allocate memory for mb_alloc based on the new group
871 * descriptor 871 * descriptor
872 */ 872 */
873 if (test_opt(sb, MBALLOC)) { 873 err = ext4_mb_add_more_groupinfo(sb, input->group, gdp);
874 err = ext4_mb_add_more_groupinfo(sb, input->group, gdp); 874 if (err)
875 if (err) 875 goto exit_journal;
876 goto exit_journal; 876
877 }
878 /* 877 /*
879 * Make the new blocks and inodes valid next. We do this before 878 * Make the new blocks and inodes valid next. We do this before
880 * increasing the group count so that once the group is enabled, 879 * increasing the group count so that once the group is enabled,
@@ -929,6 +928,15 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
929 percpu_counter_add(&sbi->s_freeinodes_counter, 928 percpu_counter_add(&sbi->s_freeinodes_counter,
930 EXT4_INODES_PER_GROUP(sb)); 929 EXT4_INODES_PER_GROUP(sb));
931 930
931 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
932 ext4_group_t flex_group;
933 flex_group = ext4_flex_group(sbi, input->group);
934 sbi->s_flex_groups[flex_group].free_blocks +=
935 input->free_blocks_count;
936 sbi->s_flex_groups[flex_group].free_inodes +=
937 EXT4_INODES_PER_GROUP(sb);
938 }
939
932 ext4_journal_dirty_metadata(handle, sbi->s_sbh); 940 ext4_journal_dirty_metadata(handle, sbi->s_sbh);
933 sb->s_dirt = 1; 941 sb->s_dirt = 1;
934 942
@@ -964,7 +972,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
964 ext4_group_t o_groups_count; 972 ext4_group_t o_groups_count;
965 ext4_grpblk_t last; 973 ext4_grpblk_t last;
966 ext4_grpblk_t add; 974 ext4_grpblk_t add;
967 struct buffer_head * bh; 975 struct buffer_head *bh;
968 handle_t *handle; 976 handle_t *handle;
969 int err; 977 int err;
970 unsigned long freed_blocks; 978 unsigned long freed_blocks;
@@ -1077,8 +1085,15 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
1077 /* 1085 /*
1078 * Mark mballoc pages as not up to date so that they will be updated 1086 * Mark mballoc pages as not up to date so that they will be updated
1079 * next time they are loaded by ext4_mb_load_buddy. 1087 * next time they are loaded by ext4_mb_load_buddy.
1088 *
1089 * XXX Bad, Bad, BAD!!! We should not be overloading the
1090 * Uptodate flag, particularly on thte bitmap bh, as way of
1091 * hinting to ext4_mb_load_buddy() that it needs to be
1092 * overloaded. A user could take a LVM snapshot, then do an
1093 * on-line fsck, and clear the uptodate flag, and this would
1094 * not be a bug in userspace, but a bug in the kernel. FIXME!!!
1080 */ 1095 */
1081 if (test_opt(sb, MBALLOC)) { 1096 {
1082 struct ext4_sb_info *sbi = EXT4_SB(sb); 1097 struct ext4_sb_info *sbi = EXT4_SB(sb);
1083 struct inode *inode = sbi->s_buddy_cache; 1098 struct inode *inode = sbi->s_buddy_cache;
1084 int blocks_per_page; 1099 int blocks_per_page;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 566344b926b..0e661c56966 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -34,6 +34,8 @@
34#include <linux/namei.h> 34#include <linux/namei.h>
35#include <linux/quotaops.h> 35#include <linux/quotaops.h>
36#include <linux/seq_file.h> 36#include <linux/seq_file.h>
37#include <linux/proc_fs.h>
38#include <linux/marker.h>
37#include <linux/log2.h> 39#include <linux/log2.h>
38#include <linux/crc16.h> 40#include <linux/crc16.h>
39#include <asm/uaccess.h> 41#include <asm/uaccess.h>
@@ -45,6 +47,8 @@
45#include "namei.h" 47#include "namei.h"
46#include "group.h" 48#include "group.h"
47 49
50struct proc_dir_entry *ext4_proc_root;
51
48static int ext4_load_journal(struct super_block *, struct ext4_super_block *, 52static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
49 unsigned long journal_devnum); 53 unsigned long journal_devnum);
50static int ext4_create_journal(struct super_block *, struct ext4_super_block *, 54static int ext4_create_journal(struct super_block *, struct ext4_super_block *,
@@ -508,10 +512,12 @@ static void ext4_put_super(struct super_block *sb)
508 if (!(sb->s_flags & MS_RDONLY)) { 512 if (!(sb->s_flags & MS_RDONLY)) {
509 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); 513 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
510 es->s_state = cpu_to_le16(sbi->s_mount_state); 514 es->s_state = cpu_to_le16(sbi->s_mount_state);
511 BUFFER_TRACE(sbi->s_sbh, "marking dirty");
512 mark_buffer_dirty(sbi->s_sbh);
513 ext4_commit_super(sb, es, 1); 515 ext4_commit_super(sb, es, 1);
514 } 516 }
517 if (sbi->s_proc) {
518 remove_proc_entry("inode_readahead_blks", sbi->s_proc);
519 remove_proc_entry(sb->s_id, ext4_proc_root);
520 }
515 521
516 for (i = 0; i < sbi->s_gdb_count; i++) 522 for (i = 0; i < sbi->s_gdb_count; i++)
517 brelse(sbi->s_group_desc[i]); 523 brelse(sbi->s_group_desc[i]);
@@ -520,6 +526,7 @@ static void ext4_put_super(struct super_block *sb)
520 percpu_counter_destroy(&sbi->s_freeblocks_counter); 526 percpu_counter_destroy(&sbi->s_freeblocks_counter);
521 percpu_counter_destroy(&sbi->s_freeinodes_counter); 527 percpu_counter_destroy(&sbi->s_freeinodes_counter);
522 percpu_counter_destroy(&sbi->s_dirs_counter); 528 percpu_counter_destroy(&sbi->s_dirs_counter);
529 percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
523 brelse(sbi->s_sbh); 530 brelse(sbi->s_sbh);
524#ifdef CONFIG_QUOTA 531#ifdef CONFIG_QUOTA
525 for (i = 0; i < MAXQUOTAS; i++) 532 for (i = 0; i < MAXQUOTAS; i++)
@@ -562,11 +569,10 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
562 ei = kmem_cache_alloc(ext4_inode_cachep, GFP_NOFS); 569 ei = kmem_cache_alloc(ext4_inode_cachep, GFP_NOFS);
563 if (!ei) 570 if (!ei)
564 return NULL; 571 return NULL;
565#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL 572#ifdef CONFIG_EXT4_FS_POSIX_ACL
566 ei->i_acl = EXT4_ACL_NOT_CACHED; 573 ei->i_acl = EXT4_ACL_NOT_CACHED;
567 ei->i_default_acl = EXT4_ACL_NOT_CACHED; 574 ei->i_default_acl = EXT4_ACL_NOT_CACHED;
568#endif 575#endif
569 ei->i_block_alloc_info = NULL;
570 ei->vfs_inode.i_version = 1; 576 ei->vfs_inode.i_version = 1;
571 ei->vfs_inode.i_data.writeback_index = 0; 577 ei->vfs_inode.i_data.writeback_index = 0;
572 memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache)); 578 memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
@@ -599,7 +605,7 @@ static void init_once(void *foo)
599 struct ext4_inode_info *ei = (struct ext4_inode_info *) foo; 605 struct ext4_inode_info *ei = (struct ext4_inode_info *) foo;
600 606
601 INIT_LIST_HEAD(&ei->i_orphan); 607 INIT_LIST_HEAD(&ei->i_orphan);
602#ifdef CONFIG_EXT4DEV_FS_XATTR 608#ifdef CONFIG_EXT4_FS_XATTR
603 init_rwsem(&ei->xattr_sem); 609 init_rwsem(&ei->xattr_sem);
604#endif 610#endif
605 init_rwsem(&ei->i_data_sem); 611 init_rwsem(&ei->i_data_sem);
@@ -625,8 +631,7 @@ static void destroy_inodecache(void)
625 631
626static void ext4_clear_inode(struct inode *inode) 632static void ext4_clear_inode(struct inode *inode)
627{ 633{
628 struct ext4_block_alloc_info *rsv = EXT4_I(inode)->i_block_alloc_info; 634#ifdef CONFIG_EXT4_FS_POSIX_ACL
629#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
630 if (EXT4_I(inode)->i_acl && 635 if (EXT4_I(inode)->i_acl &&
631 EXT4_I(inode)->i_acl != EXT4_ACL_NOT_CACHED) { 636 EXT4_I(inode)->i_acl != EXT4_ACL_NOT_CACHED) {
632 posix_acl_release(EXT4_I(inode)->i_acl); 637 posix_acl_release(EXT4_I(inode)->i_acl);
@@ -638,10 +643,7 @@ static void ext4_clear_inode(struct inode *inode)
638 EXT4_I(inode)->i_default_acl = EXT4_ACL_NOT_CACHED; 643 EXT4_I(inode)->i_default_acl = EXT4_ACL_NOT_CACHED;
639 } 644 }
640#endif 645#endif
641 ext4_discard_reservation(inode); 646 ext4_discard_preallocations(inode);
642 EXT4_I(inode)->i_block_alloc_info = NULL;
643 if (unlikely(rsv))
644 kfree(rsv);
645 jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal, 647 jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal,
646 &EXT4_I(inode)->jinode); 648 &EXT4_I(inode)->jinode);
647} 649}
@@ -654,7 +656,7 @@ static inline void ext4_show_quota_options(struct seq_file *seq,
654 656
655 if (sbi->s_jquota_fmt) 657 if (sbi->s_jquota_fmt)
656 seq_printf(seq, ",jqfmt=%s", 658 seq_printf(seq, ",jqfmt=%s",
657 (sbi->s_jquota_fmt == QFMT_VFS_OLD) ? "vfsold": "vfsv0"); 659 (sbi->s_jquota_fmt == QFMT_VFS_OLD) ? "vfsold" : "vfsv0");
658 660
659 if (sbi->s_qf_names[USRQUOTA]) 661 if (sbi->s_qf_names[USRQUOTA])
660 seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]); 662 seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]);
@@ -718,7 +720,7 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
718 seq_puts(seq, ",debug"); 720 seq_puts(seq, ",debug");
719 if (test_opt(sb, OLDALLOC)) 721 if (test_opt(sb, OLDALLOC))
720 seq_puts(seq, ",oldalloc"); 722 seq_puts(seq, ",oldalloc");
721#ifdef CONFIG_EXT4DEV_FS_XATTR 723#ifdef CONFIG_EXT4_FS_XATTR
722 if (test_opt(sb, XATTR_USER) && 724 if (test_opt(sb, XATTR_USER) &&
723 !(def_mount_opts & EXT4_DEFM_XATTR_USER)) 725 !(def_mount_opts & EXT4_DEFM_XATTR_USER))
724 seq_puts(seq, ",user_xattr"); 726 seq_puts(seq, ",user_xattr");
@@ -727,7 +729,7 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
727 seq_puts(seq, ",nouser_xattr"); 729 seq_puts(seq, ",nouser_xattr");
728 } 730 }
729#endif 731#endif
730#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL 732#ifdef CONFIG_EXT4_FS_POSIX_ACL
731 if (test_opt(sb, POSIX_ACL) && !(def_mount_opts & EXT4_DEFM_ACL)) 733 if (test_opt(sb, POSIX_ACL) && !(def_mount_opts & EXT4_DEFM_ACL))
732 seq_puts(seq, ",acl"); 734 seq_puts(seq, ",acl");
733 if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT4_DEFM_ACL)) 735 if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT4_DEFM_ACL))
@@ -752,8 +754,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
752 seq_puts(seq, ",nobh"); 754 seq_puts(seq, ",nobh");
753 if (!test_opt(sb, EXTENTS)) 755 if (!test_opt(sb, EXTENTS))
754 seq_puts(seq, ",noextents"); 756 seq_puts(seq, ",noextents");
755 if (!test_opt(sb, MBALLOC))
756 seq_puts(seq, ",nomballoc");
757 if (test_opt(sb, I_VERSION)) 757 if (test_opt(sb, I_VERSION))
758 seq_puts(seq, ",i_version"); 758 seq_puts(seq, ",i_version");
759 if (!test_opt(sb, DELALLOC)) 759 if (!test_opt(sb, DELALLOC))
@@ -773,6 +773,10 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
773 else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) 773 else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
774 seq_puts(seq, ",data=writeback"); 774 seq_puts(seq, ",data=writeback");
775 775
776 if (sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS)
777 seq_printf(seq, ",inode_readahead_blks=%u",
778 sbi->s_inode_readahead_blks);
779
776 ext4_show_quota_options(seq, sb); 780 ext4_show_quota_options(seq, sb);
777 return 0; 781 return 0;
778} 782}
@@ -822,7 +826,7 @@ static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid,
822} 826}
823 827
824#ifdef CONFIG_QUOTA 828#ifdef CONFIG_QUOTA
825#define QTYPE2NAME(t) ((t) == USRQUOTA?"user":"group") 829#define QTYPE2NAME(t) ((t) == USRQUOTA ? "user" : "group")
826#define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA)) 830#define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
827 831
828static int ext4_dquot_initialize(struct inode *inode, int type); 832static int ext4_dquot_initialize(struct inode *inode, int type);
@@ -907,6 +911,7 @@ enum {
907 Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, 911 Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
908 Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version, 912 Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
909 Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc, Opt_nodelalloc, 913 Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc, Opt_nodelalloc,
914 Opt_inode_readahead_blks
910}; 915};
911 916
912static match_table_t tokens = { 917static match_table_t tokens = {
@@ -967,6 +972,7 @@ static match_table_t tokens = {
967 {Opt_resize, "resize"}, 972 {Opt_resize, "resize"},
968 {Opt_delalloc, "delalloc"}, 973 {Opt_delalloc, "delalloc"},
969 {Opt_nodelalloc, "nodelalloc"}, 974 {Opt_nodelalloc, "nodelalloc"},
975 {Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
970 {Opt_err, NULL}, 976 {Opt_err, NULL},
971}; 977};
972 978
@@ -981,7 +987,7 @@ static ext4_fsblk_t get_sb_block(void **data)
981 /*todo: use simple_strtoll with >32bit ext4 */ 987 /*todo: use simple_strtoll with >32bit ext4 */
982 sb_block = simple_strtoul(options, &options, 0); 988 sb_block = simple_strtoul(options, &options, 0);
983 if (*options && *options != ',') { 989 if (*options && *options != ',') {
984 printk("EXT4-fs: Invalid sb specification: %s\n", 990 printk(KERN_ERR "EXT4-fs: Invalid sb specification: %s\n",
985 (char *) *data); 991 (char *) *data);
986 return 1; 992 return 1;
987 } 993 }
@@ -1072,7 +1078,7 @@ static int parse_options(char *options, struct super_block *sb,
1072 case Opt_orlov: 1078 case Opt_orlov:
1073 clear_opt(sbi->s_mount_opt, OLDALLOC); 1079 clear_opt(sbi->s_mount_opt, OLDALLOC);
1074 break; 1080 break;
1075#ifdef CONFIG_EXT4DEV_FS_XATTR 1081#ifdef CONFIG_EXT4_FS_XATTR
1076 case Opt_user_xattr: 1082 case Opt_user_xattr:
1077 set_opt(sbi->s_mount_opt, XATTR_USER); 1083 set_opt(sbi->s_mount_opt, XATTR_USER);
1078 break; 1084 break;
@@ -1082,10 +1088,11 @@ static int parse_options(char *options, struct super_block *sb,
1082#else 1088#else
1083 case Opt_user_xattr: 1089 case Opt_user_xattr:
1084 case Opt_nouser_xattr: 1090 case Opt_nouser_xattr:
1085 printk("EXT4 (no)user_xattr options not supported\n"); 1091 printk(KERN_ERR "EXT4 (no)user_xattr options "
1092 "not supported\n");
1086 break; 1093 break;
1087#endif 1094#endif
1088#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL 1095#ifdef CONFIG_EXT4_FS_POSIX_ACL
1089 case Opt_acl: 1096 case Opt_acl:
1090 set_opt(sbi->s_mount_opt, POSIX_ACL); 1097 set_opt(sbi->s_mount_opt, POSIX_ACL);
1091 break; 1098 break;
@@ -1095,7 +1102,8 @@ static int parse_options(char *options, struct super_block *sb,
1095#else 1102#else
1096 case Opt_acl: 1103 case Opt_acl:
1097 case Opt_noacl: 1104 case Opt_noacl:
1098 printk("EXT4 (no)acl options not supported\n"); 1105 printk(KERN_ERR "EXT4 (no)acl options "
1106 "not supported\n");
1099 break; 1107 break;
1100#endif 1108#endif
1101 case Opt_reservation: 1109 case Opt_reservation:
@@ -1189,8 +1197,8 @@ set_qf_name:
1189 sb_any_quota_suspended(sb)) && 1197 sb_any_quota_suspended(sb)) &&
1190 !sbi->s_qf_names[qtype]) { 1198 !sbi->s_qf_names[qtype]) {
1191 printk(KERN_ERR 1199 printk(KERN_ERR
1192 "EXT4-fs: Cannot change journaled " 1200 "EXT4-fs: Cannot change journaled "
1193 "quota options when quota turned on.\n"); 1201 "quota options when quota turned on.\n");
1194 return 0; 1202 return 0;
1195 } 1203 }
1196 qname = match_strdup(&args[0]); 1204 qname = match_strdup(&args[0]);
@@ -1357,12 +1365,6 @@ set_qf_format:
1357 case Opt_nodelalloc: 1365 case Opt_nodelalloc:
1358 clear_opt(sbi->s_mount_opt, DELALLOC); 1366 clear_opt(sbi->s_mount_opt, DELALLOC);
1359 break; 1367 break;
1360 case Opt_mballoc:
1361 set_opt(sbi->s_mount_opt, MBALLOC);
1362 break;
1363 case Opt_nomballoc:
1364 clear_opt(sbi->s_mount_opt, MBALLOC);
1365 break;
1366 case Opt_stripe: 1368 case Opt_stripe:
1367 if (match_int(&args[0], &option)) 1369 if (match_int(&args[0], &option))
1368 return 0; 1370 return 0;
@@ -1373,6 +1375,13 @@ set_qf_format:
1373 case Opt_delalloc: 1375 case Opt_delalloc:
1374 set_opt(sbi->s_mount_opt, DELALLOC); 1376 set_opt(sbi->s_mount_opt, DELALLOC);
1375 break; 1377 break;
1378 case Opt_inode_readahead_blks:
1379 if (match_int(&args[0], &option))
1380 return 0;
1381 if (option < 0 || option > (1 << 30))
1382 return 0;
1383 sbi->s_inode_readahead_blks = option;
1384 break;
1376 default: 1385 default:
1377 printk(KERN_ERR 1386 printk(KERN_ERR
1378 "EXT4-fs: Unrecognized mount option \"%s\" " 1387 "EXT4-fs: Unrecognized mount option \"%s\" "
@@ -1473,15 +1482,9 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
1473 EXT4_INODES_PER_GROUP(sb), 1482 EXT4_INODES_PER_GROUP(sb),
1474 sbi->s_mount_opt); 1483 sbi->s_mount_opt);
1475 1484
1476 printk(KERN_INFO "EXT4 FS on %s, ", sb->s_id); 1485 printk(KERN_INFO "EXT4 FS on %s, %s journal on %s\n",
1477 if (EXT4_SB(sb)->s_journal->j_inode == NULL) { 1486 sb->s_id, EXT4_SB(sb)->s_journal->j_inode ? "internal" :
1478 char b[BDEVNAME_SIZE]; 1487 "external", EXT4_SB(sb)->s_journal->j_devname);
1479
1480 printk("external journal on %s\n",
1481 bdevname(EXT4_SB(sb)->s_journal->j_dev, b));
1482 } else {
1483 printk("internal journal\n");
1484 }
1485 return res; 1488 return res;
1486} 1489}
1487 1490
@@ -1504,8 +1507,11 @@ static int ext4_fill_flex_info(struct super_block *sb)
1504 sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; 1507 sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
1505 groups_per_flex = 1 << sbi->s_log_groups_per_flex; 1508 groups_per_flex = 1 << sbi->s_log_groups_per_flex;
1506 1509
1507 flex_group_count = (sbi->s_groups_count + groups_per_flex - 1) / 1510 /* We allocate both existing and potentially added groups */
1508 groups_per_flex; 1511 flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) +
1512 ((sbi->s_es->s_reserved_gdt_blocks +1 ) <<
1513 EXT4_DESC_PER_BLOCK_BITS(sb))) /
1514 groups_per_flex;
1509 sbi->s_flex_groups = kzalloc(flex_group_count * 1515 sbi->s_flex_groups = kzalloc(flex_group_count *
1510 sizeof(struct flex_groups), GFP_KERNEL); 1516 sizeof(struct flex_groups), GFP_KERNEL);
1511 if (sbi->s_flex_groups == NULL) { 1517 if (sbi->s_flex_groups == NULL) {
@@ -1584,7 +1590,7 @@ static int ext4_check_descriptors(struct super_block *sb)
1584 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) 1590 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
1585 flexbg_flag = 1; 1591 flexbg_flag = 1;
1586 1592
1587 ext4_debug ("Checking group descriptors"); 1593 ext4_debug("Checking group descriptors");
1588 1594
1589 for (i = 0; i < sbi->s_groups_count; i++) { 1595 for (i = 0; i < sbi->s_groups_count; i++) {
1590 struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL); 1596 struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
@@ -1623,8 +1629,10 @@ static int ext4_check_descriptors(struct super_block *sb)
1623 "Checksum for group %lu failed (%u!=%u)\n", 1629 "Checksum for group %lu failed (%u!=%u)\n",
1624 i, le16_to_cpu(ext4_group_desc_csum(sbi, i, 1630 i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
1625 gdp)), le16_to_cpu(gdp->bg_checksum)); 1631 gdp)), le16_to_cpu(gdp->bg_checksum));
1626 if (!(sb->s_flags & MS_RDONLY)) 1632 if (!(sb->s_flags & MS_RDONLY)) {
1633 spin_unlock(sb_bgl_lock(sbi, i));
1627 return 0; 1634 return 0;
1635 }
1628 } 1636 }
1629 spin_unlock(sb_bgl_lock(sbi, i)); 1637 spin_unlock(sb_bgl_lock(sbi, i));
1630 if (!flexbg_flag) 1638 if (!flexbg_flag)
@@ -1714,9 +1722,9 @@ static void ext4_orphan_cleanup(struct super_block *sb,
1714 DQUOT_INIT(inode); 1722 DQUOT_INIT(inode);
1715 if (inode->i_nlink) { 1723 if (inode->i_nlink) {
1716 printk(KERN_DEBUG 1724 printk(KERN_DEBUG
1717 "%s: truncating inode %lu to %Ld bytes\n", 1725 "%s: truncating inode %lu to %lld bytes\n",
1718 __func__, inode->i_ino, inode->i_size); 1726 __func__, inode->i_ino, inode->i_size);
1719 jbd_debug(2, "truncating inode %lu to %Ld bytes\n", 1727 jbd_debug(2, "truncating inode %lu to %lld bytes\n",
1720 inode->i_ino, inode->i_size); 1728 inode->i_ino, inode->i_size);
1721 ext4_truncate(inode); 1729 ext4_truncate(inode);
1722 nr_truncates++; 1730 nr_truncates++;
@@ -1914,6 +1922,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
1914 unsigned long journal_devnum = 0; 1922 unsigned long journal_devnum = 0;
1915 unsigned long def_mount_opts; 1923 unsigned long def_mount_opts;
1916 struct inode *root; 1924 struct inode *root;
1925 char *cp;
1917 int ret = -EINVAL; 1926 int ret = -EINVAL;
1918 int blocksize; 1927 int blocksize;
1919 int db_count; 1928 int db_count;
@@ -1930,10 +1939,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
1930 sbi->s_mount_opt = 0; 1939 sbi->s_mount_opt = 0;
1931 sbi->s_resuid = EXT4_DEF_RESUID; 1940 sbi->s_resuid = EXT4_DEF_RESUID;
1932 sbi->s_resgid = EXT4_DEF_RESGID; 1941 sbi->s_resgid = EXT4_DEF_RESGID;
1942 sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
1933 sbi->s_sb_block = sb_block; 1943 sbi->s_sb_block = sb_block;
1934 1944
1935 unlock_kernel(); 1945 unlock_kernel();
1936 1946
1947 /* Cleanup superblock name */
1948 for (cp = sb->s_id; (cp = strchr(cp, '/'));)
1949 *cp = '!';
1950
1937 blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE); 1951 blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE);
1938 if (!blocksize) { 1952 if (!blocksize) {
1939 printk(KERN_ERR "EXT4-fs: unable to set blocksize\n"); 1953 printk(KERN_ERR "EXT4-fs: unable to set blocksize\n");
@@ -1973,11 +1987,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
1973 set_opt(sbi->s_mount_opt, GRPID); 1987 set_opt(sbi->s_mount_opt, GRPID);
1974 if (def_mount_opts & EXT4_DEFM_UID16) 1988 if (def_mount_opts & EXT4_DEFM_UID16)
1975 set_opt(sbi->s_mount_opt, NO_UID32); 1989 set_opt(sbi->s_mount_opt, NO_UID32);
1976#ifdef CONFIG_EXT4DEV_FS_XATTR 1990#ifdef CONFIG_EXT4_FS_XATTR
1977 if (def_mount_opts & EXT4_DEFM_XATTR_USER) 1991 if (def_mount_opts & EXT4_DEFM_XATTR_USER)
1978 set_opt(sbi->s_mount_opt, XATTR_USER); 1992 set_opt(sbi->s_mount_opt, XATTR_USER);
1979#endif 1993#endif
1980#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL 1994#ifdef CONFIG_EXT4_FS_POSIX_ACL
1981 if (def_mount_opts & EXT4_DEFM_ACL) 1995 if (def_mount_opts & EXT4_DEFM_ACL)
1982 set_opt(sbi->s_mount_opt, POSIX_ACL); 1996 set_opt(sbi->s_mount_opt, POSIX_ACL);
1983#endif 1997#endif
@@ -2012,11 +2026,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2012 ext4_warning(sb, __func__, 2026 ext4_warning(sb, __func__,
2013 "extents feature not enabled on this filesystem, " 2027 "extents feature not enabled on this filesystem, "
2014 "use tune2fs.\n"); 2028 "use tune2fs.\n");
2015 /*
2016 * turn on mballoc code by default in ext4 filesystem
2017 * Use -o nomballoc to turn it off
2018 */
2019 set_opt(sbi->s_mount_opt, MBALLOC);
2020 2029
2021 /* 2030 /*
2022 * enable delayed allocation by default 2031 * enable delayed allocation by default
@@ -2041,16 +2050,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2041 "running e2fsck is recommended\n"); 2050 "running e2fsck is recommended\n");
2042 2051
2043 /* 2052 /*
2044 * Since ext4 is still considered development code, we require
2045 * that the TEST_FILESYS flag in s->flags be set.
2046 */
2047 if (!(le32_to_cpu(es->s_flags) & EXT2_FLAGS_TEST_FILESYS)) {
2048 printk(KERN_WARNING "EXT4-fs: %s: not marked "
2049 "OK to use with test code.\n", sb->s_id);
2050 goto failed_mount;
2051 }
2052
2053 /*
2054 * Check feature flags regardless of the revision level, since we 2053 * Check feature flags regardless of the revision level, since we
2055 * previously didn't change the revision level when setting the flags, 2054 * previously didn't change the revision level when setting the flags,
2056 * so there is a chance incompat flags are set on a rev 0 filesystem. 2055 * so there is a chance incompat flags are set on a rev 0 filesystem.
@@ -2219,6 +2218,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2219 goto failed_mount; 2218 goto failed_mount;
2220 } 2219 }
2221 2220
2221 if (ext4_proc_root)
2222 sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root);
2223
2224 if (sbi->s_proc)
2225 proc_create_data("inode_readahead_blks", 0644, sbi->s_proc,
2226 &ext4_ui_proc_fops,
2227 &sbi->s_inode_readahead_blks);
2228
2222 bgl_lock_init(&sbi->s_blockgroup_lock); 2229 bgl_lock_init(&sbi->s_blockgroup_lock);
2223 2230
2224 for (i = 0; i < db_count; i++) { 2231 for (i = 0; i < db_count; i++) {
@@ -2257,24 +2264,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2257 err = percpu_counter_init(&sbi->s_dirs_counter, 2264 err = percpu_counter_init(&sbi->s_dirs_counter,
2258 ext4_count_dirs(sb)); 2265 ext4_count_dirs(sb));
2259 } 2266 }
2267 if (!err) {
2268 err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
2269 }
2260 if (err) { 2270 if (err) {
2261 printk(KERN_ERR "EXT4-fs: insufficient memory\n"); 2271 printk(KERN_ERR "EXT4-fs: insufficient memory\n");
2262 goto failed_mount3; 2272 goto failed_mount3;
2263 } 2273 }
2264 2274
2265 /* per fileystem reservation list head & lock */
2266 spin_lock_init(&sbi->s_rsv_window_lock);
2267 sbi->s_rsv_window_root = RB_ROOT;
2268 /* Add a single, static dummy reservation to the start of the
2269 * reservation window list --- it gives us a placeholder for
2270 * append-at-start-of-list which makes the allocation logic
2271 * _much_ simpler. */
2272 sbi->s_rsv_window_head.rsv_start = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
2273 sbi->s_rsv_window_head.rsv_end = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
2274 sbi->s_rsv_window_head.rsv_alloc_hit = 0;
2275 sbi->s_rsv_window_head.rsv_goal_size = 0;
2276 ext4_rsv_window_add(sb, &sbi->s_rsv_window_head);
2277
2278 sbi->s_stripe = ext4_get_stripe_size(sbi); 2275 sbi->s_stripe = ext4_get_stripe_size(sbi);
2279 2276
2280 /* 2277 /*
@@ -2471,7 +2468,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2471 printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n"); 2468 printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n");
2472 2469
2473 ext4_ext_init(sb); 2470 ext4_ext_init(sb);
2474 ext4_mb_init(sb, needs_recovery); 2471 err = ext4_mb_init(sb, needs_recovery);
2472 if (err) {
2473 printk(KERN_ERR "EXT4-fs: failed to initalize mballoc (%d)\n",
2474 err);
2475 goto failed_mount4;
2476 }
2475 2477
2476 lock_kernel(); 2478 lock_kernel();
2477 return 0; 2479 return 0;
@@ -2489,11 +2491,16 @@ failed_mount3:
2489 percpu_counter_destroy(&sbi->s_freeblocks_counter); 2491 percpu_counter_destroy(&sbi->s_freeblocks_counter);
2490 percpu_counter_destroy(&sbi->s_freeinodes_counter); 2492 percpu_counter_destroy(&sbi->s_freeinodes_counter);
2491 percpu_counter_destroy(&sbi->s_dirs_counter); 2493 percpu_counter_destroy(&sbi->s_dirs_counter);
2494 percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
2492failed_mount2: 2495failed_mount2:
2493 for (i = 0; i < db_count; i++) 2496 for (i = 0; i < db_count; i++)
2494 brelse(sbi->s_group_desc[i]); 2497 brelse(sbi->s_group_desc[i]);
2495 kfree(sbi->s_group_desc); 2498 kfree(sbi->s_group_desc);
2496failed_mount: 2499failed_mount:
2500 if (sbi->s_proc) {
2501 remove_proc_entry("inode_readahead_blks", sbi->s_proc);
2502 remove_proc_entry(sb->s_id, ext4_proc_root);
2503 }
2497#ifdef CONFIG_QUOTA 2504#ifdef CONFIG_QUOTA
2498 for (i = 0; i < MAXQUOTAS; i++) 2505 for (i = 0; i < MAXQUOTAS; i++)
2499 kfree(sbi->s_qf_names[i]); 2506 kfree(sbi->s_qf_names[i]);
@@ -2552,7 +2559,7 @@ static journal_t *ext4_get_journal(struct super_block *sb,
2552 return NULL; 2559 return NULL;
2553 } 2560 }
2554 2561
2555 jbd_debug(2, "Journal inode found at %p: %Ld bytes\n", 2562 jbd_debug(2, "Journal inode found at %p: %lld bytes\n",
2556 journal_inode, journal_inode->i_size); 2563 journal_inode, journal_inode->i_size);
2557 if (!S_ISREG(journal_inode->i_mode)) { 2564 if (!S_ISREG(journal_inode->i_mode)) {
2558 printk(KERN_ERR "EXT4-fs: invalid journal inode.\n"); 2565 printk(KERN_ERR "EXT4-fs: invalid journal inode.\n");
@@ -2715,6 +2722,11 @@ static int ext4_load_journal(struct super_block *sb,
2715 return -EINVAL; 2722 return -EINVAL;
2716 } 2723 }
2717 2724
2725 if (journal->j_flags & JBD2_BARRIER)
2726 printk(KERN_INFO "EXT4-fs: barriers enabled\n");
2727 else
2728 printk(KERN_INFO "EXT4-fs: barriers disabled\n");
2729
2718 if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) { 2730 if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
2719 err = jbd2_journal_update_format(journal); 2731 err = jbd2_journal_update_format(journal);
2720 if (err) { 2732 if (err) {
@@ -2799,13 +2811,34 @@ static void ext4_commit_super(struct super_block *sb,
2799 2811
2800 if (!sbh) 2812 if (!sbh)
2801 return; 2813 return;
2814 if (buffer_write_io_error(sbh)) {
2815 /*
2816 * Oh, dear. A previous attempt to write the
2817 * superblock failed. This could happen because the
2818 * USB device was yanked out. Or it could happen to
2819 * be a transient write error and maybe the block will
2820 * be remapped. Nothing we can do but to retry the
2821 * write and hope for the best.
2822 */
2823 printk(KERN_ERR "ext4: previous I/O error to "
2824 "superblock detected for %s.\n", sb->s_id);
2825 clear_buffer_write_io_error(sbh);
2826 set_buffer_uptodate(sbh);
2827 }
2802 es->s_wtime = cpu_to_le32(get_seconds()); 2828 es->s_wtime = cpu_to_le32(get_seconds());
2803 ext4_free_blocks_count_set(es, ext4_count_free_blocks(sb)); 2829 ext4_free_blocks_count_set(es, ext4_count_free_blocks(sb));
2804 es->s_free_inodes_count = cpu_to_le32(ext4_count_free_inodes(sb)); 2830 es->s_free_inodes_count = cpu_to_le32(ext4_count_free_inodes(sb));
2805 BUFFER_TRACE(sbh, "marking dirty"); 2831 BUFFER_TRACE(sbh, "marking dirty");
2806 mark_buffer_dirty(sbh); 2832 mark_buffer_dirty(sbh);
2807 if (sync) 2833 if (sync) {
2808 sync_dirty_buffer(sbh); 2834 sync_dirty_buffer(sbh);
2835 if (buffer_write_io_error(sbh)) {
2836 printk(KERN_ERR "ext4: I/O error while writing "
2837 "superblock for %s.\n", sb->s_id);
2838 clear_buffer_write_io_error(sbh);
2839 set_buffer_uptodate(sbh);
2840 }
2841 }
2809} 2842}
2810 2843
2811 2844
@@ -2907,6 +2940,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
2907{ 2940{
2908 tid_t target; 2941 tid_t target;
2909 2942
2943 trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait);
2910 sb->s_dirt = 0; 2944 sb->s_dirt = 0;
2911 if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) { 2945 if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) {
2912 if (wait) 2946 if (wait)
@@ -3162,7 +3196,8 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
3162 buf->f_type = EXT4_SUPER_MAGIC; 3196 buf->f_type = EXT4_SUPER_MAGIC;
3163 buf->f_bsize = sb->s_blocksize; 3197 buf->f_bsize = sb->s_blocksize;
3164 buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last; 3198 buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last;
3165 buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter); 3199 buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) -
3200 percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter);
3166 ext4_free_blocks_count_set(es, buf->f_bfree); 3201 ext4_free_blocks_count_set(es, buf->f_bfree);
3167 buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es); 3202 buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
3168 if (buf->f_bfree < ext4_r_blocks_count(es)) 3203 if (buf->f_bfree < ext4_r_blocks_count(es))
@@ -3432,7 +3467,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
3432 handle_t *handle = journal_current_handle(); 3467 handle_t *handle = journal_current_handle();
3433 3468
3434 if (!handle) { 3469 if (!handle) {
3435 printk(KERN_WARNING "EXT4-fs: Quota write (off=%Lu, len=%Lu)" 3470 printk(KERN_WARNING "EXT4-fs: Quota write (off=%llu, len=%llu)"
3436 " cancelled because transaction is not started.\n", 3471 " cancelled because transaction is not started.\n",
3437 (unsigned long long)off, (unsigned long long)len); 3472 (unsigned long long)off, (unsigned long long)len);
3438 return -EIO; 3473 return -EIO;
@@ -3493,18 +3528,82 @@ static int ext4_get_sb(struct file_system_type *fs_type,
3493 return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt); 3528 return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt);
3494} 3529}
3495 3530
3531#ifdef CONFIG_PROC_FS
3532static int ext4_ui_proc_show(struct seq_file *m, void *v)
3533{
3534 unsigned int *p = m->private;
3535
3536 seq_printf(m, "%u\n", *p);
3537 return 0;
3538}
3539
3540static int ext4_ui_proc_open(struct inode *inode, struct file *file)
3541{
3542 return single_open(file, ext4_ui_proc_show, PDE(inode)->data);
3543}
3544
3545static ssize_t ext4_ui_proc_write(struct file *file, const char __user *buf,
3546 size_t cnt, loff_t *ppos)
3547{
3548 unsigned int *p = PDE(file->f_path.dentry->d_inode)->data;
3549 char str[32];
3550 unsigned long value;
3551
3552 if (cnt >= sizeof(str))
3553 return -EINVAL;
3554 if (copy_from_user(str, buf, cnt))
3555 return -EFAULT;
3556 value = simple_strtol(str, NULL, 0);
3557 if (value < 0)
3558 return -ERANGE;
3559 *p = value;
3560 return cnt;
3561}
3562
3563const struct file_operations ext4_ui_proc_fops = {
3564 .owner = THIS_MODULE,
3565 .open = ext4_ui_proc_open,
3566 .read = seq_read,
3567 .llseek = seq_lseek,
3568 .release = single_release,
3569 .write = ext4_ui_proc_write,
3570};
3571#endif
3572
3573static struct file_system_type ext4_fs_type = {
3574 .owner = THIS_MODULE,
3575 .name = "ext4",
3576 .get_sb = ext4_get_sb,
3577 .kill_sb = kill_block_super,
3578 .fs_flags = FS_REQUIRES_DEV,
3579};
3580
3581#ifdef CONFIG_EXT4DEV_COMPAT
3582static int ext4dev_get_sb(struct file_system_type *fs_type,
3583 int flags, const char *dev_name, void *data, struct vfsmount *mnt)
3584{
3585 printk(KERN_WARNING "EXT4-fs: Update your userspace programs "
3586 "to mount using ext4\n");
3587 printk(KERN_WARNING "EXT4-fs: ext4dev backwards compatibility "
3588 "will go away by 2.6.31\n");
3589 return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt);
3590}
3591
3496static struct file_system_type ext4dev_fs_type = { 3592static struct file_system_type ext4dev_fs_type = {
3497 .owner = THIS_MODULE, 3593 .owner = THIS_MODULE,
3498 .name = "ext4dev", 3594 .name = "ext4dev",
3499 .get_sb = ext4_get_sb, 3595 .get_sb = ext4dev_get_sb,
3500 .kill_sb = kill_block_super, 3596 .kill_sb = kill_block_super,
3501 .fs_flags = FS_REQUIRES_DEV, 3597 .fs_flags = FS_REQUIRES_DEV,
3502}; 3598};
3599MODULE_ALIAS("ext4dev");
3600#endif
3503 3601
3504static int __init init_ext4_fs(void) 3602static int __init init_ext4_fs(void)
3505{ 3603{
3506 int err; 3604 int err;
3507 3605
3606 ext4_proc_root = proc_mkdir("fs/ext4", NULL);
3508 err = init_ext4_mballoc(); 3607 err = init_ext4_mballoc();
3509 if (err) 3608 if (err)
3510 return err; 3609 return err;
@@ -3515,9 +3614,16 @@ static int __init init_ext4_fs(void)
3515 err = init_inodecache(); 3614 err = init_inodecache();
3516 if (err) 3615 if (err)
3517 goto out1; 3616 goto out1;
3518 err = register_filesystem(&ext4dev_fs_type); 3617 err = register_filesystem(&ext4_fs_type);
3519 if (err) 3618 if (err)
3520 goto out; 3619 goto out;
3620#ifdef CONFIG_EXT4DEV_COMPAT
3621 err = register_filesystem(&ext4dev_fs_type);
3622 if (err) {
3623 unregister_filesystem(&ext4_fs_type);
3624 goto out;
3625 }
3626#endif
3521 return 0; 3627 return 0;
3522out: 3628out:
3523 destroy_inodecache(); 3629 destroy_inodecache();
@@ -3530,10 +3636,14 @@ out2:
3530 3636
3531static void __exit exit_ext4_fs(void) 3637static void __exit exit_ext4_fs(void)
3532{ 3638{
3639 unregister_filesystem(&ext4_fs_type);
3640#ifdef CONFIG_EXT4DEV_COMPAT
3533 unregister_filesystem(&ext4dev_fs_type); 3641 unregister_filesystem(&ext4dev_fs_type);
3642#endif
3534 destroy_inodecache(); 3643 destroy_inodecache();
3535 exit_ext4_xattr(); 3644 exit_ext4_xattr();
3536 exit_ext4_mballoc(); 3645 exit_ext4_mballoc();
3646 remove_proc_entry("fs/ext4", NULL);
3537} 3647}
3538 3648
3539MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); 3649MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
index e9178643dc0..00740cb32be 100644
--- a/fs/ext4/symlink.c
+++ b/fs/ext4/symlink.c
@@ -23,10 +23,10 @@
23#include "ext4.h" 23#include "ext4.h"
24#include "xattr.h" 24#include "xattr.h"
25 25
26static void * ext4_follow_link(struct dentry *dentry, struct nameidata *nd) 26static void *ext4_follow_link(struct dentry *dentry, struct nameidata *nd)
27{ 27{
28 struct ext4_inode_info *ei = EXT4_I(dentry->d_inode); 28 struct ext4_inode_info *ei = EXT4_I(dentry->d_inode);
29 nd_set_link(nd, (char*)ei->i_data); 29 nd_set_link(nd, (char *) ei->i_data);
30 return NULL; 30 return NULL;
31} 31}
32 32
@@ -34,7 +34,7 @@ const struct inode_operations ext4_symlink_inode_operations = {
34 .readlink = generic_readlink, 34 .readlink = generic_readlink,
35 .follow_link = page_follow_link_light, 35 .follow_link = page_follow_link_light,
36 .put_link = page_put_link, 36 .put_link = page_put_link,
37#ifdef CONFIG_EXT4DEV_FS_XATTR 37#ifdef CONFIG_EXT4_FS_XATTR
38 .setxattr = generic_setxattr, 38 .setxattr = generic_setxattr,
39 .getxattr = generic_getxattr, 39 .getxattr = generic_getxattr,
40 .listxattr = ext4_listxattr, 40 .listxattr = ext4_listxattr,
@@ -45,7 +45,7 @@ const struct inode_operations ext4_symlink_inode_operations = {
45const struct inode_operations ext4_fast_symlink_inode_operations = { 45const struct inode_operations ext4_fast_symlink_inode_operations = {
46 .readlink = generic_readlink, 46 .readlink = generic_readlink,
47 .follow_link = ext4_follow_link, 47 .follow_link = ext4_follow_link,
48#ifdef CONFIG_EXT4DEV_FS_XATTR 48#ifdef CONFIG_EXT4_FS_XATTR
49 .setxattr = generic_setxattr, 49 .setxattr = generic_setxattr,
50 .getxattr = generic_getxattr, 50 .getxattr = generic_getxattr,
51 .listxattr = ext4_listxattr, 51 .listxattr = ext4_listxattr,
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 8954208b489..80626d516fe 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -99,12 +99,12 @@ static struct mb_cache *ext4_xattr_cache;
99 99
100static struct xattr_handler *ext4_xattr_handler_map[] = { 100static struct xattr_handler *ext4_xattr_handler_map[] = {
101 [EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler, 101 [EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler,
102#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL 102#ifdef CONFIG_EXT4_FS_POSIX_ACL
103 [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext4_xattr_acl_access_handler, 103 [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext4_xattr_acl_access_handler,
104 [EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ext4_xattr_acl_default_handler, 104 [EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ext4_xattr_acl_default_handler,
105#endif 105#endif
106 [EXT4_XATTR_INDEX_TRUSTED] = &ext4_xattr_trusted_handler, 106 [EXT4_XATTR_INDEX_TRUSTED] = &ext4_xattr_trusted_handler,
107#ifdef CONFIG_EXT4DEV_FS_SECURITY 107#ifdef CONFIG_EXT4_FS_SECURITY
108 [EXT4_XATTR_INDEX_SECURITY] = &ext4_xattr_security_handler, 108 [EXT4_XATTR_INDEX_SECURITY] = &ext4_xattr_security_handler,
109#endif 109#endif
110}; 110};
@@ -112,11 +112,11 @@ static struct xattr_handler *ext4_xattr_handler_map[] = {
112struct xattr_handler *ext4_xattr_handlers[] = { 112struct xattr_handler *ext4_xattr_handlers[] = {
113 &ext4_xattr_user_handler, 113 &ext4_xattr_user_handler,
114 &ext4_xattr_trusted_handler, 114 &ext4_xattr_trusted_handler,
115#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL 115#ifdef CONFIG_EXT4_FS_POSIX_ACL
116 &ext4_xattr_acl_access_handler, 116 &ext4_xattr_acl_access_handler,
117 &ext4_xattr_acl_default_handler, 117 &ext4_xattr_acl_default_handler,
118#endif 118#endif
119#ifdef CONFIG_EXT4DEV_FS_SECURITY 119#ifdef CONFIG_EXT4_FS_SECURITY
120 &ext4_xattr_security_handler, 120 &ext4_xattr_security_handler,
121#endif 121#endif
122 NULL 122 NULL
@@ -959,6 +959,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
959 struct ext4_xattr_block_find bs = { 959 struct ext4_xattr_block_find bs = {
960 .s = { .not_found = -ENODATA, }, 960 .s = { .not_found = -ENODATA, },
961 }; 961 };
962 unsigned long no_expand;
962 int error; 963 int error;
963 964
964 if (!name) 965 if (!name)
@@ -966,6 +967,9 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
966 if (strlen(name) > 255) 967 if (strlen(name) > 255)
967 return -ERANGE; 968 return -ERANGE;
968 down_write(&EXT4_I(inode)->xattr_sem); 969 down_write(&EXT4_I(inode)->xattr_sem);
970 no_expand = EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND;
971 EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND;
972
969 error = ext4_get_inode_loc(inode, &is.iloc); 973 error = ext4_get_inode_loc(inode, &is.iloc);
970 if (error) 974 if (error)
971 goto cleanup; 975 goto cleanup;
@@ -1042,6 +1046,8 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
1042cleanup: 1046cleanup:
1043 brelse(is.iloc.bh); 1047 brelse(is.iloc.bh);
1044 brelse(bs.bh); 1048 brelse(bs.bh);
1049 if (no_expand == 0)
1050 EXT4_I(inode)->i_state &= ~EXT4_STATE_NO_EXPAND;
1045 up_write(&EXT4_I(inode)->xattr_sem); 1051 up_write(&EXT4_I(inode)->xattr_sem);
1046 return error; 1052 return error;
1047} 1053}
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 5992fe979bb..8ede88b18c2 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -51,8 +51,8 @@ struct ext4_xattr_entry {
51 (((name_len) + EXT4_XATTR_ROUND + \ 51 (((name_len) + EXT4_XATTR_ROUND + \
52 sizeof(struct ext4_xattr_entry)) & ~EXT4_XATTR_ROUND) 52 sizeof(struct ext4_xattr_entry)) & ~EXT4_XATTR_ROUND)
53#define EXT4_XATTR_NEXT(entry) \ 53#define EXT4_XATTR_NEXT(entry) \
54 ( (struct ext4_xattr_entry *)( \ 54 ((struct ext4_xattr_entry *)( \
55 (char *)(entry) + EXT4_XATTR_LEN((entry)->e_name_len)) ) 55 (char *)(entry) + EXT4_XATTR_LEN((entry)->e_name_len)))
56#define EXT4_XATTR_SIZE(size) \ 56#define EXT4_XATTR_SIZE(size) \
57 (((size) + EXT4_XATTR_ROUND) & ~EXT4_XATTR_ROUND) 57 (((size) + EXT4_XATTR_ROUND) & ~EXT4_XATTR_ROUND)
58 58
@@ -63,7 +63,7 @@ struct ext4_xattr_entry {
63 EXT4_I(inode)->i_extra_isize)) 63 EXT4_I(inode)->i_extra_isize))
64#define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1)) 64#define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))
65 65
66# ifdef CONFIG_EXT4DEV_FS_XATTR 66# ifdef CONFIG_EXT4_FS_XATTR
67 67
68extern struct xattr_handler ext4_xattr_user_handler; 68extern struct xattr_handler ext4_xattr_user_handler;
69extern struct xattr_handler ext4_xattr_trusted_handler; 69extern struct xattr_handler ext4_xattr_trusted_handler;
@@ -88,7 +88,7 @@ extern void exit_ext4_xattr(void);
88 88
89extern struct xattr_handler *ext4_xattr_handlers[]; 89extern struct xattr_handler *ext4_xattr_handlers[];
90 90
91# else /* CONFIG_EXT4DEV_FS_XATTR */ 91# else /* CONFIG_EXT4_FS_XATTR */
92 92
93static inline int 93static inline int
94ext4_xattr_get(struct inode *inode, int name_index, const char *name, 94ext4_xattr_get(struct inode *inode, int name_index, const char *name,
@@ -141,9 +141,9 @@ ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
141 141
142#define ext4_xattr_handlers NULL 142#define ext4_xattr_handlers NULL
143 143
144# endif /* CONFIG_EXT4DEV_FS_XATTR */ 144# endif /* CONFIG_EXT4_FS_XATTR */
145 145
146#ifdef CONFIG_EXT4DEV_FS_SECURITY 146#ifdef CONFIG_EXT4_FS_SECURITY
147extern int ext4_init_security(handle_t *handle, struct inode *inode, 147extern int ext4_init_security(handle_t *handle, struct inode *inode,
148 struct inode *dir); 148 struct inode *dir);
149#else 149#else
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 7db32b3382d..33a6b7ecb8b 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -13,9 +13,14 @@
13#include <linux/security.h> 13#include <linux/security.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/uaccess.h> 15#include <linux/uaccess.h>
16#include <linux/writeback.h>
17#include <linux/buffer_head.h>
16 18
17#include <asm/ioctls.h> 19#include <asm/ioctls.h>
18 20
21/* So that the fiemap access checks can't overflow on 32 bit machines. */
22#define FIEMAP_MAX_EXTENTS (UINT_MAX / sizeof(struct fiemap_extent))
23
19/** 24/**
20 * vfs_ioctl - call filesystem specific ioctl methods 25 * vfs_ioctl - call filesystem specific ioctl methods
21 * @filp: open file to invoke ioctl method on 26 * @filp: open file to invoke ioctl method on
@@ -71,6 +76,272 @@ static int ioctl_fibmap(struct file *filp, int __user *p)
71 return put_user(res, p); 76 return put_user(res, p);
72} 77}
73 78
79/**
80 * fiemap_fill_next_extent - Fiemap helper function
81 * @fieinfo: Fiemap context passed into ->fiemap
82 * @logical: Extent logical start offset, in bytes
83 * @phys: Extent physical start offset, in bytes
84 * @len: Extent length, in bytes
85 * @flags: FIEMAP_EXTENT flags that describe this extent
86 *
87 * Called from file system ->fiemap callback. Will populate extent
88 * info as passed in via arguments and copy to user memory. On
89 * success, extent count on fieinfo is incremented.
90 *
91 * Returns 0 on success, -errno on error, 1 if this was the last
92 * extent that will fit in user array.
93 */
94#define SET_UNKNOWN_FLAGS (FIEMAP_EXTENT_DELALLOC)
95#define SET_NO_UNMOUNTED_IO_FLAGS (FIEMAP_EXTENT_DATA_ENCRYPTED)
96#define SET_NOT_ALIGNED_FLAGS (FIEMAP_EXTENT_DATA_TAIL|FIEMAP_EXTENT_DATA_INLINE)
97int fiemap_fill_next_extent(struct fiemap_extent_info *fieinfo, u64 logical,
98 u64 phys, u64 len, u32 flags)
99{
100 struct fiemap_extent extent;
101 struct fiemap_extent *dest = fieinfo->fi_extents_start;
102
103 /* only count the extents */
104 if (fieinfo->fi_extents_max == 0) {
105 fieinfo->fi_extents_mapped++;
106 return (flags & FIEMAP_EXTENT_LAST) ? 1 : 0;
107 }
108
109 if (fieinfo->fi_extents_mapped >= fieinfo->fi_extents_max)
110 return 1;
111
112 if (flags & SET_UNKNOWN_FLAGS)
113 flags |= FIEMAP_EXTENT_UNKNOWN;
114 if (flags & SET_NO_UNMOUNTED_IO_FLAGS)
115 flags |= FIEMAP_EXTENT_ENCODED;
116 if (flags & SET_NOT_ALIGNED_FLAGS)
117 flags |= FIEMAP_EXTENT_NOT_ALIGNED;
118
119 memset(&extent, 0, sizeof(extent));
120 extent.fe_logical = logical;
121 extent.fe_physical = phys;
122 extent.fe_length = len;
123 extent.fe_flags = flags;
124
125 dest += fieinfo->fi_extents_mapped;
126 if (copy_to_user(dest, &extent, sizeof(extent)))
127 return -EFAULT;
128
129 fieinfo->fi_extents_mapped++;
130 if (fieinfo->fi_extents_mapped == fieinfo->fi_extents_max)
131 return 1;
132 return (flags & FIEMAP_EXTENT_LAST) ? 1 : 0;
133}
134EXPORT_SYMBOL(fiemap_fill_next_extent);
135
136/**
137 * fiemap_check_flags - check validity of requested flags for fiemap
138 * @fieinfo: Fiemap context passed into ->fiemap
139 * @fs_flags: Set of fiemap flags that the file system understands
140 *
141 * Called from file system ->fiemap callback. This will compute the
142 * intersection of valid fiemap flags and those that the fs supports. That
143 * value is then compared against the user supplied flags. In case of bad user
144 * flags, the invalid values will be written into the fieinfo structure, and
145 * -EBADR is returned, which tells ioctl_fiemap() to return those values to
146 * userspace. For this reason, a return code of -EBADR should be preserved.
147 *
148 * Returns 0 on success, -EBADR on bad flags.
149 */
150int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags)
151{
152 u32 incompat_flags;
153
154 incompat_flags = fieinfo->fi_flags & ~(FIEMAP_FLAGS_COMPAT & fs_flags);
155 if (incompat_flags) {
156 fieinfo->fi_flags = incompat_flags;
157 return -EBADR;
158 }
159 return 0;
160}
161EXPORT_SYMBOL(fiemap_check_flags);
162
163static int fiemap_check_ranges(struct super_block *sb,
164 u64 start, u64 len, u64 *new_len)
165{
166 *new_len = len;
167
168 if (len == 0)
169 return -EINVAL;
170
171 if (start > sb->s_maxbytes)
172 return -EFBIG;
173
174 /*
175 * Shrink request scope to what the fs can actually handle.
176 */
177 if ((len > sb->s_maxbytes) ||
178 (sb->s_maxbytes - len) < start)
179 *new_len = sb->s_maxbytes - start;
180
181 return 0;
182}
183
184static int ioctl_fiemap(struct file *filp, unsigned long arg)
185{
186 struct fiemap fiemap;
187 struct fiemap_extent_info fieinfo = { 0, };
188 struct inode *inode = filp->f_path.dentry->d_inode;
189 struct super_block *sb = inode->i_sb;
190 u64 len;
191 int error;
192
193 if (!inode->i_op->fiemap)
194 return -EOPNOTSUPP;
195
196 if (copy_from_user(&fiemap, (struct fiemap __user *)arg,
197 sizeof(struct fiemap)))
198 return -EFAULT;
199
200 if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS)
201 return -EINVAL;
202
203 error = fiemap_check_ranges(sb, fiemap.fm_start, fiemap.fm_length,
204 &len);
205 if (error)
206 return error;
207
208 fieinfo.fi_flags = fiemap.fm_flags;
209 fieinfo.fi_extents_max = fiemap.fm_extent_count;
210 fieinfo.fi_extents_start = (struct fiemap_extent *)(arg + sizeof(fiemap));
211
212 if (fiemap.fm_extent_count != 0 &&
213 !access_ok(VERIFY_WRITE, fieinfo.fi_extents_start,
214 fieinfo.fi_extents_max * sizeof(struct fiemap_extent)))
215 return -EFAULT;
216
217 if (fieinfo.fi_flags & FIEMAP_FLAG_SYNC)
218 filemap_write_and_wait(inode->i_mapping);
219
220 error = inode->i_op->fiemap(inode, &fieinfo, fiemap.fm_start, len);
221 fiemap.fm_flags = fieinfo.fi_flags;
222 fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped;
223 if (copy_to_user((char *)arg, &fiemap, sizeof(fiemap)))
224 error = -EFAULT;
225
226 return error;
227}
228
229#define blk_to_logical(inode, blk) (blk << (inode)->i_blkbits)
230#define logical_to_blk(inode, offset) (offset >> (inode)->i_blkbits);
231
232/*
233 * @inode - the inode to map
234 * @arg - the pointer to userspace where we copy everything to
235 * @get_block - the fs's get_block function
236 *
237 * This does FIEMAP for block based inodes. Basically it will just loop
238 * through get_block until we hit the number of extents we want to map, or we
239 * go past the end of the file and hit a hole.
240 *
241 * If it is possible to have data blocks beyond a hole past @inode->i_size, then
242 * please do not use this function, it will stop at the first unmapped block
243 * beyond i_size
244 */
245int generic_block_fiemap(struct inode *inode,
246 struct fiemap_extent_info *fieinfo, u64 start,
247 u64 len, get_block_t *get_block)
248{
249 struct buffer_head tmp;
250 unsigned int start_blk;
251 long long length = 0, map_len = 0;
252 u64 logical = 0, phys = 0, size = 0;
253 u32 flags = FIEMAP_EXTENT_MERGED;
254 int ret = 0;
255
256 if ((ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC)))
257 return ret;
258
259 start_blk = logical_to_blk(inode, start);
260
261 /* guard against change */
262 mutex_lock(&inode->i_mutex);
263
264 length = (long long)min_t(u64, len, i_size_read(inode));
265 map_len = length;
266
267 do {
268 /*
269 * we set b_size to the total size we want so it will map as
270 * many contiguous blocks as possible at once
271 */
272 memset(&tmp, 0, sizeof(struct buffer_head));
273 tmp.b_size = map_len;
274
275 ret = get_block(inode, start_blk, &tmp, 0);
276 if (ret)
277 break;
278
279 /* HOLE */
280 if (!buffer_mapped(&tmp)) {
281 /*
282 * first hole after going past the EOF, this is our
283 * last extent
284 */
285 if (length <= 0) {
286 flags = FIEMAP_EXTENT_MERGED|FIEMAP_EXTENT_LAST;
287 ret = fiemap_fill_next_extent(fieinfo, logical,
288 phys, size,
289 flags);
290 break;
291 }
292
293 length -= blk_to_logical(inode, 1);
294
295 /* if we have holes up to/past EOF then we're done */
296 if (length <= 0)
297 break;
298
299 start_blk++;
300 } else {
301 if (length <= 0 && size) {
302 ret = fiemap_fill_next_extent(fieinfo, logical,
303 phys, size,
304 flags);
305 if (ret)
306 break;
307 }
308
309 logical = blk_to_logical(inode, start_blk);
310 phys = blk_to_logical(inode, tmp.b_blocknr);
311 size = tmp.b_size;
312 flags = FIEMAP_EXTENT_MERGED;
313
314 length -= tmp.b_size;
315 start_blk += logical_to_blk(inode, size);
316
317 /*
318 * if we are past the EOF we need to loop again to see
319 * if there is a hole so we can mark this extent as the
320 * last one, and if not keep mapping things until we
321 * find a hole, or we run out of slots in the extent
322 * array
323 */
324 if (length <= 0)
325 continue;
326
327 ret = fiemap_fill_next_extent(fieinfo, logical, phys,
328 size, flags);
329 if (ret)
330 break;
331 }
332 cond_resched();
333 } while (1);
334
335 mutex_unlock(&inode->i_mutex);
336
337 /* if ret is 1 then we just hit the end of the extent array */
338 if (ret == 1)
339 ret = 0;
340
341 return ret;
342}
343EXPORT_SYMBOL(generic_block_fiemap);
344
74static int file_ioctl(struct file *filp, unsigned int cmd, 345static int file_ioctl(struct file *filp, unsigned int cmd,
75 unsigned long arg) 346 unsigned long arg)
76{ 347{
@@ -80,6 +351,8 @@ static int file_ioctl(struct file *filp, unsigned int cmd,
80 switch (cmd) { 351 switch (cmd) {
81 case FIBMAP: 352 case FIBMAP:
82 return ioctl_fibmap(filp, p); 353 return ioctl_fibmap(filp, p);
354 case FS_IOC_FIEMAP:
355 return ioctl_fiemap(filp, arg);
83 case FIGETBSZ: 356 case FIGETBSZ:
84 return put_user(inode->i_sb->s_blocksize, p); 357 return put_user(inode->i_sb->s_blocksize, p);
85 case FIONREAD: 358 case FIONREAD:
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 91389c8aee8..42895d36945 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -20,6 +20,7 @@
20#include <linux/time.h> 20#include <linux/time.h>
21#include <linux/fs.h> 21#include <linux/fs.h>
22#include <linux/jbd2.h> 22#include <linux/jbd2.h>
23#include <linux/marker.h>
23#include <linux/errno.h> 24#include <linux/errno.h>
24#include <linux/slab.h> 25#include <linux/slab.h>
25 26
@@ -126,14 +127,29 @@ void __jbd2_log_wait_for_space(journal_t *journal)
126 127
127 /* 128 /*
128 * Test again, another process may have checkpointed while we 129 * Test again, another process may have checkpointed while we
129 * were waiting for the checkpoint lock 130 * were waiting for the checkpoint lock. If there are no
131 * outstanding transactions there is nothing to checkpoint and
132 * we can't make progress. Abort the journal in this case.
130 */ 133 */
131 spin_lock(&journal->j_state_lock); 134 spin_lock(&journal->j_state_lock);
135 spin_lock(&journal->j_list_lock);
132 nblocks = jbd_space_needed(journal); 136 nblocks = jbd_space_needed(journal);
133 if (__jbd2_log_space_left(journal) < nblocks) { 137 if (__jbd2_log_space_left(journal) < nblocks) {
138 int chkpt = journal->j_checkpoint_transactions != NULL;
139
140 spin_unlock(&journal->j_list_lock);
134 spin_unlock(&journal->j_state_lock); 141 spin_unlock(&journal->j_state_lock);
135 jbd2_log_do_checkpoint(journal); 142 if (chkpt) {
143 jbd2_log_do_checkpoint(journal);
144 } else {
145 printk(KERN_ERR "%s: no transactions\n",
146 __func__);
147 jbd2_journal_abort(journal, 0);
148 }
149
136 spin_lock(&journal->j_state_lock); 150 spin_lock(&journal->j_state_lock);
151 } else {
152 spin_unlock(&journal->j_list_lock);
137 } 153 }
138 mutex_unlock(&journal->j_checkpoint_mutex); 154 mutex_unlock(&journal->j_checkpoint_mutex);
139 } 155 }
@@ -313,6 +329,8 @@ int jbd2_log_do_checkpoint(journal_t *journal)
313 * journal straight away. 329 * journal straight away.
314 */ 330 */
315 result = jbd2_cleanup_journal_tail(journal); 331 result = jbd2_cleanup_journal_tail(journal);
332 trace_mark(jbd2_checkpoint, "dev %s need_checkpoint %d",
333 journal->j_devname, result);
316 jbd_debug(1, "cleanup_journal_tail returned %d\n", result); 334 jbd_debug(1, "cleanup_journal_tail returned %d\n", result);
317 if (result <= 0) 335 if (result <= 0)
318 return result; 336 return result;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index f2ad061e95e..0d3814a35ed 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -16,6 +16,7 @@
16#include <linux/time.h> 16#include <linux/time.h>
17#include <linux/fs.h> 17#include <linux/fs.h>
18#include <linux/jbd2.h> 18#include <linux/jbd2.h>
19#include <linux/marker.h>
19#include <linux/errno.h> 20#include <linux/errno.h>
20#include <linux/slab.h> 21#include <linux/slab.h>
21#include <linux/mm.h> 22#include <linux/mm.h>
@@ -126,8 +127,7 @@ static int journal_submit_commit_record(journal_t *journal,
126 127
127 JBUFFER_TRACE(descriptor, "submit commit block"); 128 JBUFFER_TRACE(descriptor, "submit commit block");
128 lock_buffer(bh); 129 lock_buffer(bh);
129 get_bh(bh); 130 clear_buffer_dirty(bh);
130 set_buffer_dirty(bh);
131 set_buffer_uptodate(bh); 131 set_buffer_uptodate(bh);
132 bh->b_end_io = journal_end_buffer_io_sync; 132 bh->b_end_io = journal_end_buffer_io_sync;
133 133
@@ -147,12 +147,9 @@ static int journal_submit_commit_record(journal_t *journal,
147 * to remember if we sent a barrier request 147 * to remember if we sent a barrier request
148 */ 148 */
149 if (ret == -EOPNOTSUPP && barrier_done) { 149 if (ret == -EOPNOTSUPP && barrier_done) {
150 char b[BDEVNAME_SIZE];
151
152 printk(KERN_WARNING 150 printk(KERN_WARNING
153 "JBD: barrier-based sync failed on %s - " 151 "JBD: barrier-based sync failed on %s - "
154 "disabling barriers\n", 152 "disabling barriers\n", journal->j_devname);
155 bdevname(journal->j_dev, b));
156 spin_lock(&journal->j_state_lock); 153 spin_lock(&journal->j_state_lock);
157 journal->j_flags &= ~JBD2_BARRIER; 154 journal->j_flags &= ~JBD2_BARRIER;
158 spin_unlock(&journal->j_state_lock); 155 spin_unlock(&journal->j_state_lock);
@@ -160,7 +157,7 @@ static int journal_submit_commit_record(journal_t *journal,
160 /* And try again, without the barrier */ 157 /* And try again, without the barrier */
161 lock_buffer(bh); 158 lock_buffer(bh);
162 set_buffer_uptodate(bh); 159 set_buffer_uptodate(bh);
163 set_buffer_dirty(bh); 160 clear_buffer_dirty(bh);
164 ret = submit_bh(WRITE, bh); 161 ret = submit_bh(WRITE, bh);
165 } 162 }
166 *cbh = bh; 163 *cbh = bh;
@@ -371,6 +368,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
371 commit_transaction = journal->j_running_transaction; 368 commit_transaction = journal->j_running_transaction;
372 J_ASSERT(commit_transaction->t_state == T_RUNNING); 369 J_ASSERT(commit_transaction->t_state == T_RUNNING);
373 370
371 trace_mark(jbd2_start_commit, "dev %s transaction %d",
372 journal->j_devname, commit_transaction->t_tid);
374 jbd_debug(1, "JBD: starting commit of transaction %d\n", 373 jbd_debug(1, "JBD: starting commit of transaction %d\n",
375 commit_transaction->t_tid); 374 commit_transaction->t_tid);
376 375
@@ -681,11 +680,9 @@ start_journal_io:
681 */ 680 */
682 err = journal_finish_inode_data_buffers(journal, commit_transaction); 681 err = journal_finish_inode_data_buffers(journal, commit_transaction);
683 if (err) { 682 if (err) {
684 char b[BDEVNAME_SIZE];
685
686 printk(KERN_WARNING 683 printk(KERN_WARNING
687 "JBD2: Detected IO errors while flushing file data " 684 "JBD2: Detected IO errors while flushing file data "
688 "on %s\n", bdevname(journal->j_fs_dev, b)); 685 "on %s\n", journal->j_devname);
689 err = 0; 686 err = 0;
690 } 687 }
691 688
@@ -990,6 +987,9 @@ restart_loop:
990 } 987 }
991 spin_unlock(&journal->j_list_lock); 988 spin_unlock(&journal->j_list_lock);
992 989
990 trace_mark(jbd2_end_commit, "dev %s transaction %d head %d",
991 journal->j_devname, commit_transaction->t_tid,
992 journal->j_tail_sequence);
993 jbd_debug(1, "JBD: commit %d complete, head %d\n", 993 jbd_debug(1, "JBD: commit %d complete, head %d\n",
994 journal->j_commit_sequence, journal->j_tail_sequence); 994 journal->j_commit_sequence, journal->j_tail_sequence);
995 995
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 8207a01c4ed..01c3901c3a0 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -597,13 +597,9 @@ int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr,
597 if (ret) 597 if (ret)
598 *retp = ret; 598 *retp = ret;
599 else { 599 else {
600 char b[BDEVNAME_SIZE];
601
602 printk(KERN_ALERT "%s: journal block not found " 600 printk(KERN_ALERT "%s: journal block not found "
603 "at offset %lu on %s\n", 601 "at offset %lu on %s\n",
604 __func__, 602 __func__, blocknr, journal->j_devname);
605 blocknr,
606 bdevname(journal->j_dev, b));
607 err = -EIO; 603 err = -EIO;
608 __journal_abort_soft(journal, err); 604 __journal_abort_soft(journal, err);
609 } 605 }
@@ -901,10 +897,7 @@ static struct proc_dir_entry *proc_jbd2_stats;
901 897
902static void jbd2_stats_proc_init(journal_t *journal) 898static void jbd2_stats_proc_init(journal_t *journal)
903{ 899{
904 char name[BDEVNAME_SIZE]; 900 journal->j_proc_entry = proc_mkdir(journal->j_devname, proc_jbd2_stats);
905
906 bdevname(journal->j_dev, name);
907 journal->j_proc_entry = proc_mkdir(name, proc_jbd2_stats);
908 if (journal->j_proc_entry) { 901 if (journal->j_proc_entry) {
909 proc_create_data("history", S_IRUGO, journal->j_proc_entry, 902 proc_create_data("history", S_IRUGO, journal->j_proc_entry,
910 &jbd2_seq_history_fops, journal); 903 &jbd2_seq_history_fops, journal);
@@ -915,12 +908,9 @@ static void jbd2_stats_proc_init(journal_t *journal)
915 908
916static void jbd2_stats_proc_exit(journal_t *journal) 909static void jbd2_stats_proc_exit(journal_t *journal)
917{ 910{
918 char name[BDEVNAME_SIZE];
919
920 bdevname(journal->j_dev, name);
921 remove_proc_entry("info", journal->j_proc_entry); 911 remove_proc_entry("info", journal->j_proc_entry);
922 remove_proc_entry("history", journal->j_proc_entry); 912 remove_proc_entry("history", journal->j_proc_entry);
923 remove_proc_entry(name, proc_jbd2_stats); 913 remove_proc_entry(journal->j_devname, proc_jbd2_stats);
924} 914}
925 915
926static void journal_init_stats(journal_t *journal) 916static void journal_init_stats(journal_t *journal)
@@ -1018,6 +1008,7 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
1018{ 1008{
1019 journal_t *journal = journal_init_common(); 1009 journal_t *journal = journal_init_common();
1020 struct buffer_head *bh; 1010 struct buffer_head *bh;
1011 char *p;
1021 int n; 1012 int n;
1022 1013
1023 if (!journal) 1014 if (!journal)
@@ -1039,6 +1030,10 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
1039 journal->j_fs_dev = fs_dev; 1030 journal->j_fs_dev = fs_dev;
1040 journal->j_blk_offset = start; 1031 journal->j_blk_offset = start;
1041 journal->j_maxlen = len; 1032 journal->j_maxlen = len;
1033 bdevname(journal->j_dev, journal->j_devname);
1034 p = journal->j_devname;
1035 while ((p = strchr(p, '/')))
1036 *p = '!';
1042 jbd2_stats_proc_init(journal); 1037 jbd2_stats_proc_init(journal);
1043 1038
1044 bh = __getblk(journal->j_dev, start, journal->j_blocksize); 1039 bh = __getblk(journal->j_dev, start, journal->j_blocksize);
@@ -1061,6 +1056,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
1061{ 1056{
1062 struct buffer_head *bh; 1057 struct buffer_head *bh;
1063 journal_t *journal = journal_init_common(); 1058 journal_t *journal = journal_init_common();
1059 char *p;
1064 int err; 1060 int err;
1065 int n; 1061 int n;
1066 unsigned long long blocknr; 1062 unsigned long long blocknr;
@@ -1070,6 +1066,12 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
1070 1066
1071 journal->j_dev = journal->j_fs_dev = inode->i_sb->s_bdev; 1067 journal->j_dev = journal->j_fs_dev = inode->i_sb->s_bdev;
1072 journal->j_inode = inode; 1068 journal->j_inode = inode;
1069 bdevname(journal->j_dev, journal->j_devname);
1070 p = journal->j_devname;
1071 while ((p = strchr(p, '/')))
1072 *p = '!';
1073 p = journal->j_devname + strlen(journal->j_devname);
1074 sprintf(p, ":%lu", journal->j_inode->i_ino);
1073 jbd_debug(1, 1075 jbd_debug(1,
1074 "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n", 1076 "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n",
1075 journal, inode->i_sb->s_id, inode->i_ino, 1077 journal, inode->i_sb->s_id, inode->i_ino,
@@ -1253,6 +1255,22 @@ void jbd2_journal_update_superblock(journal_t *journal, int wait)
1253 goto out; 1255 goto out;
1254 } 1256 }
1255 1257
1258 if (buffer_write_io_error(bh)) {
1259 /*
1260 * Oh, dear. A previous attempt to write the journal
1261 * superblock failed. This could happen because the
1262 * USB device was yanked out. Or it could happen to
1263 * be a transient write error and maybe the block will
1264 * be remapped. Nothing we can do but to retry the
1265 * write and hope for the best.
1266 */
1267 printk(KERN_ERR "JBD2: previous I/O error detected "
1268 "for journal superblock update for %s.\n",
1269 journal->j_devname);
1270 clear_buffer_write_io_error(bh);
1271 set_buffer_uptodate(bh);
1272 }
1273
1256 spin_lock(&journal->j_state_lock); 1274 spin_lock(&journal->j_state_lock);
1257 jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n", 1275 jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n",
1258 journal->j_tail, journal->j_tail_sequence, journal->j_errno); 1276 journal->j_tail, journal->j_tail_sequence, journal->j_errno);
@@ -1264,9 +1282,16 @@ void jbd2_journal_update_superblock(journal_t *journal, int wait)
1264 1282
1265 BUFFER_TRACE(bh, "marking dirty"); 1283 BUFFER_TRACE(bh, "marking dirty");
1266 mark_buffer_dirty(bh); 1284 mark_buffer_dirty(bh);
1267 if (wait) 1285 if (wait) {
1268 sync_dirty_buffer(bh); 1286 sync_dirty_buffer(bh);
1269 else 1287 if (buffer_write_io_error(bh)) {
1288 printk(KERN_ERR "JBD2: I/O error detected "
1289 "when updating journal superblock for %s.\n",
1290 journal->j_devname);
1291 clear_buffer_write_io_error(bh);
1292 set_buffer_uptodate(bh);
1293 }
1294 } else
1270 ll_rw_block(SWRITE, 1, &bh); 1295 ll_rw_block(SWRITE, 1, &bh);
1271 1296
1272out: 1297out:
@@ -1761,23 +1786,6 @@ int jbd2_journal_wipe(journal_t *journal, int write)
1761} 1786}
1762 1787
1763/* 1788/*
1764 * journal_dev_name: format a character string to describe on what
1765 * device this journal is present.
1766 */
1767
1768static const char *journal_dev_name(journal_t *journal, char *buffer)
1769{
1770 struct block_device *bdev;
1771
1772 if (journal->j_inode)
1773 bdev = journal->j_inode->i_sb->s_bdev;
1774 else
1775 bdev = journal->j_dev;
1776
1777 return bdevname(bdev, buffer);
1778}
1779
1780/*
1781 * Journal abort has very specific semantics, which we describe 1789 * Journal abort has very specific semantics, which we describe
1782 * for journal abort. 1790 * for journal abort.
1783 * 1791 *
@@ -1793,13 +1801,12 @@ static const char *journal_dev_name(journal_t *journal, char *buffer)
1793void __jbd2_journal_abort_hard(journal_t *journal) 1801void __jbd2_journal_abort_hard(journal_t *journal)
1794{ 1802{
1795 transaction_t *transaction; 1803 transaction_t *transaction;
1796 char b[BDEVNAME_SIZE];
1797 1804
1798 if (journal->j_flags & JBD2_ABORT) 1805 if (journal->j_flags & JBD2_ABORT)
1799 return; 1806 return;
1800 1807
1801 printk(KERN_ERR "Aborting journal on device %s.\n", 1808 printk(KERN_ERR "Aborting journal on device %s.\n",
1802 journal_dev_name(journal, b)); 1809 journal->j_devname);
1803 1810
1804 spin_lock(&journal->j_state_lock); 1811 spin_lock(&journal->j_state_lock);
1805 journal->j_flags |= JBD2_ABORT; 1812 journal->j_flags |= JBD2_ABORT;
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 10bfb466e06..29ff57ec5d1 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -990,15 +990,6 @@ out:
990} 990}
991 991
992/* 992/*
993 * This is only valid for leaf nodes, which are the only ones that can
994 * have empty extents anyway.
995 */
996static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec)
997{
998 return !rec->e_leaf_clusters;
999}
1000
1001/*
1002 * This function will discard the rightmost extent record. 993 * This function will discard the rightmost extent record.
1003 */ 994 */
1004static void ocfs2_shift_records_right(struct ocfs2_extent_list *el) 995static void ocfs2_shift_records_right(struct ocfs2_extent_list *el)
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 42ff94bd801..60cd3d59230 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -146,4 +146,13 @@ static inline unsigned int ocfs2_rec_clusters(struct ocfs2_extent_list *el,
146 return le16_to_cpu(rec->e_leaf_clusters); 146 return le16_to_cpu(rec->e_leaf_clusters);
147} 147}
148 148
149/*
150 * This is only valid for leaf nodes, which are the only ones that can
151 * have empty extents anyway.
152 */
153static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec)
154{
155 return !rec->e_leaf_clusters;
156}
157
149#endif /* OCFS2_ALLOC_H */ 158#endif /* OCFS2_ALLOC_H */
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index c58668a326f..aed268e80b4 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -25,6 +25,7 @@
25#include <linux/fs.h> 25#include <linux/fs.h>
26#include <linux/init.h> 26#include <linux/init.h>
27#include <linux/types.h> 27#include <linux/types.h>
28#include <linux/fiemap.h>
28 29
29#define MLOG_MASK_PREFIX ML_EXTENT_MAP 30#define MLOG_MASK_PREFIX ML_EXTENT_MAP
30#include <cluster/masklog.h> 31#include <cluster/masklog.h>
@@ -32,6 +33,7 @@
32#include "ocfs2.h" 33#include "ocfs2.h"
33 34
34#include "alloc.h" 35#include "alloc.h"
36#include "dlmglue.h"
35#include "extent_map.h" 37#include "extent_map.h"
36#include "inode.h" 38#include "inode.h"
37#include "super.h" 39#include "super.h"
@@ -282,6 +284,51 @@ out:
282 kfree(new_emi); 284 kfree(new_emi);
283} 285}
284 286
287static int ocfs2_last_eb_is_empty(struct inode *inode,
288 struct ocfs2_dinode *di)
289{
290 int ret, next_free;
291 u64 last_eb_blk = le64_to_cpu(di->i_last_eb_blk);
292 struct buffer_head *eb_bh = NULL;
293 struct ocfs2_extent_block *eb;
294 struct ocfs2_extent_list *el;
295
296 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), last_eb_blk,
297 &eb_bh, OCFS2_BH_CACHED, inode);
298 if (ret) {
299 mlog_errno(ret);
300 goto out;
301 }
302
303 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
304 el = &eb->h_list;
305
306 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
307 ret = -EROFS;
308 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
309 goto out;
310 }
311
312 if (el->l_tree_depth) {
313 ocfs2_error(inode->i_sb,
314 "Inode %lu has non zero tree depth in "
315 "leaf block %llu\n", inode->i_ino,
316 (unsigned long long)eb_bh->b_blocknr);
317 ret = -EROFS;
318 goto out;
319 }
320
321 next_free = le16_to_cpu(el->l_next_free_rec);
322
323 if (next_free == 0 ||
324 (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0])))
325 ret = 1;
326
327out:
328 brelse(eb_bh);
329 return ret;
330}
331
285/* 332/*
286 * Return the 1st index within el which contains an extent start 333 * Return the 1st index within el which contains an extent start
287 * larger than v_cluster. 334 * larger than v_cluster.
@@ -373,42 +420,28 @@ out:
373 return ret; 420 return ret;
374} 421}
375 422
376int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, 423static int ocfs2_get_clusters_nocache(struct inode *inode,
377 u32 *p_cluster, u32 *num_clusters, 424 struct buffer_head *di_bh,
378 unsigned int *extent_flags) 425 u32 v_cluster, unsigned int *hole_len,
426 struct ocfs2_extent_rec *ret_rec,
427 unsigned int *is_last)
379{ 428{
380 int ret, i; 429 int i, ret, tree_height, len;
381 unsigned int flags = 0;
382 struct buffer_head *di_bh = NULL;
383 struct buffer_head *eb_bh = NULL;
384 struct ocfs2_dinode *di; 430 struct ocfs2_dinode *di;
385 struct ocfs2_extent_block *eb; 431 struct ocfs2_extent_block *uninitialized_var(eb);
386 struct ocfs2_extent_list *el; 432 struct ocfs2_extent_list *el;
387 struct ocfs2_extent_rec *rec; 433 struct ocfs2_extent_rec *rec;
388 u32 coff; 434 struct buffer_head *eb_bh = NULL;
389
390 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
391 ret = -ERANGE;
392 mlog_errno(ret);
393 goto out;
394 }
395
396 ret = ocfs2_extent_map_lookup(inode, v_cluster, p_cluster,
397 num_clusters, extent_flags);
398 if (ret == 0)
399 goto out;
400 435
401 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), OCFS2_I(inode)->ip_blkno, 436 memset(ret_rec, 0, sizeof(*ret_rec));
402 &di_bh, OCFS2_BH_CACHED, inode); 437 if (is_last)
403 if (ret) { 438 *is_last = 0;
404 mlog_errno(ret);
405 goto out;
406 }
407 439
408 di = (struct ocfs2_dinode *) di_bh->b_data; 440 di = (struct ocfs2_dinode *) di_bh->b_data;
409 el = &di->id2.i_list; 441 el = &di->id2.i_list;
442 tree_height = le16_to_cpu(el->l_tree_depth);
410 443
411 if (el->l_tree_depth) { 444 if (tree_height > 0) {
412 ret = ocfs2_find_leaf(inode, el, v_cluster, &eb_bh); 445 ret = ocfs2_find_leaf(inode, el, v_cluster, &eb_bh);
413 if (ret) { 446 if (ret) {
414 mlog_errno(ret); 447 mlog_errno(ret);
@@ -431,46 +464,143 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
431 i = ocfs2_search_extent_list(el, v_cluster); 464 i = ocfs2_search_extent_list(el, v_cluster);
432 if (i == -1) { 465 if (i == -1) {
433 /* 466 /*
434 * A hole was found. Return some canned values that 467 * Holes can be larger than the maximum size of an
435 * callers can key on. If asked for, num_clusters will 468 * extent, so we return their lengths in a seperate
436 * be populated with the size of the hole. 469 * field.
437 */ 470 */
438 *p_cluster = 0; 471 if (hole_len) {
439 if (num_clusters) {
440 ret = ocfs2_figure_hole_clusters(inode, el, eb_bh, 472 ret = ocfs2_figure_hole_clusters(inode, el, eb_bh,
441 v_cluster, 473 v_cluster, &len);
442 num_clusters);
443 if (ret) { 474 if (ret) {
444 mlog_errno(ret); 475 mlog_errno(ret);
445 goto out; 476 goto out;
446 } 477 }
478
479 *hole_len = len;
447 } 480 }
448 } else { 481 goto out_hole;
449 rec = &el->l_recs[i]; 482 }
450 483
451 BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos)); 484 rec = &el->l_recs[i];
452 485
453 if (!rec->e_blkno) { 486 BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
454 ocfs2_error(inode->i_sb, "Inode %lu has bad extent " 487
455 "record (%u, %u, 0)", inode->i_ino, 488 if (!rec->e_blkno) {
456 le32_to_cpu(rec->e_cpos), 489 ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
457 ocfs2_rec_clusters(el, rec)); 490 "record (%u, %u, 0)", inode->i_ino,
458 ret = -EROFS; 491 le32_to_cpu(rec->e_cpos),
459 goto out; 492 ocfs2_rec_clusters(el, rec));
493 ret = -EROFS;
494 goto out;
495 }
496
497 *ret_rec = *rec;
498
499 /*
500 * Checking for last extent is potentially expensive - we
501 * might have to look at the next leaf over to see if it's
502 * empty.
503 *
504 * The first two checks are to see whether the caller even
505 * cares for this information, and if the extent is at least
506 * the last in it's list.
507 *
508 * If those hold true, then the extent is last if any of the
509 * additional conditions hold true:
510 * - Extent list is in-inode
511 * - Extent list is right-most
512 * - Extent list is 2nd to rightmost, with empty right-most
513 */
514 if (is_last) {
515 if (i == (le16_to_cpu(el->l_next_free_rec) - 1)) {
516 if (tree_height == 0)
517 *is_last = 1;
518 else if (eb->h_blkno == di->i_last_eb_blk)
519 *is_last = 1;
520 else if (eb->h_next_leaf_blk == di->i_last_eb_blk) {
521 ret = ocfs2_last_eb_is_empty(inode, di);
522 if (ret < 0) {
523 mlog_errno(ret);
524 goto out;
525 }
526 if (ret == 1)
527 *is_last = 1;
528 }
460 } 529 }
530 }
531
532out_hole:
533 ret = 0;
534out:
535 brelse(eb_bh);
536 return ret;
537}
538
539static void ocfs2_relative_extent_offsets(struct super_block *sb,
540 u32 v_cluster,
541 struct ocfs2_extent_rec *rec,
542 u32 *p_cluster, u32 *num_clusters)
543
544{
545 u32 coff = v_cluster - le32_to_cpu(rec->e_cpos);
546
547 *p_cluster = ocfs2_blocks_to_clusters(sb, le64_to_cpu(rec->e_blkno));
548 *p_cluster = *p_cluster + coff;
549
550 if (num_clusters)
551 *num_clusters = le16_to_cpu(rec->e_leaf_clusters) - coff;
552}
553
554int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
555 u32 *p_cluster, u32 *num_clusters,
556 unsigned int *extent_flags)
557{
558 int ret;
559 unsigned int uninitialized_var(hole_len), flags = 0;
560 struct buffer_head *di_bh = NULL;
561 struct ocfs2_extent_rec rec;
461 562
462 coff = v_cluster - le32_to_cpu(rec->e_cpos); 563 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
564 ret = -ERANGE;
565 mlog_errno(ret);
566 goto out;
567 }
463 568
464 *p_cluster = ocfs2_blocks_to_clusters(inode->i_sb, 569 ret = ocfs2_extent_map_lookup(inode, v_cluster, p_cluster,
465 le64_to_cpu(rec->e_blkno)); 570 num_clusters, extent_flags);
466 *p_cluster = *p_cluster + coff; 571 if (ret == 0)
572 goto out;
467 573
468 if (num_clusters) 574 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), OCFS2_I(inode)->ip_blkno,
469 *num_clusters = ocfs2_rec_clusters(el, rec) - coff; 575 &di_bh, OCFS2_BH_CACHED, inode);
576 if (ret) {
577 mlog_errno(ret);
578 goto out;
579 }
470 580
471 flags = rec->e_flags; 581 ret = ocfs2_get_clusters_nocache(inode, di_bh, v_cluster, &hole_len,
582 &rec, NULL);
583 if (ret) {
584 mlog_errno(ret);
585 goto out;
586 }
472 587
473 ocfs2_extent_map_insert_rec(inode, rec); 588 if (rec.e_blkno == 0ULL) {
589 /*
590 * A hole was found. Return some canned values that
591 * callers can key on. If asked for, num_clusters will
592 * be populated with the size of the hole.
593 */
594 *p_cluster = 0;
595 if (num_clusters) {
596 *num_clusters = hole_len;
597 }
598 } else {
599 ocfs2_relative_extent_offsets(inode->i_sb, v_cluster, &rec,
600 p_cluster, num_clusters);
601 flags = rec.e_flags;
602
603 ocfs2_extent_map_insert_rec(inode, &rec);
474 } 604 }
475 605
476 if (extent_flags) 606 if (extent_flags)
@@ -478,7 +608,6 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
478 608
479out: 609out:
480 brelse(di_bh); 610 brelse(di_bh);
481 brelse(eb_bh);
482 return ret; 611 return ret;
483} 612}
484 613
@@ -521,3 +650,114 @@ int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
521out: 650out:
522 return ret; 651 return ret;
523} 652}
653
654static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh,
655 struct fiemap_extent_info *fieinfo,
656 u64 map_start)
657{
658 int ret;
659 unsigned int id_count;
660 struct ocfs2_dinode *di;
661 u64 phys;
662 u32 flags = FIEMAP_EXTENT_DATA_INLINE|FIEMAP_EXTENT_LAST;
663 struct ocfs2_inode_info *oi = OCFS2_I(inode);
664
665 di = (struct ocfs2_dinode *)di_bh->b_data;
666 id_count = le16_to_cpu(di->id2.i_data.id_count);
667
668 if (map_start < id_count) {
669 phys = oi->ip_blkno << inode->i_sb->s_blocksize_bits;
670 phys += offsetof(struct ocfs2_dinode, id2.i_data.id_data);
671
672 ret = fiemap_fill_next_extent(fieinfo, 0, phys, id_count,
673 flags);
674 if (ret < 0)
675 return ret;
676 }
677
678 return 0;
679}
680
681#define OCFS2_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC)
682
683int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
684 u64 map_start, u64 map_len)
685{
686 int ret, is_last;
687 u32 mapping_end, cpos;
688 unsigned int hole_size;
689 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
690 u64 len_bytes, phys_bytes, virt_bytes;
691 struct buffer_head *di_bh = NULL;
692 struct ocfs2_extent_rec rec;
693
694 ret = fiemap_check_flags(fieinfo, OCFS2_FIEMAP_FLAGS);
695 if (ret)
696 return ret;
697
698 ret = ocfs2_inode_lock(inode, &di_bh, 0);
699 if (ret) {
700 mlog_errno(ret);
701 goto out;
702 }
703
704 down_read(&OCFS2_I(inode)->ip_alloc_sem);
705
706 /*
707 * Handle inline-data separately.
708 */
709 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
710 ret = ocfs2_fiemap_inline(inode, di_bh, fieinfo, map_start);
711 goto out_unlock;
712 }
713
714 cpos = map_start >> osb->s_clustersize_bits;
715 mapping_end = ocfs2_clusters_for_bytes(inode->i_sb,
716 map_start + map_len);
717 mapping_end -= cpos;
718 is_last = 0;
719 while (cpos < mapping_end && !is_last) {
720 u32 fe_flags;
721
722 ret = ocfs2_get_clusters_nocache(inode, di_bh, cpos,
723 &hole_size, &rec, &is_last);
724 if (ret) {
725 mlog_errno(ret);
726 goto out;
727 }
728
729 if (rec.e_blkno == 0ULL) {
730 cpos += hole_size;
731 continue;
732 }
733
734 fe_flags = 0;
735 if (rec.e_flags & OCFS2_EXT_UNWRITTEN)
736 fe_flags |= FIEMAP_EXTENT_UNWRITTEN;
737 if (is_last)
738 fe_flags |= FIEMAP_EXTENT_LAST;
739 len_bytes = (u64)le16_to_cpu(rec.e_leaf_clusters) << osb->s_clustersize_bits;
740 phys_bytes = le64_to_cpu(rec.e_blkno) << osb->sb->s_blocksize_bits;
741 virt_bytes = (u64)le32_to_cpu(rec.e_cpos) << osb->s_clustersize_bits;
742
743 ret = fiemap_fill_next_extent(fieinfo, virt_bytes, phys_bytes,
744 len_bytes, fe_flags);
745 if (ret)
746 break;
747
748 cpos = le32_to_cpu(rec.e_cpos)+ le16_to_cpu(rec.e_leaf_clusters);
749 }
750
751 if (ret > 0)
752 ret = 0;
753
754out_unlock:
755 brelse(di_bh);
756
757 up_read(&OCFS2_I(inode)->ip_alloc_sem);
758
759 ocfs2_inode_unlock(inode, 0);
760out:
761
762 return ret;
763}
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h
index de91e3e41a2..1b97490e1ea 100644
--- a/fs/ocfs2/extent_map.h
+++ b/fs/ocfs2/extent_map.h
@@ -50,4 +50,7 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, u32 *p_cluster,
50int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno, 50int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
51 u64 *ret_count, unsigned int *extent_flags); 51 u64 *ret_count, unsigned int *extent_flags);
52 52
53int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
54 u64 map_start, u64 map_len);
55
53#endif /* _EXTENT_MAP_H */ 56#endif /* _EXTENT_MAP_H */
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index ec2ed15c3da..ed38796052d 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2228,6 +2228,7 @@ const struct inode_operations ocfs2_file_iops = {
2228 .getattr = ocfs2_getattr, 2228 .getattr = ocfs2_getattr,
2229 .permission = ocfs2_permission, 2229 .permission = ocfs2_permission,
2230 .fallocate = ocfs2_fallocate, 2230 .fallocate = ocfs2_fallocate,
2231 .fiemap = ocfs2_fiemap,
2231}; 2232};
2232 2233
2233const struct inode_operations ocfs2_special_file_iops = { 2234const struct inode_operations ocfs2_special_file_iops = {
diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h
index 80171ee89a2..8120fa1bc23 100644
--- a/include/linux/ext3_fs.h
+++ b/include/linux/ext3_fs.h
@@ -837,6 +837,8 @@ extern void ext3_truncate (struct inode *);
837extern void ext3_set_inode_flags(struct inode *); 837extern void ext3_set_inode_flags(struct inode *);
838extern void ext3_get_inode_flags(struct ext3_inode_info *); 838extern void ext3_get_inode_flags(struct ext3_inode_info *);
839extern void ext3_set_aops(struct inode *inode); 839extern void ext3_set_aops(struct inode *inode);
840extern int ext3_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
841 u64 start, u64 len);
840 842
841/* ioctl.c */ 843/* ioctl.c */
842extern int ext3_ioctl (struct inode *, struct file *, unsigned int, 844extern int ext3_ioctl (struct inode *, struct file *, unsigned int,
diff --git a/include/linux/fiemap.h b/include/linux/fiemap.h
new file mode 100644
index 00000000000..671decbd2ae
--- /dev/null
+++ b/include/linux/fiemap.h
@@ -0,0 +1,64 @@
1/*
2 * FS_IOC_FIEMAP ioctl infrastructure.
3 *
4 * Some portions copyright (C) 2007 Cluster File Systems, Inc
5 *
6 * Authors: Mark Fasheh <mfasheh@suse.com>
7 * Kalpak Shah <kalpak.shah@sun.com>
8 * Andreas Dilger <adilger@sun.com>
9 */
10
11#ifndef _LINUX_FIEMAP_H
12#define _LINUX_FIEMAP_H
13
14struct fiemap_extent {
15 __u64 fe_logical; /* logical offset in bytes for the start of
16 * the extent from the beginning of the file */
17 __u64 fe_physical; /* physical offset in bytes for the start
18 * of the extent from the beginning of the disk */
19 __u64 fe_length; /* length in bytes for this extent */
20 __u64 fe_reserved64[2];
21 __u32 fe_flags; /* FIEMAP_EXTENT_* flags for this extent */
22 __u32 fe_reserved[3];
23};
24
25struct fiemap {
26 __u64 fm_start; /* logical offset (inclusive) at
27 * which to start mapping (in) */
28 __u64 fm_length; /* logical length of mapping which
29 * userspace wants (in) */
30 __u32 fm_flags; /* FIEMAP_FLAG_* flags for request (in/out) */
31 __u32 fm_mapped_extents;/* number of extents that were mapped (out) */
32 __u32 fm_extent_count; /* size of fm_extents array (in) */
33 __u32 fm_reserved;
34 struct fiemap_extent fm_extents[0]; /* array of mapped extents (out) */
35};
36
37#define FIEMAP_MAX_OFFSET (~0ULL)
38
39#define FIEMAP_FLAG_SYNC 0x00000001 /* sync file data before map */
40#define FIEMAP_FLAG_XATTR 0x00000002 /* map extended attribute tree */
41
42#define FIEMAP_FLAGS_COMPAT (FIEMAP_FLAG_SYNC | FIEMAP_FLAG_XATTR)
43
44#define FIEMAP_EXTENT_LAST 0x00000001 /* Last extent in file. */
45#define FIEMAP_EXTENT_UNKNOWN 0x00000002 /* Data location unknown. */
46#define FIEMAP_EXTENT_DELALLOC 0x00000004 /* Location still pending.
47 * Sets EXTENT_UNKNOWN. */
48#define FIEMAP_EXTENT_ENCODED 0x00000008 /* Data can not be read
49 * while fs is unmounted */
50#define FIEMAP_EXTENT_DATA_ENCRYPTED 0x00000080 /* Data is encrypted by fs.
51 * Sets EXTENT_NO_BYPASS. */
52#define FIEMAP_EXTENT_NOT_ALIGNED 0x00000100 /* Extent offsets may not be
53 * block aligned. */
54#define FIEMAP_EXTENT_DATA_INLINE 0x00000200 /* Data mixed with metadata.
55 * Sets EXTENT_NOT_ALIGNED.*/
56#define FIEMAP_EXTENT_DATA_TAIL 0x00000400 /* Multiple files in block.
57 * Sets EXTENT_NOT_ALIGNED.*/
58#define FIEMAP_EXTENT_UNWRITTEN 0x00000800 /* Space allocated, but
59 * no data (i.e. zero). */
60#define FIEMAP_EXTENT_MERGED 0x00001000 /* File does not natively
61 * support extents. Result
62 * merged for efficiency. */
63
64#endif /* _LINUX_FIEMAP_H */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 32477e8872d..44e3cb2f196 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -234,6 +234,7 @@ extern int dir_notify_enable;
234#define FS_IOC_SETFLAGS _IOW('f', 2, long) 234#define FS_IOC_SETFLAGS _IOW('f', 2, long)
235#define FS_IOC_GETVERSION _IOR('v', 1, long) 235#define FS_IOC_GETVERSION _IOR('v', 1, long)
236#define FS_IOC_SETVERSION _IOW('v', 2, long) 236#define FS_IOC_SETVERSION _IOW('v', 2, long)
237#define FS_IOC_FIEMAP _IOWR('f', 11, struct fiemap)
237#define FS_IOC32_GETFLAGS _IOR('f', 1, int) 238#define FS_IOC32_GETFLAGS _IOR('f', 1, int)
238#define FS_IOC32_SETFLAGS _IOW('f', 2, int) 239#define FS_IOC32_SETFLAGS _IOW('f', 2, int)
239#define FS_IOC32_GETVERSION _IOR('v', 1, int) 240#define FS_IOC32_GETVERSION _IOR('v', 1, int)
@@ -294,6 +295,7 @@ extern int dir_notify_enable;
294#include <linux/mutex.h> 295#include <linux/mutex.h>
295#include <linux/capability.h> 296#include <linux/capability.h>
296#include <linux/semaphore.h> 297#include <linux/semaphore.h>
298#include <linux/fiemap.h>
297 299
298#include <asm/atomic.h> 300#include <asm/atomic.h>
299#include <asm/byteorder.h> 301#include <asm/byteorder.h>
@@ -1182,6 +1184,20 @@ extern void dentry_unhash(struct dentry *dentry);
1182extern int file_permission(struct file *, int); 1184extern int file_permission(struct file *, int);
1183 1185
1184/* 1186/*
1187 * VFS FS_IOC_FIEMAP helper definitions.
1188 */
1189struct fiemap_extent_info {
1190 unsigned int fi_flags; /* Flags as passed from user */
1191 unsigned int fi_extents_mapped; /* Number of mapped extents */
1192 unsigned int fi_extents_max; /* Size of fiemap_extent array */
1193 struct fiemap_extent *fi_extents_start; /* Start of fiemap_extent
1194 * array */
1195};
1196int fiemap_fill_next_extent(struct fiemap_extent_info *info, u64 logical,
1197 u64 phys, u64 len, u32 flags);
1198int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags);
1199
1200/*
1185 * File types 1201 * File types
1186 * 1202 *
1187 * NOTE! These match bits 12..15 of stat.st_mode 1203 * NOTE! These match bits 12..15 of stat.st_mode
@@ -1290,6 +1306,8 @@ struct inode_operations {
1290 void (*truncate_range)(struct inode *, loff_t, loff_t); 1306 void (*truncate_range)(struct inode *, loff_t, loff_t);
1291 long (*fallocate)(struct inode *inode, int mode, loff_t offset, 1307 long (*fallocate)(struct inode *inode, int mode, loff_t offset,
1292 loff_t len); 1308 loff_t len);
1309 int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start,
1310 u64 len);
1293}; 1311};
1294 1312
1295struct seq_file; 1313struct seq_file;
@@ -1987,6 +2005,9 @@ extern int vfs_fstat(unsigned int, struct kstat *);
1987 2005
1988extern int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd, 2006extern int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
1989 unsigned long arg); 2007 unsigned long arg);
2008extern int generic_block_fiemap(struct inode *inode,
2009 struct fiemap_extent_info *fieinfo, u64 start,
2010 u64 len, get_block_t *get_block);
1990 2011
1991extern void get_filesystem(struct file_system_type *fs); 2012extern void get_filesystem(struct file_system_type *fs);
1992extern void put_filesystem(struct file_system_type *fs); 2013extern void put_filesystem(struct file_system_type *fs);
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 3dd20900709..66c3499478b 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -850,7 +850,8 @@ struct journal_s
850 */ 850 */
851 struct block_device *j_dev; 851 struct block_device *j_dev;
852 int j_blocksize; 852 int j_blocksize;
853 unsigned long long j_blk_offset; 853 unsigned long long j_blk_offset;
854 char j_devname[BDEVNAME_SIZE+24];
854 855
855 /* 856 /*
856 * Device which holds the client fs. For internal journal this will be 857 * Device which holds the client fs. For internal journal this will be
diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
index 20838883535..9007ccdfc11 100644
--- a/include/linux/percpu_counter.h
+++ b/include/linux/percpu_counter.h
@@ -35,7 +35,7 @@ int percpu_counter_init_irq(struct percpu_counter *fbc, s64 amount);
35void percpu_counter_destroy(struct percpu_counter *fbc); 35void percpu_counter_destroy(struct percpu_counter *fbc);
36void percpu_counter_set(struct percpu_counter *fbc, s64 amount); 36void percpu_counter_set(struct percpu_counter *fbc, s64 amount);
37void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch); 37void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch);
38s64 __percpu_counter_sum(struct percpu_counter *fbc, int set); 38s64 __percpu_counter_sum(struct percpu_counter *fbc);
39 39
40static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount) 40static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount)
41{ 41{
@@ -44,19 +44,13 @@ static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount)
44 44
45static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc) 45static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc)
46{ 46{
47 s64 ret = __percpu_counter_sum(fbc, 0); 47 s64 ret = __percpu_counter_sum(fbc);
48 return ret < 0 ? 0 : ret; 48 return ret < 0 ? 0 : ret;
49} 49}
50 50
51static inline s64 percpu_counter_sum_and_set(struct percpu_counter *fbc)
52{
53 return __percpu_counter_sum(fbc, 1);
54}
55
56
57static inline s64 percpu_counter_sum(struct percpu_counter *fbc) 51static inline s64 percpu_counter_sum(struct percpu_counter *fbc)
58{ 52{
59 return __percpu_counter_sum(fbc, 0); 53 return __percpu_counter_sum(fbc);
60} 54}
61 55
62static inline s64 percpu_counter_read(struct percpu_counter *fbc) 56static inline s64 percpu_counter_read(struct percpu_counter *fbc)
diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
index 4a8ba4bf5f6..a8663890a88 100644
--- a/lib/percpu_counter.c
+++ b/lib/percpu_counter.c
@@ -52,7 +52,7 @@ EXPORT_SYMBOL(__percpu_counter_add);
52 * Add up all the per-cpu counts, return the result. This is a more accurate 52 * Add up all the per-cpu counts, return the result. This is a more accurate
53 * but much slower version of percpu_counter_read_positive() 53 * but much slower version of percpu_counter_read_positive()
54 */ 54 */
55s64 __percpu_counter_sum(struct percpu_counter *fbc, int set) 55s64 __percpu_counter_sum(struct percpu_counter *fbc)
56{ 56{
57 s64 ret; 57 s64 ret;
58 int cpu; 58 int cpu;
@@ -62,11 +62,9 @@ s64 __percpu_counter_sum(struct percpu_counter *fbc, int set)
62 for_each_online_cpu(cpu) { 62 for_each_online_cpu(cpu) {
63 s32 *pcount = per_cpu_ptr(fbc->counters, cpu); 63 s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
64 ret += *pcount; 64 ret += *pcount;
65 if (set) 65 *pcount = 0;
66 *pcount = 0;
67 } 66 }
68 if (set) 67 fbc->count = ret;
69 fbc->count = ret;
70 68
71 spin_unlock(&fbc->lock); 69 spin_unlock(&fbc->lock);
72 return ret; 70 return ret;